(core) deal with SQLITE_BUSY: database is locked during tests

Summary:
During testing, and in some dev configurations, we make connections to
the Grist home database from multiple processes, and store it in Sqlite.
This is a situation prone to file locks in Sqlite. We set a busy_timeout
parameter to help, but transactions may still fail immediately. This
diff adds retries to our patch of TypeORM for Sqlite transactions
(which without the patch are already flakey for other reasons
https://github.com/typeorm/typeorm/issues/1884#issuecomment-380767213).

Without this change, I can tickly SQLITE_BUSY problems fairly
reliably with the DocApi tests. With this change, I can't so far.

This change should not affect deployments of our SaaS (which use
Postgres for home db) or grist-core/grist-ee (which access home
db from a single process in default configuration).

Test Plan: repeated running of DocApi tests

Reviewers: dsagal

Reviewed By: dsagal

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D3466
This commit is contained in:
Paul Fitzpatrick 2022-06-03 14:38:26 -04:00
parent 1c6f80f956
commit ff77ecc6c6

View File

@ -13,6 +13,7 @@
// changed during construction of a query. // changed during construction of a query.
import * as sqlite3 from '@gristlabs/sqlite3'; import * as sqlite3 from '@gristlabs/sqlite3';
import {delay} from 'app/common/delay';
import * as log from 'app/server/lib/log'; import * as log from 'app/server/lib/log';
import {Mutex, MutexInterface} from 'async-mutex'; import {Mutex, MutexInterface} from 'async-mutex';
import isEqual = require('lodash/isEqual'); import isEqual = require('lodash/isEqual');
@ -97,27 +98,83 @@ export function applyPatch() {
} }
const queryRunner = this.connection.createQueryRunner(); const queryRunner = this.connection.createQueryRunner();
const runInTransaction = typeof arg1 === "function" ? arg1 : arg2; const runInTransaction = typeof arg1 === "function" ? arg1 : arg2;
const isSqlite = this.connection.driver.options.type === 'sqlite';
try { try {
await queryRunner.startTransaction(); async function runOrRollback() {
const result = await runInTransaction(queryRunner.manager); try {
await queryRunner.commitTransaction(); await queryRunner.startTransaction();
return result; const result = await runInTransaction(queryRunner.manager);
} catch (err) { await queryRunner.commitTransaction();
log.debug(`SQLite transaction error [${arg1} ${arg2}] - ${err}`); return result;
try { } catch (err) {
// we throw original error even if rollback thrown an error log.debug(`TypeORM transaction error [${arg1} ${arg2}] - ${err}`);
await queryRunner.rollbackTransaction(); try {
// tslint: disable-next-line // we throw original error even if rollback thrown an error
} catch (rollbackError) { await queryRunner.rollbackTransaction();
// tslint: disable-next-line // tslint: disable-next-line
} catch (rollbackError) {
// tslint: disable-next-line
}
throw err;
}
}
if (isSqlite) {
return await callWithRetry(runOrRollback, {
// Transactions may fail immediately if there are connections from
// multiple processes, regardless of busy_timeout setting. Add a
// retry for this kind of failure. This is relevant to tests, which
// use connections from multiple processes, but not to single-process
// instances of Grist, or instances of Grist that use Postgres for the
// home server.
worthRetry: (e) => Boolean(e.message.match(/SQLITE_BUSY/)),
firstDelayMsec: 10,
factor: 1.25,
maxTotalMsec: 3000,
});
} else {
// When not using SQLite, don't do anything special.
return await runOrRollback();
} }
throw err;
} finally { } finally {
await queryRunner.release(); await queryRunner.release();
} }
}; };
} }
/**
* Call an operation, and if it fails with an error that is worth retrying
* (or any error if worthRetry callback is not specified), retry it after
* a delay of firstDelayMsec. Retries are repeated with delays growing by
* the specified factor (or 2.0 if not specified). Stop if maxTotalMsec is
* specified and has passed.
*/
async function callWithRetry<T>(op: () => Promise<T>, options: {
worthRetry?: (err: Error) => boolean,
maxTotalMsec?: number,
firstDelayMsec: number,
factor?: number,
}): Promise<T> {
const startedAt = Date.now();
let dt = options.firstDelayMsec;
while (true) { // eslint-disable-line no-constant-condition
try {
return await op();
} catch (e) {
// throw if not worth retrying
if (options.worthRetry && e instanceof Error && !options.worthRetry(e)) {
throw e;
}
// throw if max time has expired
if (options.maxTotalMsec && Date.now() - startedAt > options.maxTotalMsec) {
throw e;
}
// otherwise wait a bit and retry
await delay(dt);
dt *= options.factor ?? 2.0;
}
}
}
/********************** /**********************
* Patch 2 * Patch 2