(core) deal with SQLITE_BUSY: database is locked during tests

Summary:
During testing, and in some dev configurations, we make connections to
the Grist home database from multiple processes, and store it in Sqlite.
This is a situation prone to file locks in Sqlite. We set a busy_timeout
parameter to help, but transactions may still fail immediately. This
diff adds retries to our patch of TypeORM for Sqlite transactions
(which without the patch are already flakey for other reasons
https://github.com/typeorm/typeorm/issues/1884#issuecomment-380767213).

Without this change, I can tickly SQLITE_BUSY problems fairly
reliably with the DocApi tests. With this change, I can't so far.

This change should not affect deployments of our SaaS (which use
Postgres for home db) or grist-core/grist-ee (which access home
db from a single process in default configuration).

Test Plan: repeated running of DocApi tests

Reviewers: dsagal

Reviewed By: dsagal

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D3466
pull/214/head
Paul Fitzpatrick 2 years ago
parent 1c6f80f956
commit ff77ecc6c6

@ -13,6 +13,7 @@
// changed during construction of a query.
import * as sqlite3 from '@gristlabs/sqlite3';
import {delay} from 'app/common/delay';
import * as log from 'app/server/lib/log';
import {Mutex, MutexInterface} from 'async-mutex';
import isEqual = require('lodash/isEqual');
@ -97,27 +98,83 @@ export function applyPatch() {
}
const queryRunner = this.connection.createQueryRunner();
const runInTransaction = typeof arg1 === "function" ? arg1 : arg2;
const isSqlite = this.connection.driver.options.type === 'sqlite';
try {
await queryRunner.startTransaction();
const result = await runInTransaction(queryRunner.manager);
await queryRunner.commitTransaction();
return result;
} catch (err) {
log.debug(`SQLite transaction error [${arg1} ${arg2}] - ${err}`);
try {
// we throw original error even if rollback thrown an error
await queryRunner.rollbackTransaction();
// tslint: disable-next-line
} catch (rollbackError) {
// tslint: disable-next-line
async function runOrRollback() {
try {
await queryRunner.startTransaction();
const result = await runInTransaction(queryRunner.manager);
await queryRunner.commitTransaction();
return result;
} catch (err) {
log.debug(`TypeORM transaction error [${arg1} ${arg2}] - ${err}`);
try {
// we throw original error even if rollback thrown an error
await queryRunner.rollbackTransaction();
// tslint: disable-next-line
} catch (rollbackError) {
// tslint: disable-next-line
}
throw err;
}
}
if (isSqlite) {
return await callWithRetry(runOrRollback, {
// Transactions may fail immediately if there are connections from
// multiple processes, regardless of busy_timeout setting. Add a
// retry for this kind of failure. This is relevant to tests, which
// use connections from multiple processes, but not to single-process
// instances of Grist, or instances of Grist that use Postgres for the
// home server.
worthRetry: (e) => Boolean(e.message.match(/SQLITE_BUSY/)),
firstDelayMsec: 10,
factor: 1.25,
maxTotalMsec: 3000,
});
} else {
// When not using SQLite, don't do anything special.
return await runOrRollback();
}
throw err;
} finally {
await queryRunner.release();
}
};
}
/**
* Call an operation, and if it fails with an error that is worth retrying
* (or any error if worthRetry callback is not specified), retry it after
* a delay of firstDelayMsec. Retries are repeated with delays growing by
* the specified factor (or 2.0 if not specified). Stop if maxTotalMsec is
* specified and has passed.
*/
async function callWithRetry<T>(op: () => Promise<T>, options: {
worthRetry?: (err: Error) => boolean,
maxTotalMsec?: number,
firstDelayMsec: number,
factor?: number,
}): Promise<T> {
const startedAt = Date.now();
let dt = options.firstDelayMsec;
while (true) { // eslint-disable-line no-constant-condition
try {
return await op();
} catch (e) {
// throw if not worth retrying
if (options.worthRetry && e instanceof Error && !options.worthRetry(e)) {
throw e;
}
// throw if max time has expired
if (options.maxTotalMsec && Date.now() - startedAt > options.maxTotalMsec) {
throw e;
}
// otherwise wait a bit and retry
await delay(dt);
dt *= options.factor ?? 2.0;
}
}
}
/**********************
* Patch 2

Loading…
Cancel
Save