From ff77ecc6c60fe17d013108eca0c1b3baf5a4f72b Mon Sep 17 00:00:00 2001 From: Paul Fitzpatrick Date: Fri, 3 Jun 2022 14:38:26 -0400 Subject: [PATCH] (core) deal with SQLITE_BUSY: database is locked during tests Summary: During testing, and in some dev configurations, we make connections to the Grist home database from multiple processes, and store it in Sqlite. This is a situation prone to file locks in Sqlite. We set a busy_timeout parameter to help, but transactions may still fail immediately. This diff adds retries to our patch of TypeORM for Sqlite transactions (which without the patch are already flakey for other reasons https://github.com/typeorm/typeorm/issues/1884#issuecomment-380767213). Without this change, I can tickly SQLITE_BUSY problems fairly reliably with the DocApi tests. With this change, I can't so far. This change should not affect deployments of our SaaS (which use Postgres for home db) or grist-core/grist-ee (which access home db from a single process in default configuration). Test Plan: repeated running of DocApi tests Reviewers: dsagal Reviewed By: dsagal Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D3466 --- app/gen-server/lib/TypeORMPatches.ts | 83 +++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/app/gen-server/lib/TypeORMPatches.ts b/app/gen-server/lib/TypeORMPatches.ts index 12f1bc1a..f67145ff 100644 --- a/app/gen-server/lib/TypeORMPatches.ts +++ b/app/gen-server/lib/TypeORMPatches.ts @@ -13,6 +13,7 @@ // changed during construction of a query. import * as sqlite3 from '@gristlabs/sqlite3'; +import {delay} from 'app/common/delay'; import * as log from 'app/server/lib/log'; import {Mutex, MutexInterface} from 'async-mutex'; import isEqual = require('lodash/isEqual'); @@ -97,27 +98,83 @@ export function applyPatch() { } const queryRunner = this.connection.createQueryRunner(); const runInTransaction = typeof arg1 === "function" ? arg1 : arg2; + const isSqlite = this.connection.driver.options.type === 'sqlite'; try { - await queryRunner.startTransaction(); - const result = await runInTransaction(queryRunner.manager); - await queryRunner.commitTransaction(); - return result; - } catch (err) { - log.debug(`SQLite transaction error [${arg1} ${arg2}] - ${err}`); - try { - // we throw original error even if rollback thrown an error - await queryRunner.rollbackTransaction(); - // tslint: disable-next-line - } catch (rollbackError) { - // tslint: disable-next-line + async function runOrRollback() { + try { + await queryRunner.startTransaction(); + const result = await runInTransaction(queryRunner.manager); + await queryRunner.commitTransaction(); + return result; + } catch (err) { + log.debug(`TypeORM transaction error [${arg1} ${arg2}] - ${err}`); + try { + // we throw original error even if rollback thrown an error + await queryRunner.rollbackTransaction(); + // tslint: disable-next-line + } catch (rollbackError) { + // tslint: disable-next-line + } + throw err; + } + } + if (isSqlite) { + return await callWithRetry(runOrRollback, { + // Transactions may fail immediately if there are connections from + // multiple processes, regardless of busy_timeout setting. Add a + // retry for this kind of failure. This is relevant to tests, which + // use connections from multiple processes, but not to single-process + // instances of Grist, or instances of Grist that use Postgres for the + // home server. + worthRetry: (e) => Boolean(e.message.match(/SQLITE_BUSY/)), + firstDelayMsec: 10, + factor: 1.25, + maxTotalMsec: 3000, + }); + } else { + // When not using SQLite, don't do anything special. + return await runOrRollback(); } - throw err; } finally { await queryRunner.release(); } }; } +/** + * Call an operation, and if it fails with an error that is worth retrying + * (or any error if worthRetry callback is not specified), retry it after + * a delay of firstDelayMsec. Retries are repeated with delays growing by + * the specified factor (or 2.0 if not specified). Stop if maxTotalMsec is + * specified and has passed. + */ +async function callWithRetry(op: () => Promise, options: { + worthRetry?: (err: Error) => boolean, + maxTotalMsec?: number, + firstDelayMsec: number, + factor?: number, +}): Promise { + const startedAt = Date.now(); + let dt = options.firstDelayMsec; + while (true) { // eslint-disable-line no-constant-condition + try { + return await op(); + } catch (e) { + // throw if not worth retrying + if (options.worthRetry && e instanceof Error && !options.worthRetry(e)) { + throw e; + } + // throw if max time has expired + if (options.maxTotalMsec && Date.now() - startedAt > options.maxTotalMsec) { + throw e; + } + // otherwise wait a bit and retry + await delay(dt); + dt *= options.factor ?? 2.0; + } + } +} + /********************** * Patch 2