From 5b2de988b5dd1a1158de2e91c4424bb4d094d8c5 Mon Sep 17 00:00:00 2001 From: Dmitry S Date: Wed, 11 Nov 2020 16:29:11 -0500 Subject: [PATCH] (core) Perform migrations of Grist schema using only metadata tables when possible. Summary: Loading all user data to run a migration is risky (creates more than usual memory pressure), and almost never needed (only one migration requires it). This diff attempts to run migrations using only metadata (_grist_* tables), but retries if the sandbox tells it that all data is needed. The intent is for new migrations to avoid needing all data. Test Plan: Added a somewhat contrived unittest. Reviewers: paulfitz Reviewed By: paulfitz Differential Revision: https://phab.getgrist.com/D2659 --- app/server/lib/ActiveDoc.ts | 47 ++++++++++++++++++++++++++++++------ app/server/lib/DocStorage.ts | 19 +++------------ sandbox/grist/main.py | 4 +-- sandbox/grist/migrations.py | 31 +++++++++++++++++++----- 4 files changed, 70 insertions(+), 31 deletions(-) diff --git a/app/server/lib/ActiveDoc.ts b/app/server/lib/ActiveDoc.ts index c87baca4..56f4ec83 100644 --- a/app/server/lib/ActiveDoc.ts +++ b/app/server/lib/ActiveDoc.ts @@ -27,7 +27,7 @@ import {toTableDataAction} from 'app/common/DocActions'; import {DocData} from 'app/common/DocData'; import {DocSnapshots} from 'app/common/DocSnapshot'; import {EncActionBundleFromHub} from 'app/common/EncActionBundle'; -import {byteString} from 'app/common/gutil'; +import {byteString, countIf} from 'app/common/gutil'; import {InactivityTimer} from 'app/common/InactivityTimer'; import * as marshal from 'app/common/marshal'; import {Peer} from 'app/common/sharing'; @@ -1125,13 +1125,44 @@ export class ActiveDoc extends EventEmitter { * collaborators. */ private async _migrate(docSession: OptDocSession): Promise { - // TODO fetchAllTables() creates more memory pressure than usual since full data is present in - // memory at once both in node and in Python. This is liable to cause crashes. This doesn't - // even skip onDemand tables. We should try to migrate using metadata only. Data migrations - // would need to be done table-by-table, and differently still for on-demand tables. - const allTables = await this.docStorage.fetchAllTables(); - const docActions: DocAction[] = await this._dataEngine.pyCall('create_migrations', allTables); - this.logInfo(docSession, "_migrate: applying %d migration actions", docActions.length); + const tableNames = await this.docStorage.getAllTableNames(); + + // Fetch only metadata tables first, and try to migrate with only those. + const tableData: {[key: string]: Buffer|null} = {}; + for (const tableName of tableNames) { + if (tableName.startsWith('_grist_')) { + tableData[tableName] = await this.docStorage.fetchTable(tableName); + } + } + + let docActions: DocAction[]; + try { + // The last argument tells create_migrations() that only metadata is included. + docActions = await this._dataEngine.pyCall('create_migrations', tableData, true); + } catch (e) { + if (!/need all tables/.test(e.message)) { + throw e; + } + // If the migration failed because it needs all tables (i.e. involves changes to data), then + // fetch them all. TODO: This is used for some older migrations, and is relied on by tests. + // If a new migration needs this flag, more work is needed. The current approach creates + // more memory pressure than usual since full data is present in memory at once both in node + // and in Python; and it doesn't skip onDemand tables. This is liable to cause crashes. + this.logWarn(docSession, "_migrate: retrying with all tables"); + for (const tableName of tableNames) { + if (!tableData[tableName] && !tableName.startsWith('_gristsys_')) { + tableData[tableName] = await this.docStorage.fetchTable(tableName); + } + } + docActions = await this._dataEngine.pyCall('create_migrations', tableData); + } + + const processedTables = Object.keys(tableData); + const numSchema = countIf(processedTables, t => t.startsWith("_grist_")); + const numUser = countIf(processedTables, t => !t.startsWith("_grist_")); + this.logInfo(docSession, "_migrate: applying %d migration actions (processed %s schema, %s user tables)", + docActions.length, numSchema, numUser); + docActions.forEach((action, i) => this.logInfo(docSession, "_migrate: docAction %s: %s", i, shortDesc(action))); await this.docStorage.execTransaction(() => this.docStorage.applyStoredActions(docActions)); } diff --git a/app/server/lib/DocStorage.ts b/app/server/lib/DocStorage.ts index 505f04d6..fe0f5de1 100644 --- a/app/server/lib/DocStorage.ts +++ b/app/server/lib/DocStorage.ts @@ -714,22 +714,11 @@ export class DocStorage implements ISQLiteDB { } /** - * Fetches all tables from the database, and returns a dictionary mapping table names to encoded - * TableData objects (marshalled dicts mapping column ids to arrays of values). + * Fetches and returns the names of all tables in the database (including _gristsys_ tables). */ - public fetchAllTables(): Promise<{[key: string]: Buffer|null}> { - const tables: {[key: string]: Buffer|null} = {}; - return bluebird.Promise.each( - this.all("SELECT name FROM sqlite_master WHERE type='table'"), - (row: ResultRow) => { - if (!row.name.startsWith('_gristsys_')) { - return this.fetchTable(row.name) - .then(data => { - tables[row.name] = data; - }); - } - }) - .return(tables); + public async getAllTableNames(): Promise { + const rows = await this.all("SELECT name FROM sqlite_master WHERE type='table'"); + return rows.map(row => row.name); } /** diff --git a/sandbox/grist/main.py b/sandbox/grist/main.py index 75b53896..16742c18 100644 --- a/sandbox/grist/main.py +++ b/sandbox/grist/main.py @@ -97,9 +97,9 @@ def main(): return eng.load_table(table_data_from_db(table_name, table_data)) @export - def create_migrations(all_tables): + def create_migrations(all_tables, metadata_only=False): doc_actions = migrations.create_migrations( - {t: table_data_from_db(t, data) for t, data in all_tables.iteritems()}) + {t: table_data_from_db(t, data) for t, data in all_tables.iteritems()}, metadata_only) return map(actions.get_action_repr, doc_actions) @export diff --git a/sandbox/grist/migrations.py b/sandbox/grist/migrations.py index 4c984a78..664e836f 100644 --- a/sandbox/grist/migrations.py +++ b/sandbox/grist/migrations.py @@ -28,12 +28,18 @@ all_migrations = {} def noop_migration(_all_tables): return [] +# Each migration function includes a .need_all_tables attribute. See migration() decorator. +noop_migration.need_all_tables = False -def create_migrations(all_tables): + +def create_migrations(all_tables, metadata_only=False): """ Creates and returns a list of DocActions needed to bring this document to - schema.SCHEMA_VERSION. It requires as input all of the documents tables. - all_tables: all tables as a dictionary mapping table name to TableData. + schema.SCHEMA_VERSION. + all_tables: all tables or just the metadata tables (those named with _grist_ prefix) as a + dictionary mapping table name to TableData. + metadata_only: should be set if only metadata tables are passed in. If ALL tables are + required to process migrations, this method will raise a "need all tables..." exception. """ try: doc_version = all_tables['_grist_DocInfo'].columns["schemaVersion"][0] @@ -57,6 +63,7 @@ def create_migrations(all_tables): new_schema = {a.table_id: a for a in schema.schema_create_actions()} for table_id, data in sorted(all_tables.iteritems()): # User tables should already be in tdset; the rest must be metadata tables. + # (If metadata_only is true, there is simply nothing to skip here.) if table_id not in tdset.all_tables: new_col_info = {} if table_id in new_schema: @@ -71,6 +78,9 @@ def create_migrations(all_tables): migration_actions = [] for version in xrange(doc_version + 1, schema.SCHEMA_VERSION + 1): + migration_func = all_migrations.get(version, noop_migration) + if migration_func.need_all_tables and metadata_only: + raise Exception("need all tables for migration to %s" % version) migration_actions.extend(all_migrations.get(version, noop_migration)(tdset)) # Note that if we are downgrading versions (i.e. doc_version is higher), then the following is @@ -86,12 +96,19 @@ def get_last_migration_version(): """ return max(all_migrations.iterkeys()) -def migration(schema_version): +def migration(schema_version, need_all_tables=False): """ Decorator for migrations that associates the decorated migration function with the given - schema_version. This decorate function will be run to migrate forward to schema_version. + schema_version. This decorated function will be run to migrate forward to schema_version. + + Migrations are first attempted with only metadata tables, but if any required migration function + is marked with need_all_tables=True, then the migration will be retried with all tables. + + NOTE: new migrations should NOT set need_all_tables=True; it would require more work to process + very large documents safely (incuding those containing on-demand tables). """ def add_migration(migration_func): + migration_func.need_all_tables = need_all_tables all_migrations[schema_version] = migration_func return migration_func return add_migration @@ -653,7 +670,9 @@ def migration16(tdset): return tdset.apply_doc_actions(doc_actions) -@migration(schema_version=17) +# This is actually the only migration that requires all tables because it modifies user data +# (specifically, any columns of the deprecated "Image" type). +@migration(schema_version=17, need_all_tables=True) def migration17(tdset): """ There is no longer an "Image" type for columns, as "Attachments" now serves as a