From 64369df4c303393f5d4f2ceeb0b2a826ce8600ec Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 7 Apr 2022 14:34:50 +0200 Subject: [PATCH] (core) Add /attachments/updateUsed DocApi endpoint to soft delete all unused attachments in document Summary: Builds on https://phab.getgrist.com/D3352 Add DocStorage.scanAttachmentsForUsageChanges to do fancy JSON query to find all attachment metadata rows whose soft deletion status needs updating. Add ActiveDoc.updateUsedAttachments which uses the above and then applies the appropriate user action if needed to soft delete/undelete metadata rows. Add endpoint in DocApi calling ActiveDoc method. Test Plan: Added DocApi test Reviewers: paulfitz Reviewed By: paulfitz Differential Revision: https://phab.getgrist.com/D3357 --- app/server/lib/ActiveDoc.ts | 22 ++++++ app/server/lib/DocApi.ts | 6 ++ app/server/lib/DocStorage.ts | 44 ++++++++++++ test/server/lib/DocApi.ts | 130 ++++++++++++++++++++++++++++++++++- 4 files changed, 200 insertions(+), 2 deletions(-) diff --git a/app/server/lib/ActiveDoc.ts b/app/server/lib/ActiveDoc.ts index a58b49db..312e8011 100644 --- a/app/server/lib/ActiveDoc.ts +++ b/app/server/lib/ActiveDoc.ts @@ -24,6 +24,7 @@ import { import {ApiError} from 'app/common/ApiError'; import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate'; import { + BulkUpdateRecord, CellValue, DocAction, TableDataAction, @@ -1298,6 +1299,27 @@ export class ActiveDoc extends EventEmitter { return sandboxActionBundle; } + /** + * Check which attachments in the _grist_Attachments metadata are actually used, + * i.e. referenced by some cell in an Attachments type column. + * Set timeDeleted to the current time on newly unused attachments, + * 'soft deleting' them so that they get cleaned up automatically from _gristsys_Files after enough time has passed. + * Set timeDeleted to null on used attachments that were previously soft deleted, + * so that undo can 'undelete' attachments. + */ + public async updateUsedAttachments() { + const changes = await this.docStorage.scanAttachmentsForUsageChanges(); + if (!changes.length) { + return; + } + const rowIds = changes.map(r => r.id); + const now = Date.now() / 1000; + const timeDeleted = changes.map(r => r.used ? null : now); + const action: BulkUpdateRecord = ["BulkUpdateRecord", "_grist_Attachments", rowIds, {timeDeleted}]; + // Don't use applyUserActions which may block the update action in delete-only mode + await this._applyUserActions(makeExceptionalDocSession('system'), [action]); + } + // Needed for test/server/migrations.js tests public async testGetVersionFromDataEngine() { return this._pyCall('get_version'); diff --git a/app/server/lib/DocApi.ts b/app/server/lib/DocApi.ts index 9a209e89..33786a3a 100644 --- a/app/server/lib/DocApi.ts +++ b/app/server/lib/DocApi.ts @@ -230,6 +230,12 @@ export class DocWorkerApi { .send(fileData); })); + // Mostly for testing + this._app.post('/api/docs/:docId/attachments/updateUsed', canEdit, withDoc(async (activeDoc, req, res) => { + await activeDoc.updateUsedAttachments(); + res.json(null); + })); + // Adds records given in a column oriented format, // returns an array of row IDs this._app.post('/api/docs/:docId/tables/:tableId/data', canEdit, diff --git a/app/server/lib/DocStorage.ts b/app/server/lib/DocStorage.ts index ad3203d0..61b4a4b6 100644 --- a/app/server/lib/DocStorage.ts +++ b/app/server/lib/DocStorage.ts @@ -1223,6 +1223,50 @@ export class DocStorage implements ISQLiteDB, OnDemandStorage { }); } + /** + * Returns an array of objects where: + * - `id` is a row ID of _grist_Attachments + * - `used` is true if and only if `id` is in a list in a cell of type Attachments + * - The value of `timeDeleted` in this row of _grist_Attachments needs to be updated + * because its truthiness doesn't match `used`, i.e. either: + * - a used attachment is marked as deleted, OR + * - an unused attachment is not marked as deleted + */ + public async scanAttachmentsForUsageChanges(): Promise<{ used: boolean, id: number }[]> { + // Array of SQL queries where attachment_ids contains JSON arrays (typically containg row IDs). + // Below we add one query for each column of type Attachments in the document. + // We always include this first dummy query because if the array is empty then the final SQL query + // will just have `()` causing a syntax error. + // We can't just return when there are no Attachments columns + // because we may still need to delete all remaining attachments. + const attachmentsQueries = ["SELECT '[0]' AS attachment_ids"]; + for (const [tableId, cols] of Object.entries(this._docSchema)) { + for (const [colId, type] of Object.entries(cols)) { + if (type === "Attachments") { + attachmentsQueries.push(` + SELECT t.${quoteIdent(colId)} AS attachment_ids + FROM ${quoteIdent(tableId)} AS t + WHERE json_valid(attachment_ids) + `); + } + } + } + + // `UNION ALL` instead of `UNION` because duplicate values are unlikely and deduplicating is not worth the cost + const allAttachmentsQuery = attachmentsQueries.join(' UNION ALL '); + + const sql = ` + WITH all_attachment_ids(id) AS ( + SELECT json_each.value AS id + FROM json_each(attachment_ids), (${allAttachmentsQuery}) + ) -- flatten out all the lists of IDs into a simple column of IDs + SELECT id, id IN all_attachment_ids AS used + FROM _grist_Attachments + WHERE used != (timeDeleted IS NULL); -- only include rows that need updating + `; + return (await this.all(sql)) as any[]; + } + public all(sql: string, ...args: any[]): Promise { return this._getDB().all(sql, ...args); } diff --git a/test/server/lib/DocApi.ts b/test/server/lib/DocApi.ts index 1f131043..422306f5 100644 --- a/test/server/lib/DocApi.ts +++ b/test/server/lib/DocApi.ts @@ -1,9 +1,9 @@ import {ActionSummary} from 'app/common/ActionSummary'; -import {BulkColValues} from 'app/common/DocActions'; +import {BulkColValues, UserAction} from 'app/common/DocActions'; import {arrayRepeat} from 'app/common/gutil'; import {DocState, UserAPIImpl} from 'app/common/UserAPI'; -import {AddOrUpdateRecord} from 'app/plugin/DocApiTypes'; import {teamFreeFeatures} from 'app/gen-server/entity/Product'; +import {AddOrUpdateRecord, Record as ApiRecord} from 'app/plugin/DocApiTypes'; import {CellValue, GristObjCode} from 'app/plugin/GristData'; import {applyQueryParameters, docDailyApiUsageKey} from 'app/server/lib/DocApi'; import * as log from 'app/server/lib/log'; @@ -1544,6 +1544,132 @@ function testDocApi() { assert.deepEqual(resp.headers['bad-header'], undefined); // Attempt to hack in more headers didn't work assert.deepEqual(resp.data, 'def'); }); + + it("POST /docs/{did}/attachments/updateUsed updates timeDeleted on metadata", async function() { + const wid = await getWorkspaceId(userApi, 'Private'); + const docId = await userApi.newDoc({name: 'TestDoc2'}, wid); + + // Apply the given user actions, + // POST to /attachments/updateUsed + // Check that Table1 and _grist_Attachments contain the expected rows + async function check( + actions: UserAction[], + userData: { id: number, Attached: any }[], + metaData: { id: number, deleted: boolean }[], + ) { + const docUrl = `${serverUrl}/api/docs/${docId}`; + + let resp = await axios.post(`${docUrl}/apply`, actions, chimpy); + assert.equal(resp.status, 200); + + resp = await axios.post(`${docUrl}/attachments/updateUsed`, actions, chimpy); + assert.equal(resp.status, 200); + + resp = await axios.get(`${docUrl}/tables/Table1/records`, chimpy); + const actualUserData = resp.data.records.map( + ({id, fields: {Attached}}: ApiRecord) => + ({id, Attached}) + ); + assert.deepEqual(actualUserData, userData); + + resp = await axios.get(`${docUrl}/tables/_grist_Attachments/records`, chimpy); + const actualMetaData = resp.data.records.map( + ({id, fields: {timeDeleted}}: ApiRecord) => + ({id, deleted: Boolean(timeDeleted)}) + ); + assert.deepEqual(actualMetaData, metaData); + } + + // Set up the document and initial data. + await check( + [ + ["AddColumn", "Table1", "Attached", {type: "Attachments"}], + ["BulkAddRecord", "Table1", [1, 2], {Attached: [['L', 1], ['L', 2, 3]]}], + // There's no actual attachments here but that doesn't matter + ["BulkAddRecord", "_grist_Attachments", [1, 2, 3], {}], + ], + [ + {id: 1, Attached: ['L', 1]}, + {id: 2, Attached: ['L', 2, 3]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: false}, + {id: 3, deleted: false}, + ], + ); + + // Remove the record containing ['L', 2, 3], so the metadata for 2 and 3 now says deleted + await check( + [["RemoveRecord", "Table1", 2]], + [ + {id: 1, Attached: ['L', 1]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: true}, // deleted here + {id: 3, deleted: true}, // deleted here + ], + ); + + // Add back a reference to attacument 2 to test 'undeletion', plus some junk values + await check( + [["BulkAddRecord", "Table1", [3, 4, 5], {Attached: [null, "foo", ['L', 2, 2, 4, 4, 5]]}]], + [ + {id: 1, Attached: ['L', 1]}, + {id: 3, Attached: null}, + {id: 4, Attached: "foo"}, + {id: 5, Attached: ['L', 2, 2, 4, 4, 5]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: false}, // undeleted here + {id: 3, deleted: true}, + ], + ); + + // Remove the whole column to test what happens when there's no Attachment columns + await check( + [["RemoveColumn", "Table1", "Attached"]], + [ + {id: 1, Attached: undefined}, + {id: 3, Attached: undefined}, + {id: 4, Attached: undefined}, + {id: 5, Attached: undefined}, + ], + [ + {id: 1, deleted: true}, // deleted here + {id: 2, deleted: true}, // deleted here + {id: 3, deleted: true}, + ], + ); + + // Test performance with a large number of records and attachments. + // The maximum value of numRecords that doesn't return a 413 error is about 18,000. + // In that case it took about 5.7 seconds to apply the initial user actions (i.e. add the records), + // 0.3 seconds to call updateUsed once, and 0.1 seconds to call it again immediately after. + // That last time roughly measures the time taken to do the SQL query + // without having to apply any user actions after to update timeDeleted. + // 10,000 records is a compromise so that tests aren't too slow. + const numRecords = 10000; + const attachmentsPerRecord = 4; + const totalUsedAttachments = numRecords * attachmentsPerRecord; // 40,000 attachments referenced in user data + const totalAttachments = totalUsedAttachments * 1.1; // 44,000 attachment IDs listed in metadata + const attachedValues = _.chunk(_.range(1, totalUsedAttachments + 1), attachmentsPerRecord) + .map(arr => ['L', ...arr]); + await check( + [ + // Reset the state: add back the removed column and delete the previously added data + ["AddColumn", "Table1", "Attached", {type: "Attachments"}], + ["BulkRemoveRecord", "Table1", [1, 3, 4, 5]], + ["BulkRemoveRecord", "_grist_Attachments", [1, 2, 3]], + ["BulkAddRecord", "Table1", arrayRepeat(numRecords, null), {Attached: attachedValues}], + ["BulkAddRecord", "_grist_Attachments", arrayRepeat(totalAttachments, null), {}], + ], + attachedValues.map((Attached, index) => ({id: index + 1, Attached})), + _.range(totalAttachments).map(index => ({id: index + 1, deleted: index >= totalUsedAttachments})), + ); + }); }); it("GET /docs/{did}/download serves document", async function() {