diff --git a/app/server/lib/ActiveDoc.ts b/app/server/lib/ActiveDoc.ts index a58b49db..312e8011 100644 --- a/app/server/lib/ActiveDoc.ts +++ b/app/server/lib/ActiveDoc.ts @@ -24,6 +24,7 @@ import { import {ApiError} from 'app/common/ApiError'; import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate'; import { + BulkUpdateRecord, CellValue, DocAction, TableDataAction, @@ -1298,6 +1299,27 @@ export class ActiveDoc extends EventEmitter { return sandboxActionBundle; } + /** + * Check which attachments in the _grist_Attachments metadata are actually used, + * i.e. referenced by some cell in an Attachments type column. + * Set timeDeleted to the current time on newly unused attachments, + * 'soft deleting' them so that they get cleaned up automatically from _gristsys_Files after enough time has passed. + * Set timeDeleted to null on used attachments that were previously soft deleted, + * so that undo can 'undelete' attachments. + */ + public async updateUsedAttachments() { + const changes = await this.docStorage.scanAttachmentsForUsageChanges(); + if (!changes.length) { + return; + } + const rowIds = changes.map(r => r.id); + const now = Date.now() / 1000; + const timeDeleted = changes.map(r => r.used ? null : now); + const action: BulkUpdateRecord = ["BulkUpdateRecord", "_grist_Attachments", rowIds, {timeDeleted}]; + // Don't use applyUserActions which may block the update action in delete-only mode + await this._applyUserActions(makeExceptionalDocSession('system'), [action]); + } + // Needed for test/server/migrations.js tests public async testGetVersionFromDataEngine() { return this._pyCall('get_version'); diff --git a/app/server/lib/DocApi.ts b/app/server/lib/DocApi.ts index 9a209e89..33786a3a 100644 --- a/app/server/lib/DocApi.ts +++ b/app/server/lib/DocApi.ts @@ -230,6 +230,12 @@ export class DocWorkerApi { .send(fileData); })); + // Mostly for testing + this._app.post('/api/docs/:docId/attachments/updateUsed', canEdit, withDoc(async (activeDoc, req, res) => { + await activeDoc.updateUsedAttachments(); + res.json(null); + })); + // Adds records given in a column oriented format, // returns an array of row IDs this._app.post('/api/docs/:docId/tables/:tableId/data', canEdit, diff --git a/app/server/lib/DocStorage.ts b/app/server/lib/DocStorage.ts index ad3203d0..61b4a4b6 100644 --- a/app/server/lib/DocStorage.ts +++ b/app/server/lib/DocStorage.ts @@ -1223,6 +1223,50 @@ export class DocStorage implements ISQLiteDB, OnDemandStorage { }); } + /** + * Returns an array of objects where: + * - `id` is a row ID of _grist_Attachments + * - `used` is true if and only if `id` is in a list in a cell of type Attachments + * - The value of `timeDeleted` in this row of _grist_Attachments needs to be updated + * because its truthiness doesn't match `used`, i.e. either: + * - a used attachment is marked as deleted, OR + * - an unused attachment is not marked as deleted + */ + public async scanAttachmentsForUsageChanges(): Promise<{ used: boolean, id: number }[]> { + // Array of SQL queries where attachment_ids contains JSON arrays (typically containg row IDs). + // Below we add one query for each column of type Attachments in the document. + // We always include this first dummy query because if the array is empty then the final SQL query + // will just have `()` causing a syntax error. + // We can't just return when there are no Attachments columns + // because we may still need to delete all remaining attachments. + const attachmentsQueries = ["SELECT '[0]' AS attachment_ids"]; + for (const [tableId, cols] of Object.entries(this._docSchema)) { + for (const [colId, type] of Object.entries(cols)) { + if (type === "Attachments") { + attachmentsQueries.push(` + SELECT t.${quoteIdent(colId)} AS attachment_ids + FROM ${quoteIdent(tableId)} AS t + WHERE json_valid(attachment_ids) + `); + } + } + } + + // `UNION ALL` instead of `UNION` because duplicate values are unlikely and deduplicating is not worth the cost + const allAttachmentsQuery = attachmentsQueries.join(' UNION ALL '); + + const sql = ` + WITH all_attachment_ids(id) AS ( + SELECT json_each.value AS id + FROM json_each(attachment_ids), (${allAttachmentsQuery}) + ) -- flatten out all the lists of IDs into a simple column of IDs + SELECT id, id IN all_attachment_ids AS used + FROM _grist_Attachments + WHERE used != (timeDeleted IS NULL); -- only include rows that need updating + `; + return (await this.all(sql)) as any[]; + } + public all(sql: string, ...args: any[]): Promise { return this._getDB().all(sql, ...args); } diff --git a/test/server/lib/DocApi.ts b/test/server/lib/DocApi.ts index 1f131043..422306f5 100644 --- a/test/server/lib/DocApi.ts +++ b/test/server/lib/DocApi.ts @@ -1,9 +1,9 @@ import {ActionSummary} from 'app/common/ActionSummary'; -import {BulkColValues} from 'app/common/DocActions'; +import {BulkColValues, UserAction} from 'app/common/DocActions'; import {arrayRepeat} from 'app/common/gutil'; import {DocState, UserAPIImpl} from 'app/common/UserAPI'; -import {AddOrUpdateRecord} from 'app/plugin/DocApiTypes'; import {teamFreeFeatures} from 'app/gen-server/entity/Product'; +import {AddOrUpdateRecord, Record as ApiRecord} from 'app/plugin/DocApiTypes'; import {CellValue, GristObjCode} from 'app/plugin/GristData'; import {applyQueryParameters, docDailyApiUsageKey} from 'app/server/lib/DocApi'; import * as log from 'app/server/lib/log'; @@ -1544,6 +1544,132 @@ function testDocApi() { assert.deepEqual(resp.headers['bad-header'], undefined); // Attempt to hack in more headers didn't work assert.deepEqual(resp.data, 'def'); }); + + it("POST /docs/{did}/attachments/updateUsed updates timeDeleted on metadata", async function() { + const wid = await getWorkspaceId(userApi, 'Private'); + const docId = await userApi.newDoc({name: 'TestDoc2'}, wid); + + // Apply the given user actions, + // POST to /attachments/updateUsed + // Check that Table1 and _grist_Attachments contain the expected rows + async function check( + actions: UserAction[], + userData: { id: number, Attached: any }[], + metaData: { id: number, deleted: boolean }[], + ) { + const docUrl = `${serverUrl}/api/docs/${docId}`; + + let resp = await axios.post(`${docUrl}/apply`, actions, chimpy); + assert.equal(resp.status, 200); + + resp = await axios.post(`${docUrl}/attachments/updateUsed`, actions, chimpy); + assert.equal(resp.status, 200); + + resp = await axios.get(`${docUrl}/tables/Table1/records`, chimpy); + const actualUserData = resp.data.records.map( + ({id, fields: {Attached}}: ApiRecord) => + ({id, Attached}) + ); + assert.deepEqual(actualUserData, userData); + + resp = await axios.get(`${docUrl}/tables/_grist_Attachments/records`, chimpy); + const actualMetaData = resp.data.records.map( + ({id, fields: {timeDeleted}}: ApiRecord) => + ({id, deleted: Boolean(timeDeleted)}) + ); + assert.deepEqual(actualMetaData, metaData); + } + + // Set up the document and initial data. + await check( + [ + ["AddColumn", "Table1", "Attached", {type: "Attachments"}], + ["BulkAddRecord", "Table1", [1, 2], {Attached: [['L', 1], ['L', 2, 3]]}], + // There's no actual attachments here but that doesn't matter + ["BulkAddRecord", "_grist_Attachments", [1, 2, 3], {}], + ], + [ + {id: 1, Attached: ['L', 1]}, + {id: 2, Attached: ['L', 2, 3]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: false}, + {id: 3, deleted: false}, + ], + ); + + // Remove the record containing ['L', 2, 3], so the metadata for 2 and 3 now says deleted + await check( + [["RemoveRecord", "Table1", 2]], + [ + {id: 1, Attached: ['L', 1]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: true}, // deleted here + {id: 3, deleted: true}, // deleted here + ], + ); + + // Add back a reference to attacument 2 to test 'undeletion', plus some junk values + await check( + [["BulkAddRecord", "Table1", [3, 4, 5], {Attached: [null, "foo", ['L', 2, 2, 4, 4, 5]]}]], + [ + {id: 1, Attached: ['L', 1]}, + {id: 3, Attached: null}, + {id: 4, Attached: "foo"}, + {id: 5, Attached: ['L', 2, 2, 4, 4, 5]}, + ], + [ + {id: 1, deleted: false}, + {id: 2, deleted: false}, // undeleted here + {id: 3, deleted: true}, + ], + ); + + // Remove the whole column to test what happens when there's no Attachment columns + await check( + [["RemoveColumn", "Table1", "Attached"]], + [ + {id: 1, Attached: undefined}, + {id: 3, Attached: undefined}, + {id: 4, Attached: undefined}, + {id: 5, Attached: undefined}, + ], + [ + {id: 1, deleted: true}, // deleted here + {id: 2, deleted: true}, // deleted here + {id: 3, deleted: true}, + ], + ); + + // Test performance with a large number of records and attachments. + // The maximum value of numRecords that doesn't return a 413 error is about 18,000. + // In that case it took about 5.7 seconds to apply the initial user actions (i.e. add the records), + // 0.3 seconds to call updateUsed once, and 0.1 seconds to call it again immediately after. + // That last time roughly measures the time taken to do the SQL query + // without having to apply any user actions after to update timeDeleted. + // 10,000 records is a compromise so that tests aren't too slow. + const numRecords = 10000; + const attachmentsPerRecord = 4; + const totalUsedAttachments = numRecords * attachmentsPerRecord; // 40,000 attachments referenced in user data + const totalAttachments = totalUsedAttachments * 1.1; // 44,000 attachment IDs listed in metadata + const attachedValues = _.chunk(_.range(1, totalUsedAttachments + 1), attachmentsPerRecord) + .map(arr => ['L', ...arr]); + await check( + [ + // Reset the state: add back the removed column and delete the previously added data + ["AddColumn", "Table1", "Attached", {type: "Attachments"}], + ["BulkRemoveRecord", "Table1", [1, 3, 4, 5]], + ["BulkRemoveRecord", "_grist_Attachments", [1, 2, 3]], + ["BulkAddRecord", "Table1", arrayRepeat(numRecords, null), {Attached: attachedValues}], + ["BulkAddRecord", "_grist_Attachments", arrayRepeat(totalAttachments, null), {}], + ], + attachedValues.map((Attached, index) => ({id: index + 1, Attached})), + _.range(totalAttachments).map(index => ({id: index + 1, deleted: index >= totalUsedAttachments})), + ); + }); }); it("GET /docs/{did}/download serves document", async function() {