(core) Add /attachments/updateUsed DocApi endpoint to soft delete all unused attachments in document

Summary:
Builds on https://phab.getgrist.com/D3352

Add DocStorage.scanAttachmentsForUsageChanges to do fancy JSON query to find all attachment metadata rows whose soft deletion status needs updating.

Add ActiveDoc.updateUsedAttachments which uses the above and then applies the appropriate user action if needed to soft delete/undelete metadata rows.

Add endpoint in DocApi calling ActiveDoc method.

Test Plan: Added DocApi test

Reviewers: paulfitz

Reviewed By: paulfitz

Differential Revision: https://phab.getgrist.com/D3357
This commit is contained in:
Alex Hall 2022-04-07 14:34:50 +02:00
parent a6ba40558a
commit 64369df4c3
4 changed files with 200 additions and 2 deletions

View File

@ -24,6 +24,7 @@ import {
import {ApiError} from 'app/common/ApiError';
import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate';
import {
BulkUpdateRecord,
CellValue,
DocAction,
TableDataAction,
@ -1298,6 +1299,27 @@ export class ActiveDoc extends EventEmitter {
return sandboxActionBundle;
}
/**
* Check which attachments in the _grist_Attachments metadata are actually used,
* i.e. referenced by some cell in an Attachments type column.
* Set timeDeleted to the current time on newly unused attachments,
* 'soft deleting' them so that they get cleaned up automatically from _gristsys_Files after enough time has passed.
* Set timeDeleted to null on used attachments that were previously soft deleted,
* so that undo can 'undelete' attachments.
*/
public async updateUsedAttachments() {
const changes = await this.docStorage.scanAttachmentsForUsageChanges();
if (!changes.length) {
return;
}
const rowIds = changes.map(r => r.id);
const now = Date.now() / 1000;
const timeDeleted = changes.map(r => r.used ? null : now);
const action: BulkUpdateRecord = ["BulkUpdateRecord", "_grist_Attachments", rowIds, {timeDeleted}];
// Don't use applyUserActions which may block the update action in delete-only mode
await this._applyUserActions(makeExceptionalDocSession('system'), [action]);
}
// Needed for test/server/migrations.js tests
public async testGetVersionFromDataEngine() {
return this._pyCall('get_version');

View File

@ -230,6 +230,12 @@ export class DocWorkerApi {
.send(fileData);
}));
// Mostly for testing
this._app.post('/api/docs/:docId/attachments/updateUsed', canEdit, withDoc(async (activeDoc, req, res) => {
await activeDoc.updateUsedAttachments();
res.json(null);
}));
// Adds records given in a column oriented format,
// returns an array of row IDs
this._app.post('/api/docs/:docId/tables/:tableId/data', canEdit,

View File

@ -1223,6 +1223,50 @@ export class DocStorage implements ISQLiteDB, OnDemandStorage {
});
}
/**
* Returns an array of objects where:
* - `id` is a row ID of _grist_Attachments
* - `used` is true if and only if `id` is in a list in a cell of type Attachments
* - The value of `timeDeleted` in this row of _grist_Attachments needs to be updated
* because its truthiness doesn't match `used`, i.e. either:
* - a used attachment is marked as deleted, OR
* - an unused attachment is not marked as deleted
*/
public async scanAttachmentsForUsageChanges(): Promise<{ used: boolean, id: number }[]> {
// Array of SQL queries where attachment_ids contains JSON arrays (typically containg row IDs).
// Below we add one query for each column of type Attachments in the document.
// We always include this first dummy query because if the array is empty then the final SQL query
// will just have `()` causing a syntax error.
// We can't just return when there are no Attachments columns
// because we may still need to delete all remaining attachments.
const attachmentsQueries = ["SELECT '[0]' AS attachment_ids"];
for (const [tableId, cols] of Object.entries(this._docSchema)) {
for (const [colId, type] of Object.entries(cols)) {
if (type === "Attachments") {
attachmentsQueries.push(`
SELECT t.${quoteIdent(colId)} AS attachment_ids
FROM ${quoteIdent(tableId)} AS t
WHERE json_valid(attachment_ids)
`);
}
}
}
// `UNION ALL` instead of `UNION` because duplicate values are unlikely and deduplicating is not worth the cost
const allAttachmentsQuery = attachmentsQueries.join(' UNION ALL ');
const sql = `
WITH all_attachment_ids(id) AS (
SELECT json_each.value AS id
FROM json_each(attachment_ids), (${allAttachmentsQuery})
) -- flatten out all the lists of IDs into a simple column of IDs
SELECT id, id IN all_attachment_ids AS used
FROM _grist_Attachments
WHERE used != (timeDeleted IS NULL); -- only include rows that need updating
`;
return (await this.all(sql)) as any[];
}
public all(sql: string, ...args: any[]): Promise<ResultRow[]> {
return this._getDB().all(sql, ...args);
}

View File

@ -1,9 +1,9 @@
import {ActionSummary} from 'app/common/ActionSummary';
import {BulkColValues} from 'app/common/DocActions';
import {BulkColValues, UserAction} from 'app/common/DocActions';
import {arrayRepeat} from 'app/common/gutil';
import {DocState, UserAPIImpl} from 'app/common/UserAPI';
import {AddOrUpdateRecord} from 'app/plugin/DocApiTypes';
import {teamFreeFeatures} from 'app/gen-server/entity/Product';
import {AddOrUpdateRecord, Record as ApiRecord} from 'app/plugin/DocApiTypes';
import {CellValue, GristObjCode} from 'app/plugin/GristData';
import {applyQueryParameters, docDailyApiUsageKey} from 'app/server/lib/DocApi';
import * as log from 'app/server/lib/log';
@ -1544,6 +1544,132 @@ function testDocApi() {
assert.deepEqual(resp.headers['bad-header'], undefined); // Attempt to hack in more headers didn't work
assert.deepEqual(resp.data, 'def');
});
it("POST /docs/{did}/attachments/updateUsed updates timeDeleted on metadata", async function() {
const wid = await getWorkspaceId(userApi, 'Private');
const docId = await userApi.newDoc({name: 'TestDoc2'}, wid);
// Apply the given user actions,
// POST to /attachments/updateUsed
// Check that Table1 and _grist_Attachments contain the expected rows
async function check(
actions: UserAction[],
userData: { id: number, Attached: any }[],
metaData: { id: number, deleted: boolean }[],
) {
const docUrl = `${serverUrl}/api/docs/${docId}`;
let resp = await axios.post(`${docUrl}/apply`, actions, chimpy);
assert.equal(resp.status, 200);
resp = await axios.post(`${docUrl}/attachments/updateUsed`, actions, chimpy);
assert.equal(resp.status, 200);
resp = await axios.get(`${docUrl}/tables/Table1/records`, chimpy);
const actualUserData = resp.data.records.map(
({id, fields: {Attached}}: ApiRecord) =>
({id, Attached})
);
assert.deepEqual(actualUserData, userData);
resp = await axios.get(`${docUrl}/tables/_grist_Attachments/records`, chimpy);
const actualMetaData = resp.data.records.map(
({id, fields: {timeDeleted}}: ApiRecord) =>
({id, deleted: Boolean(timeDeleted)})
);
assert.deepEqual(actualMetaData, metaData);
}
// Set up the document and initial data.
await check(
[
["AddColumn", "Table1", "Attached", {type: "Attachments"}],
["BulkAddRecord", "Table1", [1, 2], {Attached: [['L', 1], ['L', 2, 3]]}],
// There's no actual attachments here but that doesn't matter
["BulkAddRecord", "_grist_Attachments", [1, 2, 3], {}],
],
[
{id: 1, Attached: ['L', 1]},
{id: 2, Attached: ['L', 2, 3]},
],
[
{id: 1, deleted: false},
{id: 2, deleted: false},
{id: 3, deleted: false},
],
);
// Remove the record containing ['L', 2, 3], so the metadata for 2 and 3 now says deleted
await check(
[["RemoveRecord", "Table1", 2]],
[
{id: 1, Attached: ['L', 1]},
],
[
{id: 1, deleted: false},
{id: 2, deleted: true}, // deleted here
{id: 3, deleted: true}, // deleted here
],
);
// Add back a reference to attacument 2 to test 'undeletion', plus some junk values
await check(
[["BulkAddRecord", "Table1", [3, 4, 5], {Attached: [null, "foo", ['L', 2, 2, 4, 4, 5]]}]],
[
{id: 1, Attached: ['L', 1]},
{id: 3, Attached: null},
{id: 4, Attached: "foo"},
{id: 5, Attached: ['L', 2, 2, 4, 4, 5]},
],
[
{id: 1, deleted: false},
{id: 2, deleted: false}, // undeleted here
{id: 3, deleted: true},
],
);
// Remove the whole column to test what happens when there's no Attachment columns
await check(
[["RemoveColumn", "Table1", "Attached"]],
[
{id: 1, Attached: undefined},
{id: 3, Attached: undefined},
{id: 4, Attached: undefined},
{id: 5, Attached: undefined},
],
[
{id: 1, deleted: true}, // deleted here
{id: 2, deleted: true}, // deleted here
{id: 3, deleted: true},
],
);
// Test performance with a large number of records and attachments.
// The maximum value of numRecords that doesn't return a 413 error is about 18,000.
// In that case it took about 5.7 seconds to apply the initial user actions (i.e. add the records),
// 0.3 seconds to call updateUsed once, and 0.1 seconds to call it again immediately after.
// That last time roughly measures the time taken to do the SQL query
// without having to apply any user actions after to update timeDeleted.
// 10,000 records is a compromise so that tests aren't too slow.
const numRecords = 10000;
const attachmentsPerRecord = 4;
const totalUsedAttachments = numRecords * attachmentsPerRecord; // 40,000 attachments referenced in user data
const totalAttachments = totalUsedAttachments * 1.1; // 44,000 attachment IDs listed in metadata
const attachedValues = _.chunk(_.range(1, totalUsedAttachments + 1), attachmentsPerRecord)
.map(arr => ['L', ...arr]);
await check(
[
// Reset the state: add back the removed column and delete the previously added data
["AddColumn", "Table1", "Attached", {type: "Attachments"}],
["BulkRemoveRecord", "Table1", [1, 3, 4, 5]],
["BulkRemoveRecord", "_grist_Attachments", [1, 2, 3]],
["BulkAddRecord", "Table1", arrayRepeat(numRecords, null), {Attached: attachedValues}],
["BulkAddRecord", "_grist_Attachments", arrayRepeat(totalAttachments, null), {}],
],
attachedValues.map((Attached, index) => ({id: index + 1, Attached})),
_.range(totalAttachments).map(index => ({id: index + 1, deleted: index >= totalUsedAttachments})),
);
});
});
it("GET /docs/{did}/download serves document", async function() {