mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
(core) be careful when reassigning a doc to a worker it was on before
Summary: Importing a .grist document is implemented in a somewhat clunky way, in a multi-worker setup. * First a random worker receives the upload, and updates Grist's various stores appropriately (database, redis, s3). * Then a random worker is assigned to serve the document. If the worker serving the document fails, there is a chance the it will end up assigned to the worker that handled its upload. Currently the worker will misbehave in this case. This diff: * Ports a multi-worker test from test/home to run in test/s3, and adds a test simulating a bad scenario seen in the wild. * Fixes persistence of any existing document checksum in redis when a worker is assigned. * Adds a check when assigned a document to serve, and finding that document already cached locally. It isn't safe to rely only on the document checksum in redis, since that may have expired. * Explicitly claims the document on the uploading worker, so this situation becomes even less likely to arise. Test Plan: added test Reviewers: dsagal Reviewed By: dsagal Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D3305
This commit is contained in:
parent
321019217d
commit
c4d3d7d3bb
@ -308,26 +308,8 @@ export class DocWorkerMap implements IDocWorkerMap {
|
||||
* refused and need to retry.
|
||||
*/
|
||||
public async getDocWorker(docId: string): Promise<DocStatus|null> {
|
||||
// Fetch the various elements that go into making a DocStatus
|
||||
const props = await this._client.multi()
|
||||
.hgetall(`doc-${docId}`)
|
||||
.get(`doc-${docId}-checksum`)
|
||||
.execAsync() as [{[key: string]: any}|null, string|null]|null;
|
||||
if (!props) { return null; }
|
||||
|
||||
// If there is no worker, return null. An alternative would be to modify
|
||||
// DocStatus so that it is possible for it to not have a worker assignment.
|
||||
if (!props[0]) { return null; }
|
||||
|
||||
// Fields are JSON encoded since redis cannot store them directly.
|
||||
const doc = mapValues(props[0], (val) => JSON.parse(val));
|
||||
|
||||
// Redis cannot store a null value, so we encode it as 'null', which does
|
||||
// not match any possible MD5.
|
||||
doc.docMD5 = props[1] === 'null' ? null : props[1];
|
||||
|
||||
// Ok, we have a valid DocStatus at this point.
|
||||
return doc as DocStatus;
|
||||
const {doc} = await this._getDocAndChecksum(docId);
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -377,7 +359,8 @@ export class DocWorkerMap implements IDocWorkerMap {
|
||||
try {
|
||||
// Now that we've locked, recheck that the worker hasn't been reassigned
|
||||
// in the meantime. Return immediately if it has.
|
||||
docStatus = await this.getDocWorker(docId);
|
||||
const docAndChecksum = await this._getDocAndChecksum(docId);
|
||||
docStatus = docAndChecksum.doc;
|
||||
if (docStatus) { return docStatus; }
|
||||
|
||||
if (!workerId) {
|
||||
@ -408,8 +391,9 @@ export class DocWorkerMap implements IDocWorkerMap {
|
||||
const docWorker = await this._client.hgetallAsync(`worker-${workerId}`) as DocWorkerInfo|null;
|
||||
if (!docWorker) { throw new Error('no doc worker contact info available'); }
|
||||
|
||||
// We can now construct a DocStatus.
|
||||
const newDocStatus = {docMD5: null, docWorker, isActive: true};
|
||||
// We can now construct a DocStatus, preserving any existing checksum.
|
||||
const checksum = docAndChecksum.checksum;
|
||||
const newDocStatus = {docMD5: checksum, docWorker, isActive: true};
|
||||
|
||||
// We add the assignment to worker-{workerId}-docs and save doc-{docId}.
|
||||
const result = await this._client.multi()
|
||||
@ -418,7 +402,7 @@ export class DocWorkerMap implements IDocWorkerMap {
|
||||
docWorker: JSON.stringify(docWorker), // redis can't store nested objects, strings only
|
||||
isActive: JSON.stringify(true) // redis can't store booleans, strings only
|
||||
})
|
||||
.setex(`doc-${docId}-checksum`, CHECKSUM_TTL_MSEC / 1000.0, 'null')
|
||||
.setex(`doc-${docId}-checksum`, CHECKSUM_TTL_MSEC / 1000.0, checksum || 'null')
|
||||
.execAsync();
|
||||
if (!result) { throw new Error('failed to store new assignment'); }
|
||||
return newDocStatus;
|
||||
@ -522,6 +506,28 @@ export class DocWorkerMap implements IDocWorkerMap {
|
||||
public async getDocGroup(docId: string): Promise<string|null> {
|
||||
return this._client.getAsync(`doc-${docId}-group`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the doc-<docId> hash and doc-<docId>-checksum key from redis.
|
||||
* Return as a decoded DocStatus and a checksum.
|
||||
*/
|
||||
private async _getDocAndChecksum(docId: string): Promise<{
|
||||
doc: DocStatus|null,
|
||||
checksum: string|null,
|
||||
}> {
|
||||
// Fetch the various elements that go into making a DocStatus
|
||||
const props = await this._client.multi()
|
||||
.hgetall(`doc-${docId}`)
|
||||
.get(`doc-${docId}-checksum`)
|
||||
.execAsync() as [{[key: string]: any}|null, string|null]|null;
|
||||
// Fields are JSON encoded since redis cannot store them directly.
|
||||
const doc = props?.[0] ? mapValues(props[0], (val) => JSON.parse(val)) as DocStatus : null;
|
||||
// Redis cannot store a null value, so we encode it as 'null', which does
|
||||
// not match any possible MD5.
|
||||
const checksum = (props?.[1] === 'null' ? null : props?.[1]) || null;
|
||||
if (doc) { doc.docMD5 = checksum; } // the checksum goes in the DocStatus too.
|
||||
return {doc, checksum};
|
||||
}
|
||||
}
|
||||
|
||||
// If we don't have redis available and use a DummyDocWorker, it should be a singleton.
|
||||
|
@ -530,7 +530,13 @@ export class DocManager extends EventEmitter {
|
||||
const srcDocPath = uploadInfo.files[0].absPath;
|
||||
await checkAllegedGristDoc(docSession, srcDocPath);
|
||||
await docUtils.copyFile(srcDocPath, docPath);
|
||||
await this.storageManager.addToStorage(docName);
|
||||
// Go ahead and claim this document. If we wanted to serve it
|
||||
// from a potentially different worker, we'd call addToStorage(docName)
|
||||
// instead (we used to do this). The upload should already be happening
|
||||
// on a randomly assigned worker due to the special treatment of the
|
||||
// 'import' assignmentId.
|
||||
await this.storageManager.prepareLocalDoc(docName);
|
||||
this.storageManager.markAsChanged(docName, 'edit');
|
||||
return {title: basename, id: docName};
|
||||
} else {
|
||||
const doc = await this.createNewEmptyDoc(docSession, id);
|
||||
|
@ -163,6 +163,18 @@ export class DocSnapshotInventory implements IInventory {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Wipe local cached state of the inventory.
|
||||
*/
|
||||
public async clear(key: string) {
|
||||
await this._mutex.runExclusive(key, async() => {
|
||||
await this._flush(key);
|
||||
const fname = await this._getFilename(key);
|
||||
// NOTE: fse.remove succeeds also when the file does not exist.
|
||||
await fse.remove(fname);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a set of snapshots from the inventory, and then flush to S3.
|
||||
*/
|
||||
|
@ -158,9 +158,14 @@ export class FlexServer implements GristServer {
|
||||
log.info(`== Grist version is ${version.version} (commit ${version.gitcommit})`);
|
||||
this.info.push(['appRoot', this.appRoot]);
|
||||
// This directory hold Grist documents.
|
||||
const docsRoot = path.resolve((this.options && this.options.dataDir) ||
|
||||
let docsRoot = path.resolve((this.options && this.options.dataDir) ||
|
||||
process.env.GRIST_DATA_DIR ||
|
||||
getAppPathTo(this.appRoot, 'samples'));
|
||||
// In testing, it can be useful to separate out document roots used
|
||||
// by distinct FlexServers.
|
||||
if (process.env.GRIST_TEST_ADD_PORT_TO_DOCS_ROOT === 'true') {
|
||||
docsRoot = path.resolve(docsRoot, String(port));
|
||||
}
|
||||
// Create directory if it doesn't exist.
|
||||
// TODO: track down all dependencies on 'samples' existing in tests and
|
||||
// in dev environment, and remove them. Then it would probably be best
|
||||
@ -1308,6 +1313,16 @@ export class FlexServer implements GristServer {
|
||||
return this.tag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure external storage of all docs is up to date.
|
||||
*/
|
||||
public async testFlushDocs() {
|
||||
const assignments = await this._docWorkerMap.getAssignments(this.worker.id);
|
||||
for (const assignment of assignments) {
|
||||
await this._storageManager.flushDoc(assignment);
|
||||
}
|
||||
}
|
||||
|
||||
// Adds endpoints that support imports and exports.
|
||||
private _addSupportPaths(docAccessMiddleware: express.RequestHandler[]) {
|
||||
if (!this._docWorker) { throw new Error("need DocWorker"); }
|
||||
|
@ -112,6 +112,7 @@ export class HostedStorageManager implements IDocStorageManager {
|
||||
|
||||
// Latest version ids of documents.
|
||||
private _latestVersions = new Map<string, string>();
|
||||
private _latestMetaVersions = new Map<string, string>();
|
||||
|
||||
private _log = new LogMethods('HostedStorageManager ', (docId: string|null) => ({docId}));
|
||||
|
||||
@ -158,7 +159,7 @@ export class HostedStorageManager implements IDocStorageManager {
|
||||
throw new Error('bug: external storage should be created for "meta" if it is created for "doc"');
|
||||
}
|
||||
this._extMeta = this._getChecksummedExternalStorage('meta', baseStoreMeta,
|
||||
new Map(),
|
||||
this._latestMetaVersions,
|
||||
options);
|
||||
|
||||
this._inventory = new DocSnapshotInventory(this._ext, this._extMeta,
|
||||
@ -570,9 +571,18 @@ export class HostedStorageManager implements IDocStorageManager {
|
||||
const existsLocally = await fse.pathExists(this.getPath(docName));
|
||||
if (existsLocally) {
|
||||
if (!docStatus.docMD5 || docStatus.docMD5 === DELETED_TOKEN) {
|
||||
// New doc appears to already exist, but not in S3 (according to redis).
|
||||
// Go ahead and use local version.
|
||||
return true;
|
||||
// New doc appears to already exist, but may not exist in S3.
|
||||
// Let's check.
|
||||
const head = await this._ext.head(docName);
|
||||
const lastLocalVersionSeen = this._latestVersions.get(docName);
|
||||
if (head && lastLocalVersionSeen !== head.snapshotId) {
|
||||
// Exists in S3, with a version not known to be latest seen
|
||||
// by this worker - so wipe local version and defer to S3.
|
||||
await this._wipeCache(docName);
|
||||
} else {
|
||||
// Go ahead and use local version.
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// Doc exists locally and in S3 (according to redis).
|
||||
// Make sure the checksum matches.
|
||||
@ -586,8 +596,7 @@ export class HostedStorageManager implements IDocStorageManager {
|
||||
// On the assumption that the local file is outdated, delete it.
|
||||
// TODO: may want to be more careful in case the local file has modifications that
|
||||
// simply never made it to S3 due to some kind of breakage.
|
||||
// NOTE: fse.remove succeeds also when the file does not exist.
|
||||
await fse.remove(this.getPath(docName));
|
||||
await this._wipeCache(docName);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -598,6 +607,19 @@ export class HostedStorageManager implements IDocStorageManager {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove local version of a document, and state related to it.
|
||||
*/
|
||||
private async _wipeCache(docName: string) {
|
||||
// NOTE: fse.remove succeeds also when the file does not exist.
|
||||
await fse.remove(this.getPath(docName));
|
||||
await fse.remove(this._getHashFile(this.getPath(docName), 'doc'));
|
||||
await fse.remove(this._getHashFile(this.getPath(docName), 'meta'));
|
||||
await this._inventory.clear(docName);
|
||||
this._latestVersions.delete(docName);
|
||||
this._latestMetaVersions.delete(docName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a document from s3 and save it locally as destId.grist
|
||||
*
|
||||
|
@ -16,6 +16,7 @@ export const ITestingHooks = t.iface([], {
|
||||
"closeDocs": t.func("void"),
|
||||
"setDocWorkerActivation": t.func("void", t.param("workerId", "string"), t.param("active", t.union(t.lit('active'), t.lit('inactive'), t.lit('crash')))),
|
||||
"flushAuthorizerCache": t.func("void"),
|
||||
"flushDocs": t.func("void"),
|
||||
"getDocClientCounts": t.func(t.array(t.tuple("string", "number"))),
|
||||
"setActiveDocTimeout": t.func("number", t.param("seconds", "number")),
|
||||
"setDiscourseConnectVar": t.func(t.union("string", "null"), t.param("varName", "string"), t.param("value", t.union("string", "null"))),
|
||||
|
@ -12,6 +12,7 @@ export interface ITestingHooks {
|
||||
closeDocs(): Promise<void>;
|
||||
setDocWorkerActivation(workerId: string, active: 'active'|'inactive'|'crash'): Promise<void>;
|
||||
flushAuthorizerCache(): Promise<void>;
|
||||
flushDocs(): Promise<void>;
|
||||
getDocClientCounts(): Promise<Array<[string, number]>>;
|
||||
setActiveDocTimeout(seconds: number): Promise<number>;
|
||||
setDiscourseConnectVar(varName: string, value: string|null): Promise<string|null>;
|
||||
|
@ -137,23 +137,27 @@ export class TestingHooks implements ITestingHooks {
|
||||
public async setDocWorkerActivation(workerId: string, active: 'active'|'inactive'|'crash'):
|
||||
Promise<void> {
|
||||
log.info("TestingHooks.setDocWorkerActivation called with", workerId, active);
|
||||
for (const server of this._workerServers) {
|
||||
if (server.worker.id === workerId || server.worker.publicUrl === workerId) {
|
||||
switch (active) {
|
||||
case 'active':
|
||||
await server.restartListening();
|
||||
break;
|
||||
case 'inactive':
|
||||
await server.stopListening();
|
||||
break;
|
||||
case 'crash':
|
||||
await server.stopListening('crash');
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const matches = this._workerServers.filter(
|
||||
server => server.worker.id === workerId ||
|
||||
server.worker.publicUrl === workerId ||
|
||||
(server.worker.publicUrl.startsWith('http://localhost:') &&
|
||||
workerId.startsWith('http://localhost:') &&
|
||||
new URL(server.worker.publicUrl).host === new URL(workerId).host));
|
||||
if (matches.length !== 1) {
|
||||
throw new Error(`could not find worker: ${workerId}`);
|
||||
}
|
||||
const server = matches[0];
|
||||
switch (active) {
|
||||
case 'active':
|
||||
await server.restartListening();
|
||||
break;
|
||||
case 'inactive':
|
||||
await server.stopListening();
|
||||
break;
|
||||
case 'crash':
|
||||
await server.stopListening('crash');
|
||||
break;
|
||||
}
|
||||
throw new Error(`could not find worker: ${workerId}`);
|
||||
}
|
||||
|
||||
public async flushAuthorizerCache(): Promise<void> {
|
||||
@ -164,6 +168,13 @@ export class TestingHooks implements ITestingHooks {
|
||||
}
|
||||
}
|
||||
|
||||
public async flushDocs(): Promise<void> {
|
||||
log.info("TestingHooks.flushDocs called");
|
||||
for (const server of this._workerServers) {
|
||||
await server.testFlushDocs();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns a Map from docId to number of connected clients for all open docs across servers,
|
||||
// but represented as an array of pairs, to be serializable.
|
||||
public async getDocClientCounts(): Promise<Array<[string, number]>> {
|
||||
|
@ -151,6 +151,21 @@ export async function testCurrentUrl(pattern: RegExp|string) {
|
||||
return (typeof pattern === 'string') ? url.includes(pattern) : pattern.test(url);
|
||||
}
|
||||
|
||||
export async function getDocWorkerUrls(): Promise<string[]> {
|
||||
const result = await driver.wait(() => driver.executeScript(`
|
||||
return Array.from(window.gristApp.comm.listConnections().values());
|
||||
`), 1000) as string[];
|
||||
return result;
|
||||
}
|
||||
|
||||
export async function getDocWorkerUrl(): Promise<string> {
|
||||
const urls = await getDocWorkerUrls();
|
||||
if (urls.length > 1) {
|
||||
throw new Error(`Expected a single docWorker URL, received ${urls}`);
|
||||
}
|
||||
return urls[0] || '';
|
||||
}
|
||||
|
||||
export async function waitForUrl(pattern: RegExp|string, waitMs: number = 2000) {
|
||||
await driver.wait(() => testCurrentUrl(pattern), waitMs);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user