gristlabs_grist-core/app/gen-server/lib/DocWorkerMap.ts
Paul Fitzpatrick 71519d9e5c (core) revamp snapshot inventory
Summary:
Deliberate changes:
 * save snapshots to s3 prior to migrations.
 * label migration snapshots in s3 metadata.
 * avoid pruning migration snapshots for a month.

Opportunistic changes:
 * Associate document timezone with snapshots, so pruning can respect timezones.
 * Associate actionHash/Num with snapshots.
 * Record time of last change in snapshots (rather than just s3 upload time, which could be a while later).

This ended up being a biggish change, because there was nowhere ideal to put tags (list of possibilities in diff).

Test Plan: added tests

Reviewers: dsagal

Reviewed By: dsagal

Differential Revision: https://phab.getgrist.com/D2646
2020-10-30 13:52:46 -04:00

458 lines
18 KiB
TypeScript

import {MapWithTTL} from 'app/common/AsyncCreate';
import * as version from 'app/common/version';
import {DocStatus, DocWorkerInfo, IDocWorkerMap} from 'app/server/lib/DocWorkerMap';
import * as log from 'app/server/lib/log';
import {checkPermitKey, formatPermitKey, Permit} from 'app/server/lib/Permit';
import {promisifyAll} from 'bluebird';
import mapValues = require('lodash/mapValues');
import {createClient, Multi, RedisClient} from 'redis';
import * as Redlock from 'redlock';
import * as uuidv4 from 'uuid/v4';
promisifyAll(RedisClient.prototype);
promisifyAll(Multi.prototype);
// Max time for which we will hold a lock, by default. In milliseconds.
const LOCK_TIMEOUT = 3000;
// How long do checksums stored in redis last. In milliseconds.
// Should be long enough to allow S3 to reach consistency with very high probability.
// Consistency failures shorter than this interval will be detectable, failures longer
// than this interval will not be detectable.
const CHECKSUM_TTL_MSEC = 24 * 60 * 60 * 1000; // 24 hours
// How long do permits stored in redis last, in milliseconds.
const PERMIT_TTL_MSEC = 1 * 60 * 1000; // 1 minute
class DummyDocWorkerMap implements IDocWorkerMap {
private _worker?: DocWorkerInfo;
private _available: boolean = false;
private _permits = new MapWithTTL<string, string>(PERMIT_TTL_MSEC);
private _elections = new MapWithTTL<string, string>(1); // default ttl never used
public async getDocWorker(docId: string) {
if (!this._worker) { throw new Error('no workers'); }
return {docMD5: 'unknown', docWorker: this._worker, isActive: true};
}
public async assignDocWorker(docId: string) {
if (!this._worker || !this._available) { throw new Error('no workers'); }
return {docMD5: 'unknown', docWorker: this._worker, isActive: true};
}
public async getDocWorkerOrAssign(docId: string, workerId: string): Promise<DocStatus> {
if (!this._worker || !this._available) { throw new Error('no workers'); }
if (this._worker.id !== workerId) { throw new Error('worker not known'); }
return {docMD5: 'unknown', docWorker: this._worker, isActive: true};
}
public async updateDocStatus(docId: string, checksum: string) {
// nothing to do
}
public async addWorker(info: DocWorkerInfo): Promise<void> {
this._worker = info;
}
public async removeWorker(workerId: string): Promise<void> {
this._worker = undefined;
}
public async setWorkerAvailability(workerId: string, available: boolean): Promise<void> {
this._available = available;
}
public async releaseAssignment(workerId: string, docId: string): Promise<void> {
// nothing to do
}
public async getAssignments(workerId: string): Promise<string[]> {
return [];
}
public async setPermit(permit: Permit): Promise<string> {
const key = formatPermitKey(uuidv4());
this._permits.set(key, JSON.stringify(permit));
return key;
}
public async getPermit(key: string): Promise<Permit> {
const result = this._permits.get(key);
return result ? JSON.parse(result) : null;
}
public async removePermit(key: string): Promise<void> {
this._permits.delete(key);
}
public async close(): Promise<void> {
this._permits.clear();
this._elections.clear();
}
public async getElection(name: string, durationInMs: number): Promise<string|null> {
if (this._elections.get(name)) { return null; }
const key = uuidv4();
this._elections.setWithCustomTTL(name, key, durationInMs);
return key;
}
public async removeElection(name: string, electionKey: string): Promise<void> {
if (this._elections.get(name) === electionKey) {
this._elections.delete(name);
}
}
public async updateChecksum(family: string, key: string, checksum: string) {
// nothing to do
}
public async getChecksum(family: string, key: string) {
return null;
}
}
/**
* Manage the relationship between document and workers. Backed by Redis.
* Can also assign workers to "groups" for serving particular documents.
* Keys used:
* workers - the set of known active workers, identified by workerId
* workers-available - the set of workers available for assignment (a subset of the workers set)
* workers-available-{group} - the set of workers available for a given group
* worker-{workerId} - a hash of contact information for a worker
* worker-{workerId}-docs - a set of docs assigned to a worker, identified by docId
* worker-{workerId}-group - if set, marks the worker as serving a particular group
* doc-${docId} - a hash containing (JSON serialized) DocStatus fields, other than docMD5.
* doc-${docId}-checksum - the docs docMD5, or 'null' if docMD5 is null
* doc-${docId}-group - if set, marks the doc as to be served by workers in a given group
* workers-lock - a lock used when working with the list of workers
* groups - a hash from groupIds (arbitrary strings) to desired number of workers in group
* elections-${deployment} - a hash, from groupId to a (serialized json) list of worker ids
*
* Assignments of documents to workers can end abruptly at any time. Clients
* should be prepared to retry if a worker is not responding or denies that a document
* is assigned to it.
*
* If the groups key is set, workers assign themselves to groupIds to
* fill the counts specified in groups (in order of groupIds), and
* once those are exhausted, get assigned to the special group
* "default".
*/
export class DocWorkerMap implements IDocWorkerMap {
private _client: RedisClient;
private _redlock: Redlock;
// Optional deploymentKey argument supplies a key unique to the deployment (this is important
// for maintaining groups across redeployments only)
constructor(_clients?: RedisClient[], private _deploymentKey?: string, private _options?: {
permitMsec?: number
}) {
this._deploymentKey = this._deploymentKey || version.version;
_clients = _clients || [createClient(process.env.REDIS_URL)];
this._redlock = new Redlock(_clients);
this._client = _clients[0]!;
}
public async addWorker(info: DocWorkerInfo): Promise<void> {
log.info(`DocWorkerMap.addWorker ${info.id}`);
const lock = await this._redlock.lock('workers-lock', LOCK_TIMEOUT);
try {
// Make a worker-{workerId} key with contact info, then add this worker to available set.
await this._client.hmsetAsync(`worker-${info.id}`, info);
await this._client.saddAsync('workers', info.id);
// Figure out if worker should belong to a group
const groups = await this._client.hgetallAsync('groups');
if (groups) {
const elections = await this._client.hgetallAsync(`elections-${this._deploymentKey}`) || {};
for (const group of Object.keys(groups).sort()) {
const count = parseInt(groups[group], 10) || 0;
if (count < 1) { continue; }
const elected: string[] = JSON.parse(elections[group] || '[]');
if (elected.length >= count) { continue; }
elected.push(info.id);
await this._client.setAsync(`worker-${info.id}-group`, group);
await this._client.hsetAsync(`elections-${this._deploymentKey}`, group, JSON.stringify(elected));
break;
}
}
} finally {
await lock.unlock();
}
}
public async removeWorker(workerId: string): Promise<void> {
log.info(`DocWorkerMap.removeWorker ${workerId}`);
const lock = await this._redlock.lock('workers-lock', LOCK_TIMEOUT);
try {
// Drop out of available set first.
await this._client.sremAsync('workers-available', workerId);
const group = await this._client.getAsync(`worker-${workerId}-group`) || 'default';
await this._client.sremAsync(`workers-available-${group}`, workerId);
// At this point, this worker should no longer be receiving new doc assignments, though
// clients may still be directed to the worker.
// If we were elected for anything, back out.
const elections = await this._client.hgetallAsync(`elections-${this._deploymentKey}`);
if (elections) {
if (group in elections) {
const elected: string[] = JSON.parse(elections[group]);
const newElected = elected.filter(worker => worker !== workerId);
if (elected.length !== newElected.length) {
if (newElected.length > 0) {
await this._client.hsetAsync(`elections-${this._deploymentKey}`, group,
JSON.stringify(newElected));
} else {
await this._client.hdelAsync(`elections-${this._deploymentKey}`, group);
delete elections[group];
}
}
// We're the last one involved in elections - remove the key entirely.
if (Object.keys(elected).length === 0) {
await this._client.delAsync(`elections-${this._deploymentKey}`);
}
}
}
// Now, we start removing the assignments.
const assignments = await this._client.smembersAsync(`worker-${workerId}-docs`);
if (assignments) {
const op = this._client.multi();
for (const doc of assignments) { op.del(`doc-${doc}`); }
await op.execAsync();
}
// Now remove worker-{workerId}* keys.
await this._client.delAsync(`worker-${workerId}-docs`);
await this._client.delAsync(`worker-${workerId}-group`);
await this._client.delAsync(`worker-${workerId}`);
// Forget about this worker completely.
await this._client.sremAsync('workers', workerId);
} finally {
await lock.unlock();
}
}
public async setWorkerAvailability(workerId: string, available: boolean): Promise<void> {
log.info(`DocWorkerMap.setWorkerAvailability ${workerId} ${available}`);
const group = await this._client.getAsync(`worker-${workerId}-group`) || 'default';
if (available) {
await this._client.saddAsync(`workers-available-${group}`, workerId);
await this._client.saddAsync('workers-available', workerId);
} else {
await this._client.sremAsync('workers-available', workerId);
await this._client.sremAsync(`workers-available-${group}`, workerId);
}
}
public async releaseAssignment(workerId: string, docId: string): Promise<void> {
const op = this._client.multi();
op.del(`doc-${docId}`);
op.srem(`worker-${workerId}-docs`, docId);
await op.execAsync();
}
public async getAssignments(workerId: string): Promise<string[]> {
return this._client.smembersAsync(`worker-${workerId}-docs`);
}
/**
* Defined by IDocWorkerMap.
*
* Looks up which DocWorker is responsible for this docId.
* Responsibility could change at any time after this call, so it
* should be treated as a hint, and clients should be prepared to be
* refused and need to retry.
*/
public async getDocWorker(docId: string): Promise<DocStatus|null> {
// Fetch the various elements that go into making a DocStatus
const props = await this._client.multi()
.hgetall(`doc-${docId}`)
.get(`doc-${docId}-checksum`)
.execAsync() as [{[key: string]: any}|null, string|null]|null;
if (!props) { return null; }
// If there is no worker, return null. An alternative would be to modify
// DocStatus so that it is possible for it to not have a worker assignment.
if (!props[0]) { return null; }
// Fields are JSON encoded since redis cannot store them directly.
const doc = mapValues(props[0], (val) => JSON.parse(val));
// Redis cannot store a null value, so we encode it as 'null', which does
// not match any possible MD5.
doc.docMD5 = props[1] === 'null' ? null : props[1];
// Ok, we have a valid DocStatus at this point.
return doc as DocStatus;
}
/**
*
* Defined by IDocWorkerMap.
*
* Assigns a DocWorker to this docId if one is not yet assigned.
* Note that the assignment could be unmade at any time after this
* call if the worker dies, is brought down, or for other potential
* reasons in the future such as migration of individual documents
* between workers.
*
* A preferred doc worker can be specified, which will be assigned
* if no assignment is already made.
*
*/
public async assignDocWorker(docId: string, workerId?: string): Promise<DocStatus> {
// Check if a DocWorker is already assigned; if so return result immediately
// without locking.
let docStatus = await this.getDocWorker(docId);
if (docStatus) { return docStatus; }
// No assignment yet, so let's lock and set an assignment up.
const lock = await this._redlock.lock(`workers-lock`, LOCK_TIMEOUT);
try {
// Now that we've locked, recheck that the worker hasn't been reassigned
// in the meantime. Return immediately if it has.
docStatus = await this.getDocWorker(docId);
if (docStatus) { return docStatus; }
if (!workerId) {
// Check if document has a preferred worker group set.
const group = await this._client.getAsync(`doc-${docId}-group`) || 'default';
// Let's start off by assigning documents to available workers randomly.
// TODO: use a smarter algorithm.
workerId = await this._client.srandmemberAsync(`workers-available-${group}`) || undefined;
if (!workerId) {
// No workers available in the desired worker group. Rather than refusing to
// open the document, we fall back on assigning a worker from any of the workers
// available, regardless of grouping.
// This limits the impact of operational misconfiguration (bad redis setup,
// or not starting enough workers). It has the downside of potentially disguising
// problems, so we log a warning.
log.warn(`DocWorkerMap.assignDocWorker ${docId} found no workers for group ${group}`);
workerId = await this._client.srandmemberAsync('workers-available') || undefined;
}
if (!workerId) { throw new Error('no doc workers available'); }
} else {
if (!await this._client.sismemberAsync('workers-available', workerId)) {
throw new Error(`worker ${workerId} not known or not available`);
}
}
// Look up how to contact the worker.
const docWorker = await this._client.hgetallAsync(`worker-${workerId}`) as DocWorkerInfo|null;
if (!docWorker) { throw new Error('no doc worker contact info available'); }
// We can now construct a DocStatus.
const newDocStatus = {docMD5: null, docWorker, isActive: true};
// We add the assignment to worker-{workerId}-docs and save doc-{docId}.
const result = await this._client.multi()
.sadd(`worker-${workerId}-docs`, docId)
.hmset(`doc-${docId}`, {
docWorker: JSON.stringify(docWorker), // redis can't store nested objects, strings only
isActive: JSON.stringify(true) // redis can't store booleans, strings only
})
.setex(`doc-${docId}-checksum`, CHECKSUM_TTL_MSEC / 1000.0, 'null')
.execAsync();
if (!result) { throw new Error('failed to store new assignment'); }
return newDocStatus;
} finally {
await lock.unlock();
}
}
/**
*
* Defined by IDocWorkerMap.
*
* Assigns a specific DocWorker to this docId if one is not yet assigned.
*
*/
public async getDocWorkerOrAssign(docId: string, workerId: string): Promise<DocStatus> {
return this.assignDocWorker(docId, workerId);
}
public async updateDocStatus(docId: string, checksum: string): Promise<void> {
this.updateChecksum('doc', docId, checksum);
}
public async updateChecksum(family: string, key: string, checksum: string) {
await this._client.setexAsync(`${family}-${key}-checksum`, CHECKSUM_TTL_MSEC / 1000.0, checksum);
}
public async getChecksum(family: string, key: string) {
const checksum = await this._client.getAsync(`${family}-${key}-checksum`);
return checksum === 'null' ? null : checksum;
}
public async setPermit(permit: Permit): Promise<string> {
const key = formatPermitKey(uuidv4());
const duration = (this._options && this._options.permitMsec) || PERMIT_TTL_MSEC;
// seems like only integer seconds are supported?
await this._client.setexAsync(key, Math.ceil(duration / 1000.0),
JSON.stringify(permit));
return key;
}
public async getPermit(key: string): Promise<Permit|null> {
if (!checkPermitKey(key)) { throw new Error('permit could not be read'); }
const result = await this._client.getAsync(key);
return result && JSON.parse(result);
}
public async removePermit(key: string): Promise<void> {
if (!checkPermitKey(key)) { throw new Error('permit could not be read'); }
await this._client.delAsync(key);
}
public async close(): Promise<void> {
// nothing to do
}
public async getElection(name: string, durationInMs: number): Promise<string|null> {
// Could use "set nx" for election, but redis docs don't encourage that any more,
// favoring redlock:
// https://redis.io/commands/setnx#design-pattern-locking-with-codesetnxcode
const redisKey = `nomination-${name}`;
const lock = await this._redlock.lock(`${redisKey}-lock`, LOCK_TIMEOUT);
try {
if (await this._client.getAsync(redisKey) !== null) { return null; }
const electionKey = uuidv4();
// seems like only integer seconds are supported?
await this._client.setexAsync(redisKey, Math.ceil(durationInMs / 1000.0), electionKey);
return electionKey;
} finally {
await lock.unlock();
}
}
public async removeElection(name: string, electionKey: string): Promise<void> {
const redisKey = `nomination-${name}`;
const lock = await this._redlock.lock(`${redisKey}-lock`, LOCK_TIMEOUT);
try {
const current = await this._client.getAsync(redisKey);
if (current === electionKey) {
await this._client.delAsync(redisKey);
} else if (current !== null) {
throw new Error('could not remove election');
}
} finally {
await lock.unlock();
}
}
}
// If we don't have redis available and use a DummyDocWorker, it should be a singleton.
let dummyDocWorkerMap: DummyDocWorkerMap|null = null;
export function getDocWorkerMap(): IDocWorkerMap {
if (process.env.REDIS_URL) {
return new DocWorkerMap();
} else {
dummyDocWorkerMap = dummyDocWorkerMap || new DummyDocWorkerMap();
return dummyDocWorkerMap;
}
}