gristlabs_grist-core/app/server/lib/workerExporter.ts

import {PassThrough} from 'stream';
import {ActiveDocSource} from 'app/server/lib/Export';
import * as ExportXLSX from 'app/server/lib/ExportXLSX';
import * as log from 'app/server/lib/log';
import {Rpc} from 'grain-rpc';
import {Stream} from 'stream';
import {MessagePort, threadId} from 'worker_threads';

export const makeXLSX = handleExport(ExportXLSX.makeXLSX);
export const makeXLSXFromTable = handleExport(ExportXLSX.makeXLSXFromTable);
export const makeXLSXFromViewSection = handleExport(ExportXLSX.makeXLSXFromViewSection);

function handleExport<T extends any[]>(
  make: (a: ActiveDocSource, testDates: boolean, output: Stream, ...args: T) => Promise<void>
) {
  return async function({port, testDates, args}: {port: MessagePort, testDates: boolean, args: T}) {
    try {
      const start = Date.now();
      log.debug("workerExporter %s %s: started", threadId, make.name);
      const rpc = new Rpc({
        sendMessage: async (m) => port.postMessage(m),
        logger: { info: m => {}, warn: m => log.warn(m) },
      });
      const activeDocSource = rpc.getStub<ActiveDocSource>("activeDocSource");
      port.on('message', (m) => rpc.receiveMessage(m));
      const outputStream = new PassThrough();
      bufferedPipe(outputStream, (chunk) => rpc.postMessage(chunk));
      await make(activeDocSource, testDates, outputStream, ...args);
      port.close();
      log.debug("workerExporter %s %s: done in %s ms", threadId, make.name, Date.now() - start);
    } catch (e) {
      log.debug("workerExporter %s %s: error %s", threadId, make.name, String(e));
      // When Error objects move across threads, they keep only the 'message' property. We can
      // keep other properties (like 'status') if we throw a plain object instead. (Didn't find a
      // good reference on this, https://github.com/nodejs/node/issues/35506 is vaguely related.)
      throw {message: e.message, ...e};
    }
  };
}

// ExcelJS's WorkbookWriter produces many tiny writes (even though they pass through zipping). To
// reduce overhead and context switching, buffer them and pass on in chunks. (In practice, this
// helps performance only slightly.)
function bufferedPipe(stream: Stream, callback: (chunk: Buffer) => void, threshold = 64*1024) {
  let buffers: Buffer[] = [];
  let length = 0;
  let flushed = 0;

  function flush() {
    if (length > 0) {
      const data = Buffer.concat(buffers);
      flushed += data.length;
      callback(data);
      buffers = [];
      length = 0;
    }
  }

  stream.on('data', (chunk) => {
    // Whenever data is written to the stream, add it to the buffer.
    buffers.push(chunk);
    length += chunk.length;
    // If the buffer is large enough, post it to the callback. Also post the very first chunk:
    // since this becomes an HTTP response, a quick first chunk lets the browser prompt the user
    // more quickly about what to do with the download.
    if (length >= threshold || flushed === 0) {
      flush();
    }
  });

  stream.on('end', flush);
}
(core) For exporting XLSX, do it memory-efficiently in a worker thread. Summary: - Excel exports were awfully memory-inefficient, causing occasional docWorker crashes. The fix is to use the "streaming writer" option of ExcelJS https://github.com/exceljs/exceljs#streaming-xlsx-writercontents. (Empirically on one example, max memory went down from 3G to 100M) - It's also CPU intensive and synchronous, and can block node for tens of seconds. The fix is to use a worker-thread. This diff uses "piscina" library for a pool of threads. - Additionally, adds ProcessMonitor that logs memory and cpu usage, particularly when those change significantly. - Also introduces request cancellation, so that a long download cancelled by the user will cancel the work being done in the worker thread. Test Plan: Updated previous export tests; memory and CPU performance tested manually by watching output of ProcessMonitor. Difference visible in these log excerpts: Before (total time to serve request 22 sec): ``` Telemetry processMonitor heapUsedMB=2187, heapTotalMB=2234, cpuAverage=1.13, intervalMs=17911 Telemetry processMonitor heapUsedMB=2188, heapTotalMB=2234, cpuAverage=0.66, intervalMs=5005 Telemetry processMonitor heapUsedMB=2188, heapTotalMB=2234, cpuAverage=0, intervalMs=5005 Telemetry processMonitor heapUsedMB=71, heapTotalMB=75, cpuAverage=0.13, intervalMs=5002 ``` After (total time to server request 18 sec): ``` Telemetry processMonitor heapUsedMB=109, heapTotalMB=144, cpuAverage=0.5, intervalMs=5001 Telemetry processMonitor heapUsedMB=109, heapTotalMB=144, cpuAverage=1.39, intervalMs=5002 Telemetry processMonitor heapUsedMB=94, heapTotalMB=131, cpuAverage=1.13, intervalMs=5000 Telemetry processMonitor heapUsedMB=94, heapTotalMB=131, cpuAverage=1.35, intervalMs=5001 ``` Note in "Before" that heapTotalMB goes up to 2GB in the first case, and "intervalMs" of 17 seconds indicates that node was unresponsive for that long. In the second case, heapTotalMB stays low, and the main thread remains responsive the whole time. Reviewers: jarek Reviewed By: jarek Differential Revision: https://phab.getgrist.com/D3906 2023-06-01 13:09:50 +00:00			`import {PassThrough} from 'stream';`
			`import {ActiveDocSource} from 'app/server/lib/Export';`
			`import * as ExportXLSX from 'app/server/lib/ExportXLSX';`
			`import * as log from 'app/server/lib/log';`
			`import {Rpc} from 'grain-rpc';`
			`import {Stream} from 'stream';`
			`import {MessagePort, threadId} from 'worker_threads';`

			`export const makeXLSX = handleExport(ExportXLSX.makeXLSX);`
			`export const makeXLSXFromTable = handleExport(ExportXLSX.makeXLSXFromTable);`
			`export const makeXLSXFromViewSection = handleExport(ExportXLSX.makeXLSXFromViewSection);`

			`function handleExport<T extends any[]>(`
			`make: (a: ActiveDocSource, testDates: boolean, output: Stream, ...args: T) => Promise<void>`
			`) {`
			`return async function({port, testDates, args}: {port: MessagePort, testDates: boolean, args: T}) {`
			`try {`
			`const start = Date.now();`
			`log.debug("workerExporter %s %s: started", threadId, make.name);`
			`const rpc = new Rpc({`
			`sendMessage: async (m) => port.postMessage(m),`
			`logger: { info: m => {}, warn: m => log.warn(m) },`
			`});`
			`const activeDocSource = rpc.getStub<ActiveDocSource>("activeDocSource");`
			`port.on('message', (m) => rpc.receiveMessage(m));`
			`const outputStream = new PassThrough();`
			`bufferedPipe(outputStream, (chunk) => rpc.postMessage(chunk));`
			`await make(activeDocSource, testDates, outputStream, ...args);`
			`port.close();`
			`log.debug("workerExporter %s %s: done in %s ms", threadId, make.name, Date.now() - start);`
			`} catch (e) {`
			`log.debug("workerExporter %s %s: error %s", threadId, make.name, String(e));`
			`// When Error objects move across threads, they keep only the 'message' property. We can`
			`// keep other properties (like 'status') if we throw a plain object instead. (Didn't find a`
			`// good reference on this, https://github.com/nodejs/node/issues/35506 is vaguely related.)`
			`throw {message: e.message, ...e};`
			`}`
			`};`
			`}`

			`// ExcelJS's WorkbookWriter produces many tiny writes (even though they pass through zipping). To`
			`// reduce overhead and context switching, buffer them and pass on in chunks. (In practice, this`
			`// helps performance only slightly.)`
			`function bufferedPipe(stream: Stream, callback: (chunk: Buffer) => void, threshold = 64*1024) {`
			`let buffers: Buffer[] = [];`
			`let length = 0;`
			`let flushed = 0;`

			`function flush() {`
			`if (length > 0) {`
			`const data = Buffer.concat(buffers);`
			`flushed += data.length;`
			`callback(data);`
			`buffers = [];`
			`length = 0;`
			`}`
			`}`

			`stream.on('data', (chunk) => {`
			`// Whenever data is written to the stream, add it to the buffer.`
			`buffers.push(chunk);`
			`length += chunk.length;`
			`// If the buffer is large enough, post it to the callback. Also post the very first chunk:`
			`// since this becomes an HTTP response, a quick first chunk lets the browser prompt the user`
			`// more quickly about what to do with the download.`
			`if (length >= threshold \|\| flushed === 0) {`
			`flush();`
			`}`
			`});`

			`stream.on('end', flush);`
			`}`