gristlabs_grist-core/app/server/lib/ProcessMonitor.ts

import log from 'app/server/lib/log';
import { ITelemetry } from 'app/server/lib/Telemetry';

const MONITOR_PERIOD_MS = 5_000;        // take a look at memory usage this often
const MEMORY_DELTA_FRACTION = 0.1;      // fraction by which usage should change to get reported
const CPU_DELTA_FRACTION = 0.1;         // by how much cpu usage should change to get reported
const MONITOR_LOG_PERIOD_MS = 600_000;  // log usage at least this often

let _timer: NodeJS.Timeout|undefined;
let _lastTickTime: number = Date.now();
let _lastReportTime: number = 0;
let _lastReportedHeapUsed: number = 0;
let _lastCpuUsage: NodeJS.CpuUsage = {system: 0, user: 0};
let _lastReportedCpuAverage: number = 0;

/**
 * Monitor process memory (heap) and CPU usage, reporting as telemetry on an interval, and more
 * often when usage ticks up or down by a big enough delta.
 *
 * There is a single global process monitor, reporting to the `telemetry` object passed into the
 * first call to start().
 *
 * Returns a function that stops the monitor, or null if there was already a process monitor
 * running, and no new one was started.
 *
 * Reports:
 *  - heapUsedMB:   Size of JS heap in use, in MiB.
 *  - heapTotalMB:  Total heap size, in MiB, allocated for JS by v8.
 *  - cpuAverage:   Fraction between 0 and 1, cpu usage over the last MONITOR_PERIOD_MS. Note it
 *                  includes usage from all threads, so may exceed 1.
 *  - intervalMs:   Interval (in milliseconds) over which cpuAverage is reported. Being much
 *                  higher than MONITOR_PERIOD_MS is a sign of being CPU bound for that long.
 */
export function start(telemetry: ITelemetry): (() => void) | undefined {
  if (!_timer) {
    // Initialize variables needed for accurate first-tick measurement.
    _lastTickTime = Date.now();
    _lastCpuUsage = process.cpuUsage();
    _timer = setInterval(() => monitor(telemetry), MONITOR_PERIOD_MS);

    return function stop() {
      clearInterval(_timer);
      _timer = undefined;
    };
  }
}

function monitor(telemetry: ITelemetry) {
  const memoryUsage = process.memoryUsage();
  const heapUsed = memoryUsage.heapUsed;
  const cpuUsage = process.cpuUsage();
  const now = Date.now();

  const intervalMs = now - _lastTickTime;
  // Note that cpuUsage info is in microseconds, while intervalMs is milliseconds.
  const cpuAverage = (cpuUsage.system + cpuUsage.user - _lastCpuUsage.system - _lastCpuUsage.user)
    / 1000 / intervalMs;
  _lastCpuUsage = cpuUsage;
  _lastTickTime = now;

  // Report usage when:
  // (a) enough time has passed (MONITOR_LOG_PERIOD_MS)
  // (b) memory usage ticked up or down enough since the last report
  // (c) average cpu usage ticked up or down enough since the last report
  if (
    now > _lastReportTime + MONITOR_LOG_PERIOD_MS ||
    Math.abs(heapUsed - _lastReportedHeapUsed) > _lastReportedHeapUsed * MEMORY_DELTA_FRACTION ||
    Math.abs(cpuAverage - _lastReportedCpuAverage) > CPU_DELTA_FRACTION
  ) {
    telemetry.logEvent('processMonitor', {
      full: {
        heapUsedMB: Math.round(memoryUsage.heapUsed/1024/1024),
        heapTotalMB: Math.round(memoryUsage.heapTotal/1024/1024),
        cpuAverage: Math.round(cpuAverage * 100) / 100,
        intervalMs,
      },
    })
    .catch(e => log.error('failed to log telemetry event processMonitor', e));
    _lastReportedHeapUsed = heapUsed;
    _lastReportedCpuAverage = cpuAverage;
    _lastReportTime = now;
  }
}