gristlabs_grist-core/app/server/lib/ProcessMonitor.ts
George Gevoian 10f5f0cb37 (core) Add optional telemetry to grist-core
Summary:
Adds support for optional telemetry to grist-core.

A new environment variable, GRIST_TELEMETRY_LEVEL, controls the level of telemetry collected.

Test Plan: Server and unit tests.

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal, anaisconce

Differential Revision: https://phab.getgrist.com/D3880
2023-06-07 12:00:51 -04:00

84 lines
3.4 KiB
TypeScript

import log from 'app/server/lib/log';
import { ITelemetry } from 'app/server/lib/Telemetry';
const MONITOR_PERIOD_MS = 5_000; // take a look at memory usage this often
const MEMORY_DELTA_FRACTION = 0.1; // fraction by which usage should change to get reported
const CPU_DELTA_FRACTION = 0.1; // by how much cpu usage should change to get reported
const MONITOR_LOG_PERIOD_MS = 600_000; // log usage at least this often
let _timer: NodeJS.Timeout|undefined;
let _lastTickTime: number = Date.now();
let _lastReportTime: number = 0;
let _lastReportedHeapUsed: number = 0;
let _lastCpuUsage: NodeJS.CpuUsage = {system: 0, user: 0};
let _lastReportedCpuAverage: number = 0;
/**
* Monitor process memory (heap) and CPU usage, reporting as telemetry on an interval, and more
* often when usage ticks up or down by a big enough delta.
*
* There is a single global process monitor, reporting to the `telemetry` object passed into the
* first call to start().
*
* Returns a function that stops the monitor, or null if there was already a process monitor
* running, and no new one was started.
*
* Reports:
* - heapUsedMB: Size of JS heap in use, in MiB.
* - heapTotalMB: Total heap size, in MiB, allocated for JS by v8.
* - cpuAverage: Fraction between 0 and 1, cpu usage over the last MONITOR_PERIOD_MS. Note it
* includes usage from all threads, so may exceed 1.
* - intervalMs: Interval (in milliseconds) over which cpuAverage is reported. Being much
* higher than MONITOR_PERIOD_MS is a sign of being CPU bound for that long.
*/
export function start(telemetry: ITelemetry): (() => void) | undefined {
if (!_timer) {
// Initialize variables needed for accurate first-tick measurement.
_lastTickTime = Date.now();
_lastCpuUsage = process.cpuUsage();
_timer = setInterval(() => monitor(telemetry), MONITOR_PERIOD_MS);
return function stop() {
clearInterval(_timer);
_timer = undefined;
};
}
}
function monitor(telemetry: ITelemetry) {
const memoryUsage = process.memoryUsage();
const heapUsed = memoryUsage.heapUsed;
const cpuUsage = process.cpuUsage();
const now = Date.now();
const intervalMs = now - _lastTickTime;
// Note that cpuUsage info is in microseconds, while intervalMs is milliseconds.
const cpuAverage = (cpuUsage.system + cpuUsage.user - _lastCpuUsage.system - _lastCpuUsage.user)
/ 1000 / intervalMs;
_lastCpuUsage = cpuUsage;
_lastTickTime = now;
// Report usage when:
// (a) enough time has passed (MONITOR_LOG_PERIOD_MS)
// (b) memory usage ticked up or down enough since the last report
// (c) average cpu usage ticked up or down enough since the last report
if (
now > _lastReportTime + MONITOR_LOG_PERIOD_MS ||
Math.abs(heapUsed - _lastReportedHeapUsed) > _lastReportedHeapUsed * MEMORY_DELTA_FRACTION ||
Math.abs(cpuAverage - _lastReportedCpuAverage) > CPU_DELTA_FRACTION
) {
telemetry.logEvent('processMonitor', {
full: {
heapUsedMB: Math.round(memoryUsage.heapUsed/1024/1024),
heapTotalMB: Math.round(memoryUsage.heapTotal/1024/1024),
cpuAverage: Math.round(cpuAverage * 100) / 100,
intervalMs,
},
})
.catch(e => log.error('failed to log telemetry event processMonitor', e));
_lastReportedHeapUsed = heapUsed;
_lastReportedCpuAverage = cpuAverage;
_lastReportTime = now;
}
}