/** * * Simple CPU throttling implementation. * * For this setup, a sandbox attempting to use 100% of cpu over an * extended period will end up throttled, in the steady-state, to * 10% of cpu. * * Very simple mechanism to begin with. "ctime" is measured for the * sandbox, being the cumulative time charged to the user (directly or * indirectly) by the OS for that process. If the average increase in * ctime over a time period is over 10% (targetRate) of that time period, * throttling kicks in, and the process will be paused/unpaused via * signals on a duty cycle. * * Left for future work: more careful shaping of CPU throttling, and * factoring in a team-site level credit system or similar. * */ import pidusage from '@gristlabs/pidusage'; import log from 'app/server/lib/log'; /** * Parameters related to throttling. */ export interface ThrottleTiming { dutyCyclePositiveMs: number; // when throttling, how much uninterrupted time to give // the process before pausing it. The length of the // non-positive cycle is chosen to achieve the desired // cpu usage. samplePeriodMs: number; // how often to sample cpu usage and update throttling targetAveragingPeriodMs: number; // (rough) time span to average cpu usage over. minimumAveragingPeriodMs: number; // minimum time span before throttling is considered. // No throttling will occur before a process has run // for at least this length of time. minimumLogPeriodMs: number; // minimum time between log messages about throttling. targetRate: number; // when throttling, aim for this fraction of cpu usage // per unit time. maxThrottle: number; // maximum ratio of negative duty cycle phases to // positive. traceNudgeOffset: number; // milliseconds to wait before sending a second signal // to a traced process. } /** * Some parameters that seem reasonable defaults. */ const defaultThrottleTiming: ThrottleTiming = { dutyCyclePositiveMs: 50, samplePeriodMs: 1000, targetAveragingPeriodMs: 20000, minimumAveragingPeriodMs: 6000, minimumLogPeriodMs: 10000, targetRate: 0.25, maxThrottle: 10, traceNudgeOffset: 5, // unlikely to be honored very precisely, but doesn't need to be. }; /** * A sample of cpu usage. */ interface MeterSample { time: number; // time at which sample was made (as reported by Date.now()) cpuDuration: number; // accumulated "ctime" measured by pidusage offDuration: number; // accumulated clock time for which process was paused (approximately) } /** * A throttling implementation for a process. Supply a pid, and it will try to keep that * process from consuming too much cpu until stop() is called. */ export class Throttle { private _timing: ThrottleTiming; // overall timing parameters private _meteringInterval: NodeJS.Timeout | undefined; // timer for cpu measurements private _dutyCycleTimeout: NodeJS.Timeout | undefined; // driver for throttle duty cycle private _traceNudgeTimeout: NodeJS.Timeout | undefined; // schedule a nudge to a traced process private _throttleFactor: number = 0; // relative length of paused phase private _sample: MeterSample | undefined; // latest measurement. private _anchor: MeterSample | undefined; // sample from past for averaging private _nextAnchor: MeterSample | undefined; // upcoming replacement for _anchor private _lastLogTime: number | undefined; // time of last throttle log message private _offDuration: number = 0; // cumulative time spent paused private _stopped: boolean = false; // set when stop has been called private _active: boolean = true; // set when we are not trying to pause process /** * Start monitoring the given process and throttle as needed. * If readPid is set, CPU usage will be read for that process. * If tracedPid is set, then that process will be sent a STOP signal * whenever the main process is sent a STOP, and then another STOP * signal will be sent again shortly after. * * The tracedPid wrinkle is to deal with gvisor on a ptrace platform. * From `man ptrace`: * * "While being traced, the tracee will stop each time a signal is * delivered, even if the signal is being ignored. (An exception is * SIGKILL, which has its usual effect.) The tracer will be * notified at its next call to waitpid(2) (or one of the related * "wait" system calls); that call will return a status value * containing information that indicates the cause of the stop in * the tracee. While the tracee is stopped, the tracer can use * various ptrace requests to inspect and modify the tracee. The * tracer then causes the tracee to continue, optionally ignoring * the delivered signal (or even delivering a different signal * instead)." * * So what sending a STOP to a process being traced by gvisor will * do is not obvious. In practice it appears to have no effect * (other than presumably giving gvisor a change to examine it). * So for gvisor, we send a STOP to the tracing process, and a STOP * to the tracee, and then a little later a STOP to the tracee again * (since there's no particular guarantee about order of signal * delivery). This isn't particularly elegant, but in tests, this * seems to do the job, while sending STOP to any one process does * not. * * Alternatively, gvisor runsc does have "pause" and "resume" * commands that could be looked into more. * */ constructor(private readonly _options: { pid: number, // main pid to stop/continue readPid?: number, // pid to read cpu usage of, if different to main tracedPid?: number, // pid of a traced process to signal logMeta: log.ILogMeta, timing?: ThrottleTiming }) { this._timing = this._options.timing || defaultThrottleTiming; this._meteringInterval = setInterval(() => this._update(), this._timing.samplePeriodMs); } /** * Stop all activity. */ public stop() { this._stopped = true; this._stopMetering(); this._stopTraceNudge(); this._stopThrottling(); } /** * Read the last cpu usage sample made, for test purposes. */ public get testStats(): MeterSample|undefined { return this._sample; } /** * Measure cpu usage and update whether and how much we are throttling the process. */ private async _update() { // Measure cpu usage to date. let cpuDuration: number; try { cpuDuration = (await pidusage(this._options.readPid || this._options.pid)).ctime; } catch (e) { // process may have disappeared. this._log(`Throttle measurement error: ${e}`, this._options.logMeta); return; } const now = Date.now(); const current: MeterSample = { time: now, cpuDuration, offDuration: this._offDuration }; this._sample = current; // Measuring cpu usage was an async operation, so check that we haven't been stopped // in the meantime. Otherwise we could sneak in and restart a throttle duty cycle. if (this._stopped) { return; } // We keep a reference point in the past called the "anchor". Whenever the anchor // becomes sufficiently old, we replace it with something newer. if (!this._anchor) { this._anchor = current; } if (this._nextAnchor && now - this._anchor.time > this._timing.targetAveragingPeriodMs * 2) { this._anchor = this._nextAnchor; this._nextAnchor = undefined; } // Keep a replacement for the current anchor in mind. if (!this._nextAnchor && now - this._anchor.time > this._timing.targetAveragingPeriodMs) { this._nextAnchor = current; } // Check if the anchor is sufficiently old for averages to be meaningful enough // to support throttling. const dt = current.time - this._anchor.time; if (dt < this._timing.minimumAveragingPeriodMs) { return; } // Calculate the average cpu use per second since the anchor. const rate = (current.cpuDuration - this._anchor.cpuDuration) / dt; // If that rate is less than our target rate, don't bother throttling. const targetRate = this._timing.targetRate; if (rate <= targetRate) { this._updateThrottle(0); return; } // Calculate how much time the sandbox was paused since the anchor. This is // approximate, since we don't line up duty cycles with this update function, // but it should be good enough for throttling purposes. const off = current.offDuration - this._anchor.offDuration; // If the sandbox was never allowed to run, wait a bit longer for a duty cycle to complete. // This should never happen unless time constants are set too tight relative to the // maximum length of duty cycle. const on = dt - off; if (on <= 0) { return; } // Calculate the average cpu use per second while the sandbox is unpaused. const rateWithoutThrottling = (current.cpuDuration - this._anchor.cpuDuration) / on; // Now pick a throttle level such that, if the sandbox continues using cpu // at rateWithoutThrottling when it is unpaused, the overall rate matches // the targetRate. // one duty cycle lasts: quantum * (1 + throttleFactor) // (positive cycle lasts 1 quantum; non-positive cycle duration is that of // positive cycle scaled by throttleFactor) // cpu use for this cycle is: quantum * rateWithoutThrottling // cpu use per second is therefore: rateWithoutThrottling / (1 + throttleFactor) // so: throttleFactor = (rateWithoutThrottling / targetRate) - 1 const throttleFactor = rateWithoutThrottling / targetRate - 1; // Apply the throttle. Place a cap on it so the duty cycle does not get too long. // This cap means that low targetRates could be unobtainable. this._updateThrottle(Math.min(throttleFactor, this._timing.maxThrottle)); if (!this._lastLogTime || now - this._lastLogTime > this._timing.minimumLogPeriodMs) { this._lastLogTime = now; this._log('throttle', {...this._options.logMeta, throttle: Math.round(this._throttleFactor), throttledRate: Math.round(rate * 100), rate: Math.round(rateWithoutThrottling * 100)}); } } /** * Start/stop the throttling duty cycle as necessary. */ private _updateThrottle(factor: number) { // For small factors, let the process run continuously. if (factor < 0.001) { if (this._dutyCycleTimeout) { this._stopThrottling(); } this._throttleFactor = 0; return; } // Set the throttle factor to apply and make sure the duty cycle is running. this._throttleFactor = factor; if (!this._dutyCycleTimeout) { this._throttle(true); } } /** * Send CONTinue or STOP signal to process. */ private _letProcessRun(on: boolean) { this._active = on; try { process.kill(this._options.pid, on ? 'SIGCONT' : 'SIGSTOP'); const tracedPid = this._options.tracedPid; if (tracedPid && !on) { process.kill(tracedPid, 'SIGSTOP'); if (this._timing.traceNudgeOffset > 0) { this._stopTraceNudge(); this._traceNudgeTimeout = setTimeout(() => { if (!this._active) { process.kill(tracedPid, 'SIGSTOP'); } }, this._timing.traceNudgeOffset); } } } catch (e) { // process may have disappeared this._log(`Throttle error: ${e}`, this._options.logMeta); } } /** * Send CONTinue or STOP signal to process, and schedule next step * in duty cycle. */ private _throttle(on: boolean) { this._letProcessRun(on); const dt = this._timing.dutyCyclePositiveMs * (on ? 1.0 : this._throttleFactor); if (!on) { this._offDuration += dt; } this._dutyCycleTimeout = setTimeout(() => this._throttle(!on), dt); } /** * Make sure measurement of cpu is stopped. */ private _stopMetering() { if (this._meteringInterval) { clearInterval(this._meteringInterval); this._meteringInterval = undefined; } } private _stopTraceNudge() { if (this._traceNudgeTimeout) { clearTimeout(this._traceNudgeTimeout); this._traceNudgeTimeout = undefined; } } /** * Make sure duty cycle is stopped and process is left in running state. */ private _stopThrottling() { if (this._dutyCycleTimeout) { clearTimeout(this._dutyCycleTimeout); this._dutyCycleTimeout = undefined; this._letProcessRun(true); } } private _log(msg: string, meta: log.ILogMeta) { log.rawDebug(msg, meta); } }