(core) Exit more cleanly on unhandled errors, and handle errors writing to Clients.

Summary: - Node has a strong recommendation to assume bad state and exit promptly on unhandled exceptions and rejections. We follow it, and only make an effort to clean up before exiting, and to log the error in a more standard way. - The only case seen in recent month of an unhandled rejection was for attempting to write overly large JSON to a Client websocket. Ensure that's handled, and add a test case that artificially reproduces this scenario. Test Plan: Added a test case for failing write to Client, and a test case that unhandled errors indeed kill the server but with an attempt at cleanup. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D4124
2026-03-02 04:09:24 +00:00 · 2023-11-30 14:08:46 -05:00
parent d89e008a75
commit 4d9bbf6263
12 changed files with 263 additions and 53 deletions
--- a/app/server/lib/Client.ts
+++ b/app/server/lib/Client.ts
@@ -34,9 +34,6 @@ export type ClientMethod = (client: Client, ...args: any[]) => Promise<unknown>;
 // How long the client state persists after a disconnect.
 const clientRemovalTimeoutMs = 300 * 1000;   // 300s = 5 minutes.

-// A hook for dependency injection.
-export const Deps = {clientRemovalTimeoutMs};
-
 // How much memory to allow using for large JSON responses before waiting for some to clear.
 // Max total across all clients and all JSON responses.
 const jsonResponseTotalReservation = 500 * 1024 * 1024;
@@ -45,6 +42,9 @@ const jsonResponseTotalReservation = 500 * 1024 * 1024;
 const jsonResponseReservation = 20 * 1024 * 1024;
 export const jsonMemoryPool = new MemoryPool(jsonResponseTotalReservation);

+// A hook for dependency injection.
+export const Deps = {clientRemovalTimeoutMs, jsonResponseReservation};
+
 /**
 * Generates and returns a random string to use as a clientId. This is better
 * than numbering clients with consecutive integers; otherwise a reconnecting
@@ -197,6 +197,19 @@ export class Client {
    }
  }

+  /**
+   * Sends a message to the client. If the send fails in a way that the message can't get queued
+   * (e.g. due to an unexpected exception in code), logs an error and interrupts the connection.
+   */
+  public async sendMessageOrInterrupt(messageObj: CommMessage|CommResponse|CommResponseError): Promise<void> {
+    try {
+      await this.sendMessage(messageObj);
+    } catch (e) {
+      this._log.error(null, 'sendMessage error', e);
+      this.interruptConnection();
+    }
+  }
+
  /**
   * Sends a message to the client, queuing it up on failure or if the client is disconnected.
   */
@@ -221,7 +234,7 @@ export class Client {
    // Overall, a better solution would be to stream large responses, or to have the client
    // request data piecemeal (as we'd have to for handling large data).

-    await jsonMemoryPool.withReserved(jsonResponseReservation, async (updateReservation) => {
+    await jsonMemoryPool.withReserved(Deps.jsonResponseReservation, async (updateReservation) => {
      if (this._destroyed) {
        // If this Client got destroyed while waiting, stop here and release the reservation.
        return;
@@ -562,7 +575,7 @@ export class Client {
        }
      }
    }
-    await this.sendMessage(response);
+    await this.sendMessageOrInterrupt(response);
  }

  // Fetch the user database record from profile.email, or null when profile is not set.
--- a/app/server/lib/Comm.ts
+++ b/app/server/lib/Comm.ts
@@ -37,7 +37,6 @@ import * as http from 'http';
 import * as https from 'https';
 import * as WebSocket from 'ws';

-import {CommDocEventType, CommMessage} from 'app/common/CommTypes';
 import {parseFirstUrlPart} from 'app/common/gristUrls';
 import {firstDefined, safeJsonParse} from 'app/common/gutil';
 import {UserProfile} from 'app/common/LoginSessionAPI';
@@ -270,19 +269,3 @@ export class Comm extends EventEmitter {
    return wss;
  }
 }
-
-/**
- * Sends a per-doc message to the given client.
- * @param {Object} client - The client object, as passed to all per-doc methods.
- * @param {Number} docFD - The document's file descriptor in the given client.
- * @param {String} type - The type of the message, e.g. 'docUserAction'.
- * @param {Object} messageData - The data for this type of message.
- * @param {Boolean} fromSelf - Whether `client` is the originator of this message.
- */
-export function sendDocMessage(
-  client: Client, docFD: number, type: CommDocEventType, data: unknown, fromSelf?: boolean
-) {
-  // TODO Warning disabled to preserve past behavior, but perhaps better to return the Promise?
-  // eslint-disable-next-line @typescript-eslint/no-floating-promises
-  client.sendMessage({type, docFD, data, fromSelf} as CommMessage);
-}
--- a/app/server/lib/DocClients.ts
+++ b/app/server/lib/DocClients.ts
@@ -3,12 +3,11 @@
 * open, and what FD they are using.
 */

-import {CommDocEventType} from 'app/common/CommTypes';
+import {CommDocEventType, CommMessage} from 'app/common/CommTypes';
 import {arrayRemove} from 'app/common/gutil';
 import {ActiveDoc} from 'app/server/lib/ActiveDoc';
 import {Authorizer} from 'app/server/lib/Authorizer';
 import {Client} from 'app/server/lib/Client';
-import {sendDocMessage} from 'app/server/lib/Comm';
 import {DocSession, OptDocSession} from 'app/server/lib/DocSession';
 import {LogMethods} from "app/server/lib/LogMethods";

@@ -85,7 +84,14 @@ export class DocClients {
  public async broadcastDocMessage(client: Client|null, type: CommDocEventType, messageData: any,
                                   filterMessage?: (docSession: OptDocSession,
                                                    messageData: any) => Promise<any>): Promise<void> {
-    const send = (curr: DocSession) => this._send(curr, client, type, messageData, filterMessage);
+    const send = async (target: DocSession) => {
+      const msg = await this._prepareMessage(target, type, messageData, filterMessage);
+      if (msg) {
+        const fromSelf = (target.client === client);
+        await target.client.sendMessageOrInterrupt({...msg, docFD: target.fd, fromSelf} as CommMessage);
+      }
+    };
+
    if (Deps.BROADCAST_ORDER === 'parallel') {
      await Promise.all(this._docSessions.map(send));
    } else {
@@ -101,30 +107,30 @@ export class DocClients {
  }

  /**
-   * Send a message to a single client. See broadcastDocMessage for parameters.
+   * Prepares a message to a single client. See broadcastDocMessage for parameters.
   */
-  private async _send(target: DocSession, client: Client|null, type: CommDocEventType, messageData: any,
-                      filterMessage?: (docSession: OptDocSession,
-                                       messageData: any) => Promise<any>): Promise<void> {
-    const fromSelf = (target.client === client);
+  private async _prepareMessage(
+    target: DocSession, type: CommDocEventType, messageData: any,
+    filterMessage?: (docSession: OptDocSession, messageData: any) => Promise<any>
+  ): Promise<{type: CommDocEventType, data: unknown}|undefined> {
    try {
      // Make sure user still has view access.
      await target.authorizer.assertAccess('viewers');
      if (!filterMessage) {
-        sendDocMessage(target.client, target.fd, type, messageData, fromSelf);
+        return {type, data: messageData};
      } else {
        try {
          const filteredMessageData = await filterMessage(target, messageData);
          if (filteredMessageData) {
-            sendDocMessage(target.client, target.fd, type, filteredMessageData, fromSelf);
+            return {type, data: filteredMessageData};
          } else {
            this._log.debug(target, 'skip broadcastDocMessage because it is not allowed for this client');
          }
        } catch (e) {
          if (e.code && e.code === 'NEED_RELOAD') {
-            sendDocMessage(target.client, target.fd, 'docShutdown', null, fromSelf);
+            return {type: 'docShutdown', data: null};
          } else {
-            sendDocMessage(target.client, target.fd, 'docUserAction', {error: String(e)}, fromSelf);
+            return {type: 'docUserAction', data: {error: String(e)}};
          }
        }
      }
@@ -134,9 +140,10 @@ export class DocClients {
        this._log.debug(target, 'skip broadcastDocMessage because AUTH_NO_VIEW');
        // Go further and trigger a shutdown for this user, in case they are granted
        // access again later.
-        sendDocMessage(target.client, target.fd, 'docShutdown', null, fromSelf);
+        return {type: 'docShutdown', data: null};
      } else {
-        throw(e);
+        // Propagate any totally unexpected exceptions.
+        throw e;
      }
    }
  }
--- a/app/server/lib/FlexServer.ts
+++ b/app/server/lib/FlexServer.ts
@@ -515,6 +515,26 @@ export class FlexServer implements GristServer {
    if (this._check('cleanup')) { return; }
    // Set up signal handlers. Note that nodemon sends SIGUSR2 to restart node.
    shutdown.cleanupOnSignals('SIGINT', 'SIGTERM', 'SIGHUP', 'SIGUSR2');
+
+    // We listen for uncaughtExceptions / unhandledRejections, but do exit when they happen. It is
+    // a strong recommendation, which seems best to follow
+    // (https://nodejs.org/docs/latest-v18.x/api/process.html#warning-using-uncaughtexception-correctly).
+    // We do try to shutdown cleanly (i.e. do any planned cleanup), which goes somewhat against
+    // the recommendation to do only synchronous work.
+
+    let counter = 0;
+
+    // Note that this event catches also 'unhandledRejection' (origin should be either
+    // 'uncaughtException' or 'unhandledRejection').
+    process.on('uncaughtException', (err, origin) => {
+      log.error(`UNHANDLED ERROR ${origin} (${counter}):`, err);
+      if (counter === 0) {
+        // Only call shutdown once. It's async and could in theory fail, in which case it would be
+        // another unhandledRejection, and would get caught and reported by this same handler.
+        void(shutdown.exit(1));
+      }
+      counter++;
+    });
  }

  public addTagChecker() {
--- a/app/server/lib/ITestingHooks-ti.ts
+++ b/app/server/lib/ITestingHooks-ti.ts
@@ -4,6 +4,12 @@
 import * as t from "ts-interface-checker";
 // tslint:disable:object-literal-key-quotes

+export const ClientJsonMemoryLimits = t.iface([], {
+  "totalSize": t.opt("number"),
+  "jsonResponseReservation": t.opt("number"),
+  "maxReservationSize": t.opt(t.union("number", "null")),
+});
+
 export const ITestingHooks = t.iface([], {
  "getOwnPort": t.func("number"),
  "getPort": t.func("number"),
@@ -13,7 +19,7 @@ export const ITestingHooks = t.iface([], {
  "commShutdown": t.func("void"),
  "commRestart": t.func("void"),
  "commSetClientPersistence": t.func("number", t.param("ttlMs", "number")),
-  "commSetClientJsonMemoryLimit": t.func("number", t.param("newTotalSize", "number")),
+  "commSetClientJsonMemoryLimits": t.func("ClientJsonMemoryLimits", t.param("limits", "ClientJsonMemoryLimits")),
  "closeDocs": t.func("void"),
  "setDocWorkerActivation": t.func("void", t.param("workerId", "string"), t.param("active", t.union(t.lit('active'), t.lit('inactive'), t.lit('crash')))),
  "flushAuthorizerCache": t.func("void"),
@@ -23,9 +29,11 @@ export const ITestingHooks = t.iface([], {
  "setDiscourseConnectVar": t.func(t.union("string", "null"), t.param("varName", "string"), t.param("value", t.union("string", "null"))),
  "setWidgetRepositoryUrl": t.func("void", t.param("url", "string")),
  "getMemoryUsage": t.func("object"),
+  "tickleUnhandledErrors": t.func("void", t.param("errType", "string")),
 });

 const exportedTypeSuite: t.ITypeSuite = {
+  ClientJsonMemoryLimits,
  ITestingHooks,
 };
 export default exportedTypeSuite;
--- a/app/server/lib/ITestingHooks.ts
+++ b/app/server/lib/ITestingHooks.ts
@@ -1,5 +1,11 @@
 import {UserProfile} from 'app/common/LoginSessionAPI';

+export interface ClientJsonMemoryLimits {
+  totalSize?: number;
+  jsonResponseReservation?: number;
+  maxReservationSize?: number|null;
+}
+
 export interface ITestingHooks {
  getOwnPort(): Promise<number>;
  getPort(): Promise<number>;
@@ -9,7 +15,7 @@ export interface ITestingHooks {
  commShutdown(): Promise<void>;
  commRestart(): Promise<void>;
  commSetClientPersistence(ttlMs: number): Promise<number>;
-  commSetClientJsonMemoryLimit(newTotalSize: number): Promise<number>;
+  commSetClientJsonMemoryLimits(limits: ClientJsonMemoryLimits): Promise<ClientJsonMemoryLimits>;
  closeDocs(): Promise<void>;
  setDocWorkerActivation(workerId: string, active: 'active'|'inactive'|'crash'): Promise<void>;
  flushAuthorizerCache(): Promise<void>;
@@ -19,4 +25,5 @@ export interface ITestingHooks {
  setDiscourseConnectVar(varName: string, value: string|null): Promise<string|null>;
  setWidgetRepositoryUrl(url: string): Promise<void>;
  getMemoryUsage(): Promise<object>;  // actually NodeJS.MemoryUsage
+  tickleUnhandledErrors(errType: string): Promise<void>;
 }
--- a/app/server/lib/TestingHooks.ts
+++ b/app/server/lib/TestingHooks.ts
@@ -8,10 +8,11 @@ import * as Client from 'app/server/lib/Client';
 import {Comm} from 'app/server/lib/Comm';
 import log from 'app/server/lib/log';
 import {IMessage, Rpc} from 'grain-rpc';
+import {EventEmitter} from 'events';
 import {Request} from 'express';
 import * as t from 'ts-interface-checker';
 import {FlexServer} from './FlexServer';
-import {ITestingHooks} from './ITestingHooks';
+import {ClientJsonMemoryLimits, ITestingHooks} from './ITestingHooks';
 import ITestingHooksTI from './ITestingHooks-ti';
 import {connect, fromCallback} from './serverUtils';
 import {WidgetRepositoryImpl} from 'app/server/lib/WidgetRepository';
@@ -127,11 +128,42 @@ export class TestingHooks implements ITestingHooks {
    return prev;
  }

-  // Set the amount of memory Client.ts can use for JSON responses, in bytes.
-  // Returns the old limit.
-  public async commSetClientJsonMemoryLimit(newTotalSize: number): Promise<number> {
-    log.info("TestingHooks.commSetClientJsonMemoryLimit called with", newTotalSize);
-    return Client.jsonMemoryPool.setTotalSize(newTotalSize);
+  // Set one or more limits that Client.ts can use for JSON responses, in bytes.
+  // Returns the old limits.
+  // - totalSize limits total amount of memory Client allocates to JSON response
+  // - jsonResponseReservation sets the initial amount reserved for each response
+  // - maxReservationSize monkey-patches reservation logic to fail when reservation exceeds the
+  //      given amount, to simulate unexpected failures.
+  public async commSetClientJsonMemoryLimits(limits: ClientJsonMemoryLimits): Promise<ClientJsonMemoryLimits> {
+    log.info("TestingHooks.commSetClientJsonMemoryLimits called with", limits);
+    const previous: ClientJsonMemoryLimits = {};
+    if (limits.totalSize !== undefined) {
+      previous.totalSize = Client.jsonMemoryPool.setTotalSize(limits.totalSize);
+    }
+    if (limits.jsonResponseReservation !== undefined) {
+      previous.jsonResponseReservation = CommClientDeps.jsonResponseReservation;
+      CommClientDeps.jsonResponseReservation = limits.jsonResponseReservation;
+    }
+    if (limits.maxReservationSize !== undefined) {
+      previous.maxReservationSize = null;
+      const orig = Object.getPrototypeOf(Client.jsonMemoryPool)._updateReserved;
+      if (limits.maxReservationSize === null) {
+        (Client.jsonMemoryPool as any)._updateReserved = orig;
+      } else {
+        // Monkey-patch reservation logic to simulate unexpected failures.
+        const jsonMemoryThrowLimit = limits.maxReservationSize;
+        function updateReservedWithLimit(this: typeof Client.jsonMemoryPool, sizeDelta: number) {
+          const newSize: number = (this as any)._reservedSize + sizeDelta;
+          log.warn(`TestingHooks _updateReserved reserving ${newSize}, limit ${jsonMemoryThrowLimit}`);
+          if (newSize > jsonMemoryThrowLimit) {
+            throw new Error(`TestingHooks: hit JsonMemoryThrowLimit: ${newSize} > ${jsonMemoryThrowLimit}`);
+          }
+          return orig.call(this, sizeDelta);
+        }
+        (Client.jsonMemoryPool as any)._updateReserved = updateReservedWithLimit;
+      }
+    }
+    return previous;
  }

  public async closeDocs(): Promise<void> {
@@ -229,4 +261,18 @@ export class TestingHooks implements ITestingHooks {
  public async getMemoryUsage(): Promise<NodeJS.MemoryUsage> {
    return process.memoryUsage();
  }
+
+  // This is for testing the handling of unhandled exceptions and rejections.
+  public async tickleUnhandledErrors(errType: 'exception'|'rejection'|'error-event'): Promise<void> {
+    if (errType === 'exception') {
+      setTimeout(() => { throw new Error("TestingHooks: Fake exception"); }, 0);
+    } else if (errType === 'rejection') {
+      void(Promise.resolve(null).then(() => { throw new Error("TestingHooks: Fake rejection"); }));
+    } else if (errType === 'error-event') {
+      const emitter = new EventEmitter();
+      setTimeout(() => emitter.emit('error', new Error('TestingHooks: Fake error-event')), 0);
+    } else {
+      throw new Error(`Unrecognized errType ${errType}`);
+    }
+  }
 }