(core) Improved error messages, retries, and handling of token limits in AI assistant

Summary: In a nutshell: - More specific and helpful error messages are shown to the user - API requests are only retried when needed - The system deals with reaching the maximum token limit better, especially by switching to a model with a bigger limit In more detail: - `COMPLETION_MODEL` configuration has been removed. By default `gpt-3.5-turbo-0613` is used which accepts 4k tokens. If that's not enough, `gpt-3.5-turbo-16k-0613` is used instead. - Switching to the bigger model happens when either the prompt is too long by itself (the API immediately returns an error code) or the model reaches the 4k limit itself in the process of generating a response and thus returns an incomplete response. The latter case is made possible by removing the `max_tokens: 1500` in the request, which was very generous and would have lead to switching to the more expensive model more often than needed. The downside is that the user has to wait a bit longer for the response. - If the bigger 16k token limit is also exceeded, the assistant immediately responds (instead of retrying as before) with an error message including suggestions. The suggestions include restarting the conversation if and only if the user has sent multiple messages. - If a request fails because Grist has reached its OpenAI monthly billing quota, the assistant immediately responds (instead of retrying as before) with an error message suggesting that the user try again tomorrow. - If a request fails for some other reason, the assistant retries, and if all attempts fail then the user is told to try again in a few minutes and is shown the exact error message, including the API response if there is one. - Retrying only happens when an API request fails, whereas previously the system also retried errors from a much bigger scope which included calls to the sandbox. The downside is that the hugging face assistant no longer retries, although that code is currently disabled anyway. - The assistant no longer waits an additional second after the final retry attempt fails. Test Plan: Added a new server test file with several unit tests using faked OpenAI responses, including the happy path which wasn't really tested before. Reviewers: dsagal Reviewed By: dsagal Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D3955
2026-03-02 04:09:24 +00:00 · 2023-07-18 12:03:31 +02:00
parent d894b60fd4
commit 7fd48364df
2 changed files with 116 additions and 44 deletions
--- a/app/common/AssistancePrompts.ts
+++ b/app/common/AssistancePrompts.ts
@@ -11,10 +11,12 @@ import {DocAction} from 'app/common/DocActions';
 * model at this time (it is a bit early for that).
 */
 export interface AssistanceState {
-  messages?: Array<{
+  messages?: AssistanceMessage[];
 }
 export interface AssistanceMessage {
  role: string;
  content: string;
  }>;
 }
 /**
--- a/app/server/lib/Assistance.ts
+++ b/app/server/lib/Assistance.ts
@@ -2,14 +2,16 @@
 * Module with functions used for AI formula assistance.
 */
-import {AssistanceRequest, AssistanceResponse} from 'app/common/AssistancePrompts';
+import {AssistanceMessage, AssistanceRequest, AssistanceResponse} from 'app/common/AssistancePrompts';
 import {delay} from 'app/common/delay';
 import {DocAction} from 'app/common/DocActions';
 import {OptDocSession} from 'app/server/lib/DocSession';
 import log from 'app/server/lib/log';
 import fetch from 'node-fetch';
-export const DEPS = { fetch };
+// These are mocked/replaced in tests.
 // fetch is also replacing in the runCompletion script to add caching.
 export const DEPS = { fetch, delayTime: 1000 };
 /**
 * An assistant can help a user do things with their document,
@@ -44,13 +46,58 @@ export interface AssistanceSchemaPromptV1Context {
  docString: string,
 }
 class SwitchToLongerContext extends Error {
 }
 class NonRetryableError extends Error {
 }
 class TokensExceededFirstMessage extends NonRetryableError {
  constructor() {
    super(
      "Sorry, there's too much information for the AI to process. " +
      "You'll need to either shorten your message or delete some columns."
    );
  }
 }
 class TokensExceededLaterMessage extends NonRetryableError {
  constructor() {
    super(
      "Sorry, there's too much information for the AI to process. " +
      "You'll need to either shorten your message, restart the conversation, or delete some columns."
    );
  }
 }
 class QuotaExceededError extends NonRetryableError {
  constructor() {
    super(
      "Sorry, the assistant is facing some long term capacity issues. " +
      "Maybe try again tomorrow."
    );
  }
 }
 class RetryableError extends Error {
  constructor(message: string) {
    super(
      "Sorry, the assistant is unavailable right now. " +
      "Try again in a few minutes. \n" +
      `(${message})`
    );
  }
 }
 /**
 * A flavor of assistant for use with the OpenAI API.
 * Tested primarily with gpt-3.5-turbo.
 */
 export class OpenAIAssistant implements Assistant {
  public static DEFAULT_MODEL = "gpt-3.5-turbo-0613";
  public static LONGER_CONTEXT_MODEL = "gpt-3.5-turbo-16k-0613";
  private _apiKey: string;
  private _model: string;
  private _chatMode: boolean;
  private _endpoint: string;
@@ -60,8 +107,7 @@ export class OpenAIAssistant implements Assistant {
      throw new Error('OPENAI_API_KEY not set');
    }
    this._apiKey = apiKey;
-    this._model = process.env.COMPLETION_MODEL || "gpt-3.5-turbo-0613";
+    this._chatMode = true;
    this._chatMode = this._model.includes('turbo');
    if (!this._chatMode) {
      throw new Error('Only turbo models are currently supported');
    }
@@ -114,7 +160,15 @@ export class OpenAIAssistant implements Assistant {
        role: 'user', content: await makeSchemaPromptV1(optSession, doc, request),
      });
    }
    const completion: string = await this._getCompletion(messages);
    const response = await completionToResponse(doc, request, completion, completion);
    if (chatMode) {
      response.state = {messages};
    }
    return response;
  }
  private async _fetchCompletion(messages: AssistanceMessage[], longerContext: boolean) {
    const apiResponse = await DEPS.fetch(
      this._endpoint,
      {
@@ -127,29 +181,61 @@ export class OpenAIAssistant implements Assistant {
          ...(!this._chatMode ? {
            prompt: messages[messages.length - 1].content,
          } : {messages}),
          max_tokens: 1500,
          temperature: 0,
-          model: this._model,
+          model: longerContext ? OpenAIAssistant.LONGER_CONTEXT_MODEL : OpenAIAssistant.DEFAULT_MODEL,
          stop: this._chatMode ? undefined : ["\n\n"],
        }),
      },
    );
-    if (apiResponse.status !== 200) {
+    const resultText = await apiResponse.text();
-      log.error(`OpenAI API returned ${apiResponse.status}: ${await apiResponse.text()}`);
+    const result = JSON.parse(resultText);
-      throw new Error(`OpenAI API returned status ${apiResponse.status}`);
+    const errorCode = result.error?.code;
    if (errorCode === "context_length_exceeded" || result.choices?.[0].finish_reason === "length") {
      if (!longerContext) {
        log.info("Switching to longer context model...");
        throw new SwitchToLongerContext();
      } else if (messages.length <= 2) {
        throw new TokensExceededFirstMessage();
      } else {
        throw new TokensExceededLaterMessage();
      }
-    const result = await apiResponse.json();
+    }
-    const completion: string = String(chatMode ? result.choices[0].message.content : result.choices[0].text);
+    if (errorCode === "insufficient_quota") {
-    const history = { messages };
+      log.error("OpenAI billing quota exceeded!!!");
-    if (chatMode) {
+      throw new QuotaExceededError();
-      history.messages.push(result.choices[0].message);
+    }
    if (apiResponse.status !== 200) {
      throw new Error(`OpenAI API returned status ${apiResponse.status}: ${resultText}`);
    }
    return result;
  }
-    const response = await completionToResponse(doc, request, completion, completion);
+  private async _fetchCompletionWithRetries(messages: AssistanceMessage[], longerContext: boolean): Promise<any> {
-    if (chatMode) {
+    const maxAttempts = 3;
-      response.state = history;
+    for (let attempt = 1; ; attempt++) {
      try {
        return await this._fetchCompletion(messages, longerContext);
      } catch (e) {
        if (e instanceof SwitchToLongerContext) {
          return await this._fetchCompletionWithRetries(messages, true);
        } else if (e instanceof NonRetryableError) {
          throw e;
        } else if (attempt === maxAttempts) {
          throw new RetryableError(e.toString());
        }
-    return response;
+        log.warn(`Waiting and then retrying after error: ${e}`);
        await delay(DEPS.delayTime);
      }
    }
  }
  private async _getCompletion(messages: AssistanceMessage[]) {
    const result = await this._fetchCompletionWithRetries(messages, false);
    const completion: string = String(this._chatMode ? result.choices[0].message.content : result.choices[0].text);
    if (this._chatMode) {
      messages.push(result.choices[0].message);
    }
    return completion;
  }
 }
@@ -275,31 +361,15 @@ export function getAssistant() {
 }
 /**
- * Service a request for assistance, with a little retry logic
+ * Service a request for assistance.
 * since these endpoints can be a bit flakey.
 */
 export async function sendForCompletion(
  optSession: OptDocSession,
  doc: AssistanceDoc,
-  request: AssistanceRequest): Promise<AssistanceResponse> {
+  request: AssistanceRequest,
 ): Promise<AssistanceResponse> {
  const assistant = getAssistant();
-
+  return await assistant.apply(optSession, doc, request);
  let retries: number = 0;
  let response: AssistanceResponse|null = null;
  while(retries++ < 3) {
    try {
      response = await assistant.apply(optSession, doc, request);
      break;
    } catch(e) {
      log.error(`Completion error: ${e}`);
      await delay(1000);
    }
  }
  if (!response) {
    throw new Error('Failed to get response from assistant');
  }
  return response;
 }
 async function makeSchemaPromptV1(session: OptDocSession, doc: AssistanceDoc, request: AssistanceRequest) {