(core) Porting the AI evaluation script

Summary: Porting script that run an evaluation against our formula dataset. To test you need an openai key (see here: https://platform.openai.com/) or hugging face (it should work as well), then checkout the branch and run `OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js` Test Plan: Needs manually testing: so far there is no plan to make it part of CI. The current score is somewhere around 34 successful prompts over a total of 47. Reviewers: paulfitz Reviewed By: paulfitz Subscribers: jarek Differential Revision: https://phab.getgrist.com/D3816
2025-06-13 20:53:59 +00:00 · 2023-03-15 09:52:17 +01:00 · 2023-03-15 09:52:17 +01:00 · 1ff93f89c2
commit 1ff93f89c2
parent 2b2e19c5b5
8 changed files with 377 additions and 37 deletions
--- a/app/common/gutil.ts
+++ b/app/common/gutil.ts
@ -951,7 +951,7 @@ export function assertIsDefined<T>(name: string, value: T): asserts value is Non
 * Calls function `fn`, passes any thrown errors to function `recover`, and finally calls `fn`
 * once more if `recover` doesn't throw.
 */
- export async function retryOnce<T>(fn: () => Promise<T>, recover: (e: unknown) => Promise<void>): Promise<T> {
+export async function retryOnce<T>(fn: () => Promise<T>, recover: (e: unknown) => Promise<void>): Promise<T> {
  try {
    return await fn();
  } catch (e) {
@ -964,7 +964,7 @@ export function assertIsDefined<T>(name: string, value: T): asserts value is Non
 * Checks if value is 'empty' (like null, undefined, empty string, empty array/set/map, empty object).
 * Values like 0, true, false are not empty.
 */
- export function notSet(value: any) {
+export function notSet(value: any) {
  return value === undefined || value === null || value === ''
         || (Array.isArray(value) && !value.length)
         || (typeof value === 'object' && !Object.keys(value).length)
--- a/app/server/lib/ActiveDoc.ts
+++ b/app/server/lib/ActiveDoc.ts
@ -34,7 +34,7 @@ import {
  TransformRule
 } from 'app/common/ActiveDocAPI';
 import {ApiError} from 'app/common/ApiError';
-import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate';
+import {asyncOnce, mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate';
 import {AttachmentColumns, gatherAttachmentIds, getAttachmentColumns} from 'app/common/AttachmentColumns';
 import {
  BulkAddRecord,
@ -230,6 +230,11 @@ export class ActiveDoc extends EventEmitter {
  private _inactivityTimer = new InactivityTimer(() => this.shutdown(), Deps.ACTIVEDOC_TIMEOUT * 1000);
  private _recoveryMode: boolean = false;
  private _shuttingDown: boolean = false;
+  private _afterShutdownCallback?: () => Promise<void>;
+  // catch & report error so that asyncOnce does not get cleared.
+  private _doShutdown = asyncOnce(
+    () => this._doShutdownImpl().catch((e) => log.error('Uncaught shutdown error', e))
+  );

  /**
   * In cases where large numbers of documents are restarted simultaneously
@ -493,6 +498,14 @@ export class ActiveDoc extends EventEmitter {
  public async shutdown(options: {
    afterShutdown?: () => Promise<void>
  } = {}): Promise<void> {
+    if (options.afterShutdown) {
+      this._afterShutdownCallback = options.afterShutdown;
+    }
+    await this._doShutdown();
+  }
+
+
+  private async _doShutdownImpl(): Promise<void> {
    const docSession = makeExceptionalDocSession('system');
    this._log.debug(docSession, "shutdown starting");
    try {
@ -576,7 +589,7 @@ export class ActiveDoc extends EventEmitter {
      } catch (err) {
        this._log.error(docSession, "failed to shutdown some resources", err);
      }
-      await options.afterShutdown?.();
+      await this._afterShutdownCallback?.();
    } finally {
      this._docManager.removeActiveDoc(this);
    }
--- a/app/server/lib/Assistance.ts
+++ b/app/server/lib/Assistance.ts
@ -4,16 +4,25 @@

 import {delay} from 'app/common/delay';
 import log from 'app/server/lib/log';
-import fetch, { Response as FetchResponse} from 'node-fetch';
+import fetch from 'node-fetch';

+export const DEPS = { fetch };

 export async function sendForCompletion(prompt: string): Promise<string> {
  let completion: string|null = null;
-  if (process.env.OPENAI_API_KEY) {
-    completion = await sendForCompletionOpenAI(prompt);
-  }
-  if (process.env.HUGGINGFACE_API_KEY) {
-    completion = await sendForCompletionHuggingFace(prompt);
+  let retries: number = 0;
+  while(retries++ < 3) {
+    try {
+      if (process.env.OPENAI_API_KEY) {
+        completion = await sendForCompletionOpenAI(prompt);
+      }
+      if (process.env.HUGGINGFACE_API_KEY) {
+        completion = await sendForCompletionHuggingFace(prompt);
+      }
+      break;
+    } catch(e) {
+      await delay(1000);
+    }
  }
  if (completion === null) {
    throw new Error("Please set OPENAI_API_KEY or HUGGINGFACE_API_KEY (and optionally COMPLETION_MODEL)");
@ -29,7 +38,7 @@ async function sendForCompletionOpenAI(prompt: string) {
  if (!apiKey) {
    throw new Error("OPENAI_API_KEY not set");
  }
-  const response = await fetch(
+  const response = await DEPS.fetch(
    "https://api.openai.com/v1/completions",
    {
      method: "POST",
@ -73,31 +82,27 @@ async function sendForCompletionHuggingFace(prompt: string) {
      completionUrl = 'https://api-inference.huggingface.co/models/NovelAI/genji-python-6B';
    }
  }
-  let retries: number = 0;
-  let response!: FetchResponse;
-  while (retries++ < 3) {
-    response = await fetch(
-      completionUrl,
-      {
-        method: "POST",
-        headers: {
-          "Authorization": `Bearer ${apiKey}`,
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          inputs: prompt,
-          parameters: {
-            return_full_text: false,
-            max_new_tokens: 50,
-          },
-        }),
+
+  const response = await DEPS.fetch(
+    completionUrl,
+    {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
      },
-    );
-    if (response.status === 503) {
-      log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`);
-      await delay(10000);
-      continue;
-    }
+      body: JSON.stringify({
+        inputs: prompt,
+        parameters: {
+          return_full_text: false,
+          max_new_tokens: 50,
+        },
+      }),
+    },
+  );
+  if (response.status === 503) {
+    log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`);
+    await delay(10000);
  }
  if (response.status !== 200) {
    const text = await response.text();
--- a/test/formula-dataset/.gitignore
+++ b/test/formula-dataset/.gitignore
@ -0,0 +1,2 @@
+data/templates
+data/cache
--- a/test/formula-dataset/data/formula-dataset-index.csv
+++ b/test/formula-dataset/data/formula-dataset-index.csv
@ -0,0 +1,55 @@
+table_id,col_id,doc_id,Description
+Contacts,Send_Email,hQHXqAQXceeQBPvRw5sSs1,"Link to compose an email, if there is one"
+Tasks,Today,hQHXqAQXceeQBPvRw5sSs1,Needs to be done today (or every day)
+Tasks,Week_Day,hQHXqAQXceeQBPvRw5sSs1,Full name of deadline weekday
+Expenses,Month,55Q2EtTbFvB1N6iizLh4Rk,e.g. 2022-01
+Payroll,Date_Range,5pHLanQNThxkEaEJHKJUf5,"The start date, followed by a dash (no spaces) and the end date if there is one. Dates are month/day with no leading zeroes."
+Payroll,Payment,5pHLanQNThxkEaEJHKJUf5,"Total payment amount for hours worked, rounded to the nearest cent."
+Payroll_summary_Pay_Period_Person,Dates,5pHLanQNThxkEaEJHKJUf5,"All date ranges in the group, separated by a comma and a space"
+People,Full_Name,5pHLanQNThxkEaEJHKJUf5,"e.g. Doe, John"
+General_Ledger,Quarter,2YwYBWpREY2a1N2NV7cb55,e.g. 2020 Q4
+General_Ledger,Year,2YwYBWpREY2a1N2NV7cb55,"Just the year of the date, as a string"
+Time_Calculator,Time_Worked,np7TVHmuvFcHmo1K8h7Ur4,Formatted as hours:minutes. No leading zeroes for hours.
+Time_Calculator,Seconds_Worked,np7TVHmuvFcHmo1K8h7Ur4,"Number of seconds between start/end times, if they're both there"
+Funding_Source_summary,Debt_to_Equity,qprycQa2TVwajAe6Hb3bUZ,Ratio of the total amounts in the group where the type is Debt vs Equity
+Invoices,Client,bReAxyLmzmEQfHF5L5Sc1e,Client's name followed by their address on the next line
+Invoices,Due,bReAxyLmzmEQfHF5L5Sc1e,30 days after the invoice date
+Invoices,Invoice_ID,bReAxyLmzmEQfHF5L5Sc1e,Invoice date followed by the client's name in brackets
+Projects,Project_Name,bReAxyLmzmEQfHF5L5Sc1e,"Client name and project name, e.g. John Doe: Big project"
+Time_Log,Date,bReAxyLmzmEQfHF5L5Sc1e,Start date if there is one
+Time_Log,Duration_hrs_,bReAxyLmzmEQfHF5L5Sc1e,Duration (if there is one) in hours rounded to two decimal places
+Time_Log,Duration_min_,bReAxyLmzmEQfHF5L5Sc1e,"Number of minutes between start and end time. If either time is missing, leave blank. If end is before start, give 0."
+Filtered_By_Formula,LabelCount,9nNr9uQwoXWAvxcWQDygh6,"1 if the state is CA, otherwise 0"
+Objects,Address,pyMHqncEspfZN5zfShCwT8,"City and state, separated by comma space"
+Books,search_terms,hdXy57qLiyNf35oNLzzgBG,"Title and author name, with a space in between"
+BOM_Items,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost if both quantity and cost are given
+Bill_Of_Materials,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost
+All_Responses,Entry,qvND7WUcuNb2fU4n1vBJ7f,"Name and submitted date in the format ""Name - month-day"""
+All_Responses,Month,qvND7WUcuNb2fU4n1vBJ7f,Submitted month (full name) and year
+Cap_Table,Common_Stock,iXggjrCPHut9u2BuhJxJkk,"If the class is Options, RSUs, or Option Pool, return 0, otherwise return the fully diluted value."
+Cap_Table,Fully_Diluted,iXggjrCPHut9u2BuhJxJkk,"The granted amount, minus the total pool used if the class is Option Pool"
+Cap_Table,Fully_Diluted_,iXggjrCPHut9u2BuhJxJkk,Fully diluted as a fraction of the total
+Classes,Spots_Left,swLvb3Fic22gVzrdczcAoZ,or Full
+All_Survey_Responses,Product_Experience_Score,4ktYzGV1mUipSiQFtkLGqm,"A number based on the experience:
+Very Dissatisfied: 1
+Somewhat Dissatisfied: 2
+Neutral: 3
+Somewhat Satisfied: 4
+Very Satisfied: 5"
+Time_Sheet_Entries_summary_Account_Employee_Month,Total_Spend,oGxD8EnzeVs6vSQK3QBrUv,Total hours worked times hourly rate
+Time_Sheets,Title,oGxD8EnzeVs6vSQK3QBrUv,Month number and employee full name separated by a space
+All_Products,SKU,sXsBGDTKau1F3fvxkCyoaJ,"Brand code, color code, and size, separated by dashes without spaces"
+All_Products,Stock_Alert,sXsBGDTKau1F3fvxkCyoaJ,"If the amount in stock and on order is more than 5: In Stock
+If it's 0: OUT OF STOCK
+Otherwise: Low Stock"
+Incoming_Order_Line_Items,Received_Qty,sXsBGDTKau1F3fvxkCyoaJ,"The quantity, but only if the order is received"
+Theaters,Latitude2,dKztiPYamcCpttT1LT1FnU,Coordinate before the comma
+Theaters,Longitude,dKztiPYamcCpttT1LT1FnU,Coordinate after the comma and space
+Families,Amount_Due,cJcSKdUC3nLNAv4wTjAxA6,"Total charged minus total paid, capped at 0"
+Gifts_summary_Occasion_Who_Year,Over_Budget_,dr6epxpXUcy9rsFVUoXTEe,Did we spend more than the budget for this person?
+Apartments,Have_Picture,5iMYwmESm33JpEECSqdZk2,Yes or No depending on if there's a picture
+Leases,Lease_End_Date,5iMYwmESm33JpEECSqdZk2,Start date plus the lease term in years minus one day
+Tenancies,Minor,5iMYwmESm33JpEECSqdZk2,"1 if the age is less than 18, otherwise 0"
+Game_Schedule,Loser,1xJAp2uxM7tFCVUbEofKoF,The team that won fewer sets
+Standings,Win_Rate,1xJAp2uxM7tFCVUbEofKoF,Ratio of wins to total games
+Prepare_Invoices,Due,9NH6D58FmxwPP43nw7uzQK,One month after the issued date if there is one
--- a/test/formula-dataset/runCompletion.js
+++ b/test/formula-dataset/runCompletion.js
@ -0,0 +1,12 @@
+#!/usr/bin/env node
+"use strict";
+const path = require('path');
+const codeRoot = path.dirname(path.dirname(path.dirname(__dirname)));
+
+process.env.DATA_PATH = path.join(__dirname, 'data');
+
+
+require('app-module-path').addPath(path.join(codeRoot, '_build'));
+require('app-module-path').addPath(path.join(codeRoot, '_build', 'core'));
+require('app-module-path').addPath(path.join(codeRoot, '_build', 'ext'));
+require('test/formula-dataset/runCompletion_impl').runCompletion().catch(console.error);
--- a/test/formula-dataset/runCompletion_impl.ts
+++ b/test/formula-dataset/runCompletion_impl.ts
@ -0,0 +1,252 @@
+/**
+ * This module holds an evaluation scripts for AI assistance. It tests ai assistance on the formula
+ * dataset. The formula dataset is made of an index file (formula-dataset-index.csv) and a list of
+ * grist documents hosted on S3. A row in the index file, reference one column (doc_id, table_id,
+ * col_id) amongst theses documents and a free-text description.
+ *
+ * For each entries of the data set, the scripts load the document, requests assistance based on the
+ * description, and applies the suggested actions to the document. Then it compares the col values
+ * before and after. Finally it reverts the modification.
+ *
+ * The list of grist documents for the formula dataset is a screenshot of all templates document
+ * taken somewhere in the beginning of Feb 2023.
+ *
+ * The script maintains a simple cache of all request to AI to save on the ai requests.
+ *
+ * USAGE:
+ *  OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js
+ *
+ *  # WITH VERBOSE:
+ *  VERBOSE=1 OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js
+ *
+ *  # to reset cache
+ *  rm core/test/formula-dataset/data/cache.json
+ */
+
+
+import { ActiveDoc } from "app/server/lib/ActiveDoc";
+import { DEPS } from "app/server/lib/Assistance";
+import log from 'app/server/lib/log';
+import crypto from 'crypto';
+import parse from 'csv-parse/lib/sync';
+import fetch, {RequestInfo, RequestInit, Response} from 'node-fetch';
+import * as fs from "fs";
+import JSZip from "jszip";
+import { isEqual, MapCache } from "lodash";
+import path from 'path';
+import * as os from 'os';
+import { pipeline } from 'stream';
+import { createDocTools } from "test/server/docTools";
+import { promisify } from 'util';
+
+const streamPipeline = promisify(pipeline);
+
+const DATA_PATH = process.env.DATA_PATH || path.join(__dirname, 'data');
+const PATH_TO_DOC = path.join(DATA_PATH, 'templates');
+const PATH_TO_CSV = path.join(DATA_PATH, 'formula-dataset-index.csv');
+const PATH_TO_CACHE = path.join(DATA_PATH, 'cache');
+const TEMPLATE_URL = "https://grist-static.com/datasets/grist_dataset_formulai_2023_02_20.zip";
+
+const oldFetch = DEPS.fetch;
+
+interface FormulaRec {
+  table_id: string;
+  col_id: string;
+  doc_id: string;
+  Description: string;
+}
+
+const _stats = {
+  callCount: 0,
+};
+
+
+export async function runCompletion() {
+
+  // if template directory not exists, make it
+  if (!fs.existsSync(path.join(PATH_TO_DOC))) {
+    fs.mkdirSync(path.join(PATH_TO_DOC), {recursive: true});
+
+    // create tempdir
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'grist-templates-'));
+    const destPath = path.join(dir, 'template.zip');
+
+    // start downloading
+    console.log(
+      `source url: ${TEMPLATE_URL}\n` +
+        `destination: ${destPath}\n` +
+        `download...`
+    );
+    const response = await fetch(TEMPLATE_URL);
+    if (!response.ok) { throw new Error(`unexpected response ${response.statusText}`); }
+    await streamPipeline(response.body, fs.createWriteStream(destPath));
+    console.log('done!\n\n' +
+                'start extraction...');
+
+    // unzip to directory
+    const data = fs.readFileSync(destPath);
+    const zip = await JSZip.loadAsync(data);
+    let count = 0;
+    for (const filename of Object.keys(zip.files)) {
+      if (filename.includes('/')) { continue; }
+      const fileBuffer = await zip.files[filename].async('nodebuffer');
+      fs.writeFileSync(path.join(PATH_TO_DOC, filename), fileBuffer);
+      count++;
+    }
+    console.log(
+      `Successfully extracted ${count} template files to ${PATH_TO_DOC}`
+    );
+  }
+
+  const content = fs.readFileSync(PATH_TO_CSV, {encoding: 'utf8'});
+  const records = parse(content, {columns: true}) as FormulaRec[];
+
+  // let's group by doc id to save on document loading time
+  records.sort((a, b) => a.doc_id.localeCompare(b.doc_id));
+
+  if (!process.env.VERBOSE) {
+    log.transports.file.level = 'error';  // Suppress most of log output.
+  }
+  let activeDoc: ActiveDoc|undefined;
+  const docTools = createDocTools();
+  const session = docTools.createFakeSession('owners');
+  await docTools.before();
+  let successCount = 0;
+
+  console.log('Testing AI assistance: ');
+
+  try {
+
+    DEPS.fetch = fetchWithCache;
+
+    for (const rec of records) {
+
+      // load new document
+      if (!activeDoc || activeDoc.docName !== rec.doc_id) {
+        const docPath = path.join(PATH_TO_DOC, rec.doc_id + '.grist');
+        activeDoc = await docTools.loadLocalDoc(docPath);
+        await activeDoc.waitForInitialization();
+      }
+
+      // get values
+      await activeDoc.docData!.fetchTable(rec.table_id);
+      const expected = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice();
+
+      // send prompt
+      const tableId = rec.table_id;
+      const colId = rec.col_id;
+      const description = rec.Description;
+      const {suggestedActions} = await activeDoc.getAssistance(session, {tableId, colId, description});
+
+      // apply modification
+      const {actionNum} = await activeDoc.applyUserActions(session, suggestedActions);
+
+      // get new values
+      const newValues = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice();
+
+      // revert modification
+      const [bundle] = await activeDoc.getActions([actionNum]);
+      await activeDoc.applyUserActionsById(session, [bundle!.actionNum], [bundle!.actionHash!], true);
+
+      // compare values
+      const success = isEqual(expected, newValues);
+
+      console.log(` ${success ? 'Successfully' : 'Failed to'} complete formula ` +
+        `for column ${rec.table_id}.${rec.col_id} (doc=${rec.doc_id})`);
+
+      if (success) {
+        successCount++;
+      } else {
+        // TODO: log the difference between expected and actual, similar to what mocha does on
+        // failure.
+        // console.log('expected=', expected);
+        // console.log('actual=', newValues);
+      }
+    }
+  } finally {
+    await docTools.after();
+    log.transports.file.level = 'debug';
+    printStats();
+    DEPS.fetch = oldFetch;
+    console.log(
+      `AI Assistance completed ${successCount} successful prompt on a total of ${records.length};`
+    );
+  }
+}
+
+export function main() {
+  runCompletion().catch(console.error);
+}
+
+function printStats() {
+  console.log(`Ai assistance requests stats: ${_stats.callCount} calls`);
+}
+
+/**
+ * Implements a simple cache that read/write from filesystem.
+ */
+class JsonCache implements MapCache {
+  constructor() {
+    if (!fs.existsSync(PATH_TO_CACHE)) {
+      fs.mkdirSync(path.join(PATH_TO_CACHE), {recursive: true});
+    }
+  }
+
+  public get(key: string): any {
+    if (!this.has(key)) { return undefined; }
+    const content = JSON.parse(fs.readFileSync(this._path(key), 'utf8'));
+    return JSON.stringify(content.responseBody);
+  }
+
+  public has(key: string): boolean {
+    return fs.existsSync(this._path(key));
+  }
+
+  public set(key: string, value: any): JsonCache {
+    const content = {
+      requestBody: key,
+      responseBody: JSON.parse(value),
+    };
+    fs.writeFileSync(this._path(key), JSON.stringify(content));
+    return this;
+  }
+
+  public clear(): void {
+    throw new Error('not implemented');
+  }
+
+  public delete(_key: string): boolean {
+    throw new Error('not implemented');
+  }
+
+  private _path(key: string) {
+    return path.join(PATH_TO_CACHE, this._hash(key) + '.json');
+  }
+
+  private _hash(key: string) {
+    return crypto.createHash('md5').update(key).digest('hex');
+  }
+}
+
+/**
+ * Calls fetch and uses caching.
+ */
+const _cache = new JsonCache();
+const _queue = new Map<string, any>();
+async function fetchWithCache(rinfo: RequestInfo, init?: RequestInit): Promise<Response>
+async function fetchWithCache(rinfo: any, init?: RequestInit): Promise<Response> {
+  const url: string = rinfo.url || rinfo.href || rinfo;
+  const hash = JSON.stringify({url, body: init?.body});
+  if (_cache.has(hash)) { return new Response(_cache.get(hash), {status: 200}); }
+  if (_queue.has(hash)) { return new Response(await _queue.get(hash), {status: 200}); }
+  _queue.set(hash, fetch(url, init));
+  const response = await _queue.get(hash);
+  _stats.callCount++;
+  if (response.status === 200) {
+    _cache.set(hash, await response.clone().text()); // response cannot be read twice, hence clone
+  }
+  return response;
+}
+
+// ts expect this function
+fetchWithCache.isRedirect = fetch.isRedirect;
--- a/test/server/docTools.ts
+++ b/test/server/docTools.ts
@ -1,3 +1,4 @@
+import {Role} from 'app/common/roles';
 import {getDocWorkerMap} from 'app/gen-server/lib/DocWorkerMap';
 import {ActiveDoc} from 'app/server/lib/ActiveDoc';
 import {DummyAuthorizer} from 'app/server/lib/Authorizer';
@ -82,8 +83,8 @@ export function createDocTools(options: {persistAcrossCases?: boolean,
  const systemSession = makeExceptionalDocSession('system');
  return {
    /** create a fake session for use when applying user actions to a document */
-    createFakeSession(): DocSession {
-      return {client: null, authorizer: new DummyAuthorizer('editors', 'doc')} as any as DocSession;
+    createFakeSession(role: Role = 'editors'): DocSession {
+      return {client: null, authorizer: new DummyAuthorizer(role, 'doc')} as any as DocSession;
    },

    /** create a throw-away, empty document for testing purposes */