(core) Porting the AI evaluation script

Summary: Porting script that run an evaluation against our formula dataset. To test you need an openai key (see here: https://platform.openai.com/) or hugging face (it should work as well), then checkout the branch and run `OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js` Test Plan: Needs manually testing: so far there is no plan to make it part of CI. The current score is somewhere around 34 successful prompts over a total of 47. Reviewers: paulfitz Reviewed By: paulfitz Subscribers: jarek Differential Revision: https://phab.getgrist.com/D3816
2025-06-13 20:53:59 +00:00 · 2023-03-15 09:52:17 +01:00 · 2023-03-15 09:52:17 +01:00 · 1ff93f89c2
commit 1ff93f89c2
parent 2b2e19c5b5
8 changed files with 377 additions and 37 deletions
--- a/app/common/gutil.ts
+++ b/app/common/gutil.ts
@ -951,7 +951,7 @@ export function assertIsDefined<T>(name: string, value: T): asserts value is Non
 * Calls function `fn`, passes any thrown errors to function `recover`, and finally calls `fn`
 * once more if `recover` doesn't throw.
 */
- export async function retryOnce<T>(fn: () => Promise<T>, recover: (e: unknown) => Promise<void>): Promise<T> {
+export async function retryOnce<T>(fn: () => Promise<T>, recover: (e: unknown) => Promise<void>): Promise<T> {
  try {
    return await fn();
  } catch (e) {
@ -964,7 +964,7 @@ export function assertIsDefined<T>(name: string, value: T): asserts value is Non
 * Checks if value is 'empty' (like null, undefined, empty string, empty array/set/map, empty object).
 * Values like 0, true, false are not empty.
 */
- export function notSet(value: any) {
+export function notSet(value: any) {
  return value === undefined || value === null || value === ''
         || (Array.isArray(value) && !value.length)
         || (typeof value === 'object' && !Object.keys(value).length)
--- a/app/server/lib/ActiveDoc.ts
+++ b/app/server/lib/ActiveDoc.ts
@ -34,7 +34,7 @@ import {
  TransformRule
 } from 'app/common/ActiveDocAPI';
 import {ApiError} from 'app/common/ApiError';
-import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate';
+import {asyncOnce, mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate';
 import {AttachmentColumns, gatherAttachmentIds, getAttachmentColumns} from 'app/common/AttachmentColumns';
 import {
  BulkAddRecord,
@ -230,6 +230,11 @@ export class ActiveDoc extends EventEmitter {
  private _inactivityTimer = new InactivityTimer(() => this.shutdown(), Deps.ACTIVEDOC_TIMEOUT * 1000);
  private _recoveryMode: boolean = false;
  private _shuttingDown: boolean = false;
  private _afterShutdownCallback?: () => Promise<void>;
  // catch & report error so that asyncOnce does not get cleared.
  private _doShutdown = asyncOnce(
    () => this._doShutdownImpl().catch((e) => log.error('Uncaught shutdown error', e))
  );
  /**
   * In cases where large numbers of documents are restarted simultaneously
@ -493,6 +498,14 @@ export class ActiveDoc extends EventEmitter {
  public async shutdown(options: {
    afterShutdown?: () => Promise<void>
  } = {}): Promise<void> {
    if (options.afterShutdown) {
      this._afterShutdownCallback = options.afterShutdown;
    }
    await this._doShutdown();
  }
  private async _doShutdownImpl(): Promise<void> {
    const docSession = makeExceptionalDocSession('system');
    this._log.debug(docSession, "shutdown starting");
    try {
@ -576,7 +589,7 @@ export class ActiveDoc extends EventEmitter {
      } catch (err) {
        this._log.error(docSession, "failed to shutdown some resources", err);
      }
-      await options.afterShutdown?.();
+      await this._afterShutdownCallback?.();
    } finally {
      this._docManager.removeActiveDoc(this);
    }
--- a/app/server/lib/Assistance.ts
+++ b/app/server/lib/Assistance.ts
@ -4,16 +4,25 @@
 import {delay} from 'app/common/delay';
 import log from 'app/server/lib/log';
-import fetch, { Response as FetchResponse} from 'node-fetch';
+import fetch from 'node-fetch';
 export const DEPS = { fetch };
 export async function sendForCompletion(prompt: string): Promise<string> {
  let completion: string|null = null;
-  if (process.env.OPENAI_API_KEY) {
+  let retries: number = 0;
-    completion = await sendForCompletionOpenAI(prompt);
+  while(retries++ < 3) {
-  }
+    try {
-  if (process.env.HUGGINGFACE_API_KEY) {
+      if (process.env.OPENAI_API_KEY) {
-    completion = await sendForCompletionHuggingFace(prompt);
+        completion = await sendForCompletionOpenAI(prompt);
      }
      if (process.env.HUGGINGFACE_API_KEY) {
        completion = await sendForCompletionHuggingFace(prompt);
      }
      break;
    } catch(e) {
      await delay(1000);
    }
  }
  if (completion === null) {
    throw new Error("Please set OPENAI_API_KEY or HUGGINGFACE_API_KEY (and optionally COMPLETION_MODEL)");
@ -29,7 +38,7 @@ async function sendForCompletionOpenAI(prompt: string) {
  if (!apiKey) {
    throw new Error("OPENAI_API_KEY not set");
  }
-  const response = await fetch(
+  const response = await DEPS.fetch(
    "https://api.openai.com/v1/completions",
    {
      method: "POST",
@ -73,31 +82,27 @@ async function sendForCompletionHuggingFace(prompt: string) {
      completionUrl = 'https://api-inference.huggingface.co/models/NovelAI/genji-python-6B';
    }
  }
-  let retries: number = 0;
+
-  let response!: FetchResponse;
+  const response = await DEPS.fetch(
-  while (retries++ < 3) {
+    completionUrl,
-    response = await fetch(
+    {
-      completionUrl,
+      method: "POST",
-      {
+      headers: {
-        method: "POST",
+        "Authorization": `Bearer ${apiKey}`,
-        headers: {
+        "Content-Type": "application/json",
          "Authorization": `Bearer ${apiKey}`,
          "Content-Type": "application/json",
        },
        body: JSON.stringify({
          inputs: prompt,
          parameters: {
            return_full_text: false,
            max_new_tokens: 50,
          },
        }),
      },
-    );
+      body: JSON.stringify({
-    if (response.status === 503) {
+        inputs: prompt,
-      log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`);
+        parameters: {
-      await delay(10000);
+          return_full_text: false,
-      continue;
+          max_new_tokens: 50,
-    }
+        },
      }),
    },
  );
  if (response.status === 503) {
    log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`);
    await delay(10000);
  }
  if (response.status !== 200) {
    const text = await response.text();
--- a/test/formula-dataset/.gitignore
+++ b/test/formula-dataset/.gitignore
@ -0,0 +1,2 @@
 data/templates
 data/cache
--- a/test/formula-dataset/data/formula-dataset-index.csv
+++ b/test/formula-dataset/data/formula-dataset-index.csv
@ -0,0 +1,55 @@
 table_id,col_id,doc_id,Description
 Contacts,Send_Email,hQHXqAQXceeQBPvRw5sSs1,"Link to compose an email, if there is one"
 Tasks,Today,hQHXqAQXceeQBPvRw5sSs1,Needs to be done today (or every day)
 Tasks,Week_Day,hQHXqAQXceeQBPvRw5sSs1,Full name of deadline weekday
 Expenses,Month,55Q2EtTbFvB1N6iizLh4Rk,e.g. 2022-01
 Payroll,Date_Range,5pHLanQNThxkEaEJHKJUf5,"The start date, followed by a dash (no spaces) and the end date if there is one. Dates are month/day with no leading zeroes."
 Payroll,Payment,5pHLanQNThxkEaEJHKJUf5,"Total payment amount for hours worked, rounded to the nearest cent."
 Payroll_summary_Pay_Period_Person,Dates,5pHLanQNThxkEaEJHKJUf5,"All date ranges in the group, separated by a comma and a space"
 People,Full_Name,5pHLanQNThxkEaEJHKJUf5,"e.g. Doe, John"
 General_Ledger,Quarter,2YwYBWpREY2a1N2NV7cb55,e.g. 2020 Q4
 General_Ledger,Year,2YwYBWpREY2a1N2NV7cb55,"Just the year of the date, as a string"
 Time_Calculator,Time_Worked,np7TVHmuvFcHmo1K8h7Ur4,Formatted as hours:minutes. No leading zeroes for hours.
 Time_Calculator,Seconds_Worked,np7TVHmuvFcHmo1K8h7Ur4,"Number of seconds between start/end times, if they're both there"
 Funding_Source_summary,Debt_to_Equity,qprycQa2TVwajAe6Hb3bUZ,Ratio of the total amounts in the group where the type is Debt vs Equity
 Invoices,Client,bReAxyLmzmEQfHF5L5Sc1e,Client's name followed by their address on the next line
 Invoices,Due,bReAxyLmzmEQfHF5L5Sc1e,30 days after the invoice date
 Invoices,Invoice_ID,bReAxyLmzmEQfHF5L5Sc1e,Invoice date followed by the client's name in brackets
 Projects,Project_Name,bReAxyLmzmEQfHF5L5Sc1e,"Client name and project name, e.g. John Doe: Big project"
 Time_Log,Date,bReAxyLmzmEQfHF5L5Sc1e,Start date if there is one
 Time_Log,Duration_hrs_,bReAxyLmzmEQfHF5L5Sc1e,Duration (if there is one) in hours rounded to two decimal places
 Time_Log,Duration_min_,bReAxyLmzmEQfHF5L5Sc1e,"Number of minutes between start and end time. If either time is missing, leave blank. If end is before start, give 0."
 Filtered_By_Formula,LabelCount,9nNr9uQwoXWAvxcWQDygh6,"1 if the state is CA, otherwise 0"
 Objects,Address,pyMHqncEspfZN5zfShCwT8,"City and state, separated by comma space"
 Books,search_terms,hdXy57qLiyNf35oNLzzgBG,"Title and author name, with a space in between"
 BOM_Items,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost if both quantity and cost are given
 Bill_Of_Materials,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost
 All_Responses,Entry,qvND7WUcuNb2fU4n1vBJ7f,"Name and submitted date in the format ""Name - month-day"""
 All_Responses,Month,qvND7WUcuNb2fU4n1vBJ7f,Submitted month (full name) and year
 Cap_Table,Common_Stock,iXggjrCPHut9u2BuhJxJkk,"If the class is Options, RSUs, or Option Pool, return 0, otherwise return the fully diluted value."
 Cap_Table,Fully_Diluted,iXggjrCPHut9u2BuhJxJkk,"The granted amount, minus the total pool used if the class is Option Pool"
 Cap_Table,Fully_Diluted_,iXggjrCPHut9u2BuhJxJkk,Fully diluted as a fraction of the total
 Classes,Spots_Left,swLvb3Fic22gVzrdczcAoZ,or Full
 All_Survey_Responses,Product_Experience_Score,4ktYzGV1mUipSiQFtkLGqm,"A number based on the experience:
 Very Dissatisfied: 1
 Somewhat Dissatisfied: 2
 Neutral: 3
 Somewhat Satisfied: 4
 Very Satisfied: 5"
 Time_Sheet_Entries_summary_Account_Employee_Month,Total_Spend,oGxD8EnzeVs6vSQK3QBrUv,Total hours worked times hourly rate
 Time_Sheets,Title,oGxD8EnzeVs6vSQK3QBrUv,Month number and employee full name separated by a space
 All_Products,SKU,sXsBGDTKau1F3fvxkCyoaJ,"Brand code, color code, and size, separated by dashes without spaces"
 All_Products,Stock_Alert,sXsBGDTKau1F3fvxkCyoaJ,"If the amount in stock and on order is more than 5: In Stock
 If it's 0: OUT OF STOCK
 Otherwise: Low Stock"
 Incoming_Order_Line_Items,Received_Qty,sXsBGDTKau1F3fvxkCyoaJ,"The quantity, but only if the order is received"
 Theaters,Latitude2,dKztiPYamcCpttT1LT1FnU,Coordinate before the comma
 Theaters,Longitude,dKztiPYamcCpttT1LT1FnU,Coordinate after the comma and space
 Families,Amount_Due,cJcSKdUC3nLNAv4wTjAxA6,"Total charged minus total paid, capped at 0"
 Gifts_summary_Occasion_Who_Year,Over_Budget_,dr6epxpXUcy9rsFVUoXTEe,Did we spend more than the budget for this person?
 Apartments,Have_Picture,5iMYwmESm33JpEECSqdZk2,Yes or No depending on if there's a picture
 Leases,Lease_End_Date,5iMYwmESm33JpEECSqdZk2,Start date plus the lease term in years minus one day
 Tenancies,Minor,5iMYwmESm33JpEECSqdZk2,"1 if the age is less than 18, otherwise 0"
 Game_Schedule,Loser,1xJAp2uxM7tFCVUbEofKoF,The team that won fewer sets
 Standings,Win_Rate,1xJAp2uxM7tFCVUbEofKoF,Ratio of wins to total games
 Prepare_Invoices,Due,9NH6D58FmxwPP43nw7uzQK,One month after the issued date if there is one
--- a/test/formula-dataset/runCompletion.js
+++ b/test/formula-dataset/runCompletion.js
@ -0,0 +1,12 @@
 #!/usr/bin/env node
 "use strict";
 const path = require('path');
 const codeRoot = path.dirname(path.dirname(path.dirname(__dirname)));
 process.env.DATA_PATH = path.join(__dirname, 'data');
 require('app-module-path').addPath(path.join(codeRoot, '_build'));
 require('app-module-path').addPath(path.join(codeRoot, '_build', 'core'));
 require('app-module-path').addPath(path.join(codeRoot, '_build', 'ext'));
 require('test/formula-dataset/runCompletion_impl').runCompletion().catch(console.error);
--- a/test/formula-dataset/runCompletion_impl.ts
+++ b/test/formula-dataset/runCompletion_impl.ts
@ -0,0 +1,252 @@
 /**
 * This module holds an evaluation scripts for AI assistance. It tests ai assistance on the formula
 * dataset. The formula dataset is made of an index file (formula-dataset-index.csv) and a list of
 * grist documents hosted on S3. A row in the index file, reference one column (doc_id, table_id,
 * col_id) amongst theses documents and a free-text description.
 *
 * For each entries of the data set, the scripts load the document, requests assistance based on the
 * description, and applies the suggested actions to the document. Then it compares the col values
 * before and after. Finally it reverts the modification.
 *
 * The list of grist documents for the formula dataset is a screenshot of all templates document
 * taken somewhere in the beginning of Feb 2023.
 *
 * The script maintains a simple cache of all request to AI to save on the ai requests.
 *
 * USAGE:
 *  OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js
 *
 *  # WITH VERBOSE:
 *  VERBOSE=1 OPENAI_API_KEY=<my_openai_api_key> node core/test/formula-dataset/runCompletion.js
 *
 *  # to reset cache
 *  rm core/test/formula-dataset/data/cache.json
 */
 import { ActiveDoc } from "app/server/lib/ActiveDoc";
 import { DEPS } from "app/server/lib/Assistance";
 import log from 'app/server/lib/log';
 import crypto from 'crypto';
 import parse from 'csv-parse/lib/sync';
 import fetch, {RequestInfo, RequestInit, Response} from 'node-fetch';
 import * as fs from "fs";
 import JSZip from "jszip";
 import { isEqual, MapCache } from "lodash";
 import path from 'path';
 import * as os from 'os';
 import { pipeline } from 'stream';
 import { createDocTools } from "test/server/docTools";
 import { promisify } from 'util';
 const streamPipeline = promisify(pipeline);
 const DATA_PATH = process.env.DATA_PATH || path.join(__dirname, 'data');
 const PATH_TO_DOC = path.join(DATA_PATH, 'templates');
 const PATH_TO_CSV = path.join(DATA_PATH, 'formula-dataset-index.csv');
 const PATH_TO_CACHE = path.join(DATA_PATH, 'cache');
 const TEMPLATE_URL = "https://grist-static.com/datasets/grist_dataset_formulai_2023_02_20.zip";
 const oldFetch = DEPS.fetch;
 interface FormulaRec {
  table_id: string;
  col_id: string;
  doc_id: string;
  Description: string;
 }
 const _stats = {
  callCount: 0,
 };
 export async function runCompletion() {
  // if template directory not exists, make it
  if (!fs.existsSync(path.join(PATH_TO_DOC))) {
    fs.mkdirSync(path.join(PATH_TO_DOC), {recursive: true});
    // create tempdir
    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'grist-templates-'));
    const destPath = path.join(dir, 'template.zip');
    // start downloading
    console.log(
      `source url: ${TEMPLATE_URL}\n` +
        `destination: ${destPath}\n` +
        `download...`
    );
    const response = await fetch(TEMPLATE_URL);
    if (!response.ok) { throw new Error(`unexpected response ${response.statusText}`); }
    await streamPipeline(response.body, fs.createWriteStream(destPath));
    console.log('done!\n\n' +
                'start extraction...');
    // unzip to directory
    const data = fs.readFileSync(destPath);
    const zip = await JSZip.loadAsync(data);
    let count = 0;
    for (const filename of Object.keys(zip.files)) {
      if (filename.includes('/')) { continue; }
      const fileBuffer = await zip.files[filename].async('nodebuffer');
      fs.writeFileSync(path.join(PATH_TO_DOC, filename), fileBuffer);
      count++;
    }
    console.log(
      `Successfully extracted ${count} template files to ${PATH_TO_DOC}`
    );
  }
  const content = fs.readFileSync(PATH_TO_CSV, {encoding: 'utf8'});
  const records = parse(content, {columns: true}) as FormulaRec[];
  // let's group by doc id to save on document loading time
  records.sort((a, b) => a.doc_id.localeCompare(b.doc_id));
  if (!process.env.VERBOSE) {
    log.transports.file.level = 'error';  // Suppress most of log output.
  }
  let activeDoc: ActiveDoc|undefined;
  const docTools = createDocTools();
  const session = docTools.createFakeSession('owners');
  await docTools.before();
  let successCount = 0;
  console.log('Testing AI assistance: ');
  try {
    DEPS.fetch = fetchWithCache;
    for (const rec of records) {
      // load new document
      if (!activeDoc || activeDoc.docName !== rec.doc_id) {
        const docPath = path.join(PATH_TO_DOC, rec.doc_id + '.grist');
        activeDoc = await docTools.loadLocalDoc(docPath);
        await activeDoc.waitForInitialization();
      }
      // get values
      await activeDoc.docData!.fetchTable(rec.table_id);
      const expected = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice();
      // send prompt
      const tableId = rec.table_id;
      const colId = rec.col_id;
      const description = rec.Description;
      const {suggestedActions} = await activeDoc.getAssistance(session, {tableId, colId, description});
      // apply modification
      const {actionNum} = await activeDoc.applyUserActions(session, suggestedActions);
      // get new values
      const newValues = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice();
      // revert modification
      const [bundle] = await activeDoc.getActions([actionNum]);
      await activeDoc.applyUserActionsById(session, [bundle!.actionNum], [bundle!.actionHash!], true);
      // compare values
      const success = isEqual(expected, newValues);
      console.log(` ${success ? 'Successfully' : 'Failed to'} complete formula ` +
        `for column ${rec.table_id}.${rec.col_id} (doc=${rec.doc_id})`);
      if (success) {
        successCount++;
      } else {
        // TODO: log the difference between expected and actual, similar to what mocha does on
        // failure.
        // console.log('expected=', expected);
        // console.log('actual=', newValues);
      }
    }
  } finally {
    await docTools.after();
    log.transports.file.level = 'debug';
    printStats();
    DEPS.fetch = oldFetch;
    console.log(
      `AI Assistance completed ${successCount} successful prompt on a total of ${records.length};`
    );
  }
 }
 export function main() {
  runCompletion().catch(console.error);
 }
 function printStats() {
  console.log(`Ai assistance requests stats: ${_stats.callCount} calls`);
 }
 /**
 * Implements a simple cache that read/write from filesystem.
 */
 class JsonCache implements MapCache {
  constructor() {
    if (!fs.existsSync(PATH_TO_CACHE)) {
      fs.mkdirSync(path.join(PATH_TO_CACHE), {recursive: true});
    }
  }
  public get(key: string): any {
    if (!this.has(key)) { return undefined; }
    const content = JSON.parse(fs.readFileSync(this._path(key), 'utf8'));
    return JSON.stringify(content.responseBody);
  }
  public has(key: string): boolean {
    return fs.existsSync(this._path(key));
  }
  public set(key: string, value: any): JsonCache {
    const content = {
      requestBody: key,
      responseBody: JSON.parse(value),
    };
    fs.writeFileSync(this._path(key), JSON.stringify(content));
    return this;
  }
  public clear(): void {
    throw new Error('not implemented');
  }
  public delete(_key: string): boolean {
    throw new Error('not implemented');
  }
  private _path(key: string) {
    return path.join(PATH_TO_CACHE, this._hash(key) + '.json');
  }
  private _hash(key: string) {
    return crypto.createHash('md5').update(key).digest('hex');
  }
 }
 /**
 * Calls fetch and uses caching.
 */
 const _cache = new JsonCache();
 const _queue = new Map<string, any>();
 async function fetchWithCache(rinfo: RequestInfo, init?: RequestInit): Promise<Response>
 async function fetchWithCache(rinfo: any, init?: RequestInit): Promise<Response> {
  const url: string = rinfo.url || rinfo.href || rinfo;
  const hash = JSON.stringify({url, body: init?.body});
  if (_cache.has(hash)) { return new Response(_cache.get(hash), {status: 200}); }
  if (_queue.has(hash)) { return new Response(await _queue.get(hash), {status: 200}); }
  _queue.set(hash, fetch(url, init));
  const response = await _queue.get(hash);
  _stats.callCount++;
  if (response.status === 200) {
    _cache.set(hash, await response.clone().text()); // response cannot be read twice, hence clone
  }
  return response;
 }
 // ts expect this function
 fetchWithCache.isRedirect = fetch.isRedirect;
--- a/test/server/docTools.ts
+++ b/test/server/docTools.ts
@ -1,3 +1,4 @@
 import {Role} from 'app/common/roles';
 import {getDocWorkerMap} from 'app/gen-server/lib/DocWorkerMap';
 import {ActiveDoc} from 'app/server/lib/ActiveDoc';
 import {DummyAuthorizer} from 'app/server/lib/Authorizer';
@ -82,8 +83,8 @@ export function createDocTools(options: {persistAcrossCases?: boolean,
  const systemSession = makeExceptionalDocSession('system');
  return {
    /** create a fake session for use when applying user actions to a document */
-    createFakeSession(): DocSession {
+    createFakeSession(role: Role = 'editors'): DocSession {
-      return {client: null, authorizer: new DummyAuthorizer('editors', 'doc')} as any as DocSession;
+      return {client: null, authorizer: new DummyAuthorizer(role, 'doc')} as any as DocSession;
    },
    /** create a throw-away, empty document for testing purposes */