From 1ff93f89c237ec2587b9a94eec327104987edabe Mon Sep 17 00:00:00 2001 From: Cyprien P Date: Wed, 15 Mar 2023 09:52:17 +0100 Subject: [PATCH] (core) Porting the AI evaluation script Summary: Porting script that run an evaluation against our formula dataset. To test you need an openai key (see here: https://platform.openai.com/) or hugging face (it should work as well), then checkout the branch and run `OPENAI_API_KEY= node core/test/formula-dataset/runCompletion.js` Test Plan: Needs manually testing: so far there is no plan to make it part of CI. The current score is somewhere around 34 successful prompts over a total of 47. Reviewers: paulfitz Reviewed By: paulfitz Subscribers: jarek Differential Revision: https://phab.getgrist.com/D3816 --- app/common/gutil.ts | 4 +- app/server/lib/ActiveDoc.ts | 17 +- app/server/lib/Assistance.ts | 67 ++--- test/formula-dataset/.gitignore | 2 + .../data/formula-dataset-index.csv | 55 ++++ test/formula-dataset/runCompletion.js | 12 + test/formula-dataset/runCompletion_impl.ts | 252 ++++++++++++++++++ test/server/docTools.ts | 5 +- 8 files changed, 377 insertions(+), 37 deletions(-) create mode 100644 test/formula-dataset/.gitignore create mode 100644 test/formula-dataset/data/formula-dataset-index.csv create mode 100644 test/formula-dataset/runCompletion.js create mode 100644 test/formula-dataset/runCompletion_impl.ts diff --git a/app/common/gutil.ts b/app/common/gutil.ts index 7fb99119..4f22d7b5 100644 --- a/app/common/gutil.ts +++ b/app/common/gutil.ts @@ -951,7 +951,7 @@ export function assertIsDefined(name: string, value: T): asserts value is Non * Calls function `fn`, passes any thrown errors to function `recover`, and finally calls `fn` * once more if `recover` doesn't throw. */ - export async function retryOnce(fn: () => Promise, recover: (e: unknown) => Promise): Promise { +export async function retryOnce(fn: () => Promise, recover: (e: unknown) => Promise): Promise { try { return await fn(); } catch (e) { @@ -964,7 +964,7 @@ export function assertIsDefined(name: string, value: T): asserts value is Non * Checks if value is 'empty' (like null, undefined, empty string, empty array/set/map, empty object). * Values like 0, true, false are not empty. */ - export function notSet(value: any) { +export function notSet(value: any) { return value === undefined || value === null || value === '' || (Array.isArray(value) && !value.length) || (typeof value === 'object' && !Object.keys(value).length) diff --git a/app/server/lib/ActiveDoc.ts b/app/server/lib/ActiveDoc.ts index 80fe53d0..bcc32f66 100644 --- a/app/server/lib/ActiveDoc.ts +++ b/app/server/lib/ActiveDoc.ts @@ -34,7 +34,7 @@ import { TransformRule } from 'app/common/ActiveDocAPI'; import {ApiError} from 'app/common/ApiError'; -import {mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate'; +import {asyncOnce, mapGetOrSet, MapWithTTL} from 'app/common/AsyncCreate'; import {AttachmentColumns, gatherAttachmentIds, getAttachmentColumns} from 'app/common/AttachmentColumns'; import { BulkAddRecord, @@ -230,6 +230,11 @@ export class ActiveDoc extends EventEmitter { private _inactivityTimer = new InactivityTimer(() => this.shutdown(), Deps.ACTIVEDOC_TIMEOUT * 1000); private _recoveryMode: boolean = false; private _shuttingDown: boolean = false; + private _afterShutdownCallback?: () => Promise; + // catch & report error so that asyncOnce does not get cleared. + private _doShutdown = asyncOnce( + () => this._doShutdownImpl().catch((e) => log.error('Uncaught shutdown error', e)) + ); /** * In cases where large numbers of documents are restarted simultaneously @@ -493,6 +498,14 @@ export class ActiveDoc extends EventEmitter { public async shutdown(options: { afterShutdown?: () => Promise } = {}): Promise { + if (options.afterShutdown) { + this._afterShutdownCallback = options.afterShutdown; + } + await this._doShutdown(); + } + + + private async _doShutdownImpl(): Promise { const docSession = makeExceptionalDocSession('system'); this._log.debug(docSession, "shutdown starting"); try { @@ -576,7 +589,7 @@ export class ActiveDoc extends EventEmitter { } catch (err) { this._log.error(docSession, "failed to shutdown some resources", err); } - await options.afterShutdown?.(); + await this._afterShutdownCallback?.(); } finally { this._docManager.removeActiveDoc(this); } diff --git a/app/server/lib/Assistance.ts b/app/server/lib/Assistance.ts index ce994326..804506e7 100644 --- a/app/server/lib/Assistance.ts +++ b/app/server/lib/Assistance.ts @@ -4,16 +4,25 @@ import {delay} from 'app/common/delay'; import log from 'app/server/lib/log'; -import fetch, { Response as FetchResponse} from 'node-fetch'; +import fetch from 'node-fetch'; +export const DEPS = { fetch }; export async function sendForCompletion(prompt: string): Promise { let completion: string|null = null; - if (process.env.OPENAI_API_KEY) { - completion = await sendForCompletionOpenAI(prompt); - } - if (process.env.HUGGINGFACE_API_KEY) { - completion = await sendForCompletionHuggingFace(prompt); + let retries: number = 0; + while(retries++ < 3) { + try { + if (process.env.OPENAI_API_KEY) { + completion = await sendForCompletionOpenAI(prompt); + } + if (process.env.HUGGINGFACE_API_KEY) { + completion = await sendForCompletionHuggingFace(prompt); + } + break; + } catch(e) { + await delay(1000); + } } if (completion === null) { throw new Error("Please set OPENAI_API_KEY or HUGGINGFACE_API_KEY (and optionally COMPLETION_MODEL)"); @@ -29,7 +38,7 @@ async function sendForCompletionOpenAI(prompt: string) { if (!apiKey) { throw new Error("OPENAI_API_KEY not set"); } - const response = await fetch( + const response = await DEPS.fetch( "https://api.openai.com/v1/completions", { method: "POST", @@ -73,31 +82,27 @@ async function sendForCompletionHuggingFace(prompt: string) { completionUrl = 'https://api-inference.huggingface.co/models/NovelAI/genji-python-6B'; } } - let retries: number = 0; - let response!: FetchResponse; - while (retries++ < 3) { - response = await fetch( - completionUrl, - { - method: "POST", - headers: { - "Authorization": `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - inputs: prompt, - parameters: { - return_full_text: false, - max_new_tokens: 50, - }, - }), + + const response = await DEPS.fetch( + completionUrl, + { + method: "POST", + headers: { + "Authorization": `Bearer ${apiKey}`, + "Content-Type": "application/json", }, - ); - if (response.status === 503) { - log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`); - await delay(10000); - continue; - } + body: JSON.stringify({ + inputs: prompt, + parameters: { + return_full_text: false, + max_new_tokens: 50, + }, + }), + }, + ); + if (response.status === 503) { + log.error(`Sleeping for 10s - HuggingFace API returned ${response.status}: ${await response.text()}`); + await delay(10000); } if (response.status !== 200) { const text = await response.text(); diff --git a/test/formula-dataset/.gitignore b/test/formula-dataset/.gitignore new file mode 100644 index 00000000..a9e1945b --- /dev/null +++ b/test/formula-dataset/.gitignore @@ -0,0 +1,2 @@ +data/templates +data/cache diff --git a/test/formula-dataset/data/formula-dataset-index.csv b/test/formula-dataset/data/formula-dataset-index.csv new file mode 100644 index 00000000..e94b8f41 --- /dev/null +++ b/test/formula-dataset/data/formula-dataset-index.csv @@ -0,0 +1,55 @@ +table_id,col_id,doc_id,Description +Contacts,Send_Email,hQHXqAQXceeQBPvRw5sSs1,"Link to compose an email, if there is one" +Tasks,Today,hQHXqAQXceeQBPvRw5sSs1,Needs to be done today (or every day) +Tasks,Week_Day,hQHXqAQXceeQBPvRw5sSs1,Full name of deadline weekday +Expenses,Month,55Q2EtTbFvB1N6iizLh4Rk,e.g. 2022-01 +Payroll,Date_Range,5pHLanQNThxkEaEJHKJUf5,"The start date, followed by a dash (no spaces) and the end date if there is one. Dates are month/day with no leading zeroes." +Payroll,Payment,5pHLanQNThxkEaEJHKJUf5,"Total payment amount for hours worked, rounded to the nearest cent." +Payroll_summary_Pay_Period_Person,Dates,5pHLanQNThxkEaEJHKJUf5,"All date ranges in the group, separated by a comma and a space" +People,Full_Name,5pHLanQNThxkEaEJHKJUf5,"e.g. Doe, John" +General_Ledger,Quarter,2YwYBWpREY2a1N2NV7cb55,e.g. 2020 Q4 +General_Ledger,Year,2YwYBWpREY2a1N2NV7cb55,"Just the year of the date, as a string" +Time_Calculator,Time_Worked,np7TVHmuvFcHmo1K8h7Ur4,Formatted as hours:minutes. No leading zeroes for hours. +Time_Calculator,Seconds_Worked,np7TVHmuvFcHmo1K8h7Ur4,"Number of seconds between start/end times, if they're both there" +Funding_Source_summary,Debt_to_Equity,qprycQa2TVwajAe6Hb3bUZ,Ratio of the total amounts in the group where the type is Debt vs Equity +Invoices,Client,bReAxyLmzmEQfHF5L5Sc1e,Client's name followed by their address on the next line +Invoices,Due,bReAxyLmzmEQfHF5L5Sc1e,30 days after the invoice date +Invoices,Invoice_ID,bReAxyLmzmEQfHF5L5Sc1e,Invoice date followed by the client's name in brackets +Projects,Project_Name,bReAxyLmzmEQfHF5L5Sc1e,"Client name and project name, e.g. John Doe: Big project" +Time_Log,Date,bReAxyLmzmEQfHF5L5Sc1e,Start date if there is one +Time_Log,Duration_hrs_,bReAxyLmzmEQfHF5L5Sc1e,Duration (if there is one) in hours rounded to two decimal places +Time_Log,Duration_min_,bReAxyLmzmEQfHF5L5Sc1e,"Number of minutes between start and end time. If either time is missing, leave blank. If end is before start, give 0." +Filtered_By_Formula,LabelCount,9nNr9uQwoXWAvxcWQDygh6,"1 if the state is CA, otherwise 0" +Objects,Address,pyMHqncEspfZN5zfShCwT8,"City and state, separated by comma space" +Books,search_terms,hdXy57qLiyNf35oNLzzgBG,"Title and author name, with a space in between" +BOM_Items,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost if both quantity and cost are given +Bill_Of_Materials,Cost,e4gEm7dt4cgBMkouVBNMeY,Total cost +All_Responses,Entry,qvND7WUcuNb2fU4n1vBJ7f,"Name and submitted date in the format ""Name - month-day""" +All_Responses,Month,qvND7WUcuNb2fU4n1vBJ7f,Submitted month (full name) and year +Cap_Table,Common_Stock,iXggjrCPHut9u2BuhJxJkk,"If the class is Options, RSUs, or Option Pool, return 0, otherwise return the fully diluted value." +Cap_Table,Fully_Diluted,iXggjrCPHut9u2BuhJxJkk,"The granted amount, minus the total pool used if the class is Option Pool" +Cap_Table,Fully_Diluted_,iXggjrCPHut9u2BuhJxJkk,Fully diluted as a fraction of the total +Classes,Spots_Left,swLvb3Fic22gVzrdczcAoZ,or Full +All_Survey_Responses,Product_Experience_Score,4ktYzGV1mUipSiQFtkLGqm,"A number based on the experience: +Very Dissatisfied: 1 +Somewhat Dissatisfied: 2 +Neutral: 3 +Somewhat Satisfied: 4 +Very Satisfied: 5" +Time_Sheet_Entries_summary_Account_Employee_Month,Total_Spend,oGxD8EnzeVs6vSQK3QBrUv,Total hours worked times hourly rate +Time_Sheets,Title,oGxD8EnzeVs6vSQK3QBrUv,Month number and employee full name separated by a space +All_Products,SKU,sXsBGDTKau1F3fvxkCyoaJ,"Brand code, color code, and size, separated by dashes without spaces" +All_Products,Stock_Alert,sXsBGDTKau1F3fvxkCyoaJ,"If the amount in stock and on order is more than 5: In Stock +If it's 0: OUT OF STOCK +Otherwise: Low Stock" +Incoming_Order_Line_Items,Received_Qty,sXsBGDTKau1F3fvxkCyoaJ,"The quantity, but only if the order is received" +Theaters,Latitude2,dKztiPYamcCpttT1LT1FnU,Coordinate before the comma +Theaters,Longitude,dKztiPYamcCpttT1LT1FnU,Coordinate after the comma and space +Families,Amount_Due,cJcSKdUC3nLNAv4wTjAxA6,"Total charged minus total paid, capped at 0" +Gifts_summary_Occasion_Who_Year,Over_Budget_,dr6epxpXUcy9rsFVUoXTEe,Did we spend more than the budget for this person? +Apartments,Have_Picture,5iMYwmESm33JpEECSqdZk2,Yes or No depending on if there's a picture +Leases,Lease_End_Date,5iMYwmESm33JpEECSqdZk2,Start date plus the lease term in years minus one day +Tenancies,Minor,5iMYwmESm33JpEECSqdZk2,"1 if the age is less than 18, otherwise 0" +Game_Schedule,Loser,1xJAp2uxM7tFCVUbEofKoF,The team that won fewer sets +Standings,Win_Rate,1xJAp2uxM7tFCVUbEofKoF,Ratio of wins to total games +Prepare_Invoices,Due,9NH6D58FmxwPP43nw7uzQK,One month after the issued date if there is one diff --git a/test/formula-dataset/runCompletion.js b/test/formula-dataset/runCompletion.js new file mode 100644 index 00000000..25409b5b --- /dev/null +++ b/test/formula-dataset/runCompletion.js @@ -0,0 +1,12 @@ +#!/usr/bin/env node +"use strict"; +const path = require('path'); +const codeRoot = path.dirname(path.dirname(path.dirname(__dirname))); + +process.env.DATA_PATH = path.join(__dirname, 'data'); + + +require('app-module-path').addPath(path.join(codeRoot, '_build')); +require('app-module-path').addPath(path.join(codeRoot, '_build', 'core')); +require('app-module-path').addPath(path.join(codeRoot, '_build', 'ext')); +require('test/formula-dataset/runCompletion_impl').runCompletion().catch(console.error); diff --git a/test/formula-dataset/runCompletion_impl.ts b/test/formula-dataset/runCompletion_impl.ts new file mode 100644 index 00000000..45c2e207 --- /dev/null +++ b/test/formula-dataset/runCompletion_impl.ts @@ -0,0 +1,252 @@ +/** + * This module holds an evaluation scripts for AI assistance. It tests ai assistance on the formula + * dataset. The formula dataset is made of an index file (formula-dataset-index.csv) and a list of + * grist documents hosted on S3. A row in the index file, reference one column (doc_id, table_id, + * col_id) amongst theses documents and a free-text description. + * + * For each entries of the data set, the scripts load the document, requests assistance based on the + * description, and applies the suggested actions to the document. Then it compares the col values + * before and after. Finally it reverts the modification. + * + * The list of grist documents for the formula dataset is a screenshot of all templates document + * taken somewhere in the beginning of Feb 2023. + * + * The script maintains a simple cache of all request to AI to save on the ai requests. + * + * USAGE: + * OPENAI_API_KEY= node core/test/formula-dataset/runCompletion.js + * + * # WITH VERBOSE: + * VERBOSE=1 OPENAI_API_KEY= node core/test/formula-dataset/runCompletion.js + * + * # to reset cache + * rm core/test/formula-dataset/data/cache.json + */ + + +import { ActiveDoc } from "app/server/lib/ActiveDoc"; +import { DEPS } from "app/server/lib/Assistance"; +import log from 'app/server/lib/log'; +import crypto from 'crypto'; +import parse from 'csv-parse/lib/sync'; +import fetch, {RequestInfo, RequestInit, Response} from 'node-fetch'; +import * as fs from "fs"; +import JSZip from "jszip"; +import { isEqual, MapCache } from "lodash"; +import path from 'path'; +import * as os from 'os'; +import { pipeline } from 'stream'; +import { createDocTools } from "test/server/docTools"; +import { promisify } from 'util'; + +const streamPipeline = promisify(pipeline); + +const DATA_PATH = process.env.DATA_PATH || path.join(__dirname, 'data'); +const PATH_TO_DOC = path.join(DATA_PATH, 'templates'); +const PATH_TO_CSV = path.join(DATA_PATH, 'formula-dataset-index.csv'); +const PATH_TO_CACHE = path.join(DATA_PATH, 'cache'); +const TEMPLATE_URL = "https://grist-static.com/datasets/grist_dataset_formulai_2023_02_20.zip"; + +const oldFetch = DEPS.fetch; + +interface FormulaRec { + table_id: string; + col_id: string; + doc_id: string; + Description: string; +} + +const _stats = { + callCount: 0, +}; + + +export async function runCompletion() { + + // if template directory not exists, make it + if (!fs.existsSync(path.join(PATH_TO_DOC))) { + fs.mkdirSync(path.join(PATH_TO_DOC), {recursive: true}); + + // create tempdir + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'grist-templates-')); + const destPath = path.join(dir, 'template.zip'); + + // start downloading + console.log( + `source url: ${TEMPLATE_URL}\n` + + `destination: ${destPath}\n` + + `download...` + ); + const response = await fetch(TEMPLATE_URL); + if (!response.ok) { throw new Error(`unexpected response ${response.statusText}`); } + await streamPipeline(response.body, fs.createWriteStream(destPath)); + console.log('done!\n\n' + + 'start extraction...'); + + // unzip to directory + const data = fs.readFileSync(destPath); + const zip = await JSZip.loadAsync(data); + let count = 0; + for (const filename of Object.keys(zip.files)) { + if (filename.includes('/')) { continue; } + const fileBuffer = await zip.files[filename].async('nodebuffer'); + fs.writeFileSync(path.join(PATH_TO_DOC, filename), fileBuffer); + count++; + } + console.log( + `Successfully extracted ${count} template files to ${PATH_TO_DOC}` + ); + } + + const content = fs.readFileSync(PATH_TO_CSV, {encoding: 'utf8'}); + const records = parse(content, {columns: true}) as FormulaRec[]; + + // let's group by doc id to save on document loading time + records.sort((a, b) => a.doc_id.localeCompare(b.doc_id)); + + if (!process.env.VERBOSE) { + log.transports.file.level = 'error'; // Suppress most of log output. + } + let activeDoc: ActiveDoc|undefined; + const docTools = createDocTools(); + const session = docTools.createFakeSession('owners'); + await docTools.before(); + let successCount = 0; + + console.log('Testing AI assistance: '); + + try { + + DEPS.fetch = fetchWithCache; + + for (const rec of records) { + + // load new document + if (!activeDoc || activeDoc.docName !== rec.doc_id) { + const docPath = path.join(PATH_TO_DOC, rec.doc_id + '.grist'); + activeDoc = await docTools.loadLocalDoc(docPath); + await activeDoc.waitForInitialization(); + } + + // get values + await activeDoc.docData!.fetchTable(rec.table_id); + const expected = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice(); + + // send prompt + const tableId = rec.table_id; + const colId = rec.col_id; + const description = rec.Description; + const {suggestedActions} = await activeDoc.getAssistance(session, {tableId, colId, description}); + + // apply modification + const {actionNum} = await activeDoc.applyUserActions(session, suggestedActions); + + // get new values + const newValues = activeDoc.docData!.getTable(rec.table_id)!.getColValues(rec.col_id)!.slice(); + + // revert modification + const [bundle] = await activeDoc.getActions([actionNum]); + await activeDoc.applyUserActionsById(session, [bundle!.actionNum], [bundle!.actionHash!], true); + + // compare values + const success = isEqual(expected, newValues); + + console.log(` ${success ? 'Successfully' : 'Failed to'} complete formula ` + + `for column ${rec.table_id}.${rec.col_id} (doc=${rec.doc_id})`); + + if (success) { + successCount++; + } else { + // TODO: log the difference between expected and actual, similar to what mocha does on + // failure. + // console.log('expected=', expected); + // console.log('actual=', newValues); + } + } + } finally { + await docTools.after(); + log.transports.file.level = 'debug'; + printStats(); + DEPS.fetch = oldFetch; + console.log( + `AI Assistance completed ${successCount} successful prompt on a total of ${records.length};` + ); + } +} + +export function main() { + runCompletion().catch(console.error); +} + +function printStats() { + console.log(`Ai assistance requests stats: ${_stats.callCount} calls`); +} + +/** + * Implements a simple cache that read/write from filesystem. + */ +class JsonCache implements MapCache { + constructor() { + if (!fs.existsSync(PATH_TO_CACHE)) { + fs.mkdirSync(path.join(PATH_TO_CACHE), {recursive: true}); + } + } + + public get(key: string): any { + if (!this.has(key)) { return undefined; } + const content = JSON.parse(fs.readFileSync(this._path(key), 'utf8')); + return JSON.stringify(content.responseBody); + } + + public has(key: string): boolean { + return fs.existsSync(this._path(key)); + } + + public set(key: string, value: any): JsonCache { + const content = { + requestBody: key, + responseBody: JSON.parse(value), + }; + fs.writeFileSync(this._path(key), JSON.stringify(content)); + return this; + } + + public clear(): void { + throw new Error('not implemented'); + } + + public delete(_key: string): boolean { + throw new Error('not implemented'); + } + + private _path(key: string) { + return path.join(PATH_TO_CACHE, this._hash(key) + '.json'); + } + + private _hash(key: string) { + return crypto.createHash('md5').update(key).digest('hex'); + } +} + +/** + * Calls fetch and uses caching. + */ +const _cache = new JsonCache(); +const _queue = new Map(); +async function fetchWithCache(rinfo: RequestInfo, init?: RequestInit): Promise +async function fetchWithCache(rinfo: any, init?: RequestInit): Promise { + const url: string = rinfo.url || rinfo.href || rinfo; + const hash = JSON.stringify({url, body: init?.body}); + if (_cache.has(hash)) { return new Response(_cache.get(hash), {status: 200}); } + if (_queue.has(hash)) { return new Response(await _queue.get(hash), {status: 200}); } + _queue.set(hash, fetch(url, init)); + const response = await _queue.get(hash); + _stats.callCount++; + if (response.status === 200) { + _cache.set(hash, await response.clone().text()); // response cannot be read twice, hence clone + } + return response; +} + +// ts expect this function +fetchWithCache.isRedirect = fetch.isRedirect; diff --git a/test/server/docTools.ts b/test/server/docTools.ts index bb569b5f..2b8cadfe 100644 --- a/test/server/docTools.ts +++ b/test/server/docTools.ts @@ -1,3 +1,4 @@ +import {Role} from 'app/common/roles'; import {getDocWorkerMap} from 'app/gen-server/lib/DocWorkerMap'; import {ActiveDoc} from 'app/server/lib/ActiveDoc'; import {DummyAuthorizer} from 'app/server/lib/Authorizer'; @@ -82,8 +83,8 @@ export function createDocTools(options: {persistAcrossCases?: boolean, const systemSession = makeExceptionalDocSession('system'); return { /** create a fake session for use when applying user actions to a document */ - createFakeSession(): DocSession { - return {client: null, authorizer: new DummyAuthorizer('editors', 'doc')} as any as DocSession; + createFakeSession(role: Role = 'editors'): DocSession { + return {client: null, authorizer: new DummyAuthorizer(role, 'doc')} as any as DocSession; }, /** create a throw-away, empty document for testing purposes */