From 599545fb11ee90adf7c44f0940bbd3ab9b8d6a0c Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 1 Mar 2022 14:50:12 +0200 Subject: [PATCH] (core) Fuller guessing of type and options when adding first data to blank columns Summary: Adds `common/ValueGuesser.ts` with logic for guessing column type and widget options (only for dates/datetimes) from an array of strings, and converting the strings to the guessed type in a lossless manner, so that converting back to Text gives the original values. Changes `_ensure_column_accepts_data` in Python to call an exported JS method using the new logic where possible. Test Plan: Added `test/common/ValueGuesser.ts` to unit test the core guessing logic and a DocApi end-to-end test for what happens to new columns. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3290 --- app/client/components/TypeConversion.ts | 2 +- app/common/DocData.ts | 19 ++- app/common/ValueGuesser.ts | 155 ++++++++++++++++++++++++ app/common/ValueParser.ts | 7 +- app/common/parseDate.ts | 18 ++- app/server/lib/ActiveDoc.ts | 3 + sandbox/grist/test_column_actions.py | 18 +-- sandbox/grist/useractions.py | 62 ++++++++-- 8 files changed, 246 insertions(+), 38 deletions(-) create mode 100644 app/common/ValueGuesser.ts diff --git a/app/client/components/TypeConversion.ts b/app/client/components/TypeConversion.ts index b3307535..904d4fc9 100644 --- a/app/client/components/TypeConversion.ts +++ b/app/client/components/TypeConversion.ts @@ -105,7 +105,7 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe const colValues = tableData.getColValues(sourceCol.colId()) || []; dateFormat = guessDateFormat(colValues.map(String)) || "YYYY-MM-DD"; } - widgetOptions = dateTimeWidgetOptions(dateFormat); + widgetOptions = dateTimeWidgetOptions(dateFormat, true); break; } case 'Choice': { diff --git a/app/common/DocData.ts b/app/common/DocData.ts index 96687650..123efa8a 100644 --- a/app/common/DocData.ts +++ b/app/common/DocData.ts @@ -3,13 +3,17 @@ * subscribes to actions which change it, and forwards those actions to individual tables. * It also provides the interface to apply actions to data. */ +import {DocumentSettings} from 'app/common/DocumentSettings'; +import {safeJsonParse} from 'app/common/gutil'; import {schema, SchemaTypes} from 'app/common/schema'; import fromPairs = require('lodash/fromPairs'); import groupBy = require('lodash/groupBy'); import {ActionDispatcher} from './ActionDispatcher'; -import {BulkColValues, ColInfo, ColInfoWithId, ColValues, DocAction, - RowRecord, TableDataAction} from './DocActions'; -import {ColTypeMap, MetaTableData, TableData} from './TableData'; +import { + BulkColValues, ColInfo, ColInfoWithId, ColValues, DocAction, + RowRecord, TableDataAction +} from './DocActions'; +import {ColTypeMap, MetaRowRecord, MetaTableData, TableData} from './TableData'; type FetchTableFunc = (tableId: string) => Promise; @@ -108,6 +112,15 @@ export class DocData extends ActionDispatcher { } } + public docInfo(): MetaRowRecord<'_grist_DocInfo'> { + const docInfoTable = this.getMetaTable('_grist_DocInfo'); + return docInfoTable.getRecord(1)!; + } + + public docSettings(): DocumentSettings { + return safeJsonParse(this.docInfo().documentSettings, {}); + } + // ---- The following methods implement ActionDispatcher interface ---- protected onAddTable(action: DocAction, tableId: string, columns: ColInfoWithId[]): void { diff --git a/app/common/ValueGuesser.ts b/app/common/ValueGuesser.ts new file mode 100644 index 00000000..6820c140 --- /dev/null +++ b/app/common/ValueGuesser.ts @@ -0,0 +1,155 @@ +import {DocData} from 'app/common/DocData'; +import {DocumentSettings} from 'app/common/DocumentSettings'; +import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate'; +import {createFormatter} from 'app/common/ValueFormatter'; +import * as moment from 'moment-timezone'; + +interface GuessedColInfo { + type: string; + widgetOptions?: object; +} + +interface GuessResult { + values?: any[]; + colInfo: GuessedColInfo; +} + +/** + * Class for guessing if an array of values should be interpreted as a specific column type. + * T is the type of values that strings should be parsed to and is stored in the column. + */ +abstract class ValueGuesser { + /** + * Guessed column type and maybe widget options. + */ + public abstract colInfo(): GuessedColInfo; + + /** + * Parse a single string to a typed value in such a way that formatting the value returns the original string. + * If the string cannot be parsed, return the original string. + */ + public abstract parse(value: string): T | string; + + /** + * Attempt to parse at least 90% the string values losslessly according to the guessed colInfo. + * Return null if this cannot be done. + */ + public guess(values: Array, docSettings: DocumentSettings): GuessResult | null { + const colInfo = this.colInfo(); + const {type, widgetOptions} = colInfo; + const formatter = createFormatter(type, widgetOptions || {}, docSettings); + const result: any[] = []; + const maxUnparsed = values.length * 0.1; // max number of non-parsed strings to allow before giving up + let unparsed = 0; + + for (const value of values) { + if (!value) { + if (this.allowBlank()) { + result.push(null); + continue; + } else { + return null; + } + } + + const parsed = this.parse(value); + // Give up if too many strings failed to parse or if the parsed value changes when converted back to text + if (typeof parsed === "string" && ++unparsed > maxUnparsed || formatter.formatAny(parsed) !== value) { + return null; + } + result.push(parsed); + } + return {values: result, colInfo}; + } + + /** + * Whether this type of column can store nulls directly. + */ + protected allowBlank(): boolean { + return true; + } +} + +class BoolGuesser extends ValueGuesser { + public colInfo(): GuessedColInfo { + return {type: 'Bool'}; + } + + public parse(value: string): boolean | string { + if (value === "true") { + return true; + } else if (value === "false") { + return false; + } else { + return value; + } + } + + /** + * This is the only type that can't store nulls, it converts them to false. + */ + protected allowBlank(): boolean { + return false; + } +} + +class NumericGuesser extends ValueGuesser { + public colInfo(): GuessedColInfo { + // TODO parse and guess options for formatted numbers, e.g. currency amounts + return {type: 'Numeric'}; + } + + public parse(value: string): number | string { + const parsed = Number(value); + return !isNaN(parsed) ? parsed : value; + } +} + +class DateGuesser extends ValueGuesser { + // _format should be a full moment format string + // _tz should be the document's default timezone + constructor(private _format: string, private _tz: string) { + super(); + } + + public colInfo(): GuessedColInfo { + const widgetOptions = dateTimeWidgetOptions(this._format, false); + let type; + if (widgetOptions.timeFormat) { + type = 'DateTime:' + this._tz; + } else { + type = 'Date'; + this._tz = "UTC"; + } + return {widgetOptions, type}; + } + + // Note that this parsing is much stricter than parseDate to prevent loss of information. + // Dates which can be parsed by parseDate based on the guessed widget options may not be parsed here. + public parse(value: string): number | string { + const m = moment.tz(value, this._format, true, this._tz); + return m.isValid() ? m.valueOf() / 1000 : value; + } +} + +export function guessColInfoWithDocData(values: Array, docData: DocData) { + return guessColInfo(values, docData.docSettings(), docData.docInfo().timezone); +} + +export function guessColInfo( + values: Array, docSettings: DocumentSettings, timezone: string +): GuessResult { + // Use short-circuiting of || to only do as much work as needed, + // in particular not guessing date formats before trying other types. + return ( + new BoolGuesser() + .guess(values, docSettings) || + new NumericGuesser() + .guess(values, docSettings) || + new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone) + .guess(values, docSettings) || + // Don't return the same values back if there's no conversion to be done, + // as they have to be serialized and transferred over a pipe to Python. + {colInfo: {type: 'Text'}} + ); +} diff --git a/app/common/ValueParser.ts b/app/common/ValueParser.ts index a679c961..9b0e4afb 100644 --- a/app/common/ValueParser.ts +++ b/app/common/ValueParser.ts @@ -293,8 +293,6 @@ export function createParserOrFormatterArgumentsRaw( visibleColRef: number, ): [string, object, DocumentSettings] { const columnsTable = docData.getMetaTable('_grist_Tables_column'); - const docInfoTable = docData.getMetaTable('_grist_DocInfo'); - const widgetOpts = safeJsonParse(widgetOptions, {}); if (isFullReferencingType(type)) { @@ -305,10 +303,7 @@ export function createParserOrFormatterArgumentsRaw( widgetOpts.tableData = docData.getTable(getReferencedTableId(type)!); } - const docInfo = docInfoTable.getRecord(1); - const docSettings = safeJsonParse(docInfo!.documentSettings, {}) as DocumentSettings; - - return [type, widgetOpts, docSettings]; + return [type, widgetOpts, docData.docSettings()]; } /** diff --git a/app/common/parseDate.ts b/app/common/parseDate.ts index 2f17e567..a53eb1cf 100644 --- a/app/common/parseDate.ts +++ b/app/common/parseDate.ts @@ -1,6 +1,6 @@ import escapeRegExp = require('lodash/escapeRegExp'); import memoize = require('lodash/memoize'); -import {getDistinctValues} from 'app/common/gutil'; +import {getDistinctValues, isObject} from 'app/common/gutil'; // Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment import * as guessFormat from '@gristlabs/moment-guess/dist/bundle.js'; import * as moment from 'moment-timezone'; @@ -325,8 +325,9 @@ function standardizeTime(timeString: string): { remaining: string, time: string return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`}; } -export function guessDateFormat(values: string[], timezone: string = 'UTC'): string | null { - const sample = getDistinctValues(values, 100); +export function guessDateFormat(values: Array, timezone: string = 'UTC'): string | null { + const dateStrings: string[] = values.filter(isObject); + const sample = getDistinctValues(dateStrings, 100); const formats: Record = {}; for (const dateString of sample) { let guessed: string | string[]; @@ -348,7 +349,7 @@ export function guessDateFormat(values: string[], timezone: string = 'UTC'): str } for (const format of formatKeys) { - for (const dateString of values) { + for (const dateString of dateStrings) { const m = moment.tz(dateString, format, true, timezone); if (m.isValid()) { formats[format] += 1; @@ -380,10 +381,15 @@ export const timeFormatOptions = [ 'HH:mm:ss z', ]; -export function dateTimeWidgetOptions(fullFormat: string) { +/** + * Construct widget options for a Date or DateTime column based on a single moment string + * which may or may not contain both date and time parts. + * If defaultTimeFormat is true, fallback to a non-empty default time format when none is found in fullFormat. + */ +export function dateTimeWidgetOptions(fullFormat: string, defaultTimeFormat: boolean) { const index = fullFormat.match(/[hHkaAmsSzZT]|$/)!.index!; const dateFormat = fullFormat.substr(0, index).trim(); - const timeFormat = fullFormat.substr(index).trim() || timeFormatOptions[0]; + const timeFormat = fullFormat.substr(index).trim() || (defaultTimeFormat ? timeFormatOptions[0] : ""); return { dateFormat, timeFormat, diff --git a/app/server/lib/ActiveDoc.ts b/app/server/lib/ActiveDoc.ts index 7265652c..345a75ec 100644 --- a/app/server/lib/ActiveDoc.ts +++ b/app/server/lib/ActiveDoc.ts @@ -42,6 +42,7 @@ import {MetaRowRecord} from 'app/common/TableData'; import {FetchUrlOptions, UploadResult} from 'app/common/uploads'; import {DocReplacementOptions, DocState, DocStateComparison} from 'app/common/UserAPI'; import {convertFromColumn} from 'app/common/ValueConverter'; +import {guessColInfoWithDocData} from 'app/common/ValueGuesser'; import {parseUserAction} from 'app/common/ValueParser'; import {ParseOptions} from 'app/plugin/FileParserAPI'; import {GristDocAPI} from 'app/plugin/GristAPI'; @@ -1724,6 +1725,8 @@ export class ActiveDoc extends EventEmitter { preferredPythonVersion, sandboxOptions: { exports: { + guessColInfo: (values: Array) => + guessColInfoWithDocData(values, this.docData!), convertFromColumn: (...args: Parameters>) => convertFromColumn(this.docData!)(...args) } diff --git a/sandbox/grist/test_column_actions.py b/sandbox/grist/test_column_actions.py index af26f855..b165af2e 100644 --- a/sandbox/grist/test_column_actions.py +++ b/sandbox/grist/test_column_actions.py @@ -205,7 +205,7 @@ class TestColumnActions(test_engine.EngineTestCase): Column(14, "manualSort", "ManualSortPos", False, "", 0), Column(15, "A", "Text", False, "", 0), Column(16, "B", "Text", False, "", 0), - Column(17, "C", "Text", False, "", 0), + Column(17, "C", "Any", True, "", 0), ]), Table(3, "GristSummary_7_Address", 0, 1, columns=[ Column(18, "state", "Text", False, "", summarySourceCol=12), @@ -244,9 +244,9 @@ class TestColumnActions(test_engine.EngineTestCase): self.assertTableData('Address', data=self.address_table_data) self.assertTableData('Table1', data=[ ["id", "A", "B", "C", "manualSort"], - [ 1, "a", "d", "", 1.0], - [ 2, "b", "e", "", 2.0], - [ 3, "c", "f", "", 3.0], + [ 1, "a", "d", None, 1.0], + [ 2, "b", "e", None, 2.0], + [ 3, "c", "f", None, 3.0], ]) self.assertTableData("GristSummary_7_Address", cols="subset", data=[ [ "id", "state", "count", "amount" ], @@ -294,7 +294,7 @@ class TestColumnActions(test_engine.EngineTestCase): Table(2, "Table1", 2, 0, columns=[ Column(14, "manualSort", "ManualSortPos", False, "", 0), Column(15, "A", "Text", False, "", 0), - Column(17, "C", "Text", False, "", 0), + Column(17, "C", "Any", True, "", 0), ]), Table(3, "GristSummary_7_Address", 0, 1, columns=[ Column(18, "state", "Text", False, "", summarySourceCol=12), @@ -352,7 +352,7 @@ class TestColumnActions(test_engine.EngineTestCase): Table(2, "Table1", 2, 0, columns=[ Column(14, "manualSort", "ManualSortPos", False, "", 0), Column(15, "A", "Text", False, "", 0), - Column(17, "C", "Text", False, "", 0), + Column(17, "C", "Any", True, "", 0), ]), # Note that the summary table here switches to a new one, without the deleted group-by. Table(4, "GristSummary_7_Address2", 0, 1, columns=[ @@ -401,9 +401,9 @@ class TestColumnActions(test_engine.EngineTestCase): ]) self.assertTableData('Table1', data=[ ["id", "A", "C", "manualSort"], - [ 1, "a", "", 1.0], - [ 2, "b", "", 2.0], - [ 3, "c", "", 3.0], + [ 1, "a", None, 1.0], + [ 2, "b", None, 2.0], + [ 3, "c", None, 3.0], ]) self.assertTableData("GristSummary_7_Address2", cols="subset", data=[ [ "id", "count", "amount" ], diff --git a/sandbox/grist/useractions.py b/sandbox/grist/useractions.py index 06840883..9056dfcd 100644 --- a/sandbox/grist/useractions.py +++ b/sandbox/grist/useractions.py @@ -139,6 +139,34 @@ def _make_clean_col_info(col_info, col_id=None): return ret +def guess_col_info(values): + """ + Returns a pair col_info, values + where col_info is a dict which may contain a type and widgetOptions + and `values` is similar to the given argument but maybe converted to the guessed type. + """ + # If the values are all strings/None... + if set(map(type, values)) <= {str, six.text_type, type(None)}: + # If the values are all blank (None or empty string) leave the column empty + if not any(values): + return {}, [None] * len(values) + + # Use the exported guessColInfo if we're connected to JS + from sandbox import default_sandbox + if default_sandbox: + guess = default_sandbox.call_external("guessColInfo", values) + # When the result doesn't contain `values`, that means the guessed type is Text + # so there was nothing to convert. + values = guess.get("values", values) + col_info = guess["colInfo"] + if "widgetOptions" in col_info: + col_info["widgetOptions"] = json.dumps(col_info["widgetOptions"]) + return col_info, values + + # Fallback to the older guessing method, particularly for pure python tests. + return {'type': guess_type(values, convert=True)}, values + + def guess_type(values, convert=False): """ Returns a suitable type for the given iterable of values, optionally attempting conversions. @@ -305,7 +333,7 @@ class UserActions(object): def BulkAddRecord(self, table_id, row_ids, column_values): column_values = actions.decode_bulk_values(column_values) for col_id, values in six.iteritems(column_values): - self._ensure_column_accepts_data(table_id, col_id, values) + column_values[col_id] = self._ensure_column_accepts_data(table_id, col_id, values) method = self._overrides.get(('BulkAddRecord', table_id), self.doBulkAddOrReplace) return method(table_id, row_ids, column_values) @@ -456,7 +484,7 @@ class UserActions(object): # Check that the update is valid. for col_id, values in six.iteritems(columns): - self._ensure_column_accepts_data(table_id, col_id, values) + columns[col_id] = self._ensure_column_accepts_data(table_id, col_id, values) # Additionally check that we are not trying to modify group-by values in a summary column # (this check is only for updating records, not for adding). Note that col_rec will not be @@ -799,24 +827,32 @@ class UserActions(object): When we store values (via Add or Update), check that the column is a data column. If it is an empty column (formula column with an empty formula), convert to data. If it's a real formula column, then fail. + Return a list of values which may be the same as the original argument + or may have values converted to the newly guessed type of the column. """ schema_col = self._engine.schema[table_id].columns[col_id] if not schema_col.isFormula: # Plain old data column, OK to enter values. - return + return values - if not schema_col.formula: - # An empty column (isFormula=True, formula=""), now is the time to convert it to data. - if schema_col.type == 'Any': - # Guess the type when it starts out as Any. We unfortunately need to a separate - # ModifyColumn call for type conversion, to recompute type-specific defaults - # before they are used in formula->data conversion. - self.ModifyColumn(table_id, col_id, {'type': guess_type(values, convert=True)}) - self.ModifyColumn(table_id, col_id, {'isFormula': False}) - else: - # Otherwise, this is an error. We can't save individual values to formula columns. + if schema_col.formula: + # This is an error. We can't save individual values to formula columns. raise ValueError("Can't save value to formula column %s" % col_id) + # An empty column (isFormula=True, formula=""), now is the time to convert it to data. + if schema_col.type == 'Any': + # Guess the type when it starts out as Any. We unfortunately need to update the column + # separately for type conversion, to recompute type-specific defaults + # before they are used in formula->data conversion. + col_info, values = guess_col_info(values) + # If the values are all blank (None or empty string) leave the column empty + if not col_info: + return values + col_rec = self._docmodel.get_column_rec(table_id, col_id) + self._docmodel.update([col_rec], **col_info) + self.ModifyColumn(table_id, col_id, {'isFormula': False}) + return values + @useraction def AddOrUpdateRecord(self, table_id, require, col_values, options): """