(core) Fuller guessing of type and options when adding first data to blank columns

Summary:
Adds `common/ValueGuesser.ts` with logic for guessing column type and widget options (only for dates/datetimes) from an array of strings, and converting the strings to the guessed type in a lossless manner, so that converting back to Text gives the original values.

Changes `_ensure_column_accepts_data` in Python to call an exported JS method using the new logic where possible.

Test Plan: Added `test/common/ValueGuesser.ts` to unit test the core guessing logic and a DocApi end-to-end test for what happens to new columns.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3290
This commit is contained in:
Alex Hall
2022-03-01 14:50:12 +02:00
parent ae6c857ac5
commit 599545fb11
8 changed files with 246 additions and 38 deletions

View File

@@ -3,13 +3,17 @@
* subscribes to actions which change it, and forwards those actions to individual tables.
* It also provides the interface to apply actions to data.
*/
import {DocumentSettings} from 'app/common/DocumentSettings';
import {safeJsonParse} from 'app/common/gutil';
import {schema, SchemaTypes} from 'app/common/schema';
import fromPairs = require('lodash/fromPairs');
import groupBy = require('lodash/groupBy');
import {ActionDispatcher} from './ActionDispatcher';
import {BulkColValues, ColInfo, ColInfoWithId, ColValues, DocAction,
RowRecord, TableDataAction} from './DocActions';
import {ColTypeMap, MetaTableData, TableData} from './TableData';
import {
BulkColValues, ColInfo, ColInfoWithId, ColValues, DocAction,
RowRecord, TableDataAction
} from './DocActions';
import {ColTypeMap, MetaRowRecord, MetaTableData, TableData} from './TableData';
type FetchTableFunc = (tableId: string) => Promise<TableDataAction>;
@@ -108,6 +112,15 @@ export class DocData extends ActionDispatcher {
}
}
public docInfo(): MetaRowRecord<'_grist_DocInfo'> {
const docInfoTable = this.getMetaTable('_grist_DocInfo');
return docInfoTable.getRecord(1)!;
}
public docSettings(): DocumentSettings {
return safeJsonParse(this.docInfo().documentSettings, {});
}
// ---- The following methods implement ActionDispatcher interface ----
protected onAddTable(action: DocAction, tableId: string, columns: ColInfoWithId[]): void {

155
app/common/ValueGuesser.ts Normal file
View File

@@ -0,0 +1,155 @@
import {DocData} from 'app/common/DocData';
import {DocumentSettings} from 'app/common/DocumentSettings';
import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate';
import {createFormatter} from 'app/common/ValueFormatter';
import * as moment from 'moment-timezone';
interface GuessedColInfo {
type: string;
widgetOptions?: object;
}
interface GuessResult {
values?: any[];
colInfo: GuessedColInfo;
}
/**
* Class for guessing if an array of values should be interpreted as a specific column type.
* T is the type of values that strings should be parsed to and is stored in the column.
*/
abstract class ValueGuesser<T> {
/**
* Guessed column type and maybe widget options.
*/
public abstract colInfo(): GuessedColInfo;
/**
* Parse a single string to a typed value in such a way that formatting the value returns the original string.
* If the string cannot be parsed, return the original string.
*/
public abstract parse(value: string): T | string;
/**
* Attempt to parse at least 90% the string values losslessly according to the guessed colInfo.
* Return null if this cannot be done.
*/
public guess(values: Array<string | null>, docSettings: DocumentSettings): GuessResult | null {
const colInfo = this.colInfo();
const {type, widgetOptions} = colInfo;
const formatter = createFormatter(type, widgetOptions || {}, docSettings);
const result: any[] = [];
const maxUnparsed = values.length * 0.1; // max number of non-parsed strings to allow before giving up
let unparsed = 0;
for (const value of values) {
if (!value) {
if (this.allowBlank()) {
result.push(null);
continue;
} else {
return null;
}
}
const parsed = this.parse(value);
// Give up if too many strings failed to parse or if the parsed value changes when converted back to text
if (typeof parsed === "string" && ++unparsed > maxUnparsed || formatter.formatAny(parsed) !== value) {
return null;
}
result.push(parsed);
}
return {values: result, colInfo};
}
/**
* Whether this type of column can store nulls directly.
*/
protected allowBlank(): boolean {
return true;
}
}
class BoolGuesser extends ValueGuesser<boolean> {
public colInfo(): GuessedColInfo {
return {type: 'Bool'};
}
public parse(value: string): boolean | string {
if (value === "true") {
return true;
} else if (value === "false") {
return false;
} else {
return value;
}
}
/**
* This is the only type that can't store nulls, it converts them to false.
*/
protected allowBlank(): boolean {
return false;
}
}
class NumericGuesser extends ValueGuesser<number> {
public colInfo(): GuessedColInfo {
// TODO parse and guess options for formatted numbers, e.g. currency amounts
return {type: 'Numeric'};
}
public parse(value: string): number | string {
const parsed = Number(value);
return !isNaN(parsed) ? parsed : value;
}
}
class DateGuesser extends ValueGuesser<number> {
// _format should be a full moment format string
// _tz should be the document's default timezone
constructor(private _format: string, private _tz: string) {
super();
}
public colInfo(): GuessedColInfo {
const widgetOptions = dateTimeWidgetOptions(this._format, false);
let type;
if (widgetOptions.timeFormat) {
type = 'DateTime:' + this._tz;
} else {
type = 'Date';
this._tz = "UTC";
}
return {widgetOptions, type};
}
// Note that this parsing is much stricter than parseDate to prevent loss of information.
// Dates which can be parsed by parseDate based on the guessed widget options may not be parsed here.
public parse(value: string): number | string {
const m = moment.tz(value, this._format, true, this._tz);
return m.isValid() ? m.valueOf() / 1000 : value;
}
}
export function guessColInfoWithDocData(values: Array<string | null>, docData: DocData) {
return guessColInfo(values, docData.docSettings(), docData.docInfo().timezone);
}
export function guessColInfo(
values: Array<string | null>, docSettings: DocumentSettings, timezone: string
): GuessResult {
// Use short-circuiting of || to only do as much work as needed,
// in particular not guessing date formats before trying other types.
return (
new BoolGuesser()
.guess(values, docSettings) ||
new NumericGuesser()
.guess(values, docSettings) ||
new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
.guess(values, docSettings) ||
// Don't return the same values back if there's no conversion to be done,
// as they have to be serialized and transferred over a pipe to Python.
{colInfo: {type: 'Text'}}
);
}

View File

@@ -293,8 +293,6 @@ export function createParserOrFormatterArgumentsRaw(
visibleColRef: number,
): [string, object, DocumentSettings] {
const columnsTable = docData.getMetaTable('_grist_Tables_column');
const docInfoTable = docData.getMetaTable('_grist_DocInfo');
const widgetOpts = safeJsonParse(widgetOptions, {});
if (isFullReferencingType(type)) {
@@ -305,10 +303,7 @@ export function createParserOrFormatterArgumentsRaw(
widgetOpts.tableData = docData.getTable(getReferencedTableId(type)!);
}
const docInfo = docInfoTable.getRecord(1);
const docSettings = safeJsonParse(docInfo!.documentSettings, {}) as DocumentSettings;
return [type, widgetOpts, docSettings];
return [type, widgetOpts, docData.docSettings()];
}
/**

View File

@@ -1,6 +1,6 @@
import escapeRegExp = require('lodash/escapeRegExp');
import memoize = require('lodash/memoize');
import {getDistinctValues} from 'app/common/gutil';
import {getDistinctValues, isObject} from 'app/common/gutil';
// Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
import * as guessFormat from '@gristlabs/moment-guess/dist/bundle.js';
import * as moment from 'moment-timezone';
@@ -325,8 +325,9 @@ function standardizeTime(timeString: string): { remaining: string, time: string
return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
}
export function guessDateFormat(values: string[], timezone: string = 'UTC'): string | null {
const sample = getDistinctValues(values, 100);
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string | null {
const dateStrings: string[] = values.filter(isObject);
const sample = getDistinctValues(dateStrings, 100);
const formats: Record<string, number> = {};
for (const dateString of sample) {
let guessed: string | string[];
@@ -348,7 +349,7 @@ export function guessDateFormat(values: string[], timezone: string = 'UTC'): str
}
for (const format of formatKeys) {
for (const dateString of values) {
for (const dateString of dateStrings) {
const m = moment.tz(dateString, format, true, timezone);
if (m.isValid()) {
formats[format] += 1;
@@ -380,10 +381,15 @@ export const timeFormatOptions = [
'HH:mm:ss z',
];
export function dateTimeWidgetOptions(fullFormat: string) {
/**
* Construct widget options for a Date or DateTime column based on a single moment string
* which may or may not contain both date and time parts.
* If defaultTimeFormat is true, fallback to a non-empty default time format when none is found in fullFormat.
*/
export function dateTimeWidgetOptions(fullFormat: string, defaultTimeFormat: boolean) {
const index = fullFormat.match(/[hHkaAmsSzZT]|$/)!.index!;
const dateFormat = fullFormat.substr(0, index).trim();
const timeFormat = fullFormat.substr(index).trim() || timeFormatOptions[0];
const timeFormat = fullFormat.substr(index).trim() || (defaultTimeFormat ? timeFormatOptions[0] : "");
return {
dateFormat,
timeFormat,