(core) Lossless imports

Summary:
- Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates.
- 0s and 1s are treated as numbers instead of booleans to keep imports lossless.
- Removed dateguess.py and test_dateguess.py.
- Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py.
- Columns containing numbers are now always imported as Numeric, never Int.
- Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff.

Test Plan:
- Added 2 `nbrowser/Importer2` tests.
- Updated various existing tests.
- Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3302
This commit is contained in:
Alex Hall
2022-03-04 19:37:56 +02:00
parent 9522438967
commit 321019217d
14 changed files with 150 additions and 785 deletions

View File

@@ -49,6 +49,7 @@ export interface TransformColumn {
colId: string|null;
type: string;
formula: string;
widgetOptions: string;
}
export interface ImportResult {

View File

@@ -3,7 +3,7 @@ import {ApplyUAResult, QueryFilters} from 'app/common/ActiveDocAPI';
import {BaseAPI, IOptions} from 'app/common/BaseAPI';
import {BillingAPI, BillingAPIImpl} from 'app/common/BillingAPI';
import {BrowserSettings} from 'app/common/BrowserSettings';
import {BulkColValues, TableColValues, UserAction} from 'app/common/DocActions';
import {BulkColValues, TableColValues, TableRecordValue, TableRecordValues, UserAction} from 'app/common/DocActions';
import {DocCreationInfo, OpenDocMode} from 'app/common/DocListAPI';
import {Features} from 'app/common/Features';
import {ICustomWidget} from 'app/common/CustomWidget';
@@ -402,6 +402,11 @@ export interface UserAPI {
filters?: string;
}
interface GetRowsParams {
filters?: QueryFilters;
immediate?: boolean;
}
/**
* Collect endpoints related to the content of a single document that we've been thinking
* of as the (restful) "Doc API". A few endpoints that could be here are not, for historical
@@ -411,8 +416,8 @@ export interface DocAPI {
// Immediate flag is a currently not-advertised feature, allowing a query to proceed without
// waiting for a document to be initialized. This is useful if the calculations done when
// opening a document are irrelevant.
getRows(tableId: string, options?: { filters?: QueryFilters,
immediate?: boolean }): Promise<TableColValues>;
getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues>;
getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]>;
updateRows(tableId: string, changes: TableColValues): Promise<number[]>;
addRows(tableId: string, additions: BulkColValues): Promise<number[]>;
removeRows(tableId: string, removals: number[]): Promise<number[]>;
@@ -869,16 +874,13 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
this._url = `${url}/api/docs/${docId}`;
}
public async getRows(tableId: string, options?: { filters?: QueryFilters,
immediate?: boolean }): Promise<TableColValues> {
const url = new URL(`${this._url}/tables/${tableId}/data`);
if (options?.filters) {
url.searchParams.append('filter', JSON.stringify(options.filters));
}
if (options?.immediate) {
url.searchParams.append('immediate', 'true');
}
return this.requestJson(url.href);
public async getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues> {
return this._getRecords(tableId, 'data', options);
}
public async getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]> {
const response: TableRecordValues = await this._getRecords(tableId, 'records', options);
return response.records;
}
public async updateRows(tableId: string, changes: TableColValues): Promise<number[]> {
@@ -967,6 +969,17 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
url.searchParams.append('code', code);
return this.requestJson(url.href);
}
private _getRecords(tableId: string, endpoint: 'data' | 'records', options?: GetRowsParams): Promise<any> {
const url = new URL(`${this._url}/tables/${tableId}/${endpoint}`);
if (options?.filters) {
url.searchParams.append('filter', JSON.stringify(options.filters));
}
if (options?.immediate) {
url.searchParams.append('immediate', 'true');
}
return this.requestJson(url.href);
}
}
/**

View File

@@ -162,7 +162,7 @@ export function guessColInfo(
NumberParse.fromSettings(docSettings).guessOptions(values)
)
.guess(values, docSettings) ||
new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
new DateGuesser(guessDateFormat(values, timezone), timezone)
.guess(values, docSettings) ||
// Don't return the same values back if there's no conversion to be done,
// as they have to be serialized and transferred over a pipe to Python.

View File

@@ -36,18 +36,6 @@ export class ValueParser {
class IdentityParser extends ValueParser {
}
/**
* Same as basic Value parser, but will return null if a value is an empty string.
*/
class NullIfEmptyParser extends ValueParser {
public cleanParse(value: string): any {
if (value === "") {
return null;
}
return super.cleanParse(value);
}
}
export class NumericParser extends ValueParser {
private _parse: NumberParse;
@@ -225,7 +213,6 @@ export class ReferenceListParser extends ReferenceParser {
}
export const valueParserClasses: { [type: string]: typeof ValueParser } = {
Any: NullIfEmptyParser,
Numeric: NumericParser,
Int: NumericParser,
Date: DateParser,

View File

@@ -1,4 +1,5 @@
import escapeRegExp = require('lodash/escapeRegExp');
import last = require('lodash/last');
import memoize = require('lodash/memoize');
import {getDistinctValues, isObject} from 'app/common/gutil';
// Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
@@ -325,7 +326,26 @@ function standardizeTime(timeString: string): { remaining: string, time: string
return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
}
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string | null {
/**
* Guesses a full date[time] format that best matches the given strings.
* If several formats match equally well, picks the last one lexicographically to match the old date guessing.
* This means formats with an early Y and/or M are favoured.
* If no formats match, returns the default YYYY-MM-DD.
*/
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string {
const formats = guessDateFormats(values, timezone);
if (!formats) {
return "YYYY-MM-DD";
}
return last(formats)!;
}
/**
* Returns all full date[time] formats that best match the given strings.
* If several formats match equally well, returns them all.
* May return null if there are no matching formats or choosing one is too expensive.
*/
export function guessDateFormats(values: Array<string | null>, timezone: string = 'UTC'): string[] | null {
const dateStrings: string[] = values.filter(isObject);
const sample = getDistinctValues(dateStrings, 100);
const formats: Record<string, number> = {};
@@ -358,7 +378,9 @@ export function guessDateFormat(values: Array<string | null>, timezone: string =
}
const maxCount = Math.max(...Object.values(formats));
return formatKeys.find(format => formats[format] === maxCount)!;
// Return all formats that tied for first place.
// Sort lexicographically for consistency in tests and with the old dateguess.py.
return formatKeys.filter(format => formats[format] === maxCount).sort();
}
export const dateFormatOptions = [