(core) Guess numeric formatting options

Summary:
Change NumberParse.parse to return not just the parsed number but also information it gathered along the way about how the input string was formatted.

Use this in the new NumberParse.guessOptions to guess the actual widget options based on an array of strings.

Use NumberParse.guessOptions in TypeConversion (for when a user explicitly chooses to change type) and in ValueGuesser (for guesses about strings entered into empty columns).

Test Plan: Adds unit tests for NumberParse and ValueGuesser and updates the TypeChange2 nbrowser test.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3294
This commit is contained in:
Alex Hall 2022-03-03 20:48:25 +02:00
parent cc1af85594
commit 5213972d24
5 changed files with 182 additions and 20 deletions

View File

@ -4,11 +4,13 @@
*/ */
// tslint:disable:no-console // tslint:disable:no-console
import {isString} from 'app/client/lib/sessionObs';
import {DocModel} from 'app/client/models/DocModel'; import {DocModel} from 'app/client/models/DocModel';
import {ColumnRec} from 'app/client/models/entities/ColumnRec'; import {ColumnRec} from 'app/client/models/entities/ColumnRec';
import * as gristTypes from 'app/common/gristTypes'; import * as gristTypes from 'app/common/gristTypes';
import {isFullReferencingType} from 'app/common/gristTypes'; import {isFullReferencingType} from 'app/common/gristTypes';
import * as gutil from 'app/common/gutil'; import * as gutil from 'app/common/gutil';
import NumberParse from 'app/common/NumberParse';
import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate'; import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate';
import {TableData} from 'app/common/TableData'; import {TableData} from 'app/common/TableData';
import {decodeObject} from 'app/plugin/objtypes'; import {decodeObject} from 'app/plugin/objtypes';
@ -108,6 +110,17 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe
widgetOptions = dateTimeWidgetOptions(dateFormat, true); widgetOptions = dateTimeWidgetOptions(dateFormat, true);
break; break;
} }
case 'Numeric':
case 'Int': {
if (["Numeric", "Int"].includes(sourceCol.type())) {
widgetOptions = prevOptions;
} else {
const numberParse = NumberParse.fromSettings(docModel.docData.docSettings());
const colValues = tableData.getColValues(sourceCol.colId()) || [];
widgetOptions = numberParse.guessOptions(colValues.filter(isString));
}
break;
}
case 'Choice': { case 'Choice': {
if (Array.isArray(prevOptions.choices)) { if (Array.isArray(prevOptions.choices)) {
// Use previous choices if they are set, e.g. if converting from ChoiceList // Use previous choices if they are set, e.g. if converting from ChoiceList

View File

@ -19,12 +19,14 @@
*/ */
import {clamp} from 'app/common/gutil'; import {clamp} from 'app/common/gutil';
import {StringUnion} from 'app/common/StringUnion';
import * as LocaleCurrency from "locale-currency"; import * as LocaleCurrency from "locale-currency";
import {FormatOptions} from 'app/common/ValueFormatter'; import {FormatOptions} from 'app/common/ValueFormatter';
import {DocumentSettings} from 'app/common/DocumentSettings'; import {DocumentSettings} from 'app/common/DocumentSettings';
// Options for number formatting. // Options for number formatting.
export type NumMode = 'currency' | 'decimal' | 'percent' | 'scientific'; export const NumMode = StringUnion('currency', 'decimal', 'percent', 'scientific');
export type NumMode = typeof NumMode.type;
export type NumSign = 'parens'; export type NumSign = 'parens';
export interface NumberFormatOptions extends FormatOptions { export interface NumberFormatOptions extends FormatOptions {

View File

@ -4,7 +4,9 @@
* not tied to documents or anything. * not tied to documents or anything.
*/ */
import { NumMode, parseNumMode } from 'app/common/NumberFormat'; import {DocumentSettings} from 'app/common/DocumentSettings';
import {getDistinctValues} from 'app/common/gutil';
import {getCurrency, NumberFormatOptions, NumMode, parseNumMode} from 'app/common/NumberFormat';
import escapeRegExp = require('lodash/escapeRegExp'); import escapeRegExp = require('lodash/escapeRegExp');
import last = require('lodash/last'); import last = require('lodash/last');
@ -30,6 +32,14 @@ function getDigitsMap(locale: string) {
return result; return result;
} }
interface ParsedOptions {
isPercent: boolean;
isCurrency: boolean;
isParenthesised: boolean;
hasDigitGroupSeparator: boolean;
isScientific: boolean;
}
export default class NumberParse { export default class NumberParse {
// Regex for whitespace and some control characters we need to remove // Regex for whitespace and some control characters we need to remove
// 200e = Left-to-right mark // 200e = Left-to-right mark
@ -37,6 +47,10 @@ export default class NumberParse {
// 061c = Arabic letter mark // 061c = Arabic letter mark
public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g; public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g;
public static fromSettings(docSettings: DocumentSettings, options: NumberFormatOptions = {}) {
return new NumberParse(docSettings.locale, getCurrency(options, docSettings));
}
// Many attributes are public for easy testing. // Many attributes are public for easy testing.
public readonly currencySymbol: string; public readonly currencySymbol: string;
public readonly percentageSymbol: string; public readonly percentageSymbol: string;
@ -45,6 +59,7 @@ export default class NumberParse {
public readonly exponentSeparator: string; public readonly exponentSeparator: string;
public readonly decimalSeparator: string; public readonly decimalSeparator: string;
public readonly minusSign: string; public readonly minusSign: string;
public readonly defaultNumDecimalsCurrency: number;
public readonly digitsMap: Map<string, string>; public readonly digitsMap: Map<string, string>;
@ -58,9 +73,8 @@ export default class NumberParse {
private readonly _replaceDigits: (s: string) => string; private readonly _replaceDigits: (s: string) => string;
constructor(locale: string, currency: string) { constructor(locale: string, currency: string) {
const numModes: NumMode[] = ['currency', 'percent', 'scientific', 'decimal'];
const parts = new Map<NumMode, Intl.NumberFormatPart[]>(); const parts = new Map<NumMode, Intl.NumberFormatPart[]>();
for (const numMode of numModes) { for (const numMode of NumMode.values) {
const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency)); const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency));
const formatParts = formatter.formatToParts(-1234567.5678); const formatParts = formatter.formatToParts(-1234567.5678);
parts.set(numMode, formatParts); parts.set(numMode, formatParts);
@ -87,6 +101,10 @@ export default class NumberParse {
// A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-' // A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-'
this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign'; this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign';
// Default number of fractional digits for currency,
// e.g. this is 2 for USD because 1 is formatted as $1.00
this.defaultNumDecimalsCurrency = getPart("fraction", "currency")?.length || 0;
// Since JS and Python allow both e and E for scientific notation, it seems fair that other // Since JS and Python allow both e and E for scientific notation, it seems fair that other
// locales should be case insensitive for this. // locales should be case insensitive for this.
this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i'); this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i');
@ -113,11 +131,16 @@ export default class NumberParse {
} }
/** /**
* Returns a number if the string looks like that number formatted by Grist using this parser's locale and currency * If the string looks like a number formatted by Grist using this parser's locale and currency (or at least close)
* (or at least close). * then returns an object where:
* - `result` is that number, the only thing most callers need
* - `cleaned` is a string derived from `value` which can be parsed directly by Number, although `result`
* is still processed a bit further than that, e.g. dividing by 100 for percentages.
* - `options` describes how the number was apparently formatted.
*
* Returns null otherwise. * Returns null otherwise.
*/ */
public parse(value: string): number | null { public parse(value: string): { result: number, cleaned: string, options: ParsedOptions } | null {
// Remove characters before checking for parentheses on the ends of the string. // Remove characters before checking for parentheses on the ends of the string.
const [value2, isCurrency] = removeSymbol(value, this.currencySymbol); const [value2, isCurrency] = removeSymbol(value, this.currencySymbol);
const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol); const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol);
@ -125,8 +148,8 @@ export default class NumberParse {
// Remove whitespace and special characters, after currency because some currencies contain spaces. // Remove whitespace and special characters, after currency because some currencies contain spaces.
value = value3.replace(NumberParse.removeCharsRegex, ""); value = value3.replace(NumberParse.removeCharsRegex, "");
const parenthesised = value[0] === "(" && value[value.length - 1] === ")"; const isParenthesised = value[0] === "(" && value[value.length - 1] === ")";
if (parenthesised) { if (isParenthesised) {
value = value.substring(1, value.length - 1); value = value.substring(1, value.length - 1);
} }
@ -144,13 +167,18 @@ export default class NumberParse {
// Check for exponent separator before replacing digits // Check for exponent separator before replacing digits
// because it can contain locale-specific digits representing '10' as in 'x10^'. // because it can contain locale-specific digits representing '10' as in 'x10^'.
const withExponent = value;
value = value.replace(this._exponentSeparatorRegex, "e"); value = value.replace(this._exponentSeparatorRegex, "e");
const isScientific = withExponent !== value;
value = this._replaceDigits(value); value = this._replaceDigits(value);
// Must come after replacing digits because the regex uses \d // Must come after replacing digits because the regex uses \d
// which doesn't work for locale-specific digits. // which doesn't work for locale-specific digits.
// This simply removes the separators, $1 is a captured group of digits which we keep. // This simply removes the separators, $1 is a captured group of digits which we keep.
const withSeparators = value;
value = value.replace(this._digitGroupSeparatorRegex, "$1"); value = value.replace(this._digitGroupSeparatorRegex, "$1");
const hasDigitGroupSeparator = withSeparators !== value;
// Must come after the digit separator replacement // Must come after the digit separator replacement
// because the digit separator might be '.' // because the digit separator might be '.'
@ -174,7 +202,7 @@ export default class NumberParse {
// Parentheses represent a negative number, e.g. (123) -> -123 // Parentheses represent a negative number, e.g. (123) -> -123
// (-123) is treated as an error // (-123) is treated as an error
if (parenthesised) { if (isParenthesised) {
if (result <= 0) { if (result <= 0) {
return null; return null;
} }
@ -185,6 +213,109 @@ export default class NumberParse {
result *= 0.01; result *= 0.01;
} }
return {
result,
cleaned: value,
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
};
}
public guessOptions(values: Array<string | null>): NumberFormatOptions {
// null: undecided
// true: negative numbers should be parenthesised
// false: they should not
let parens: boolean | null = null;
// If any of the numbers have thousands separators, that's enough to guess that option
let anyHasDigitGroupSeparator = false;
// Minimum number of decimal places, guessed by looking for trailing 0s after the decimal point
let decimals = 0;
const decimalsRegex = /\.\d+/;
// Maximum number of decimal places. We never actually guess a value for this option,
// but for currencies we need to check if there are fewer decimal places than the default.
let maxDecimals = 0;
// Keep track of the number of modes seen to pick the most common
const modes = {} as Record<NumMode, number>;
for (const mode of NumMode.values) {
modes[mode] = 0;
}
for (const value of getDistinctValues(values)) {
if (!value) {
continue;
}
const parsed = this.parse(value);
if (!parsed) {
continue;
}
const {
result,
cleaned,
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
} = parsed;
if (result < 0 && !isParenthesised) {
// If we see a negative number not surrounded by parens, assume that any other parens mean something else
parens = false;
} else if (parens === null && isParenthesised) {
// If we're still unsure about parens (i.e. the above case hasn't been encountered)
// then one parenthesised number is enough to guess that the parens option should be used.
parens = true;
}
// If any of the numbers have thousands separators, that's enough to guess that option
anyHasDigitGroupSeparator = anyHasDigitGroupSeparator || hasDigitGroupSeparator;
let mode: NumMode = "decimal";
if (isCurrency) {
mode = "currency";
} else if (isPercent) {
mode = "percent";
} else if (isScientific) {
mode = "scientific";
}
modes[mode] += 1;
const decimalsMatch = decimalsRegex.exec(cleaned);
if (decimalsMatch) {
// Number of digits after the '.' (which is part of the match, hence the -1)
const numDecimals = decimalsMatch[0].length - 1;
maxDecimals = Math.max(maxDecimals, numDecimals);
if (decimalsMatch[0].endsWith("0")) {
decimals = Math.max(decimals, numDecimals);
}
}
}
const maxCount = Math.max(...Object.values(modes));
if (maxCount === 0) {
// No numbers parsed at all, so don't guess any options
return {};
}
const result: NumberFormatOptions = {};
// Find the most common mode.
const maxMode: NumMode = NumMode.values.find((k) => modes[k] === maxCount)!;
// 'decimal' is the default mode above when counting,
// but only guess it as an actual option if digit separators were used at least once.
if (maxMode !== "decimal" || anyHasDigitGroupSeparator) {
result.numMode = maxMode;
}
if (parens) {
result.numSign = "parens";
}
// Specify minimum number of decimal places if we saw any trailing 0s after '.'
// Otherwise explicitly set it to 0 if needed to suppress the default for that currency.
if (decimals > 0 || maxMode === "currency" && maxDecimals < this.defaultNumDecimalsCurrency) {
result.decimals = decimals;
}
return result; return result;
} }
} }

View File

@ -1,7 +1,11 @@
import {DocData} from 'app/common/DocData'; import {DocData} from 'app/common/DocData';
import {DocumentSettings} from 'app/common/DocumentSettings'; import {DocumentSettings} from 'app/common/DocumentSettings';
import {countIf} from 'app/common/gutil';
import {NumberFormatOptions} from 'app/common/NumberFormat';
import NumberParse from 'app/common/NumberParse';
import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate'; import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate';
import {createFormatter} from 'app/common/ValueFormatter'; import {createFormatter} from 'app/common/ValueFormatter';
import {createParserRaw, ValueParser} from 'app/common/ValueParser';
import * as moment from 'moment-timezone'; import * as moment from 'moment-timezone';
interface GuessedColInfo { interface GuessedColInfo {
@ -9,7 +13,7 @@ interface GuessedColInfo {
widgetOptions?: object; widgetOptions?: object;
} }
interface GuessResult { export interface GuessResult {
values?: any[]; values?: any[];
colInfo: GuessedColInfo; colInfo: GuessedColInfo;
} }
@ -39,7 +43,8 @@ abstract class ValueGuesser<T> {
const {type, widgetOptions} = colInfo; const {type, widgetOptions} = colInfo;
const formatter = createFormatter(type, widgetOptions || {}, docSettings); const formatter = createFormatter(type, widgetOptions || {}, docSettings);
const result: any[] = []; const result: any[] = [];
const maxUnparsed = values.length * 0.1; // max number of non-parsed strings to allow before giving up // max number of non-parsed strings to allow before giving up
const maxUnparsed = countIf(values, v => Boolean(v)) * 0.1;
let unparsed = 0; let unparsed = 0;
for (const value of values) { for (const value of values) {
@ -94,14 +99,22 @@ class BoolGuesser extends ValueGuesser<boolean> {
} }
class NumericGuesser extends ValueGuesser<number> { class NumericGuesser extends ValueGuesser<number> {
private _parser: ValueParser;
constructor(docSettings: DocumentSettings, private _options: NumberFormatOptions) {
super();
this._parser = createParserRaw('Numeric', _options, docSettings);
}
public colInfo(): GuessedColInfo { public colInfo(): GuessedColInfo {
// TODO parse and guess options for formatted numbers, e.g. currency amounts const result: GuessedColInfo = {type: 'Numeric'};
return {type: 'Numeric'}; if (Object.keys(this._options).length) {
result.widgetOptions = this._options;
}
return result;
} }
public parse(value: string): number | string { public parse(value: string): number | string {
const parsed = Number(value); return this._parser.cleanParse(value);
return !isNaN(parsed) ? parsed : value;
} }
} }
@ -144,7 +157,10 @@ export function guessColInfo(
return ( return (
new BoolGuesser() new BoolGuesser()
.guess(values, docSettings) || .guess(values, docSettings) ||
new NumericGuesser() new NumericGuesser(
docSettings,
NumberParse.fromSettings(docSettings).guessOptions(values)
)
.guess(values, docSettings) || .guess(values, docSettings) ||
new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone) new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
.guess(values, docSettings) || .guess(values, docSettings) ||

View File

@ -6,7 +6,7 @@ import * as gristTypes from 'app/common/gristTypes';
import {getReferencedTableId, isFullReferencingType} from 'app/common/gristTypes'; import {getReferencedTableId, isFullReferencingType} from 'app/common/gristTypes';
import * as gutil from 'app/common/gutil'; import * as gutil from 'app/common/gutil';
import {safeJsonParse} from 'app/common/gutil'; import {safeJsonParse} from 'app/common/gutil';
import {getCurrency, NumberFormatOptions} from 'app/common/NumberFormat'; import {NumberFormatOptions} from 'app/common/NumberFormat';
import NumberParse from 'app/common/NumberParse'; import NumberParse from 'app/common/NumberParse';
import {parseDateStrict, parseDateTime} from 'app/common/parseDate'; import {parseDateStrict, parseDateTime} from 'app/common/parseDate';
import {MetaRowRecord, TableData} from 'app/common/TableData'; import {MetaRowRecord, TableData} from 'app/common/TableData';
@ -53,11 +53,11 @@ export class NumericParser extends ValueParser {
constructor(type: string, options: NumberFormatOptions, docSettings: DocumentSettings) { constructor(type: string, options: NumberFormatOptions, docSettings: DocumentSettings) {
super(type, options, docSettings); super(type, options, docSettings);
this._parse = new NumberParse(docSettings.locale, getCurrency(options, docSettings)); this._parse = NumberParse.fromSettings(docSettings, options);
} }
public parse(value: string): number | null { public parse(value: string): number | null {
return this._parse.parse(value); return this._parse.parse(value)?.result ?? null;
} }
} }