From 5213972d24b493d120c9d0d77afd6c56967b41d4 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 3 Mar 2022 20:48:25 +0200 Subject: [PATCH] (core) Guess numeric formatting options Summary: Change NumberParse.parse to return not just the parsed number but also information it gathered along the way about how the input string was formatted. Use this in the new NumberParse.guessOptions to guess the actual widget options based on an array of strings. Use NumberParse.guessOptions in TypeConversion (for when a user explicitly chooses to change type) and in ValueGuesser (for guesses about strings entered into empty columns). Test Plan: Adds unit tests for NumberParse and ValueGuesser and updates the TypeChange2 nbrowser test. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3294 --- app/client/components/TypeConversion.ts | 13 +++ app/common/NumberFormat.ts | 4 +- app/common/NumberParse.ts | 149 ++++++++++++++++++++++-- app/common/ValueGuesser.ts | 30 +++-- app/common/ValueParser.ts | 6 +- 5 files changed, 182 insertions(+), 20 deletions(-) diff --git a/app/client/components/TypeConversion.ts b/app/client/components/TypeConversion.ts index 904d4fc9..b025d91c 100644 --- a/app/client/components/TypeConversion.ts +++ b/app/client/components/TypeConversion.ts @@ -4,11 +4,13 @@ */ // tslint:disable:no-console +import {isString} from 'app/client/lib/sessionObs'; import {DocModel} from 'app/client/models/DocModel'; import {ColumnRec} from 'app/client/models/entities/ColumnRec'; import * as gristTypes from 'app/common/gristTypes'; import {isFullReferencingType} from 'app/common/gristTypes'; import * as gutil from 'app/common/gutil'; +import NumberParse from 'app/common/NumberParse'; import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate'; import {TableData} from 'app/common/TableData'; import {decodeObject} from 'app/plugin/objtypes'; @@ -108,6 +110,17 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe widgetOptions = dateTimeWidgetOptions(dateFormat, true); break; } + case 'Numeric': + case 'Int': { + if (["Numeric", "Int"].includes(sourceCol.type())) { + widgetOptions = prevOptions; + } else { + const numberParse = NumberParse.fromSettings(docModel.docData.docSettings()); + const colValues = tableData.getColValues(sourceCol.colId()) || []; + widgetOptions = numberParse.guessOptions(colValues.filter(isString)); + } + break; + } case 'Choice': { if (Array.isArray(prevOptions.choices)) { // Use previous choices if they are set, e.g. if converting from ChoiceList diff --git a/app/common/NumberFormat.ts b/app/common/NumberFormat.ts index 99735ea3..ccfc8922 100644 --- a/app/common/NumberFormat.ts +++ b/app/common/NumberFormat.ts @@ -19,12 +19,14 @@ */ import {clamp} from 'app/common/gutil'; +import {StringUnion} from 'app/common/StringUnion'; import * as LocaleCurrency from "locale-currency"; import {FormatOptions} from 'app/common/ValueFormatter'; import {DocumentSettings} from 'app/common/DocumentSettings'; // Options for number formatting. -export type NumMode = 'currency' | 'decimal' | 'percent' | 'scientific'; +export const NumMode = StringUnion('currency', 'decimal', 'percent', 'scientific'); +export type NumMode = typeof NumMode.type; export type NumSign = 'parens'; export interface NumberFormatOptions extends FormatOptions { diff --git a/app/common/NumberParse.ts b/app/common/NumberParse.ts index ab68da65..bda8b1be 100644 --- a/app/common/NumberParse.ts +++ b/app/common/NumberParse.ts @@ -4,7 +4,9 @@ * not tied to documents or anything. */ -import { NumMode, parseNumMode } from 'app/common/NumberFormat'; +import {DocumentSettings} from 'app/common/DocumentSettings'; +import {getDistinctValues} from 'app/common/gutil'; +import {getCurrency, NumberFormatOptions, NumMode, parseNumMode} from 'app/common/NumberFormat'; import escapeRegExp = require('lodash/escapeRegExp'); import last = require('lodash/last'); @@ -30,6 +32,14 @@ function getDigitsMap(locale: string) { return result; } +interface ParsedOptions { + isPercent: boolean; + isCurrency: boolean; + isParenthesised: boolean; + hasDigitGroupSeparator: boolean; + isScientific: boolean; +} + export default class NumberParse { // Regex for whitespace and some control characters we need to remove // 200e = Left-to-right mark @@ -37,6 +47,10 @@ export default class NumberParse { // 061c = Arabic letter mark public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g; + public static fromSettings(docSettings: DocumentSettings, options: NumberFormatOptions = {}) { + return new NumberParse(docSettings.locale, getCurrency(options, docSettings)); + } + // Many attributes are public for easy testing. public readonly currencySymbol: string; public readonly percentageSymbol: string; @@ -45,6 +59,7 @@ export default class NumberParse { public readonly exponentSeparator: string; public readonly decimalSeparator: string; public readonly minusSign: string; + public readonly defaultNumDecimalsCurrency: number; public readonly digitsMap: Map; @@ -58,9 +73,8 @@ export default class NumberParse { private readonly _replaceDigits: (s: string) => string; constructor(locale: string, currency: string) { - const numModes: NumMode[] = ['currency', 'percent', 'scientific', 'decimal']; const parts = new Map(); - for (const numMode of numModes) { + for (const numMode of NumMode.values) { const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency)); const formatParts = formatter.formatToParts(-1234567.5678); parts.set(numMode, formatParts); @@ -87,6 +101,10 @@ export default class NumberParse { // A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-' this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign'; + // Default number of fractional digits for currency, + // e.g. this is 2 for USD because 1 is formatted as $1.00 + this.defaultNumDecimalsCurrency = getPart("fraction", "currency")?.length || 0; + // Since JS and Python allow both e and E for scientific notation, it seems fair that other // locales should be case insensitive for this. this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i'); @@ -113,11 +131,16 @@ export default class NumberParse { } /** - * Returns a number if the string looks like that number formatted by Grist using this parser's locale and currency - * (or at least close). + * If the string looks like a number formatted by Grist using this parser's locale and currency (or at least close) + * then returns an object where: + * - `result` is that number, the only thing most callers need + * - `cleaned` is a string derived from `value` which can be parsed directly by Number, although `result` + * is still processed a bit further than that, e.g. dividing by 100 for percentages. + * - `options` describes how the number was apparently formatted. + * * Returns null otherwise. */ - public parse(value: string): number | null { + public parse(value: string): { result: number, cleaned: string, options: ParsedOptions } | null { // Remove characters before checking for parentheses on the ends of the string. const [value2, isCurrency] = removeSymbol(value, this.currencySymbol); const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol); @@ -125,8 +148,8 @@ export default class NumberParse { // Remove whitespace and special characters, after currency because some currencies contain spaces. value = value3.replace(NumberParse.removeCharsRegex, ""); - const parenthesised = value[0] === "(" && value[value.length - 1] === ")"; - if (parenthesised) { + const isParenthesised = value[0] === "(" && value[value.length - 1] === ")"; + if (isParenthesised) { value = value.substring(1, value.length - 1); } @@ -144,13 +167,18 @@ export default class NumberParse { // Check for exponent separator before replacing digits // because it can contain locale-specific digits representing '10' as in 'x10^'. + const withExponent = value; value = value.replace(this._exponentSeparatorRegex, "e"); + const isScientific = withExponent !== value; + value = this._replaceDigits(value); // Must come after replacing digits because the regex uses \d // which doesn't work for locale-specific digits. // This simply removes the separators, $1 is a captured group of digits which we keep. + const withSeparators = value; value = value.replace(this._digitGroupSeparatorRegex, "$1"); + const hasDigitGroupSeparator = withSeparators !== value; // Must come after the digit separator replacement // because the digit separator might be '.' @@ -174,7 +202,7 @@ export default class NumberParse { // Parentheses represent a negative number, e.g. (123) -> -123 // (-123) is treated as an error - if (parenthesised) { + if (isParenthesised) { if (result <= 0) { return null; } @@ -185,6 +213,109 @@ export default class NumberParse { result *= 0.01; } + return { + result, + cleaned: value, + options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific} + }; + } + + public guessOptions(values: Array): NumberFormatOptions { + // null: undecided + // true: negative numbers should be parenthesised + // false: they should not + let parens: boolean | null = null; + + // If any of the numbers have thousands separators, that's enough to guess that option + let anyHasDigitGroupSeparator = false; + + // Minimum number of decimal places, guessed by looking for trailing 0s after the decimal point + let decimals = 0; + const decimalsRegex = /\.\d+/; + // Maximum number of decimal places. We never actually guess a value for this option, + // but for currencies we need to check if there are fewer decimal places than the default. + let maxDecimals = 0; + + // Keep track of the number of modes seen to pick the most common + const modes = {} as Record; + for (const mode of NumMode.values) { + modes[mode] = 0; + } + + for (const value of getDistinctValues(values)) { + if (!value) { + continue; + } + const parsed = this.parse(value); + if (!parsed) { + continue; + } + const { + result, + cleaned, + options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific} + } = parsed; + + if (result < 0 && !isParenthesised) { + // If we see a negative number not surrounded by parens, assume that any other parens mean something else + parens = false; + } else if (parens === null && isParenthesised) { + // If we're still unsure about parens (i.e. the above case hasn't been encountered) + // then one parenthesised number is enough to guess that the parens option should be used. + parens = true; + } + + // If any of the numbers have thousands separators, that's enough to guess that option + anyHasDigitGroupSeparator = anyHasDigitGroupSeparator || hasDigitGroupSeparator; + + let mode: NumMode = "decimal"; + if (isCurrency) { + mode = "currency"; + } else if (isPercent) { + mode = "percent"; + } else if (isScientific) { + mode = "scientific"; + } + modes[mode] += 1; + + const decimalsMatch = decimalsRegex.exec(cleaned); + if (decimalsMatch) { + // Number of digits after the '.' (which is part of the match, hence the -1) + const numDecimals = decimalsMatch[0].length - 1; + maxDecimals = Math.max(maxDecimals, numDecimals); + if (decimalsMatch[0].endsWith("0")) { + decimals = Math.max(decimals, numDecimals); + } + } + } + + const maxCount = Math.max(...Object.values(modes)); + if (maxCount === 0) { + // No numbers parsed at all, so don't guess any options + return {}; + } + + const result: NumberFormatOptions = {}; + + // Find the most common mode. + const maxMode: NumMode = NumMode.values.find((k) => modes[k] === maxCount)!; + + // 'decimal' is the default mode above when counting, + // but only guess it as an actual option if digit separators were used at least once. + if (maxMode !== "decimal" || anyHasDigitGroupSeparator) { + result.numMode = maxMode; + } + + if (parens) { + result.numSign = "parens"; + } + + // Specify minimum number of decimal places if we saw any trailing 0s after '.' + // Otherwise explicitly set it to 0 if needed to suppress the default for that currency. + if (decimals > 0 || maxMode === "currency" && maxDecimals < this.defaultNumDecimalsCurrency) { + result.decimals = decimals; + } + return result; } } diff --git a/app/common/ValueGuesser.ts b/app/common/ValueGuesser.ts index 6820c140..a3b16b1d 100644 --- a/app/common/ValueGuesser.ts +++ b/app/common/ValueGuesser.ts @@ -1,7 +1,11 @@ import {DocData} from 'app/common/DocData'; import {DocumentSettings} from 'app/common/DocumentSettings'; +import {countIf} from 'app/common/gutil'; +import {NumberFormatOptions} from 'app/common/NumberFormat'; +import NumberParse from 'app/common/NumberParse'; import {dateTimeWidgetOptions, guessDateFormat} from 'app/common/parseDate'; import {createFormatter} from 'app/common/ValueFormatter'; +import {createParserRaw, ValueParser} from 'app/common/ValueParser'; import * as moment from 'moment-timezone'; interface GuessedColInfo { @@ -9,7 +13,7 @@ interface GuessedColInfo { widgetOptions?: object; } -interface GuessResult { +export interface GuessResult { values?: any[]; colInfo: GuessedColInfo; } @@ -39,7 +43,8 @@ abstract class ValueGuesser { const {type, widgetOptions} = colInfo; const formatter = createFormatter(type, widgetOptions || {}, docSettings); const result: any[] = []; - const maxUnparsed = values.length * 0.1; // max number of non-parsed strings to allow before giving up + // max number of non-parsed strings to allow before giving up + const maxUnparsed = countIf(values, v => Boolean(v)) * 0.1; let unparsed = 0; for (const value of values) { @@ -94,14 +99,22 @@ class BoolGuesser extends ValueGuesser { } class NumericGuesser extends ValueGuesser { + private _parser: ValueParser; + constructor(docSettings: DocumentSettings, private _options: NumberFormatOptions) { + super(); + this._parser = createParserRaw('Numeric', _options, docSettings); + } + public colInfo(): GuessedColInfo { - // TODO parse and guess options for formatted numbers, e.g. currency amounts - return {type: 'Numeric'}; + const result: GuessedColInfo = {type: 'Numeric'}; + if (Object.keys(this._options).length) { + result.widgetOptions = this._options; + } + return result; } public parse(value: string): number | string { - const parsed = Number(value); - return !isNaN(parsed) ? parsed : value; + return this._parser.cleanParse(value); } } @@ -144,7 +157,10 @@ export function guessColInfo( return ( new BoolGuesser() .guess(values, docSettings) || - new NumericGuesser() + new NumericGuesser( + docSettings, + NumberParse.fromSettings(docSettings).guessOptions(values) + ) .guess(values, docSettings) || new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone) .guess(values, docSettings) || diff --git a/app/common/ValueParser.ts b/app/common/ValueParser.ts index 9b0e4afb..519b1fd0 100644 --- a/app/common/ValueParser.ts +++ b/app/common/ValueParser.ts @@ -6,7 +6,7 @@ import * as gristTypes from 'app/common/gristTypes'; import {getReferencedTableId, isFullReferencingType} from 'app/common/gristTypes'; import * as gutil from 'app/common/gutil'; import {safeJsonParse} from 'app/common/gutil'; -import {getCurrency, NumberFormatOptions} from 'app/common/NumberFormat'; +import {NumberFormatOptions} from 'app/common/NumberFormat'; import NumberParse from 'app/common/NumberParse'; import {parseDateStrict, parseDateTime} from 'app/common/parseDate'; import {MetaRowRecord, TableData} from 'app/common/TableData'; @@ -53,11 +53,11 @@ export class NumericParser extends ValueParser { constructor(type: string, options: NumberFormatOptions, docSettings: DocumentSettings) { super(type, options, docSettings); - this._parse = new NumberParse(docSettings.locale, getCurrency(options, docSettings)); + this._parse = NumberParse.fromSettings(docSettings, options); } public parse(value: string): number | null { - return this._parse.parse(value); + return this._parse.parse(value)?.result ?? null; } }