gristlabs_grist-core/app/common/NumberParse.ts
Alex Hall 5213972d24 (core) Guess numeric formatting options
Summary:
Change NumberParse.parse to return not just the parsed number but also information it gathered along the way about how the input string was formatted.

Use this in the new NumberParse.guessOptions to guess the actual widget options based on an array of strings.

Use NumberParse.guessOptions in TypeConversion (for when a user explicitly chooses to change type) and in ValueGuesser (for guesses about strings entered into empty columns).

Test Plan: Adds unit tests for NumberParse and ValueGuesser and updates the TypeChange2 nbrowser test.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3294
2022-03-03 21:32:03 +02:00

333 lines
13 KiB
TypeScript

/**
* Counterpart of NumberFormat.ts.
* Generic functionality for parsing numbers formatted by Intl.NumberFormat,
* not tied to documents or anything.
*/
import {DocumentSettings} from 'app/common/DocumentSettings';
import {getDistinctValues} from 'app/common/gutil';
import {getCurrency, NumberFormatOptions, NumMode, parseNumMode} from 'app/common/NumberFormat';
import escapeRegExp = require('lodash/escapeRegExp');
import last = require('lodash/last');
// Possible values of Intl.NumberFormat.formatToParts[i].type
// Seems Intl.NumberFormatPartTypes is not quite complete
type NumberFormatPartTypes = Intl.NumberFormatPartTypes | 'exponentSeparator';
/**
* Returns a map converting the decimal digits used in the given formatter
* to the digits 0123456789.
* Excludes digits which don't need conversion, so for many locales this is empty.
*/
function getDigitsMap(locale: string) {
const formatter = Intl.NumberFormat(locale);
const result = new Map<string, string>();
for (let i = 0; i < 10; i++) {
const digit = String(i);
const localeDigit = formatter.format(i);
if (localeDigit !== digit) {
result.set(localeDigit, digit);
}
}
return result;
}
interface ParsedOptions {
isPercent: boolean;
isCurrency: boolean;
isParenthesised: boolean;
hasDigitGroupSeparator: boolean;
isScientific: boolean;
}
export default class NumberParse {
// Regex for whitespace and some control characters we need to remove
// 200e = Left-to-right mark
// 200f = Right-to-left mark
// 061c = Arabic letter mark
public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g;
public static fromSettings(docSettings: DocumentSettings, options: NumberFormatOptions = {}) {
return new NumberParse(docSettings.locale, getCurrency(options, docSettings));
}
// Many attributes are public for easy testing.
public readonly currencySymbol: string;
public readonly percentageSymbol: string;
public readonly digitGroupSeparator: string;
public readonly digitGroupSeparatorCurrency: string;
public readonly exponentSeparator: string;
public readonly decimalSeparator: string;
public readonly minusSign: string;
public readonly defaultNumDecimalsCurrency: number;
public readonly digitsMap: Map<string, string>;
public readonly currencyEndsInMinusSign: boolean;
private readonly _exponentSeparatorRegex: RegExp;
private readonly _digitGroupSeparatorRegex: RegExp;
// Function which replaces keys of digitsMap (i.e. locale-specific digits)
// with corresponding digits from 0123456789.
private readonly _replaceDigits: (s: string) => string;
constructor(locale: string, currency: string) {
const parts = new Map<NumMode, Intl.NumberFormatPart[]>();
for (const numMode of NumMode.values) {
const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency));
const formatParts = formatter.formatToParts(-1234567.5678);
parts.set(numMode, formatParts);
}
function getPart(partType: NumberFormatPartTypes, numMode: NumMode = "decimal"): string {
const part = parts.get(numMode)!.find(p => p.type === partType);
// Only time we expect `part` to be undefined is for digitGroupSeparatorCurrency
return part?.value || '';
}
this.currencySymbol = getPart('currency', 'currency');
this.percentageSymbol = getPart('percentSign', 'percent');
this.exponentSeparator = getPart('exponentSeparator', 'scientific');
this.minusSign = getPart('minusSign');
this.decimalSeparator = getPart('decimal');
// Separators for groups of digits, typically groups of 3, i.e. 'thousands separators'.
// A few locales have different separators for currency and non-currency.
// We check for both but don't check which one is used, currency or not.
this.digitGroupSeparator = getPart('group');
this.digitGroupSeparatorCurrency = getPart('group', 'currency');
// A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-'
this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign';
// Default number of fractional digits for currency,
// e.g. this is 2 for USD because 1 is formatted as $1.00
this.defaultNumDecimalsCurrency = getPart("fraction", "currency")?.length || 0;
// Since JS and Python allow both e and E for scientific notation, it seems fair that other
// locales should be case insensitive for this.
this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i');
// Overall the parser is quite lax about digit separators.
// We only require that the separator is followed by at least 2 digits,
// because India groups digits in pairs after the first 3.
// More careful checking is probably more complicated than is worth it.
this._digitGroupSeparatorRegex = new RegExp(
`[${escapeRegExp(
this.digitGroupSeparator +
this.digitGroupSeparatorCurrency
)}](\\d\\d)`,
'g'
);
const digitsMap = this.digitsMap = getDigitsMap(locale);
if (digitsMap.size === 0) {
this._replaceDigits = (s: string) => s;
} else {
const digitsRegex = new RegExp([...digitsMap.keys()].join("|"), "g");
this._replaceDigits = (s: string) => s.replace(digitsRegex, d => digitsMap.get(d) || d);
}
}
/**
* If the string looks like a number formatted by Grist using this parser's locale and currency (or at least close)
* then returns an object where:
* - `result` is that number, the only thing most callers need
* - `cleaned` is a string derived from `value` which can be parsed directly by Number, although `result`
* is still processed a bit further than that, e.g. dividing by 100 for percentages.
* - `options` describes how the number was apparently formatted.
*
* Returns null otherwise.
*/
public parse(value: string): { result: number, cleaned: string, options: ParsedOptions } | null {
// Remove characters before checking for parentheses on the ends of the string.
const [value2, isCurrency] = removeSymbol(value, this.currencySymbol);
const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol);
// Remove whitespace and special characters, after currency because some currencies contain spaces.
value = value3.replace(NumberParse.removeCharsRegex, "");
const isParenthesised = value[0] === "(" && value[value.length - 1] === ")";
if (isParenthesised) {
value = value.substring(1, value.length - 1);
}
// Must check for empty string directly because Number('') is 0 :facepalm:
// Check early so we can return early for performance.
// Nothing after this should potentially produce an empty string.
if (value === '') {
return null;
}
// Replace various symbols with the standard versions recognised by JS Number.
// Note that this also allows the 'standard' symbols ('e', '.', '-', and '0123456789')
// even if the locale doesn't use them when formatting,
// although '.' will still be removed if it's a digit separator.
// Check for exponent separator before replacing digits
// because it can contain locale-specific digits representing '10' as in 'x10^'.
const withExponent = value;
value = value.replace(this._exponentSeparatorRegex, "e");
const isScientific = withExponent !== value;
value = this._replaceDigits(value);
// Must come after replacing digits because the regex uses \d
// which doesn't work for locale-specific digits.
// This simply removes the separators, $1 is a captured group of digits which we keep.
const withSeparators = value;
value = value.replace(this._digitGroupSeparatorRegex, "$1");
const hasDigitGroupSeparator = withSeparators !== value;
// Must come after the digit separator replacement
// because the digit separator might be '.'
value = value.replace(this.decimalSeparator, '.');
// .replace with a string only replaces once,
// and a number can contain two minus signs when using scientific notation
value = value.replace(this.minusSign, "-");
value = value.replace(this.minusSign, "-");
// Move '-' from the end to the beginning when appropriate (which is rare)
if (isCurrency && this.currencyEndsInMinusSign && value.endsWith("-")) {
value = "-" + value.substring(0, value.length - 1);
}
// Number is more strict than parseFloat which allows extra trailing characters.
let result = Number(value);
if (isNaN(result)) {
return null;
}
// Parentheses represent a negative number, e.g. (123) -> -123
// (-123) is treated as an error
if (isParenthesised) {
if (result <= 0) {
return null;
}
result = -result;
}
if (isPercent) {
result *= 0.01;
}
return {
result,
cleaned: value,
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
};
}
public guessOptions(values: Array<string | null>): NumberFormatOptions {
// null: undecided
// true: negative numbers should be parenthesised
// false: they should not
let parens: boolean | null = null;
// If any of the numbers have thousands separators, that's enough to guess that option
let anyHasDigitGroupSeparator = false;
// Minimum number of decimal places, guessed by looking for trailing 0s after the decimal point
let decimals = 0;
const decimalsRegex = /\.\d+/;
// Maximum number of decimal places. We never actually guess a value for this option,
// but for currencies we need to check if there are fewer decimal places than the default.
let maxDecimals = 0;
// Keep track of the number of modes seen to pick the most common
const modes = {} as Record<NumMode, number>;
for (const mode of NumMode.values) {
modes[mode] = 0;
}
for (const value of getDistinctValues(values)) {
if (!value) {
continue;
}
const parsed = this.parse(value);
if (!parsed) {
continue;
}
const {
result,
cleaned,
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
} = parsed;
if (result < 0 && !isParenthesised) {
// If we see a negative number not surrounded by parens, assume that any other parens mean something else
parens = false;
} else if (parens === null && isParenthesised) {
// If we're still unsure about parens (i.e. the above case hasn't been encountered)
// then one parenthesised number is enough to guess that the parens option should be used.
parens = true;
}
// If any of the numbers have thousands separators, that's enough to guess that option
anyHasDigitGroupSeparator = anyHasDigitGroupSeparator || hasDigitGroupSeparator;
let mode: NumMode = "decimal";
if (isCurrency) {
mode = "currency";
} else if (isPercent) {
mode = "percent";
} else if (isScientific) {
mode = "scientific";
}
modes[mode] += 1;
const decimalsMatch = decimalsRegex.exec(cleaned);
if (decimalsMatch) {
// Number of digits after the '.' (which is part of the match, hence the -1)
const numDecimals = decimalsMatch[0].length - 1;
maxDecimals = Math.max(maxDecimals, numDecimals);
if (decimalsMatch[0].endsWith("0")) {
decimals = Math.max(decimals, numDecimals);
}
}
}
const maxCount = Math.max(...Object.values(modes));
if (maxCount === 0) {
// No numbers parsed at all, so don't guess any options
return {};
}
const result: NumberFormatOptions = {};
// Find the most common mode.
const maxMode: NumMode = NumMode.values.find((k) => modes[k] === maxCount)!;
// 'decimal' is the default mode above when counting,
// but only guess it as an actual option if digit separators were used at least once.
if (maxMode !== "decimal" || anyHasDigitGroupSeparator) {
result.numMode = maxMode;
}
if (parens) {
result.numSign = "parens";
}
// Specify minimum number of decimal places if we saw any trailing 0s after '.'
// Otherwise explicitly set it to 0 if needed to suppress the default for that currency.
if (decimals > 0 || maxMode === "currency" && maxDecimals < this.defaultNumDecimalsCurrency) {
result.decimals = decimals;
}
return result;
}
}
/**
* Returns a tuple [removed, wasPresent]
* - `removed` is the given string `value` with `symbol` removed at most once.
* - `wasPresent` is `true` if `symbol` was present in `value` and was thus removed.
*/
function removeSymbol(value: string, symbol: string): [string, boolean] {
const removed = value.replace(symbol, "");
const wasPresent = removed.length < value.length;
return [removed, wasPresent];
}