2021-10-19 18:59:13 +00:00
|
|
|
/**
|
|
|
|
* Counterpart of NumberFormat.ts.
|
|
|
|
* Generic functionality for parsing numbers formatted by Intl.NumberFormat,
|
|
|
|
* not tied to documents or anything.
|
|
|
|
*/
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
import {DocumentSettings} from 'app/common/DocumentSettings';
|
|
|
|
import {getDistinctValues} from 'app/common/gutil';
|
|
|
|
import {getCurrency, NumberFormatOptions, NumMode, parseNumMode} from 'app/common/NumberFormat';
|
2024-01-30 03:15:34 +00:00
|
|
|
import {buildNumberFormat} from 'app/common/NumberFormat';
|
2021-10-19 18:59:13 +00:00
|
|
|
import escapeRegExp = require('lodash/escapeRegExp');
|
|
|
|
import last = require('lodash/last');
|
|
|
|
|
|
|
|
// Possible values of Intl.NumberFormat.formatToParts[i].type
|
|
|
|
// Seems Intl.NumberFormatPartTypes is not quite complete
|
|
|
|
type NumberFormatPartTypes = Intl.NumberFormatPartTypes | 'exponentSeparator';
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a map converting the decimal digits used in the given formatter
|
|
|
|
* to the digits 0123456789.
|
|
|
|
* Excludes digits which don't need conversion, so for many locales this is empty.
|
|
|
|
*/
|
|
|
|
function getDigitsMap(locale: string) {
|
|
|
|
const formatter = Intl.NumberFormat(locale);
|
|
|
|
const result = new Map<string, string>();
|
|
|
|
for (let i = 0; i < 10; i++) {
|
|
|
|
const digit = String(i);
|
|
|
|
const localeDigit = formatter.format(i);
|
|
|
|
if (localeDigit !== digit) {
|
|
|
|
result.set(localeDigit, digit);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
interface ParsedOptions {
|
|
|
|
isPercent: boolean;
|
|
|
|
isCurrency: boolean;
|
|
|
|
isParenthesised: boolean;
|
|
|
|
hasDigitGroupSeparator: boolean;
|
|
|
|
isScientific: boolean;
|
|
|
|
}
|
|
|
|
|
2021-10-19 18:59:13 +00:00
|
|
|
export default class NumberParse {
|
|
|
|
// Regex for whitespace and some control characters we need to remove
|
|
|
|
// 200e = Left-to-right mark
|
|
|
|
// 200f = Right-to-left mark
|
|
|
|
// 061c = Arabic letter mark
|
|
|
|
public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g;
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
public static fromSettings(docSettings: DocumentSettings, options: NumberFormatOptions = {}) {
|
|
|
|
return new NumberParse(docSettings.locale, getCurrency(options, docSettings));
|
|
|
|
}
|
|
|
|
|
2021-10-19 18:59:13 +00:00
|
|
|
// Many attributes are public for easy testing.
|
|
|
|
public readonly currencySymbol: string;
|
|
|
|
public readonly percentageSymbol: string;
|
|
|
|
public readonly digitGroupSeparator: string;
|
|
|
|
public readonly digitGroupSeparatorCurrency: string;
|
|
|
|
public readonly exponentSeparator: string;
|
|
|
|
public readonly decimalSeparator: string;
|
|
|
|
public readonly minusSign: string;
|
2022-03-03 18:48:25 +00:00
|
|
|
public readonly defaultNumDecimalsCurrency: number;
|
2021-10-19 18:59:13 +00:00
|
|
|
|
|
|
|
public readonly digitsMap: Map<string, string>;
|
|
|
|
|
|
|
|
public readonly currencyEndsInMinusSign: boolean;
|
|
|
|
|
|
|
|
private readonly _exponentSeparatorRegex: RegExp;
|
|
|
|
private readonly _digitGroupSeparatorRegex: RegExp;
|
|
|
|
|
|
|
|
// Function which replaces keys of digitsMap (i.e. locale-specific digits)
|
|
|
|
// with corresponding digits from 0123456789.
|
|
|
|
private readonly _replaceDigits: (s: string) => string;
|
|
|
|
|
2024-01-30 03:15:34 +00:00
|
|
|
constructor(public readonly locale: string, public readonly currency: string) {
|
2021-10-19 18:59:13 +00:00
|
|
|
const parts = new Map<NumMode, Intl.NumberFormatPart[]>();
|
2022-03-03 18:48:25 +00:00
|
|
|
for (const numMode of NumMode.values) {
|
2021-10-19 18:59:13 +00:00
|
|
|
const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency));
|
|
|
|
const formatParts = formatter.formatToParts(-1234567.5678);
|
|
|
|
parts.set(numMode, formatParts);
|
|
|
|
}
|
|
|
|
|
|
|
|
function getPart(partType: NumberFormatPartTypes, numMode: NumMode = "decimal"): string {
|
|
|
|
const part = parts.get(numMode)!.find(p => p.type === partType);
|
|
|
|
// Only time we expect `part` to be undefined is for digitGroupSeparatorCurrency
|
|
|
|
return part?.value || '';
|
|
|
|
}
|
|
|
|
|
|
|
|
this.currencySymbol = getPart('currency', 'currency');
|
|
|
|
this.percentageSymbol = getPart('percentSign', 'percent');
|
|
|
|
this.exponentSeparator = getPart('exponentSeparator', 'scientific');
|
|
|
|
this.minusSign = getPart('minusSign');
|
|
|
|
this.decimalSeparator = getPart('decimal');
|
|
|
|
|
|
|
|
// Separators for groups of digits, typically groups of 3, i.e. 'thousands separators'.
|
|
|
|
// A few locales have different separators for currency and non-currency.
|
|
|
|
// We check for both but don't check which one is used, currency or not.
|
|
|
|
this.digitGroupSeparator = getPart('group');
|
|
|
|
this.digitGroupSeparatorCurrency = getPart('group', 'currency');
|
|
|
|
|
|
|
|
// A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-'
|
|
|
|
this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign';
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
// Default number of fractional digits for currency,
|
|
|
|
// e.g. this is 2 for USD because 1 is formatted as $1.00
|
|
|
|
this.defaultNumDecimalsCurrency = getPart("fraction", "currency")?.length || 0;
|
|
|
|
|
2021-10-19 18:59:13 +00:00
|
|
|
// Since JS and Python allow both e and E for scientific notation, it seems fair that other
|
|
|
|
// locales should be case insensitive for this.
|
|
|
|
this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i');
|
|
|
|
|
|
|
|
// Overall the parser is quite lax about digit separators.
|
|
|
|
// We only require that the separator is followed by at least 2 digits,
|
|
|
|
// because India groups digits in pairs after the first 3.
|
|
|
|
// More careful checking is probably more complicated than is worth it.
|
|
|
|
this._digitGroupSeparatorRegex = new RegExp(
|
|
|
|
`[${escapeRegExp(
|
|
|
|
this.digitGroupSeparator +
|
|
|
|
this.digitGroupSeparatorCurrency
|
|
|
|
)}](\\d\\d)`,
|
|
|
|
'g'
|
|
|
|
);
|
|
|
|
|
|
|
|
const digitsMap = this.digitsMap = getDigitsMap(locale);
|
|
|
|
if (digitsMap.size === 0) {
|
|
|
|
this._replaceDigits = (s: string) => s;
|
|
|
|
} else {
|
|
|
|
const digitsRegex = new RegExp([...digitsMap.keys()].join("|"), "g");
|
|
|
|
this._replaceDigits = (s: string) => s.replace(digitsRegex, d => digitsMap.get(d) || d);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2022-03-03 18:48:25 +00:00
|
|
|
* If the string looks like a number formatted by Grist using this parser's locale and currency (or at least close)
|
|
|
|
* then returns an object where:
|
|
|
|
* - `result` is that number, the only thing most callers need
|
|
|
|
* - `cleaned` is a string derived from `value` which can be parsed directly by Number, although `result`
|
|
|
|
* is still processed a bit further than that, e.g. dividing by 100 for percentages.
|
|
|
|
* - `options` describes how the number was apparently formatted.
|
|
|
|
*
|
2021-10-19 18:59:13 +00:00
|
|
|
* Returns null otherwise.
|
|
|
|
*/
|
2022-03-03 18:48:25 +00:00
|
|
|
public parse(value: string): { result: number, cleaned: string, options: ParsedOptions } | null {
|
2021-10-19 18:59:13 +00:00
|
|
|
// Remove characters before checking for parentheses on the ends of the string.
|
|
|
|
const [value2, isCurrency] = removeSymbol(value, this.currencySymbol);
|
|
|
|
const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol);
|
|
|
|
|
|
|
|
// Remove whitespace and special characters, after currency because some currencies contain spaces.
|
|
|
|
value = value3.replace(NumberParse.removeCharsRegex, "");
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
const isParenthesised = value[0] === "(" && value[value.length - 1] === ")";
|
|
|
|
if (isParenthesised) {
|
2021-10-19 18:59:13 +00:00
|
|
|
value = value.substring(1, value.length - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Must check for empty string directly because Number('') is 0 :facepalm:
|
|
|
|
// Check early so we can return early for performance.
|
|
|
|
// Nothing after this should potentially produce an empty string.
|
|
|
|
if (value === '') {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace various symbols with the standard versions recognised by JS Number.
|
|
|
|
// Note that this also allows the 'standard' symbols ('e', '.', '-', and '0123456789')
|
|
|
|
// even if the locale doesn't use them when formatting,
|
|
|
|
// although '.' will still be removed if it's a digit separator.
|
|
|
|
|
|
|
|
// Check for exponent separator before replacing digits
|
|
|
|
// because it can contain locale-specific digits representing '10' as in 'x10^'.
|
2022-03-03 18:48:25 +00:00
|
|
|
const withExponent = value;
|
2021-10-19 18:59:13 +00:00
|
|
|
value = value.replace(this._exponentSeparatorRegex, "e");
|
2022-03-03 18:48:25 +00:00
|
|
|
const isScientific = withExponent !== value;
|
|
|
|
|
2021-10-19 18:59:13 +00:00
|
|
|
value = this._replaceDigits(value);
|
|
|
|
|
|
|
|
// Must come after replacing digits because the regex uses \d
|
|
|
|
// which doesn't work for locale-specific digits.
|
|
|
|
// This simply removes the separators, $1 is a captured group of digits which we keep.
|
2022-03-03 18:48:25 +00:00
|
|
|
const withSeparators = value;
|
2021-10-19 18:59:13 +00:00
|
|
|
value = value.replace(this._digitGroupSeparatorRegex, "$1");
|
2022-03-03 18:48:25 +00:00
|
|
|
const hasDigitGroupSeparator = withSeparators !== value;
|
2021-10-19 18:59:13 +00:00
|
|
|
|
|
|
|
// Must come after the digit separator replacement
|
|
|
|
// because the digit separator might be '.'
|
|
|
|
value = value.replace(this.decimalSeparator, '.');
|
|
|
|
|
|
|
|
// .replace with a string only replaces once,
|
|
|
|
// and a number can contain two minus signs when using scientific notation
|
|
|
|
value = value.replace(this.minusSign, "-");
|
|
|
|
value = value.replace(this.minusSign, "-");
|
|
|
|
|
|
|
|
// Move '-' from the end to the beginning when appropriate (which is rare)
|
|
|
|
if (isCurrency && this.currencyEndsInMinusSign && value.endsWith("-")) {
|
|
|
|
value = "-" + value.substring(0, value.length - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Number is more strict than parseFloat which allows extra trailing characters.
|
|
|
|
let result = Number(value);
|
|
|
|
if (isNaN(result)) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parentheses represent a negative number, e.g. (123) -> -123
|
|
|
|
// (-123) is treated as an error
|
2022-03-03 18:48:25 +00:00
|
|
|
if (isParenthesised) {
|
2021-10-19 18:59:13 +00:00
|
|
|
if (result <= 0) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
result = -result;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isPercent) {
|
|
|
|
result *= 0.01;
|
|
|
|
}
|
|
|
|
|
2022-03-03 18:48:25 +00:00
|
|
|
return {
|
|
|
|
result,
|
|
|
|
cleaned: value,
|
|
|
|
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
public guessOptions(values: Array<string | null>): NumberFormatOptions {
|
|
|
|
// null: undecided
|
|
|
|
// true: negative numbers should be parenthesised
|
|
|
|
// false: they should not
|
|
|
|
let parens: boolean | null = null;
|
|
|
|
|
|
|
|
// If any of the numbers have thousands separators, that's enough to guess that option
|
|
|
|
let anyHasDigitGroupSeparator = false;
|
|
|
|
|
|
|
|
// Minimum number of decimal places, guessed by looking for trailing 0s after the decimal point
|
|
|
|
let decimals = 0;
|
|
|
|
const decimalsRegex = /\.\d+/;
|
|
|
|
// Maximum number of decimal places. We never actually guess a value for this option,
|
|
|
|
// but for currencies we need to check if there are fewer decimal places than the default.
|
|
|
|
let maxDecimals = 0;
|
|
|
|
|
|
|
|
// Keep track of the number of modes seen to pick the most common
|
|
|
|
const modes = {} as Record<NumMode, number>;
|
|
|
|
for (const mode of NumMode.values) {
|
|
|
|
modes[mode] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const value of getDistinctValues(values)) {
|
|
|
|
if (!value) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
const parsed = this.parse(value);
|
|
|
|
if (!parsed) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
const {
|
|
|
|
result,
|
|
|
|
cleaned,
|
|
|
|
options: {isCurrency, isPercent, isParenthesised, hasDigitGroupSeparator, isScientific}
|
|
|
|
} = parsed;
|
|
|
|
|
|
|
|
if (result < 0 && !isParenthesised) {
|
|
|
|
// If we see a negative number not surrounded by parens, assume that any other parens mean something else
|
|
|
|
parens = false;
|
|
|
|
} else if (parens === null && isParenthesised) {
|
|
|
|
// If we're still unsure about parens (i.e. the above case hasn't been encountered)
|
|
|
|
// then one parenthesised number is enough to guess that the parens option should be used.
|
|
|
|
parens = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If any of the numbers have thousands separators, that's enough to guess that option
|
|
|
|
anyHasDigitGroupSeparator = anyHasDigitGroupSeparator || hasDigitGroupSeparator;
|
|
|
|
|
|
|
|
let mode: NumMode = "decimal";
|
|
|
|
if (isCurrency) {
|
|
|
|
mode = "currency";
|
|
|
|
} else if (isPercent) {
|
|
|
|
mode = "percent";
|
|
|
|
} else if (isScientific) {
|
|
|
|
mode = "scientific";
|
|
|
|
}
|
|
|
|
modes[mode] += 1;
|
|
|
|
|
|
|
|
const decimalsMatch = decimalsRegex.exec(cleaned);
|
|
|
|
if (decimalsMatch) {
|
|
|
|
// Number of digits after the '.' (which is part of the match, hence the -1)
|
|
|
|
const numDecimals = decimalsMatch[0].length - 1;
|
|
|
|
maxDecimals = Math.max(maxDecimals, numDecimals);
|
|
|
|
if (decimalsMatch[0].endsWith("0")) {
|
|
|
|
decimals = Math.max(decimals, numDecimals);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const maxCount = Math.max(...Object.values(modes));
|
|
|
|
if (maxCount === 0) {
|
|
|
|
// No numbers parsed at all, so don't guess any options
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
const result: NumberFormatOptions = {};
|
|
|
|
|
|
|
|
// Find the most common mode.
|
|
|
|
const maxMode: NumMode = NumMode.values.find((k) => modes[k] === maxCount)!;
|
|
|
|
|
|
|
|
// 'decimal' is the default mode above when counting,
|
|
|
|
// but only guess it as an actual option if digit separators were used at least once.
|
|
|
|
if (maxMode !== "decimal" || anyHasDigitGroupSeparator) {
|
|
|
|
result.numMode = maxMode;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (parens) {
|
|
|
|
result.numSign = "parens";
|
|
|
|
}
|
|
|
|
|
|
|
|
// Specify minimum number of decimal places if we saw any trailing 0s after '.'
|
|
|
|
// Otherwise explicitly set it to 0 if needed to suppress the default for that currency.
|
|
|
|
if (decimals > 0 || maxMode === "currency" && maxDecimals < this.defaultNumDecimalsCurrency) {
|
|
|
|
result.decimals = decimals;
|
|
|
|
}
|
|
|
|
|
2024-01-30 03:15:34 +00:00
|
|
|
// We should only set maxDecimals if the default maxDecimals is too low.
|
|
|
|
const tmpNF = buildNumberFormat(result, {locale: this.locale, currency: this.currency}).resolvedOptions();
|
|
|
|
if (maxDecimals > tmpNF.maximumFractionDigits) {
|
|
|
|
result.maxDecimals = maxDecimals;
|
|
|
|
}
|
|
|
|
|
2021-10-19 18:59:13 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a tuple [removed, wasPresent]
|
|
|
|
* - `removed` is the given string `value` with `symbol` removed at most once.
|
|
|
|
* - `wasPresent` is `true` if `symbol` was present in `value` and was thus removed.
|
|
|
|
*/
|
|
|
|
function removeSymbol(value: string, symbol: string): [string, boolean] {
|
|
|
|
const removed = value.replace(symbol, "");
|
|
|
|
const wasPresent = removed.length < value.length;
|
|
|
|
return [removed, wasPresent];
|
|
|
|
}
|