gristlabs_grist-core/app/common/NumberParse.ts
Alex Hall 4894631ba4 (core) Generic number parsing functionality.
Summary:
Added NumberParse.ts, counterpart of NumberFormat.ts.

Contains generic functionality for parsing numbers formatted by Intl.NumberFormat, not tied to documents or anything.

This doesn't change any actual behaviour, applying this parsing when pasting/typing in numeric columns will be a separate diff.

Test Plan: New file with extensive unit tests.

Reviewers: dsagal

Reviewed By: dsagal

Subscribers: jarek

Differential Revision: https://phab.getgrist.com/D3078
2021-10-19 23:19:13 +02:00

202 lines
7.8 KiB
TypeScript

/**
* Counterpart of NumberFormat.ts.
* Generic functionality for parsing numbers formatted by Intl.NumberFormat,
* not tied to documents or anything.
*/
import { NumMode, parseNumMode } from 'app/common/NumberFormat';
import escapeRegExp = require('lodash/escapeRegExp');
import last = require('lodash/last');
// Possible values of Intl.NumberFormat.formatToParts[i].type
// Seems Intl.NumberFormatPartTypes is not quite complete
type NumberFormatPartTypes = Intl.NumberFormatPartTypes | 'exponentSeparator';
/**
* Returns a map converting the decimal digits used in the given formatter
* to the digits 0123456789.
* Excludes digits which don't need conversion, so for many locales this is empty.
*/
function getDigitsMap(locale: string) {
const formatter = Intl.NumberFormat(locale);
const result = new Map<string, string>();
for (let i = 0; i < 10; i++) {
const digit = String(i);
const localeDigit = formatter.format(i);
if (localeDigit !== digit) {
result.set(localeDigit, digit);
}
}
return result;
}
export default class NumberParse {
// Regex for whitespace and some control characters we need to remove
// 200e = Left-to-right mark
// 200f = Right-to-left mark
// 061c = Arabic letter mark
public static readonly removeCharsRegex = /[\s\u200e\u200f\u061c]/g;
// Many attributes are public for easy testing.
public readonly currencySymbol: string;
public readonly percentageSymbol: string;
public readonly digitGroupSeparator: string;
public readonly digitGroupSeparatorCurrency: string;
public readonly exponentSeparator: string;
public readonly decimalSeparator: string;
public readonly minusSign: string;
public readonly digitsMap: Map<string, string>;
public readonly currencyEndsInMinusSign: boolean;
private readonly _exponentSeparatorRegex: RegExp;
private readonly _digitGroupSeparatorRegex: RegExp;
// Function which replaces keys of digitsMap (i.e. locale-specific digits)
// with corresponding digits from 0123456789.
private readonly _replaceDigits: (s: string) => string;
constructor(locale: string, currency: string) {
const numModes: NumMode[] = ['currency', 'percent', 'scientific', 'decimal'];
const parts = new Map<NumMode, Intl.NumberFormatPart[]>();
for (const numMode of numModes) {
const formatter = Intl.NumberFormat(locale, parseNumMode(numMode, currency));
const formatParts = formatter.formatToParts(-1234567.5678);
parts.set(numMode, formatParts);
}
function getPart(partType: NumberFormatPartTypes, numMode: NumMode = "decimal"): string {
const part = parts.get(numMode)!.find(p => p.type === partType);
// Only time we expect `part` to be undefined is for digitGroupSeparatorCurrency
return part?.value || '';
}
this.currencySymbol = getPart('currency', 'currency');
this.percentageSymbol = getPart('percentSign', 'percent');
this.exponentSeparator = getPart('exponentSeparator', 'scientific');
this.minusSign = getPart('minusSign');
this.decimalSeparator = getPart('decimal');
// Separators for groups of digits, typically groups of 3, i.e. 'thousands separators'.
// A few locales have different separators for currency and non-currency.
// We check for both but don't check which one is used, currency or not.
this.digitGroupSeparator = getPart('group');
this.digitGroupSeparatorCurrency = getPart('group', 'currency');
// A few locales format negative currency amounts ending in '-', e.g. '€ 1,00-'
this.currencyEndsInMinusSign = last(parts.get('currency'))!.type === 'minusSign';
// Since JS and Python allow both e and E for scientific notation, it seems fair that other
// locales should be case insensitive for this.
this._exponentSeparatorRegex = new RegExp(escapeRegExp(this.exponentSeparator), 'i');
// Overall the parser is quite lax about digit separators.
// We only require that the separator is followed by at least 2 digits,
// because India groups digits in pairs after the first 3.
// More careful checking is probably more complicated than is worth it.
this._digitGroupSeparatorRegex = new RegExp(
`[${escapeRegExp(
this.digitGroupSeparator +
this.digitGroupSeparatorCurrency
)}](\\d\\d)`,
'g'
);
const digitsMap = this.digitsMap = getDigitsMap(locale);
if (digitsMap.size === 0) {
this._replaceDigits = (s: string) => s;
} else {
const digitsRegex = new RegExp([...digitsMap.keys()].join("|"), "g");
this._replaceDigits = (s: string) => s.replace(digitsRegex, d => digitsMap.get(d) || d);
}
}
/**
* Returns a number if the string looks like that number formatted by Grist using this parser's locale and currency
* (or at least close).
* Returns null otherwise.
*/
public parse(value: string): number | null {
// Remove characters before checking for parentheses on the ends of the string.
const [value2, isCurrency] = removeSymbol(value, this.currencySymbol);
const [value3, isPercent] = removeSymbol(value2, this.percentageSymbol);
// Remove whitespace and special characters, after currency because some currencies contain spaces.
value = value3.replace(NumberParse.removeCharsRegex, "");
const parenthesised = value[0] === "(" && value[value.length - 1] === ")";
if (parenthesised) {
value = value.substring(1, value.length - 1);
}
// Must check for empty string directly because Number('') is 0 :facepalm:
// Check early so we can return early for performance.
// Nothing after this should potentially produce an empty string.
if (value === '') {
return null;
}
// Replace various symbols with the standard versions recognised by JS Number.
// Note that this also allows the 'standard' symbols ('e', '.', '-', and '0123456789')
// even if the locale doesn't use them when formatting,
// although '.' will still be removed if it's a digit separator.
// Check for exponent separator before replacing digits
// because it can contain locale-specific digits representing '10' as in 'x10^'.
value = value.replace(this._exponentSeparatorRegex, "e");
value = this._replaceDigits(value);
// Must come after replacing digits because the regex uses \d
// which doesn't work for locale-specific digits.
// This simply removes the separators, $1 is a captured group of digits which we keep.
value = value.replace(this._digitGroupSeparatorRegex, "$1");
// Must come after the digit separator replacement
// because the digit separator might be '.'
value = value.replace(this.decimalSeparator, '.');
// .replace with a string only replaces once,
// and a number can contain two minus signs when using scientific notation
value = value.replace(this.minusSign, "-");
value = value.replace(this.minusSign, "-");
// Move '-' from the end to the beginning when appropriate (which is rare)
if (isCurrency && this.currencyEndsInMinusSign && value.endsWith("-")) {
value = "-" + value.substring(0, value.length - 1);
}
// Number is more strict than parseFloat which allows extra trailing characters.
let result = Number(value);
if (isNaN(result)) {
return null;
}
// Parentheses represent a negative number, e.g. (123) -> -123
// (-123) is treated as an error
if (parenthesised) {
if (result <= 0) {
return null;
}
result = -result;
}
if (isPercent) {
result *= 0.01;
}
return result;
}
}
/**
* Returns a tuple [removed, wasPresent]
* - `removed` is the given string `value` with `symbol` removed at most once.
* - `wasPresent` is `true` if `symbol` was present in `value` and was thus removed.
*/
function removeSymbol(value: string, symbol: string): [string, boolean] {
const removed = value.replace(symbol, "");
const wasPresent = removed.length < value.length;
return [removed, wasPresent];
}