gristlabs_grist-core/app/common/parseDate.ts
Alex Hall 5b352211c4 (core) Guess date format during type conversion
Summary:
- Adds a dependency moment-guess (https://github.com/apoorv-mishra/moment-guess) to guess date formats from strings. However the npm package is missing source maps which leads to an ugly warning, so currently using a fork until https://github.com/apoorv-mishra/moment-guess/pull/22 is resolved.
- Adds guessDateFormat using moment-guess to determine the best candidate date format. The logic may be refined for e.g. lossless imports where the stakes are higher, but for now we're just trying to make type conversions smoother.
- Uses guessDateFormat to guess widget options when changing column type to date or datetime.
- Uses the date format of the original column when possible instead of guessing.
- Fixes a bug where choices were guessed based on the display column instead of the visible column, which made the guessed choices influenced by which values were referenced as well as completely broken when converting from reflist.
- @dsagal @georgegevoian This builds on https://phab.getgrist.com/D3265, currently unmerged. That diff was created first to alert to the change. Without it there would still be similar test failures/changes here as the date format would often be concretely guessed and saved as YYYY-MM-DD instead of being left as the default `undefined` which is shows as YYYY-MM-DD in the dropdown.

Test Plan: Added a unit test to `parseDate.ts`. Updated several browser tests which show the guessing in action during type conversion quite nicely.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Subscribers: dsagal, georgegevoian

Differential Revision: https://phab.getgrist.com/D3264
2022-02-21 22:39:47 +02:00

394 lines
13 KiB
TypeScript

import escapeRegExp = require('lodash/escapeRegExp');
import memoize = require('lodash/memoize');
import {getDistinctValues} from 'app/common/gutil';
// Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
import * as guessFormat from '@gristlabs/moment-guess/dist/bundle.js';
import * as moment from 'moment-timezone';
// When using YY format, use a consistent interpretation in datepicker and in moment parsing: add
// 2000 if the result is at most 10 years greater than the current year; otherwise add 1900. See
// https://bootstrap-datepicker.readthedocs.io/en/latest/options.html#assumenearbyyear and
// "Parsing two digit years" in https://momentjs.com/docs/#/parsing/string-format/.
export const TWO_DIGIT_YEAR_THRESHOLD = 10;
const MAX_TWO_DIGIT_YEAR = new Date().getFullYear() + TWO_DIGIT_YEAR_THRESHOLD - 2000;
// Moment suggests that overriding this is fine, but we need to force TypeScript to allow it.
(moment as any).parseTwoDigitYear = function(yearString: string): number {
const year = parseInt(yearString, 10);
return year + (year > MAX_TWO_DIGIT_YEAR ? 1900 : 2000);
};
// Order of formats to try if the date cannot be parsed as the currently set format.
// Formats are parsed in momentjs strict mode, but separator matching and the MM/DD
// two digit requirement are ignored. Also, partial completion is permitted, so formats
// may match even if only beginning elements are provided.
// TODO: These should be affected by the user's locale/settings.
// TODO: We may want to consider adding default time formats as well to support more
// time formats.
const PARSER_FORMATS: string[] = [
'M D YYYY',
'M D YY',
'M D',
'M',
'MMMM D YYYY',
'MMMM D',
'MMMM Do YYYY',
'MMMM Do',
'D MMMM YYYY',
'D MMMM',
'Do MMMM YYYY',
'Do MMMM',
'MMMM',
'MMM D YYYY',
'MMM D',
'MMM Do YYYY',
'MMM Do',
'D MMM YYYY',
'D MMM',
'Do MMM YYYY',
'Do MMM',
'MMM',
'YYYY M D',
'YYYY M',
'YYYY',
'D M YYYY',
'D M YY',
'D M',
'D'
];
const UNAMBIGUOUS_FORMATS = [
'YYYY M D',
...PARSER_FORMATS.filter(f => f.includes("MMM")),
];
const TIME_REGEX = /(?:^|\s+|T)(?:(\d\d?)(?::(\d\d?)(?::(\d\d?))?)?|(\d\d?)(\d\d))\s*([ap]m?)?$/i;
// [^a-zA-Z] because no letters are allowed directly before the abbreviation
const UTC_REGEX = /[^a-zA-Z](UTC?|GMT|Z)$/i;
const NUMERIC_TZ_REGEX = /([+-]\d\d?)(?::?(\d\d))?$/i;
// Not picky about separators, so replace them in the date and format strings to be spaces.
const SEPARATORS = /[\W_]+/g;
const tzAbbreviations = memoize((tzName: string): RegExp => {
// Some abbreviations are just e.g. +05
// and escaping the + seems better than filtering
const abbreviations = new Set(moment.tz.zone(tzName)!.abbrs.map(escapeRegExp));
const union = [...abbreviations].join('|');
// [^a-zA-Z] because no letters are allowed directly before the abbreviation
// so for example CEST won't match even if EST does
return new RegExp(`[^a-zA-Z](${union})$`, 'i');
});
interface ParseOptions {
time?: string;
dateFormat?: string;
timeFormat?: string;
timezone?: string;
}
/**
* parseDate - Attempts to parse a date string using several common formats. Returns the
* timestamp of the parsed date in seconds since epoch, or returns null on failure.
* @param {String} date - The date string to parse.
* @param {String} options.dateFormat - The preferred momentjs format to use to parse the
* date. This is attempted before the default formats.
* @param {String} options.time - The time string to parse.
* @param {String} options.timeFormat - The momentjs format to use to parse the time. This
* must be given if options.time is given.
* @param {String} options.timezone - The timezone string for the date/time, which affects
* the resulting timestamp.
*/
export function parseDate(date: string, options: ParseOptions = {}): number | null {
// If no date, return null.
if (!date) {
return null;
}
const dateFormat = options.dateFormat || "YYYY-MM-DD";
const dateFormats = [..._buildVariations(dateFormat, date), ...PARSER_FORMATS];
const cleanDate = date.replace(SEPARATORS, ' ');
let datetime = cleanDate.trim();
let timeformat = '';
let time = options.time;
if (time) {
const parsedTimeZone = parseTimeZone(time, options.timezone!);
const parsedTime = standardizeTime(parsedTimeZone.remaining);
if (!parsedTime || parsedTime.remaining) {
return null;
}
time = parsedTime.time;
const {tzOffset} = parsedTimeZone;
datetime += ' ' + time + tzOffset;
timeformat = ' HH:mm:ss' + (tzOffset ? 'Z' : '');
}
for (const f of dateFormats) {
const fullFormat = f + timeformat;
const m = moment.tz(datetime, fullFormat, true, options.timezone || 'UTC');
if (m.isValid()) {
return m.valueOf() / 1000;
}
}
return null;
}
/**
* Similar to parseDate, with these differences:
* - Only for a date (no time part)
* - Only falls back to UNAMBIGUOUS_FORMATS, not the full PARSER_FORMATS
* - Optionally adds all dates which match some format to `results`, otherwise returns first match.
* This is safer so it can be used for parsing when pasting a large number of dates
* and won't silently swap around day and month.
*/
export function parseDateStrict(
date: string, dateFormat: string | null, results?: Set<number>, timezone: string = 'UTC'
): number | undefined {
if (!date) {
return;
}
dateFormat = dateFormat || "YYYY-MM-DD";
const dateFormats = [..._buildVariations(dateFormat, date), ...UNAMBIGUOUS_FORMATS];
const cleanDate = date.replace(SEPARATORS, ' ').trim();
for (const format of dateFormats) {
const m = moment.tz(cleanDate, format, true, timezone);
if (m.isValid()) {
const value = m.valueOf() / 1000;
if (results) {
results.add(value);
} else {
return value;
}
}
}
}
export function parseDateTime(dateTime: string, options: ParseOptions): number | undefined {
dateTime = dateTime.trim();
if (!dateTime) {
return;
}
const dateFormat = options.dateFormat || "YYYY-MM-DD";
const timezone = options.timezone || "UTC";
const dateOnly = parseDateStrict(dateTime, dateFormat, undefined, timezone);
if (dateOnly) {
return dateOnly;
}
const parsedTimeZone = parseTimeZone(dateTime, timezone);
let tzOffset = '';
if (parsedTimeZone) {
tzOffset = parsedTimeZone.tzOffset;
dateTime = parsedTimeZone.remaining;
}
const parsedTime = standardizeTime(dateTime);
if (!parsedTime) {
return;
}
dateTime = parsedTime.remaining;
const date = parseDateStrict(dateTime, dateFormat);
if (!date) {
return;
}
// date is a timestamp of midnight in UTC, so to get a formatted representation (for parsing
// together with time), take care to interpret it in UTC.
const dateString = moment.unix(date).utc().format("YYYY-MM-DD");
dateTime = dateString + ' ' + parsedTime.time + tzOffset;
const fullFormat = "YYYY-MM-DD HH:mm:ss" + (tzOffset ? 'Z' : '');
return moment.tz(dateTime, fullFormat, true, timezone).valueOf() / 1000;
}
// Helper function to get the partial format string based on the input. Momentjs has a feature
// which allows defaulting to the current year, month and/or day if not accounted for in the
// parser. We remove any parts of the parser not given in the input to take advantage of this
// feature.
function _getPartialFormat(input: string, format: string): string {
// Define a regular expression to match contiguous non-separators.
const re = /Y+|M+o?|D+o?|[a-zA-Z0-9]+/ig;
// Count the number of meaningful parts in the input.
const numInputParts = input.match(re)?.length || 0;
// Count the number of parts in the format string.
let numFormatParts = format.match(re)?.length || 0;
if (numFormatParts > numInputParts) {
// Remove year from format first, to default to current year.
if (/Y+/.test(format)) {
format = format.replace(/Y+/, ' ').trim();
numFormatParts -= 1;
}
if (numFormatParts > numInputParts) {
// Remove month from format next.
format = format.replace(/M+/, ' ').trim();
}
}
return format;
}
// Moment non-strict mode is considered bad, as it's far too lax. But moment's strict mode is too
// strict. We want to allow YY|YYYY for either year specifier, as well as M for MMM or MMMM month
// specifiers. It's silly that we need to create multiple format variations to support this.
function _buildVariations(dateFormat: string, date: string) {
// Momentjs has an undesirable feature in strict mode where MM and DD
// matches require two digit numbers. Change MM, DD to M, D.
let format = dateFormat.replace(/MM+/g, m => (m === 'MM' ? 'M' : m))
.replace(/DD+/g, m => (m === 'DD' ? 'D' : m))
.replace(SEPARATORS, ' ')
.trim();
// Allow the input date to end with a 4-digit year even if the format doesn't mention the year
if (
format.includes("M") &&
format.includes("D") &&
!format.includes("Y")
) {
format += " YYYY";
}
format = _getPartialFormat(date, format);
// Consider some alternatives to the preferred format.
const variations = new Set<string>([format]);
const otherYear = format.replace(/Y{2,4}/, (m) => (m === 'YY' ? 'YYYY' : (m === 'YYYY' ? 'YY' : m)));
variations.add(otherYear);
variations.add(format.replace(/MMM+/, 'M'));
if (otherYear !== format) {
variations.add(otherYear.replace(/MMM+/, 'M'));
}
return variations;
}
// Based on private calculateOffset in moment source code.
function calculateOffset(tzMatch: string[]): string {
const [, hhOffset, mmOffset] = tzMatch;
const sign = hhOffset.slice(0, 1);
return sign + hhOffset.slice(1).padStart(2, '0') + ':' + (mmOffset || '0').padStart(2, '0');
}
function parseTimeZone(str: string, timezone: string): { remaining: string, tzOffset: string } {
str = str.trim();
let tzMatch = UTC_REGEX.exec(str);
let matchStart = 0;
let tzOffset = '';
if (tzMatch) {
tzOffset = '+00:00';
matchStart = tzMatch.index + 1; // skip [^a-zA-Z] at regex start
} else {
tzMatch = NUMERIC_TZ_REGEX.exec(str);
if (tzMatch) {
tzOffset = calculateOffset(tzMatch);
matchStart = tzMatch.index;
} else if (timezone) {
// Abbreviations are simply stripped and ignored, so tzOffset is not set in this case
tzMatch = tzAbbreviations(timezone).exec(str);
if (tzMatch) {
matchStart = tzMatch.index + 1; // skip [^a-zA-Z] at regex start
}
}
}
if (tzMatch) {
str = str.slice(0, matchStart).trim();
}
return {remaining: str, tzOffset};
}
// Parses time of the form, roughly, HH[:MM[:SS]][am|pm]. Returns the time in the
// standardized HH:mm:ss format.
// This turns out easier than coaxing moment to parse time sensibly and flexibly.
function standardizeTime(timeString: string): { remaining: string, time: string } | undefined {
const match = TIME_REGEX.exec(timeString);
if (!match) {
return;
}
let hours = parseInt(match[1] || match[4], 10);
const mm = (match[2] || match[5] || '0').padStart(2, '0');
const ss = (match[3] || '0').padStart(2, '0');
const ampm = (match[6] || '').toLowerCase();
if (hours < 12 && hours > 0 && ampm.startsWith('p')) {
hours += 12;
} else if (hours === 12 && ampm.startsWith('a')) {
hours = 0;
}
const hh = String(hours).padStart(2, '0');
return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
}
export function guessDateFormat(values: string[], timezone: string = 'UTC'): string | null {
const sample = getDistinctValues(values, 100);
const formats: Record<string, number> = {};
for (const dateString of sample) {
let guessed: string | string[];
try {
guessed = guessFormat(dateString);
} catch {
continue;
}
if (typeof guessed === "string") {
guessed = [guessed];
}
for (const guess of guessed) {
formats[guess] = 0;
}
}
const formatKeys = Object.keys(formats);
if (!formatKeys.length || formatKeys.length > 10) {
return null;
}
for (const format of formatKeys) {
for (const dateString of values) {
const m = moment.tz(dateString, format, true, timezone);
if (m.isValid()) {
formats[format] += 1;
}
}
}
const maxCount = Math.max(...Object.values(formats));
return formatKeys.find(format => formats[format] === maxCount)!;
}
export const dateFormatOptions = [
'YYYY-MM-DD',
'MM-DD-YYYY',
'MM/DD/YYYY',
'MM-DD-YY',
'MM/DD/YY',
'DD MMM YYYY',
'MMMM Do, YYYY',
'DD-MM-YYYY',
];
export const timeFormatOptions = [
'h:mma',
'h:mma z',
'HH:mm',
'HH:mm z',
'HH:mm:ss',
'HH:mm:ss z',
];
export function dateTimeWidgetOptions(fullFormat: string) {
const index = fullFormat.match(/[hHkaAmsSzZT]|$/)!.index!;
const dateFormat = fullFormat.substr(0, index).trim();
const timeFormat = fullFormat.substr(index).trim() || timeFormatOptions[0];
return {
dateFormat,
timeFormat,
isCustomDateFormat: !dateFormatOptions.includes(dateFormat),
isCustomTimeFormat: !timeFormatOptions.includes(timeFormat),
};
}