diff --git a/app/client/components/Importer.ts b/app/client/components/Importer.ts index 8b5957bc..895a33ad 100644 --- a/app/client/components/Importer.ts +++ b/app/client/components/Importer.ts @@ -353,6 +353,7 @@ export class Importer extends DisposableWithEvents { label: field.label(), colId: destTableId ? field.colId() : null, // if inserting into new table, colId isn't defined type: field.column().type(), + widgetOptions: field.column().widgetOptions(), formula: field.column().formula() })), sourceCols: sourceFields.map((field) => field.colId()) diff --git a/app/client/components/TypeConversion.ts b/app/client/components/TypeConversion.ts index b025d91c..921d3a9b 100644 --- a/app/client/components/TypeConversion.ts +++ b/app/client/components/TypeConversion.ts @@ -105,7 +105,7 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe let {dateFormat} = prevOptions; if (!dateFormat) { const colValues = tableData.getColValues(sourceCol.colId()) || []; - dateFormat = guessDateFormat(colValues.map(String)) || "YYYY-MM-DD"; + dateFormat = guessDateFormat(colValues.map(String)); } widgetOptions = dateTimeWidgetOptions(dateFormat, true); break; diff --git a/app/common/ActiveDocAPI.ts b/app/common/ActiveDocAPI.ts index 7c0a369b..acafc2bb 100644 --- a/app/common/ActiveDocAPI.ts +++ b/app/common/ActiveDocAPI.ts @@ -49,6 +49,7 @@ export interface TransformColumn { colId: string|null; type: string; formula: string; + widgetOptions: string; } export interface ImportResult { diff --git a/app/common/UserAPI.ts b/app/common/UserAPI.ts index 90d9f509..19c94c40 100644 --- a/app/common/UserAPI.ts +++ b/app/common/UserAPI.ts @@ -3,7 +3,7 @@ import {ApplyUAResult, QueryFilters} from 'app/common/ActiveDocAPI'; import {BaseAPI, IOptions} from 'app/common/BaseAPI'; import {BillingAPI, BillingAPIImpl} from 'app/common/BillingAPI'; import {BrowserSettings} from 'app/common/BrowserSettings'; -import {BulkColValues, TableColValues, UserAction} from 'app/common/DocActions'; +import {BulkColValues, TableColValues, TableRecordValue, TableRecordValues, UserAction} from 'app/common/DocActions'; import {DocCreationInfo, OpenDocMode} from 'app/common/DocListAPI'; import {Features} from 'app/common/Features'; import {ICustomWidget} from 'app/common/CustomWidget'; @@ -402,6 +402,11 @@ export interface UserAPI { filters?: string; } +interface GetRowsParams { + filters?: QueryFilters; + immediate?: boolean; +} + /** * Collect endpoints related to the content of a single document that we've been thinking * of as the (restful) "Doc API". A few endpoints that could be here are not, for historical @@ -411,8 +416,8 @@ export interface DocAPI { // Immediate flag is a currently not-advertised feature, allowing a query to proceed without // waiting for a document to be initialized. This is useful if the calculations done when // opening a document are irrelevant. - getRows(tableId: string, options?: { filters?: QueryFilters, - immediate?: boolean }): Promise; + getRows(tableId: string, options?: GetRowsParams): Promise; + getRecords(tableId: string, options?: GetRowsParams): Promise; updateRows(tableId: string, changes: TableColValues): Promise; addRows(tableId: string, additions: BulkColValues): Promise; removeRows(tableId: string, removals: number[]): Promise; @@ -869,16 +874,13 @@ export class DocAPIImpl extends BaseAPI implements DocAPI { this._url = `${url}/api/docs/${docId}`; } - public async getRows(tableId: string, options?: { filters?: QueryFilters, - immediate?: boolean }): Promise { - const url = new URL(`${this._url}/tables/${tableId}/data`); - if (options?.filters) { - url.searchParams.append('filter', JSON.stringify(options.filters)); - } - if (options?.immediate) { - url.searchParams.append('immediate', 'true'); - } - return this.requestJson(url.href); + public async getRows(tableId: string, options?: GetRowsParams): Promise { + return this._getRecords(tableId, 'data', options); + } + + public async getRecords(tableId: string, options?: GetRowsParams): Promise { + const response: TableRecordValues = await this._getRecords(tableId, 'records', options); + return response.records; } public async updateRows(tableId: string, changes: TableColValues): Promise { @@ -967,6 +969,17 @@ export class DocAPIImpl extends BaseAPI implements DocAPI { url.searchParams.append('code', code); return this.requestJson(url.href); } + + private _getRecords(tableId: string, endpoint: 'data' | 'records', options?: GetRowsParams): Promise { + const url = new URL(`${this._url}/tables/${tableId}/${endpoint}`); + if (options?.filters) { + url.searchParams.append('filter', JSON.stringify(options.filters)); + } + if (options?.immediate) { + url.searchParams.append('immediate', 'true'); + } + return this.requestJson(url.href); + } } /** diff --git a/app/common/ValueGuesser.ts b/app/common/ValueGuesser.ts index a3b16b1d..3c4e3050 100644 --- a/app/common/ValueGuesser.ts +++ b/app/common/ValueGuesser.ts @@ -162,7 +162,7 @@ export function guessColInfo( NumberParse.fromSettings(docSettings).guessOptions(values) ) .guess(values, docSettings) || - new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone) + new DateGuesser(guessDateFormat(values, timezone), timezone) .guess(values, docSettings) || // Don't return the same values back if there's no conversion to be done, // as they have to be serialized and transferred over a pipe to Python. diff --git a/app/common/ValueParser.ts b/app/common/ValueParser.ts index 519b1fd0..df9b9535 100644 --- a/app/common/ValueParser.ts +++ b/app/common/ValueParser.ts @@ -36,18 +36,6 @@ export class ValueParser { class IdentityParser extends ValueParser { } -/** - * Same as basic Value parser, but will return null if a value is an empty string. - */ -class NullIfEmptyParser extends ValueParser { - public cleanParse(value: string): any { - if (value === "") { - return null; - } - return super.cleanParse(value); - } -} - export class NumericParser extends ValueParser { private _parse: NumberParse; @@ -225,7 +213,6 @@ export class ReferenceListParser extends ReferenceParser { } export const valueParserClasses: { [type: string]: typeof ValueParser } = { - Any: NullIfEmptyParser, Numeric: NumericParser, Int: NumericParser, Date: DateParser, diff --git a/app/common/parseDate.ts b/app/common/parseDate.ts index a53eb1cf..0d7d98db 100644 --- a/app/common/parseDate.ts +++ b/app/common/parseDate.ts @@ -1,4 +1,5 @@ import escapeRegExp = require('lodash/escapeRegExp'); +import last = require('lodash/last'); import memoize = require('lodash/memoize'); import {getDistinctValues, isObject} from 'app/common/gutil'; // Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment @@ -325,7 +326,26 @@ function standardizeTime(timeString: string): { remaining: string, time: string return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`}; } -export function guessDateFormat(values: Array, timezone: string = 'UTC'): string | null { +/** + * Guesses a full date[time] format that best matches the given strings. + * If several formats match equally well, picks the last one lexicographically to match the old date guessing. + * This means formats with an early Y and/or M are favoured. + * If no formats match, returns the default YYYY-MM-DD. + */ +export function guessDateFormat(values: Array, timezone: string = 'UTC'): string { + const formats = guessDateFormats(values, timezone); + if (!formats) { + return "YYYY-MM-DD"; + } + return last(formats)!; +} + +/** + * Returns all full date[time] formats that best match the given strings. + * If several formats match equally well, returns them all. + * May return null if there are no matching formats or choosing one is too expensive. + */ +export function guessDateFormats(values: Array, timezone: string = 'UTC'): string[] | null { const dateStrings: string[] = values.filter(isObject); const sample = getDistinctValues(dateStrings, 100); const formats: Record = {}; @@ -358,7 +378,9 @@ export function guessDateFormat(values: Array, timezone: string = } const maxCount = Math.max(...Object.values(formats)); - return formatKeys.find(format => formats[format] === maxCount)!; + // Return all formats that tied for first place. + // Sort lexicographically for consistency in tests and with the old dateguess.py. + return formatKeys.filter(format => formats[format] === maxCount).sort(); } export const dateFormatOptions = [ diff --git a/app/server/lib/ActiveDocImport.ts b/app/server/lib/ActiveDocImport.ts index a645f95e..93e253dd 100644 --- a/app/server/lib/ActiveDocImport.ts +++ b/app/server/lib/ActiveDocImport.ts @@ -294,7 +294,7 @@ export class ActiveDocImport { const origTableName = table.table_name ? table.table_name : ''; const transformRule = transformRuleMap && transformRuleMap.hasOwnProperty(origTableName) ? transformRuleMap[origTableName] : null; - const columnMetadata = addLabelsIfPossible(table.column_metadata); + const columnMetadata = cleanColumnMetadata(table.column_metadata); const result: ApplyUAResult = await this._activeDoc.applyUserActions(docSession, [["AddTable", hiddenTableName, columnMetadata]]); const retValue: AddTableRetValue = result.retValues[0]; @@ -313,7 +313,9 @@ export class ActiveDocImport { const ruleCanBeApplied = (transformRule != null) && _.difference(transformRule.sourceCols, hiddenTableColIds).length === 0; await this._activeDoc.applyUserActions(docSession, - [["ReplaceTableData", hiddenTableId, rowIdColumn, columnValues]], {parseStrings: true}); + // BulkAddRecord rather than ReplaceTableData so that type guessing is applied to Any columns. + // Don't use parseStrings, only use the strict parsing in ValueGuesser to make the import lossless. + [["BulkAddRecord", hiddenTableId, rowIdColumn, columnValues]]); // data parsed and put into hiddenTableId // For preview_table (isHidden) do GenImporterView to make views and formulas and cols @@ -433,14 +435,15 @@ export class ActiveDocImport { // If destination is a new table, we need to create it. if (intoNewTable) { - const colSpecs = destCols.map(({type, colId: id, label}) => ({type, id, label})); + const colSpecs = destCols.map(({type, colId: id, label, widgetOptions}) => ({type, id, label, widgetOptions})); const newTable = await this._activeDoc.applyUserActions(docSession, [['AddTable', destTableId, colSpecs]]); destTableId = newTable.retValues[0].table_id; } await this._activeDoc.applyUserActions(docSession, [['BulkAddRecord', destTableId, gutil.arrayRepeat(hiddenTableData.id.length, null), columnData]], - {parseStrings: true}); + // Don't use parseStrings for new tables to make the import lossless. + {parseStrings: !intoNewTable}); return destTableId; } @@ -586,6 +589,7 @@ export class ActiveDocImport { colId: destTableId ? id as string : null, label: fields.label as string, type: fields.type as string, + widgetOptions: fields.widgetOptions as string, formula: srcColIds.includes(id as string) ? `$${id}` : '' }); } @@ -730,10 +734,21 @@ function getMergeFunction({type}: MergeStrategy): MergeFunction { } /** + * Tweak the column metadata used in the AddTable action. * If `columns` is populated with non-blank column ids, adds labels to all - * columns using the values set for the column ids. Otherwise, returns - * a copy of columns with no modifications made. + * columns using the values set for the column ids. + * Ensure that columns of type Any start out as formula columns, i.e. empty columns, + * so that type guessing is triggered when new data is added. */ -function addLabelsIfPossible(columns: GristColumn[]) { - return columns.map(c => (c.id ? {...c, label: c.id} : c)); +function cleanColumnMetadata(columns: GristColumn[]) { + return columns.map(c => { + const newCol: any = {...c}; + if (c.id) { + newCol.label = c.id; + } + if (c.type === "Any") { + newCol.isFormula = true; + } + return newCol; + }); } diff --git a/sandbox/grist/import_actions.py b/sandbox/grist/import_actions.py index b65a1d37..753f7bde 100644 --- a/sandbox/grist/import_actions.py +++ b/sandbox/grist/import_actions.py @@ -1,12 +1,11 @@ -from collections import defaultdict, namedtuple +from collections import namedtuple -import six -from six.moves import zip, xrange +from six.moves import zip import column import identifiers - import logger + log = logger.Logger(__name__, logger.INFO) # Prefix for transform columns created during imports. @@ -103,6 +102,7 @@ class ImportActions(object): "label": c.label, "colId": c.colId if dest_table_id else None, #should be None if into new table "type": c.type, + "widgetOptions": getattr(c, "widgetOptions", ""), "formula": ("$" + c.colId) if (c.colId in src_cols) else '' }) @@ -162,6 +162,7 @@ class ImportActions(object): new_col_spec = { "label": c.label, "type": c.type, + "widgetOptions": getattr(c, "widgetOptions", ""), "isFormula": True, "formula": c.formula} result = self._useractions.doAddColumn(hidden_table_id, new_col_id, new_col_spec) diff --git a/sandbox/grist/imports/dateguess.py b/sandbox/grist/imports/dateguess.py deleted file mode 100644 index d06bafe5..00000000 --- a/sandbox/grist/imports/dateguess.py +++ /dev/null @@ -1,490 +0,0 @@ -"""This module guesses possible formats of dates which can be parsed using datetime.strptime -based on samples. - -dateguesser.guess(sample) -dateguesser.guess takes a sample date string and returns a set of -datetime.strftime/strptime-compliant date format strings that will correctly parse. - -dateguesser.guess_bulk(list_of_samples, error_rate=0) -dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate -and returns a list of datetime.strftime/strptime-compliant date format strings -sorted by error rate that will correctly parse. - -Algorithm: - - 1. Tokenize input string into chunks based on character type: digits, alphas, the rest. - 2. Analyze each token independently in terms what format codes could represent - 3. For given list of tokens generate all permutations of format codes - 4. During generating permutations check for validness of generated format and skip if invalid. - 5. Use rules listed below to decide if format is invalid: - -Invalid format checks: - - Rule #1: Year MUST be in the date. Year is the minimum possible parsable date. - Rule #2. No holes (missing parts) in the format parts. - Rule #3. Time parts are neighbors to each other. No interleaving time with the date. - Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc - Rule #5. Pattern can't have some part of date/time defined more than once. - Rule #6: Separators between elements of the time group should be the same. - Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour - Rule #8: Year can't be between other date elements - -Note: - dateguess doesn't support defaulting to current year because parsing should be deterministic, - it's better to to fail guessing the format then to guess it incorrectly. - -Examples: - >>> guess('2014/05/05 14:00:00 UTC') - set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z']) - >>> guess('12/12/12') - set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m']) - >>> guess_bulk(['12-11-2014', '12-25-2014']) - ['%m-%d-%Y'] - >>> guess_bulk(['12-11-2014', '25-25-2014']) - [] - >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5) - ['%m-%d-%Y'] -""" - - -import calendar -import itertools -import logging -import re -from collections import defaultdict - -from backports.functools_lru_cache import lru_cache -import moment - - -MONTH_NAME = calendar.month_name -MONTH_ABBR = calendar.month_abbr -TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()} -AM_PM = {'am', 'pm'} -DAYS_OF_WEEK_NAME = calendar.day_name -DAYS_OF_WEEK_ABBR = calendar.day_abbr -ASCII_DIGITS_RE = re.compile(r'^[0-9]+$') - -# Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want. -# Use isdigit(x) instead, to only match ASCII digits 0-9. -isdigit = ASCII_DIGITS_RE.match - -DATE_ELEMENTS = [ - # Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements - ("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0), - ("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0), - ("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0), - ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0), - ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0), - ("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0), - ("Day of week", "%A", lambda x, p, v: x.isalpha() - and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0), - ("Day of week abbr", "%a", lambda x, p, v: x.isalpha() - and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0), - - ("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6 - and 0 <= int(x[0:2]) < 24 - and 0 <= int(x[2:4]) < 60 - and 0 <= int(x[4:6]) < 60, "HMS", 0), - - ("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0), - ("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2 - and 0 <= int(x) <= 11, "H", 0), - ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0), - ("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0), - ("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0), - ("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None - and p.val == '.', "f", 0), - ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2 - and x in TZ_VALID_NAMES, "Z", 0), - ("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15 - and 0 <= int(x[2:4]) < 60 and p is not None - and p.val == '+', "Z", 1), - ("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15 - and 0 <= int(x[2:4]) < 60 and p is not None - and p.val == '-', "Z", 1), -] - - -class Token(object): - """Represents a part of a date string that's being parsed. - Note that __hash__ and __eq__ are overridden in order - to compare only meaningful parts of an object. - """ - def __init__(self, val, length): - self.val = val - self.length = length - self.compatible_types = () - - def __hash__(self): - h = hash(self.length) + hash(self.compatible_types) - if not self.compatible_types: - h += hash(self.val) - return hash(h) - - def __eq__(self, other): - """ - Two tokens are equal when these both are true: - a) length and compatible types are equal - b) if it is separator (no compatible types), separator values must be equal - """ - if self.length != other.length or self.compatible_types != other.compatible_types: - return False - if not other.compatible_types and self.val != other.val: - return False - return True - - -def _check_rule_1(pattern, types_used): - """Rule #1: Year MUST be in the date. Year is the minimum possible parsable date. - - Examples: - >>> _check_rule_1('%Y/%m/%d', 'Ymd') - True - >>> _check_rule_1('%m/%d', 'md') - False - """ - if 'Y' not in types_used: - logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used) - return False - return True - - -def _check_rule_2(pattern, types_used): - """Rule #2: No holes (missing parts) in the format parts. - - Examples: - >>> _check_rule_2('%Y:%H', 'YH') - False - >>> _check_rule_2('%Y/%m/%d %H', 'YmdH') - True - """ - priorities = 'YmdHMSf' - seen_parts = [p in types_used for p in priorities] - if sorted(seen_parts, reverse=True) != seen_parts: - logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used) - return False - return True - - -def _check_rule_3(pattern, types_used): - """Rule #3: Time parts are neighbors to time only. No interleaving time with the date. - - Examples: - >>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY') - True - >>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM') - False - """ - time_parts = 'HMSf' - time_parts_highlighted = [t in time_parts for t in types_used] - time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)] - if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1: - logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used) - return False - return True - - -def _check_rule_4(pattern, types_used): - """Rule #4: It's highly impossible that minutes coming before hours, - millis coming before seconds etc. - - Examples: - >>> _check_rule_4('%H:%M', 'HM') - True - >>> _check_rule_4('%S:%M', 'SM') - False - """ - time_parts_priority = 'HMSf' - time_parts_indexes = list(filter(lambda x: x >= 0, - [time_parts_priority.find(t) for t in types_used])) - if sorted(time_parts_indexes) != time_parts_indexes: - logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used) - return False - return True - - -def _check_rule_5(pattern, types_used): - """Rule #5: Pattern can't have some part of date/time defined more than once. - - Examples: - >>> _check_rule_5('%Y/%Y', 'YY') - False - >>> _check_rule_5('%m/%b', 'mm') - False - >>> _check_rule_5('%Y/%m', 'Ym') - True - """ - if len(types_used) != len(set(types_used)): - logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used) - return False - return True - - -def _check_rule_6(tokens_chosen, pattern, types_used): - """Rule #6: Separators between elements of the time group should be the same. - - Examples: - _check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True - _check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True - _check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators - ('-' and ':') in time group) - """ - time_parts = 'HMS' - num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used))) - time_parts_seen = 0 - separators_seen = [] - previous_was_a_separator = False - - for token in tokens_chosen: - if token[1] is not None and token[1][3] in time_parts: - # This rule doesn't work for separator-less time group so when we found the type - # and it's three letters then it's (see type "Compound HHMMSS") then stop iterating - if len(token[1][3]) == 3: - break - # If not a first time then - if time_parts_seen > 0 and not previous_was_a_separator: - separators_seen.append(None) - time_parts_seen += 1 - if time_parts_seen == num_of_time_parts_used: - break - previous_was_a_separator = False - else: - if time_parts_seen > 0: - separators_seen.append(token[0].val) - previous_was_a_separator = True - - if len(set(separators_seen)) > 1: - logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s", - pattern, separators_seen) - return False - return True - - -def _check_rule_7a(pattern): - """Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only. - Otherwise it's 24-hour. - - Examples: - >>> _check_rule_7a('%Y/%m/%d %H:%M %p') - False - >>> _check_rule_7a('%Y/%m/%d %I:%M %p') - True - """ - if '%p' in pattern and '%H' in pattern: - logging.debug("Rule #7a is violated for pattern %s", pattern) - return False - return True - - -def _check_rule_7b(pattern): - """Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only. - Otherwise it's 24-hour. - - Examples: - >>> _check_rule_7b('%Y/%m/%d %I:%M') - False - >>> _check_rule_7b('%Y/%m/%d %I:%M %p') - True - """ - if '%I' in pattern and '%p' not in pattern: - logging.debug("Rule #7b is violated for pattern %s", pattern) - return False - return True - - -def _check_rule_8(pattern, types_used): - """Rule #9: Year can't be between other date elements - - Examples: - >>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM') - False - """ - if 'mYd' in types_used or 'dYm' in types_used: - logging.debug("Rule #8 is violated for pattern %s", pattern) - return False - return True - - -def _tokenize_by_character_class(s): - """Return a list of strings by splitting s (tokenizing) by character class. - - Example: - >>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000') - >>> [i.val for i in t] - ['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000'] - - >>> t = _tokenize_by_character_class('5/14/2014') - >>> [i.val for i in t] - ['5', '/', '14', '/', '2014'] - """ - res = re.split(r'(\d+)|(\W)|(_)', s) - return [Token(i, len(i)) for i in res if i] - - -def _sliding_triplets(tokens): - for idx, t in enumerate(tokens): - yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None) - - -def _analyze_tokens(tokens): - """Analyze each token and find out compatible types for it.""" - for token, prev, nxt in _sliding_triplets(tokens): - token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)]) - - -@lru_cache() -def _generate_all_permutations(tokens): - """Generate all permutations of format codes for given list of tokens. - - Brute-forcing of all possible permutations and rules checking eats most of the time or date - parsing. But since the input is expected to be highly uniform then we can expect that - memoization of this step will be very efficient. - - Token contains values for date parts but due to overridden eq and hash methods, - we treat two tokens having the same length and same possible formats as equal - tokens and separators should be the same - """ - all_patterns = set() - _generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "") - - return all_patterns - - -def _check_is_pattern_valid_quick_fail_rules(pattern, types_used): - """Apply rules which are applicable for partially constructed patterns. - - Example: duplicates of a date part in a pattern. - """ - return _check_rule_5(pattern, types_used) \ - and _check_rule_4(pattern, types_used) \ - and _check_rule_7a(pattern) - - -def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used): - """Apply rules which are applicable for full pattern only. - - Example: existence of Year part in the pattern. - """ - return _check_rule_1(pattern, types_used) \ - and _check_rule_2(pattern, types_used) \ - and _check_rule_3(pattern, types_used) \ - and _check_rule_6(tokens_chosen, pattern, types_used) \ - and _check_rule_7b(pattern) \ - and _check_rule_8(pattern, types_used) - - -def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns, - types_used): - """Generate all format elements permutations recursively. - - Args: - tokens (list[Token]): List of tokens. - token_idx (int): Index of token processing this cycle. - tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples - containing token and compatible type - pattern (str): String containing format for parsing - found_patterns (set): Set of guessed patterns - types_used (str): String of types used to build pattern. - - Returns: - list: List of permutations - """ - if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used): - return - - if token_idx < len(tokens): - t = tokens[token_idx] - if t.compatible_types: - for ct in t.compatible_types: - _generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)], - (pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1], - found_patterns, types_used + ct[3]) - else: - # if no compatible types it should be separator, add it to the pattern - _generate_all_permutations_recursive(tokens, token_idx+1, - tokens_chosen[:] + [(t, None)], pattern + t.val, - found_patterns, types_used) - else: - if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used): - found_patterns.add(pattern) - - -def guess(date): - """Guesses datetime.strftime/strptime-compliant date formats for date string. - - Args: - date (str): Date string. - - Returns: - set: Set of datetime.strftime/strptime-compliant date format strings - - Examples: - >>> guess('2014/05/05 14:00:00 UTC') - set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z']) - >>> guess('12/12/12') - set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m']) - """ - # Don't attempt to parse strings that are so long as to be certainly non-dates. Somewhat long - # strings could be dates (like "Wednesday, September 16, 2020 A.D. 08:47:02.2667911 AM -06:00", - # and who knows what other languages do). A limit is important also because the current approach - # can run into "maximum recursion depth exceeded" on a very long string. - if len(date) > 150: - return set() - tokens = _tokenize_by_character_class(date) - _analyze_tokens(tokens) - return _generate_all_permutations(tuple(tokens)) - - -def guess_bulk(dates, error_rate=0): - """Guesses datetime.strftime/strptime-compliant date formats for list of the samples. - - Args: - dates (list): List of samples date strings. - error_rate (float): Acceptable error rate (default 0.0) - - Returns: - list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate - - Examples: - >>> guess_bulk(['12-11-2014', '12-25-2014']) - ['%m-%d-%Y'] - >>> guess_bulk(['12-11-2014', '25-25-2014']) - [] - >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5) - ['%m-%d-%Y'] - """ - if error_rate == 0.0: - patterns = None - for date in dates: - guesses_patterns = guess(date) - if patterns is None: - patterns = guesses_patterns - else: - patterns = patterns.intersection(guesses_patterns) - if not patterns: - break # No need to iterate more if zero patterns found - return list(patterns) - else: - found_dates = 0 - pattern_counters = defaultdict(lambda: 0) - num_dates = len(dates) - min_num_dates_to_be_found = num_dates - num_dates * error_rate - - for idx, date in enumerate(dates): - patterns = guess(date) - if patterns: - found_dates += 1 - for pattern in patterns: - pattern_counters[pattern] = pattern_counters[pattern] + 1 - - # Early return if number of strings that can't be date is already over error rate - cells_left = num_dates - idx - 1 - cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found - if cannot_be_found: - return [] - - patterns = [(v, k) for k, v in pattern_counters.items() - if v > min_num_dates_to_be_found] - patterns.sort(reverse=True) - return [k for (v, k) in patterns] diff --git a/sandbox/grist/imports/test_dateguess.py b/sandbox/grist/imports/test_dateguess.py deleted file mode 100644 index 8e960cd9..00000000 --- a/sandbox/grist/imports/test_dateguess.py +++ /dev/null @@ -1,102 +0,0 @@ -import unittest -from imports.dateguess import guess, guess_bulk - - -class TestGuesser(unittest.TestCase): - def assertDate(self, input_str, fmt_list): - guessed = guess(input_str) - self.assertEqual(set(guessed), set(fmt_list)) - - def assertDates(self, input_lst, error_rate, fmt_list): - guessed = guess_bulk(input_lst, error_rate=error_rate) - self.assertEqual(set(guessed), set(fmt_list)) - - def test_guess_dates(self): - self.assertDate('', []) - self.assertDate("2013-13-13", []) - self.assertDate("25/25/1911", []) - - self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m']) - self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m']) - self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m']) - self.assertDate("2013-12-13", ['%Y-%m-%d']) - - self.assertDate("12/31/1999", ['%m/%d/%Y']) - self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y']) - self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y']) - self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y']) - - self.assertDate("12/31/99", ['%m/%d/%y']) - self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y']) - self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y']) - self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y']) - - self.assertDate("31.12.91", ['%d.%m.%y']) - self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y']) - - self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m']) - self.assertDate("31.12.1991", ['%d.%m.%Y']) - self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y']) - self.assertDate("13.2.2008", ['%d.%m.%Y']) - self.assertDate("31.12.91", ['%d.%m.%y']) - self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y']) - self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m']) - - self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y']) - self.assertDate("31 Dec 1999", ['%d %b %Y']) - self.assertDate("1 Jan 2012", ['%d %b %Y']) - self.assertDate("3 August 2009", ['%d %B %Y']) - self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y']) - - self.assertDate("13/1/2012", ['%d/%m/%Y']) - - self.assertDate("Aug 1st 2014", ['%b %dst %Y']) - self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f']) - - def test_guess_datetimes(self): - self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y']) - self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S']) - self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y']) - - self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S']) - self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S']) - # TODO remove all except first one - self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m', - '%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m']) - self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one - - self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y']) - self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y']) - self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y']) - self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y']) - self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y']) - self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y']) - - self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S']) - self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S']) - self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y']) - self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y']) - - self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z']) - self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z']) - self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z']) - self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z']) - - self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S']) - self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M']) - - def test_guess_bulk_dates(self): - self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y']) - self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, []) - self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y']) - - self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, []) - self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y']) - - self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y']) - self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, []) - self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, []) - - -if __name__ == "__main__": - unittest.main() diff --git a/sandbox/grist/imports/test_import_csv.py b/sandbox/grist/imports/test_import_csv.py index 4a45513d..67364c8d 100644 --- a/sandbox/grist/imports/test_import_csv.py +++ b/sandbox/grist/imports/test_import_csv.py @@ -4,8 +4,6 @@ import textwrap import unittest from six import BytesIO, text_type import csv -import calendar -import datetime from imports import import_csv @@ -22,9 +20,15 @@ def bytes_io_from_str(string): class TestImportCSV(unittest.TestCase): - def _check_col(self, sheet, index, name, typename, values): + def _check_col(self, sheet, index, name, _typename, values): self.assertEqual(sheet["column_metadata"][index]["id"], name) - self.assertEqual(sheet["column_metadata"][index]["type"], typename) + # Previously, strings were parsed and types were guessed in CSV imports. + # Now all data is kept as strings and the column type is left as Any + # so that type guessing and parsing can happen elsewhere. + # To avoid updating 85 calls to _check_col, the typename argument was kept but can be ignored, + # and all values are converted back to strings for comparison. + self.assertEqual(sheet["column_metadata"][index]["type"], "Any") + values = [text_type(v) for v in values] self.assertEqual(sheet["table_data"][index], values) def _check_num_cols(self, sheet, exp_cols): @@ -40,18 +44,16 @@ class TestImportCSV(unittest.TestCase): self._check_col(sheet, 1, "int2", "Int", [5, '', '']) self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', '']) self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', '']) - self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', '']) - self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', '']) + self._check_col(sheet, 4, "num2", "Numeric", ['123456789.1234560000', '', '']) + self._check_col(sheet, 5, "bignum", "Numeric", ['7.22597E+86', '', '']) self._check_col(sheet, 6, "date1", "DateTime", - [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None]) + [u'12/22/15 11:59 AM', u'', u'']) self._check_col(sheet, 7, "date2", "Date", - [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None]) + [u'December 20, 2015', u'', u'']) self._check_col(sheet, 8, "datetext", "Date", - [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None]) + [u'12/22/2015', u'', u'']) self._check_col(sheet, 9, "datetimetext", "DateTime", - [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()), - calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()), - calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())]) + [u'12/22/2015 00:00:00', u'12/22/2015 13:15:00', u'02/27/2018 16:08:39']) def test_user_parse_options(self): @@ -68,7 +70,11 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434', '2013433434', '(201)343-3434']) self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4]) - self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None]) + self._check_col(parsed_file, 4, "DATE", "DateTime", + [u'2018-02-27 16:08:39 +0000', + u'2018-02-27 16:08:39 +0100', + u'2018-02-27 16:08:39 -0100', + u'']) def test_wrong_cols1(self): file_obj = bytes_io_from_str(textwrap.dedent( diff --git a/sandbox/grist/imports/test_import_xls.py b/sandbox/grist/imports/test_import_xls.py index d5a4fbc0..7a4a3326 100644 --- a/sandbox/grist/imports/test_import_xls.py +++ b/sandbox/grist/imports/test_import_xls.py @@ -16,31 +16,33 @@ class TestImportXLS(unittest.TestCase): def _check_col(self, sheet, index, name, typename, values): self.assertEqual(sheet["column_metadata"][index]["id"], name) self.assertEqual(sheet["column_metadata"][index]["type"], typename) + if typename == "Any": + # Convert values to strings to reduce changes to tests after imports were overhauled. + values = [str(v) for v in values] self.assertEqual(sheet["table_data"][index], values) def test_excel(self): parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx')) - # check that column type was correctly set to int and values are properly parsed - self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"}) + # check that column type was correctly set to numeric and values are properly parsed + self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"}) self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8]) # check that column type was correctly set to text and values are properly parsed - self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"}) + self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"}) self.assertEqual(parsed_file[1][0]["table_data"][1], ["a", "b", "c", "d", "e", "f", "g", "h"]) - # messy tables does not support bool types yet, it classifies them as ints - self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"}) - self.assertEqual(parsed_file[1][False]["table_data"][2], - [True, False, True, False, True, False, True, False]) + # 0s and 1s become Numeric, not boolean like in the past + self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"}) + self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0]) # check that column type was correctly set to text and values are properly parsed self.assertEqual(parsed_file[1][0]["column_metadata"][3], - {"type": "Text", "id": "corner-cases"}) + {"type": "Any", "id": "corner-cases"}) self.assertEqual(parsed_file[1][0]["table_data"][3], # The type is detected as text, so all values should be text. - [u'=function()', '3.0', u'two spaces after ', + [u'=function()', u'3.0', u'two spaces after ', u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak']) # check that multiple tables are created when there are multiple sheets in a document @@ -51,23 +53,19 @@ class TestImportXLS(unittest.TestCase): def test_excel_types(self): parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx')) sheet = parsed_file[1][0] - self._check_col(sheet, 0, "int1", "Int", [-1234123, '', '']) - self._check_col(sheet, 1, "int2", "Int", [5, '', '']) - self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', '']) - self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', '']) + self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', '']) + self._check_col(sheet, 1, "int2", "Numeric", [5, '', '']) + self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', '']) + self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', '']) self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', '']) self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', '']) self._check_col(sheet, 6, "date1", "DateTime", [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None]) self._check_col(sheet, 7, "date2", "Date", [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None]) - self._check_col(sheet, 8, "datetext", "Date", - [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None]) - # TODO: all dates have different format - # self._check_col(sheet, 9, "datetimetext", "DateTime", - # [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()), - # calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()), - # calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())]) + self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', '']) + self._check_col(sheet, 9, "datetimetext", "Any", + [u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000']) def test_excel_type_detection(self): # This tests goes over the second sheet of the fixture doc, which has multiple rows that try @@ -81,23 +79,20 @@ class TestImportXLS(unittest.TestCase): 1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0]) self._check_col(sheet, 1, "float_not_int", "Numeric", [1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18]) - self._check_col(sheet, 2, "int_not_bool", "Int", + self._check_col(sheet, 2, "int_not_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) - self._check_col(sheet, 3, "float_not_bool", "Numeric", + self._check_col(sheet, 3, "float_not_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) - self._check_col(sheet, 4, "text_as_bool", "Bool", + self._check_col(sheet, 4, "text_as_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) - self._check_col(sheet, 5, "int_as_bool", "Bool", + self._check_col(sheet, 5, "int_as_bool", "Numeric", [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) - self._check_col(sheet, 6, "float_not_date", "Numeric", + self._check_col(sheet, 6, "float_not_date", "Any", [4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 4.0, 6.0, '3-4', 4.0, 6.5]) self._check_col(sheet, 7, "float_not_text", "Numeric", - [-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-", + [-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, "", "-", 12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50]) - self._check_col(sheet, 8, "dollar_amts", "Numeric", - [0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000, - 1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01]) def test_excel_single_merged_cell(self): # An older version of xlrd had a bug where a single cell marked as 'merged' would cause an @@ -107,11 +102,11 @@ class TestImportXLS(unittest.TestCase): self.assertEqual(tables, [{ 'table_name': u'Transaction Report', 'column_metadata': [ - {'type': 'Text', 'id': u''}, + {'type': 'Any', 'id': u''}, {'type': 'Numeric', 'id': u'Start'}, {'type': 'Numeric', 'id': u''}, {'type': 'Numeric', 'id': u''}, - {'type': 'Text', 'id': u'Seek no easy ways'}, + {'type': 'Any', 'id': u'Seek no easy ways'}, ], 'table_data': [ [u'SINGLE MERGED', u'The End'], @@ -133,15 +128,15 @@ class TestImportXLS(unittest.TestCase): self.assertEqual(tables, [{ 'table_name': u'Sheet1', 'column_metadata': [ - {'id': 'a', 'type': 'Text'}, + {'id': 'a', 'type': 'Any'}, {'id': 'b', 'type': 'Date'}, - {'id': 'c', 'type': 'Text'}, - {'id': 'd', 'type': 'Text'}, + {'id': 'c', 'type': 'Any'}, + {'id': 'd', 'type': 'Any'}, {'id': 'e', 'type': 'Numeric'}, - {'id': 'f', 'type': 'Int'}, - {'id': 'g', 'type': 'Date'}, + {'id': 'f', 'type': 'Numeric'}, + {'id': 'g', 'type': 'Any'}, {'id': 'h', 'type': 'Date'}, - {'id': 'i', 'type': 'Bool'}, + {'id': 'i', 'type': 'Numeric'}, ], 'table_data': [ [u'21:14:00'], @@ -150,9 +145,9 @@ class TestImportXLS(unittest.TestCase): [u'10:20:30'], [4.180902777777778], [20], - [-6106060800.0], + [u'7/4/1776'], [205286400.0], - [False], # This is not great either, we should be able to distinguish 0 from FALSE. + [0], ], }]) diff --git a/sandbox/grist/parse_data.py b/sandbox/grist/parse_data.py index 460fa6f8..c5aa1cf4 100644 --- a/sandbox/grist/parse_data.py +++ b/sandbox/grist/parse_data.py @@ -7,13 +7,11 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a of values. All "data" lists will have the same length. """ -from imports import dateguess import datetime import logging import re import messytables import moment # TODO grist internal libraries might not be available to plugins in the future. -import dateutil.parser as date_parser import six from six.moves import zip, xrange @@ -25,12 +23,17 @@ log = logging.getLogger(__name__) # Our approach to type detection is different from that of messytables. -# We first go through each cell in a sample of rows, trying to convert it to each of the basic +# We first go through each cell in a sample of rows, checking if it's one of the basic # types, and keep a count of successes for each. We use the counts to decide the basic types (e.g. # numeric vs text). Then we go through the full data set converting to the chosen basic type. # During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric). # We use those counts to produce the selected Grist type at the end. +# Previously string values were used here for type guessing and were parsed to typed values. +# That process now happens elsewhere, and this module only handles the case +# where the imported data already contains actual numbers or dates. +# This happens for Excel sheets but not CSV files. + class BaseConverter(object): @classmethod @@ -57,50 +60,19 @@ class BaseConverter(object): class NumericConverter(BaseConverter): - """Handles numeric values, including Grist types Numeric and Int.""" - - # A number matching this is probably an identifier of some sort. Converting it to a float will - # lose precision, so it's better not to consider it numeric. - _unlikely_float = re.compile(r'\d{17}|^0\d') - - # Integers outside this range will be represented as floats. This is the limit for values that can - # be stored in a JS Int32Array. - _max_js_int = 1<<31 - - # The thousands separator. It should be locale-specific, but we don't currently have a way to - # detect locale from the data. (Also, the sandbox's locale module isn't fully functional.) - _thousands_sep = ',' + """Handles the Grist Numeric type""" @classmethod def convert(cls, value): if type(value) in six.integer_types + (float, complex): return value - if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value): - return float(value.strip().lstrip('$').replace(cls._thousands_sep, "")) raise ValueError() - @classmethod - def _is_integer(cls, value): - ttype = type(value) - if ttype == int or (ttype == float and value.is_integer()): - return -cls._max_js_int <= value < cls._max_js_int - return False - @classmethod def get_grist_column(cls, values): - if all(cls._is_integer(v) for v in values): - return ("Int", [int(v) for v in values]) return ("Numeric", values) -class DateParserInfo(date_parser.parserinfo): - def validate(self, res): - # Avoid this bogus combination which accepts plain numbers. - if res.day and not res.month: - return False - return super(DateParserInfo, self).validate(res) - - class SimpleDateTimeConverter(BaseConverter): """Handles Date and DateTime values which are already instances of datetime.datetime.""" @@ -124,66 +96,18 @@ class SimpleDateTimeConverter(BaseConverter): return grist_type, grist_values -class DateTimeCoverter(BaseConverter): - """Handles dateformats by guessed format.""" - - def __init__(self, date_format): - self._format = date_format - - def convert(self, value): - if value == "": - return None - if type(value) in (str, six.text_type): - # datetime.strptime doesn't handle %z and %Z tags in Python 2. - if '%z' in self._format or '%Z' in self._format: - return date_parser.parse(value) - else: - try: - return datetime.datetime.strptime(value, self._format) - except ValueError: - return date_parser.parse(value) - - raise ValueError() - - def _is_date(self, value): - return value is None or value.time() == datetime.time() - - def get_grist_column(self, values): - grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime" - grist_values = [(v if (v is None) else moment.dt_to_ts(v)) - for v in values] - return grist_type, grist_values - - -class BoolConverter(BaseConverter): - """Handles Boolean type.""" - - _true_values = (1, '1', 'true', 'yes') - _false_values = (0, '0', 'false', 'no') - - @classmethod - def convert(cls, value): - v = value.strip().lower() if type(value) in (str, six.text_type) else value - if v in cls._true_values: - return True - elif v in cls._false_values: - return False - raise ValueError() - - @classmethod - def get_grist_column(cls, values): - return ("Bool", values) - - -class TextConverter(BaseConverter): - """Fallback converter that converts everything to strings.""" +class AnyConverter(BaseConverter): + """ + Fallback converter that converts everything to strings. + Type guessing and parsing of the strings will happen elsewhere. + """ @classmethod def convert(cls, value): return six.text_type(value) @classmethod def get_grist_column(cls, values): - return ("Text", values) + return ("Any", values) class ColumnDetector(object): @@ -194,7 +118,7 @@ class ColumnDetector(object): """ # Converters are listed in the order of preference, which is only used if two converters succeed # on the same exact number of values. Text is always a fallback. - converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter] + converters = [SimpleDateTimeConverter, NumericConverter] # If this many non-junk values or more can't be converted, fall back to text. _text_threshold = 0.10 @@ -221,19 +145,11 @@ class ColumnDetector(object): self._counts[i] += 1 def get_converter(self): - if sum(self._counts) == 0: - # if not already guessed as int, bool or datetime then we should try to guess date pattern - str_data = [d for d in self._data if isinstance(d, six.string_types)] - data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold) - data_format = data_formats[0] if data_formats else None - if data_format: - return DateTimeCoverter(data_format) - # We find the max by count, and secondarily by minimum index in the converters list. count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts)) if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold): return self.converters[-neg_index] - return TextConverter + return AnyConverter def _guess_basic_types(rows, num_columns):