(core) Lossless imports

Summary:
- Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates.
- 0s and 1s are treated as numbers instead of booleans to keep imports lossless.
- Removed dateguess.py and test_dateguess.py.
- Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py.
- Columns containing numbers are now always imported as Numeric, never Int.
- Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff.

Test Plan:
- Added 2 `nbrowser/Importer2` tests.
- Updated various existing tests.
- Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3302
This commit is contained in:
Alex Hall 2022-03-04 19:37:56 +02:00
parent 9522438967
commit 321019217d
14 changed files with 150 additions and 785 deletions

View File

@ -353,6 +353,7 @@ export class Importer extends DisposableWithEvents {
label: field.label(),
colId: destTableId ? field.colId() : null, // if inserting into new table, colId isn't defined
type: field.column().type(),
widgetOptions: field.column().widgetOptions(),
formula: field.column().formula()
})),
sourceCols: sourceFields.map((field) => field.colId())

View File

@ -105,7 +105,7 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe
let {dateFormat} = prevOptions;
if (!dateFormat) {
const colValues = tableData.getColValues(sourceCol.colId()) || [];
dateFormat = guessDateFormat(colValues.map(String)) || "YYYY-MM-DD";
dateFormat = guessDateFormat(colValues.map(String));
}
widgetOptions = dateTimeWidgetOptions(dateFormat, true);
break;

View File

@ -49,6 +49,7 @@ export interface TransformColumn {
colId: string|null;
type: string;
formula: string;
widgetOptions: string;
}
export interface ImportResult {

View File

@ -3,7 +3,7 @@ import {ApplyUAResult, QueryFilters} from 'app/common/ActiveDocAPI';
import {BaseAPI, IOptions} from 'app/common/BaseAPI';
import {BillingAPI, BillingAPIImpl} from 'app/common/BillingAPI';
import {BrowserSettings} from 'app/common/BrowserSettings';
import {BulkColValues, TableColValues, UserAction} from 'app/common/DocActions';
import {BulkColValues, TableColValues, TableRecordValue, TableRecordValues, UserAction} from 'app/common/DocActions';
import {DocCreationInfo, OpenDocMode} from 'app/common/DocListAPI';
import {Features} from 'app/common/Features';
import {ICustomWidget} from 'app/common/CustomWidget';
@ -402,6 +402,11 @@ export interface UserAPI {
filters?: string;
}
interface GetRowsParams {
filters?: QueryFilters;
immediate?: boolean;
}
/**
* Collect endpoints related to the content of a single document that we've been thinking
* of as the (restful) "Doc API". A few endpoints that could be here are not, for historical
@ -411,8 +416,8 @@ export interface DocAPI {
// Immediate flag is a currently not-advertised feature, allowing a query to proceed without
// waiting for a document to be initialized. This is useful if the calculations done when
// opening a document are irrelevant.
getRows(tableId: string, options?: { filters?: QueryFilters,
immediate?: boolean }): Promise<TableColValues>;
getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues>;
getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]>;
updateRows(tableId: string, changes: TableColValues): Promise<number[]>;
addRows(tableId: string, additions: BulkColValues): Promise<number[]>;
removeRows(tableId: string, removals: number[]): Promise<number[]>;
@ -869,16 +874,13 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
this._url = `${url}/api/docs/${docId}`;
}
public async getRows(tableId: string, options?: { filters?: QueryFilters,
immediate?: boolean }): Promise<TableColValues> {
const url = new URL(`${this._url}/tables/${tableId}/data`);
if (options?.filters) {
url.searchParams.append('filter', JSON.stringify(options.filters));
public async getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues> {
return this._getRecords(tableId, 'data', options);
}
if (options?.immediate) {
url.searchParams.append('immediate', 'true');
}
return this.requestJson(url.href);
public async getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]> {
const response: TableRecordValues = await this._getRecords(tableId, 'records', options);
return response.records;
}
public async updateRows(tableId: string, changes: TableColValues): Promise<number[]> {
@ -967,6 +969,17 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
url.searchParams.append('code', code);
return this.requestJson(url.href);
}
private _getRecords(tableId: string, endpoint: 'data' | 'records', options?: GetRowsParams): Promise<any> {
const url = new URL(`${this._url}/tables/${tableId}/${endpoint}`);
if (options?.filters) {
url.searchParams.append('filter', JSON.stringify(options.filters));
}
if (options?.immediate) {
url.searchParams.append('immediate', 'true');
}
return this.requestJson(url.href);
}
}
/**

View File

@ -162,7 +162,7 @@ export function guessColInfo(
NumberParse.fromSettings(docSettings).guessOptions(values)
)
.guess(values, docSettings) ||
new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
new DateGuesser(guessDateFormat(values, timezone), timezone)
.guess(values, docSettings) ||
// Don't return the same values back if there's no conversion to be done,
// as they have to be serialized and transferred over a pipe to Python.

View File

@ -36,18 +36,6 @@ export class ValueParser {
class IdentityParser extends ValueParser {
}
/**
* Same as basic Value parser, but will return null if a value is an empty string.
*/
class NullIfEmptyParser extends ValueParser {
public cleanParse(value: string): any {
if (value === "") {
return null;
}
return super.cleanParse(value);
}
}
export class NumericParser extends ValueParser {
private _parse: NumberParse;
@ -225,7 +213,6 @@ export class ReferenceListParser extends ReferenceParser {
}
export const valueParserClasses: { [type: string]: typeof ValueParser } = {
Any: NullIfEmptyParser,
Numeric: NumericParser,
Int: NumericParser,
Date: DateParser,

View File

@ -1,4 +1,5 @@
import escapeRegExp = require('lodash/escapeRegExp');
import last = require('lodash/last');
import memoize = require('lodash/memoize');
import {getDistinctValues, isObject} from 'app/common/gutil';
// Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
@ -325,7 +326,26 @@ function standardizeTime(timeString: string): { remaining: string, time: string
return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
}
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string | null {
/**
* Guesses a full date[time] format that best matches the given strings.
* If several formats match equally well, picks the last one lexicographically to match the old date guessing.
* This means formats with an early Y and/or M are favoured.
* If no formats match, returns the default YYYY-MM-DD.
*/
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string {
const formats = guessDateFormats(values, timezone);
if (!formats) {
return "YYYY-MM-DD";
}
return last(formats)!;
}
/**
* Returns all full date[time] formats that best match the given strings.
* If several formats match equally well, returns them all.
* May return null if there are no matching formats or choosing one is too expensive.
*/
export function guessDateFormats(values: Array<string | null>, timezone: string = 'UTC'): string[] | null {
const dateStrings: string[] = values.filter(isObject);
const sample = getDistinctValues(dateStrings, 100);
const formats: Record<string, number> = {};
@ -358,7 +378,9 @@ export function guessDateFormat(values: Array<string | null>, timezone: string =
}
const maxCount = Math.max(...Object.values(formats));
return formatKeys.find(format => formats[format] === maxCount)!;
// Return all formats that tied for first place.
// Sort lexicographically for consistency in tests and with the old dateguess.py.
return formatKeys.filter(format => formats[format] === maxCount).sort();
}
export const dateFormatOptions = [

View File

@ -294,7 +294,7 @@ export class ActiveDocImport {
const origTableName = table.table_name ? table.table_name : '';
const transformRule = transformRuleMap && transformRuleMap.hasOwnProperty(origTableName) ?
transformRuleMap[origTableName] : null;
const columnMetadata = addLabelsIfPossible(table.column_metadata);
const columnMetadata = cleanColumnMetadata(table.column_metadata);
const result: ApplyUAResult = await this._activeDoc.applyUserActions(docSession,
[["AddTable", hiddenTableName, columnMetadata]]);
const retValue: AddTableRetValue = result.retValues[0];
@ -313,7 +313,9 @@ export class ActiveDocImport {
const ruleCanBeApplied = (transformRule != null) &&
_.difference(transformRule.sourceCols, hiddenTableColIds).length === 0;
await this._activeDoc.applyUserActions(docSession,
[["ReplaceTableData", hiddenTableId, rowIdColumn, columnValues]], {parseStrings: true});
// BulkAddRecord rather than ReplaceTableData so that type guessing is applied to Any columns.
// Don't use parseStrings, only use the strict parsing in ValueGuesser to make the import lossless.
[["BulkAddRecord", hiddenTableId, rowIdColumn, columnValues]]);
// data parsed and put into hiddenTableId
// For preview_table (isHidden) do GenImporterView to make views and formulas and cols
@ -433,14 +435,15 @@ export class ActiveDocImport {
// If destination is a new table, we need to create it.
if (intoNewTable) {
const colSpecs = destCols.map(({type, colId: id, label}) => ({type, id, label}));
const colSpecs = destCols.map(({type, colId: id, label, widgetOptions}) => ({type, id, label, widgetOptions}));
const newTable = await this._activeDoc.applyUserActions(docSession, [['AddTable', destTableId, colSpecs]]);
destTableId = newTable.retValues[0].table_id;
}
await this._activeDoc.applyUserActions(docSession,
[['BulkAddRecord', destTableId, gutil.arrayRepeat(hiddenTableData.id.length, null), columnData]],
{parseStrings: true});
// Don't use parseStrings for new tables to make the import lossless.
{parseStrings: !intoNewTable});
return destTableId;
}
@ -586,6 +589,7 @@ export class ActiveDocImport {
colId: destTableId ? id as string : null,
label: fields.label as string,
type: fields.type as string,
widgetOptions: fields.widgetOptions as string,
formula: srcColIds.includes(id as string) ? `$${id}` : ''
});
}
@ -730,10 +734,21 @@ function getMergeFunction({type}: MergeStrategy): MergeFunction {
}
/**
* Tweak the column metadata used in the AddTable action.
* If `columns` is populated with non-blank column ids, adds labels to all
* columns using the values set for the column ids. Otherwise, returns
* a copy of columns with no modifications made.
* columns using the values set for the column ids.
* Ensure that columns of type Any start out as formula columns, i.e. empty columns,
* so that type guessing is triggered when new data is added.
*/
function addLabelsIfPossible(columns: GristColumn[]) {
return columns.map(c => (c.id ? {...c, label: c.id} : c));
function cleanColumnMetadata(columns: GristColumn[]) {
return columns.map(c => {
const newCol: any = {...c};
if (c.id) {
newCol.label = c.id;
}
if (c.type === "Any") {
newCol.isFormula = true;
}
return newCol;
});
}

View File

@ -1,12 +1,11 @@
from collections import defaultdict, namedtuple
from collections import namedtuple
import six
from six.moves import zip, xrange
from six.moves import zip
import column
import identifiers
import logger
log = logger.Logger(__name__, logger.INFO)
# Prefix for transform columns created during imports.
@ -103,6 +102,7 @@ class ImportActions(object):
"label": c.label,
"colId": c.colId if dest_table_id else None, #should be None if into new table
"type": c.type,
"widgetOptions": getattr(c, "widgetOptions", ""),
"formula": ("$" + c.colId) if (c.colId in src_cols) else ''
})
@ -162,6 +162,7 @@ class ImportActions(object):
new_col_spec = {
"label": c.label,
"type": c.type,
"widgetOptions": getattr(c, "widgetOptions", ""),
"isFormula": True,
"formula": c.formula}
result = self._useractions.doAddColumn(hidden_table_id, new_col_id, new_col_spec)

View File

@ -1,490 +0,0 @@
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
based on samples.
dateguesser.guess(sample)
dateguesser.guess takes a sample date string and returns a set of
datetime.strftime/strptime-compliant date format strings that will correctly parse.
dateguesser.guess_bulk(list_of_samples, error_rate=0)
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
and returns a list of datetime.strftime/strptime-compliant date format strings
sorted by error rate that will correctly parse.
Algorithm:
1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
2. Analyze each token independently in terms what format codes could represent
3. For given list of tokens generate all permutations of format codes
4. During generating permutations check for validness of generated format and skip if invalid.
5. Use rules listed below to decide if format is invalid:
Invalid format checks:
Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
Rule #2. No holes (missing parts) in the format parts.
Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
Rule #5. Pattern can't have some part of date/time defined more than once.
Rule #6: Separators between elements of the time group should be the same.
Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
Rule #8: Year can't be between other date elements
Note:
dateguess doesn't support defaulting to current year because parsing should be deterministic,
it's better to to fail guessing the format then to guess it incorrectly.
Examples:
>>> guess('2014/05/05 14:00:00 UTC')
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
>>> guess('12/12/12')
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
>>> guess_bulk(['12-11-2014', '12-25-2014'])
['%m-%d-%Y']
>>> guess_bulk(['12-11-2014', '25-25-2014'])
[]
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
['%m-%d-%Y']
"""
import calendar
import itertools
import logging
import re
from collections import defaultdict
from backports.functools_lru_cache import lru_cache
import moment
MONTH_NAME = calendar.month_name
MONTH_ABBR = calendar.month_abbr
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
AM_PM = {'am', 'pm'}
DAYS_OF_WEEK_NAME = calendar.day_name
DAYS_OF_WEEK_ABBR = calendar.day_abbr
ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
# Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
# Use isdigit(x) instead, to only match ASCII digits 0-9.
isdigit = ASCII_DIGITS_RE.match
DATE_ELEMENTS = [
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
("Day of week", "%A", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
and 0 <= int(x[0:2]) < 24
and 0 <= int(x[2:4]) < 60
and 0 <= int(x[4:6]) < 60, "HMS", 0),
("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
and 0 <= int(x) <= 11, "H", 0),
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
and p.val == '.', "f", 0),
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
and x in TZ_VALID_NAMES, "Z", 0),
("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '+', "Z", 1),
("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '-', "Z", 1),
]
class Token(object):
"""Represents a part of a date string that's being parsed.
Note that __hash__ and __eq__ are overridden in order
to compare only meaningful parts of an object.
"""
def __init__(self, val, length):
self.val = val
self.length = length
self.compatible_types = ()
def __hash__(self):
h = hash(self.length) + hash(self.compatible_types)
if not self.compatible_types:
h += hash(self.val)
return hash(h)
def __eq__(self, other):
"""
Two tokens are equal when these both are true:
a) length and compatible types are equal
b) if it is separator (no compatible types), separator values must be equal
"""
if self.length != other.length or self.compatible_types != other.compatible_types:
return False
if not other.compatible_types and self.val != other.val:
return False
return True
def _check_rule_1(pattern, types_used):
"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
Examples:
>>> _check_rule_1('%Y/%m/%d', 'Ymd')
True
>>> _check_rule_1('%m/%d', 'md')
False
"""
if 'Y' not in types_used:
logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_2(pattern, types_used):
"""Rule #2: No holes (missing parts) in the format parts.
Examples:
>>> _check_rule_2('%Y:%H', 'YH')
False
>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
True
"""
priorities = 'YmdHMSf'
seen_parts = [p in types_used for p in priorities]
if sorted(seen_parts, reverse=True) != seen_parts:
logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_3(pattern, types_used):
"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
Examples:
>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
True
>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
False
"""
time_parts = 'HMSf'
time_parts_highlighted = [t in time_parts for t in types_used]
time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_4(pattern, types_used):
"""Rule #4: It's highly impossible that minutes coming before hours,
millis coming before seconds etc.
Examples:
>>> _check_rule_4('%H:%M', 'HM')
True
>>> _check_rule_4('%S:%M', 'SM')
False
"""
time_parts_priority = 'HMSf'
time_parts_indexes = list(filter(lambda x: x >= 0,
[time_parts_priority.find(t) for t in types_used]))
if sorted(time_parts_indexes) != time_parts_indexes:
logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_5(pattern, types_used):
"""Rule #5: Pattern can't have some part of date/time defined more than once.
Examples:
>>> _check_rule_5('%Y/%Y', 'YY')
False
>>> _check_rule_5('%m/%b', 'mm')
False
>>> _check_rule_5('%Y/%m', 'Ym')
True
"""
if len(types_used) != len(set(types_used)):
logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_6(tokens_chosen, pattern, types_used):
"""Rule #6: Separators between elements of the time group should be the same.
Examples:
_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
('-' and ':') in time group)
"""
time_parts = 'HMS'
num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
time_parts_seen = 0
separators_seen = []
previous_was_a_separator = False
for token in tokens_chosen:
if token[1] is not None and token[1][3] in time_parts:
# This rule doesn't work for separator-less time group so when we found the type
# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
if len(token[1][3]) == 3:
break
# If not a first time then
if time_parts_seen > 0 and not previous_was_a_separator:
separators_seen.append(None)
time_parts_seen += 1
if time_parts_seen == num_of_time_parts_used:
break
previous_was_a_separator = False
else:
if time_parts_seen > 0:
separators_seen.append(token[0].val)
previous_was_a_separator = True
if len(set(separators_seen)) > 1:
logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
pattern, separators_seen)
return False
return True
def _check_rule_7a(pattern):
"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
Otherwise it's 24-hour.
Examples:
>>> _check_rule_7a('%Y/%m/%d %H:%M %p')
False
>>> _check_rule_7a('%Y/%m/%d %I:%M %p')
True
"""
if '%p' in pattern and '%H' in pattern:
logging.debug("Rule #7a is violated for pattern %s", pattern)
return False
return True
def _check_rule_7b(pattern):
"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
Otherwise it's 24-hour.
Examples:
>>> _check_rule_7b('%Y/%m/%d %I:%M')
False
>>> _check_rule_7b('%Y/%m/%d %I:%M %p')
True
"""
if '%I' in pattern and '%p' not in pattern:
logging.debug("Rule #7b is violated for pattern %s", pattern)
return False
return True
def _check_rule_8(pattern, types_used):
"""Rule #9: Year can't be between other date elements
Examples:
>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
False
"""
if 'mYd' in types_used or 'dYm' in types_used:
logging.debug("Rule #8 is violated for pattern %s", pattern)
return False
return True
def _tokenize_by_character_class(s):
"""Return a list of strings by splitting s (tokenizing) by character class.
Example:
>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
>>> [i.val for i in t]
['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
>>> t = _tokenize_by_character_class('5/14/2014')
>>> [i.val for i in t]
['5', '/', '14', '/', '2014']
"""
res = re.split(r'(\d+)|(\W)|(_)', s)
return [Token(i, len(i)) for i in res if i]
def _sliding_triplets(tokens):
for idx, t in enumerate(tokens):
yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
def _analyze_tokens(tokens):
"""Analyze each token and find out compatible types for it."""
for token, prev, nxt in _sliding_triplets(tokens):
token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
@lru_cache()
def _generate_all_permutations(tokens):
"""Generate all permutations of format codes for given list of tokens.
Brute-forcing of all possible permutations and rules checking eats most of the time or date
parsing. But since the input is expected to be highly uniform then we can expect that
memoization of this step will be very efficient.
Token contains values for date parts but due to overridden eq and hash methods,
we treat two tokens having the same length and same possible formats as equal
tokens and separators should be the same
"""
all_patterns = set()
_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
return all_patterns
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
"""Apply rules which are applicable for partially constructed patterns.
Example: duplicates of a date part in a pattern.
"""
return _check_rule_5(pattern, types_used) \
and _check_rule_4(pattern, types_used) \
and _check_rule_7a(pattern)
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
"""Apply rules which are applicable for full pattern only.
Example: existence of Year part in the pattern.
"""
return _check_rule_1(pattern, types_used) \
and _check_rule_2(pattern, types_used) \
and _check_rule_3(pattern, types_used) \
and _check_rule_6(tokens_chosen, pattern, types_used) \
and _check_rule_7b(pattern) \
and _check_rule_8(pattern, types_used)
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
types_used):
"""Generate all format elements permutations recursively.
Args:
tokens (list[Token]): List of tokens.
token_idx (int): Index of token processing this cycle.
tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
containing token and compatible type
pattern (str): String containing format for parsing
found_patterns (set): Set of guessed patterns
types_used (str): String of types used to build pattern.
Returns:
list: List of permutations
"""
if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
return
if token_idx < len(tokens):
t = tokens[token_idx]
if t.compatible_types:
for ct in t.compatible_types:
_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
found_patterns, types_used + ct[3])
else:
# if no compatible types it should be separator, add it to the pattern
_generate_all_permutations_recursive(tokens, token_idx+1,
tokens_chosen[:] + [(t, None)], pattern + t.val,
found_patterns, types_used)
else:
if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
found_patterns.add(pattern)
def guess(date):
"""Guesses datetime.strftime/strptime-compliant date formats for date string.
Args:
date (str): Date string.
Returns:
set: Set of datetime.strftime/strptime-compliant date format strings
Examples:
>>> guess('2014/05/05 14:00:00 UTC')
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
>>> guess('12/12/12')
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
"""
# Don't attempt to parse strings that are so long as to be certainly non-dates. Somewhat long
# strings could be dates (like "Wednesday, September 16, 2020 A.D. 08:47:02.2667911 AM -06:00",
# and who knows what other languages do). A limit is important also because the current approach
# can run into "maximum recursion depth exceeded" on a very long string.
if len(date) > 150:
return set()
tokens = _tokenize_by_character_class(date)
_analyze_tokens(tokens)
return _generate_all_permutations(tuple(tokens))
def guess_bulk(dates, error_rate=0):
"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
Args:
dates (list): List of samples date strings.
error_rate (float): Acceptable error rate (default 0.0)
Returns:
list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
Examples:
>>> guess_bulk(['12-11-2014', '12-25-2014'])
['%m-%d-%Y']
>>> guess_bulk(['12-11-2014', '25-25-2014'])
[]
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
['%m-%d-%Y']
"""
if error_rate == 0.0:
patterns = None
for date in dates:
guesses_patterns = guess(date)
if patterns is None:
patterns = guesses_patterns
else:
patterns = patterns.intersection(guesses_patterns)
if not patterns:
break # No need to iterate more if zero patterns found
return list(patterns)
else:
found_dates = 0
pattern_counters = defaultdict(lambda: 0)
num_dates = len(dates)
min_num_dates_to_be_found = num_dates - num_dates * error_rate
for idx, date in enumerate(dates):
patterns = guess(date)
if patterns:
found_dates += 1
for pattern in patterns:
pattern_counters[pattern] = pattern_counters[pattern] + 1
# Early return if number of strings that can't be date is already over error rate
cells_left = num_dates - idx - 1
cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
if cannot_be_found:
return []
patterns = [(v, k) for k, v in pattern_counters.items()
if v > min_num_dates_to_be_found]
patterns.sort(reverse=True)
return [k for (v, k) in patterns]

View File

@ -1,102 +0,0 @@
import unittest
from imports.dateguess import guess, guess_bulk
class TestGuesser(unittest.TestCase):
def assertDate(self, input_str, fmt_list):
guessed = guess(input_str)
self.assertEqual(set(guessed), set(fmt_list))
def assertDates(self, input_lst, error_rate, fmt_list):
guessed = guess_bulk(input_lst, error_rate=error_rate)
self.assertEqual(set(guessed), set(fmt_list))
def test_guess_dates(self):
self.assertDate('', [])
self.assertDate("2013-13-13", [])
self.assertDate("25/25/1911", [])
self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("2013-12-13", ['%Y-%m-%d'])
self.assertDate("12/31/1999", ['%m/%d/%Y'])
self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("12/31/99", ['%m/%d/%y'])
self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
self.assertDate("31.12.91", ['%d.%m.%y'])
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
self.assertDate("31.12.1991", ['%d.%m.%Y'])
self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
self.assertDate("13.2.2008", ['%d.%m.%Y'])
self.assertDate("31.12.91", ['%d.%m.%y'])
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
self.assertDate("31 Dec 1999", ['%d %b %Y'])
self.assertDate("1 Jan 2012", ['%d %b %Y'])
self.assertDate("3 August 2009", ['%d %B %Y'])
self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
self.assertDate("13/1/2012", ['%d/%m/%Y'])
self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
def test_guess_datetimes(self):
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
# TODO remove all except first one
self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
'%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one
self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
def test_guess_bulk_dates(self):
self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
if __name__ == "__main__":
unittest.main()

View File

@ -4,8 +4,6 @@ import textwrap
import unittest
from six import BytesIO, text_type
import csv
import calendar
import datetime
from imports import import_csv
@ -22,9 +20,15 @@ def bytes_io_from_str(string):
class TestImportCSV(unittest.TestCase):
def _check_col(self, sheet, index, name, typename, values):
def _check_col(self, sheet, index, name, _typename, values):
self.assertEqual(sheet["column_metadata"][index]["id"], name)
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
# Previously, strings were parsed and types were guessed in CSV imports.
# Now all data is kept as strings and the column type is left as Any
# so that type guessing and parsing can happen elsewhere.
# To avoid updating 85 calls to _check_col, the typename argument was kept but can be ignored,
# and all values are converted back to strings for comparison.
self.assertEqual(sheet["column_metadata"][index]["type"], "Any")
values = [text_type(v) for v in values]
self.assertEqual(sheet["table_data"][index], values)
def _check_num_cols(self, sheet, exp_cols):
@ -40,18 +44,16 @@ class TestImportCSV(unittest.TestCase):
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
self._check_col(sheet, 4, "num2", "Numeric", ['123456789.1234560000', '', ''])
self._check_col(sheet, 5, "bignum", "Numeric", ['7.22597E+86', '', ''])
self._check_col(sheet, 6, "date1", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
[u'12/22/15 11:59 AM', u'', u''])
self._check_col(sheet, 7, "date2", "Date",
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
[u'December 20, 2015', u'', u''])
self._check_col(sheet, 8, "datetext", "Date",
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
[u'12/22/2015', u'', u''])
self._check_col(sheet, 9, "datetimetext", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
[u'12/22/2015 00:00:00', u'12/22/2015 13:15:00', u'02/27/2018 16:08:39'])
def test_user_parse_options(self):
@ -68,7 +70,11 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
'2013433434', '(201)343-3434'])
self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
self._check_col(parsed_file, 4, "DATE", "DateTime",
[u'2018-02-27 16:08:39 +0000',
u'2018-02-27 16:08:39 +0100',
u'2018-02-27 16:08:39 -0100',
u''])
def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent(

View File

@ -16,31 +16,33 @@ class TestImportXLS(unittest.TestCase):
def _check_col(self, sheet, index, name, typename, values):
self.assertEqual(sheet["column_metadata"][index]["id"], name)
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
if typename == "Any":
# Convert values to strings to reduce changes to tests after imports were overhauled.
values = [str(v) for v in values]
self.assertEqual(sheet["table_data"][index], values)
def test_excel(self):
parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
# check that column type was correctly set to int and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
# check that column type was correctly set to numeric and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"})
self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
# check that column type was correctly set to text and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"})
self.assertEqual(parsed_file[1][0]["table_data"][1],
["a", "b", "c", "d", "e", "f", "g", "h"])
# messy tables does not support bool types yet, it classifies them as ints
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
self.assertEqual(parsed_file[1][False]["table_data"][2],
[True, False, True, False, True, False, True, False])
# 0s and 1s become Numeric, not boolean like in the past
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"})
self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0])
# check that column type was correctly set to text and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][3],
{"type": "Text", "id": "corner-cases"})
{"type": "Any", "id": "corner-cases"})
self.assertEqual(parsed_file[1][0]["table_data"][3],
# The type is detected as text, so all values should be text.
[u'=function()', '3.0', u'two spaces after ',
[u'=function()', u'3.0', u'two spaces after ',
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
# check that multiple tables are created when there are multiple sheets in a document
@ -51,23 +53,19 @@ class TestImportXLS(unittest.TestCase):
def test_excel_types(self):
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
sheet = parsed_file[1][0]
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', ''])
self._check_col(sheet, 1, "int2", "Numeric", [5, '', ''])
self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
self._check_col(sheet, 6, "date1", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
self._check_col(sheet, 7, "date2", "Date",
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
self._check_col(sheet, 8, "datetext", "Date",
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
# TODO: all dates have different format
# self._check_col(sheet, 9, "datetimetext", "DateTime",
# [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
# calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
# calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', ''])
self._check_col(sheet, 9, "datetimetext", "Any",
[u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000'])
def test_excel_type_detection(self):
# This tests goes over the second sheet of the fixture doc, which has multiple rows that try
@ -81,23 +79,20 @@ class TestImportXLS(unittest.TestCase):
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
self._check_col(sheet, 1, "float_not_int", "Numeric",
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
self._check_col(sheet, 2, "int_not_bool", "Int",
self._check_col(sheet, 2, "int_not_bool", "Any",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 3, "float_not_bool", "Numeric",
self._check_col(sheet, 3, "float_not_bool", "Any",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 4, "text_as_bool", "Bool",
self._check_col(sheet, 4, "text_as_bool", "Any",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 5, "int_as_bool", "Bool",
self._check_col(sheet, 5, "int_as_bool", "Numeric",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 6, "float_not_date", "Numeric",
self._check_col(sheet, 6, "float_not_date", "Any",
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
4.0, 6.0, '3-4', 4.0, 6.5])
self._check_col(sheet, 7, "float_not_text", "Numeric",
[-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-",
[-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, "", "-",
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
self._check_col(sheet, 8, "dollar_amts", "Numeric",
[0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
def test_excel_single_merged_cell(self):
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
@ -107,11 +102,11 @@ class TestImportXLS(unittest.TestCase):
self.assertEqual(tables, [{
'table_name': u'Transaction Report',
'column_metadata': [
{'type': 'Text', 'id': u''},
{'type': 'Any', 'id': u''},
{'type': 'Numeric', 'id': u'Start'},
{'type': 'Numeric', 'id': u''},
{'type': 'Numeric', 'id': u''},
{'type': 'Text', 'id': u'Seek no easy ways'},
{'type': 'Any', 'id': u'Seek no easy ways'},
],
'table_data': [
[u'SINGLE MERGED', u'The End'],
@ -133,15 +128,15 @@ class TestImportXLS(unittest.TestCase):
self.assertEqual(tables, [{
'table_name': u'Sheet1',
'column_metadata': [
{'id': 'a', 'type': 'Text'},
{'id': 'a', 'type': 'Any'},
{'id': 'b', 'type': 'Date'},
{'id': 'c', 'type': 'Text'},
{'id': 'd', 'type': 'Text'},
{'id': 'c', 'type': 'Any'},
{'id': 'd', 'type': 'Any'},
{'id': 'e', 'type': 'Numeric'},
{'id': 'f', 'type': 'Int'},
{'id': 'g', 'type': 'Date'},
{'id': 'f', 'type': 'Numeric'},
{'id': 'g', 'type': 'Any'},
{'id': 'h', 'type': 'Date'},
{'id': 'i', 'type': 'Bool'},
{'id': 'i', 'type': 'Numeric'},
],
'table_data': [
[u'21:14:00'],
@ -150,9 +145,9 @@ class TestImportXLS(unittest.TestCase):
[u'10:20:30'],
[4.180902777777778],
[20],
[-6106060800.0],
[u'7/4/1776'],
[205286400.0],
[False], # This is not great either, we should be able to distinguish 0 from FALSE.
[0],
],
}])

View File

@ -7,13 +7,11 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a
of values. All "data" lists will have the same length.
"""
from imports import dateguess
import datetime
import logging
import re
import messytables
import moment # TODO grist internal libraries might not be available to plugins in the future.
import dateutil.parser as date_parser
import six
from six.moves import zip, xrange
@ -25,12 +23,17 @@ log = logging.getLogger(__name__)
# Our approach to type detection is different from that of messytables.
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
# We first go through each cell in a sample of rows, checking if it's one of the basic
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
# We use those counts to produce the selected Grist type at the end.
# Previously string values were used here for type guessing and were parsed to typed values.
# That process now happens elsewhere, and this module only handles the case
# where the imported data already contains actual numbers or dates.
# This happens for Excel sheets but not CSV files.
class BaseConverter(object):
@classmethod
@ -57,50 +60,19 @@ class BaseConverter(object):
class NumericConverter(BaseConverter):
"""Handles numeric values, including Grist types Numeric and Int."""
# A number matching this is probably an identifier of some sort. Converting it to a float will
# lose precision, so it's better not to consider it numeric.
_unlikely_float = re.compile(r'\d{17}|^0\d')
# Integers outside this range will be represented as floats. This is the limit for values that can
# be stored in a JS Int32Array.
_max_js_int = 1<<31
# The thousands separator. It should be locale-specific, but we don't currently have a way to
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
_thousands_sep = ','
"""Handles the Grist Numeric type"""
@classmethod
def convert(cls, value):
if type(value) in six.integer_types + (float, complex):
return value
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
raise ValueError()
@classmethod
def _is_integer(cls, value):
ttype = type(value)
if ttype == int or (ttype == float and value.is_integer()):
return -cls._max_js_int <= value < cls._max_js_int
return False
@classmethod
def get_grist_column(cls, values):
if all(cls._is_integer(v) for v in values):
return ("Int", [int(v) for v in values])
return ("Numeric", values)
class DateParserInfo(date_parser.parserinfo):
def validate(self, res):
# Avoid this bogus combination which accepts plain numbers.
if res.day and not res.month:
return False
return super(DateParserInfo, self).validate(res)
class SimpleDateTimeConverter(BaseConverter):
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
@ -124,66 +96,18 @@ class SimpleDateTimeConverter(BaseConverter):
return grist_type, grist_values
class DateTimeCoverter(BaseConverter):
"""Handles dateformats by guessed format."""
def __init__(self, date_format):
self._format = date_format
def convert(self, value):
if value == "":
return None
if type(value) in (str, six.text_type):
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
if '%z' in self._format or '%Z' in self._format:
return date_parser.parse(value)
else:
try:
return datetime.datetime.strptime(value, self._format)
except ValueError:
return date_parser.parse(value)
raise ValueError()
def _is_date(self, value):
return value is None or value.time() == datetime.time()
def get_grist_column(self, values):
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
for v in values]
return grist_type, grist_values
class BoolConverter(BaseConverter):
"""Handles Boolean type."""
_true_values = (1, '1', 'true', 'yes')
_false_values = (0, '0', 'false', 'no')
@classmethod
def convert(cls, value):
v = value.strip().lower() if type(value) in (str, six.text_type) else value
if v in cls._true_values:
return True
elif v in cls._false_values:
return False
raise ValueError()
@classmethod
def get_grist_column(cls, values):
return ("Bool", values)
class TextConverter(BaseConverter):
"""Fallback converter that converts everything to strings."""
class AnyConverter(BaseConverter):
"""
Fallback converter that converts everything to strings.
Type guessing and parsing of the strings will happen elsewhere.
"""
@classmethod
def convert(cls, value):
return six.text_type(value)
@classmethod
def get_grist_column(cls, values):
return ("Text", values)
return ("Any", values)
class ColumnDetector(object):
@ -194,7 +118,7 @@ class ColumnDetector(object):
"""
# Converters are listed in the order of preference, which is only used if two converters succeed
# on the same exact number of values. Text is always a fallback.
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
converters = [SimpleDateTimeConverter, NumericConverter]
# If this many non-junk values or more can't be converted, fall back to text.
_text_threshold = 0.10
@ -221,19 +145,11 @@ class ColumnDetector(object):
self._counts[i] += 1
def get_converter(self):
if sum(self._counts) == 0:
# if not already guessed as int, bool or datetime then we should try to guess date pattern
str_data = [d for d in self._data if isinstance(d, six.string_types)]
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
data_format = data_formats[0] if data_formats else None
if data_format:
return DateTimeCoverter(data_format)
# We find the max by count, and secondarily by minimum index in the converters list.
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
return self.converters[-neg_index]
return TextConverter
return AnyConverter
def _guess_basic_types(rows, num_columns):