(core) Lossless imports

Summary: - Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates. - 0s and 1s are treated as numbers instead of booleans to keep imports lossless. - Removed dateguess.py and test_dateguess.py. - Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py. - Columns containing numbers are now always imported as Numeric, never Int. - Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff. Test Plan: - Added 2 `nbrowser/Importer2` tests. - Updated various existing tests. - Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3302
2025-06-13 20:53:59 +00:00 · 2022-03-04 19:37:56 +02:00 · 2022-03-04 19:37:56 +02:00 · 321019217d
commit 321019217d
parent 9522438967
14 changed files with 150 additions and 785 deletions
--- a/app/client/components/Importer.ts
+++ b/app/client/components/Importer.ts
@ -353,6 +353,7 @@ export class Importer extends DisposableWithEvents {
        label: field.label(),
        colId: destTableId ? field.colId() : null, // if inserting into new table, colId isn't defined
        type: field.column().type(),
        widgetOptions: field.column().widgetOptions(),
        formula: field.column().formula()
      })),
      sourceCols: sourceFields.map((field) => field.colId())
--- a/app/client/components/TypeConversion.ts
+++ b/app/client/components/TypeConversion.ts
@ -105,7 +105,7 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe
      let {dateFormat} = prevOptions;
      if (!dateFormat) {
        const colValues = tableData.getColValues(sourceCol.colId()) || [];
-        dateFormat = guessDateFormat(colValues.map(String)) || "YYYY-MM-DD";
+        dateFormat = guessDateFormat(colValues.map(String));
      }
      widgetOptions = dateTimeWidgetOptions(dateFormat, true);
      break;
--- a/app/common/ActiveDocAPI.ts
+++ b/app/common/ActiveDocAPI.ts
@ -49,6 +49,7 @@ export interface TransformColumn {
  colId: string|null;
  type: string;
  formula: string;
  widgetOptions: string;
 }
 export interface ImportResult {
--- a/app/common/UserAPI.ts
+++ b/app/common/UserAPI.ts
@ -3,7 +3,7 @@ import {ApplyUAResult, QueryFilters} from 'app/common/ActiveDocAPI';
 import {BaseAPI, IOptions} from 'app/common/BaseAPI';
 import {BillingAPI, BillingAPIImpl} from 'app/common/BillingAPI';
 import {BrowserSettings} from 'app/common/BrowserSettings';
-import {BulkColValues, TableColValues, UserAction} from 'app/common/DocActions';
+import {BulkColValues, TableColValues, TableRecordValue, TableRecordValues, UserAction} from 'app/common/DocActions';
 import {DocCreationInfo, OpenDocMode} from 'app/common/DocListAPI';
 import {Features} from 'app/common/Features';
 import {ICustomWidget} from 'app/common/CustomWidget';
@ -402,6 +402,11 @@ export interface UserAPI {
  filters?: string;
 }
 interface GetRowsParams {
  filters?: QueryFilters;
  immediate?: boolean;
 }
 /**
 * Collect endpoints related to the content of a single document that we've been thinking
 * of as the (restful) "Doc API".  A few endpoints that could be here are not, for historical
@ -411,8 +416,8 @@ export interface DocAPI {
  // Immediate flag is a currently not-advertised feature, allowing a query to proceed without
  // waiting for a document to be initialized. This is useful if the calculations done when
  // opening a document are irrelevant.
-  getRows(tableId: string, options?: { filters?: QueryFilters,
+  getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues>;
-                                       immediate?: boolean }): Promise<TableColValues>;
+  getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]>;
  updateRows(tableId: string, changes: TableColValues): Promise<number[]>;
  addRows(tableId: string, additions: BulkColValues): Promise<number[]>;
  removeRows(tableId: string, removals: number[]): Promise<number[]>;
@ -869,16 +874,13 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
    this._url = `${url}/api/docs/${docId}`;
  }
-  public async getRows(tableId: string, options?: { filters?: QueryFilters,
+  public async getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues> {
-                                                    immediate?: boolean }): Promise<TableColValues> {
+    return this._getRecords(tableId, 'data', options);
-    const url = new URL(`${this._url}/tables/${tableId}/data`);
+  }
-    if (options?.filters) {
+
-      url.searchParams.append('filter', JSON.stringify(options.filters));
+  public async getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]> {
-    }
+    const response: TableRecordValues = await this._getRecords(tableId, 'records', options);
-    if (options?.immediate) {
+    return response.records;
      url.searchParams.append('immediate', 'true');
    }
    return this.requestJson(url.href);
  }
  public async updateRows(tableId: string, changes: TableColValues): Promise<number[]> {
@ -967,6 +969,17 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
    url.searchParams.append('code', code);
    return this.requestJson(url.href);
  }
  private _getRecords(tableId: string, endpoint: 'data' | 'records', options?: GetRowsParams): Promise<any> {
    const url = new URL(`${this._url}/tables/${tableId}/${endpoint}`);
    if (options?.filters) {
      url.searchParams.append('filter', JSON.stringify(options.filters));
    }
    if (options?.immediate) {
      url.searchParams.append('immediate', 'true');
    }
    return this.requestJson(url.href);
  }
 }
 /**
--- a/app/common/ValueGuesser.ts
+++ b/app/common/ValueGuesser.ts
@ -162,7 +162,7 @@ export function guessColInfo(
      NumberParse.fromSettings(docSettings).guessOptions(values)
    )
      .guess(values, docSettings) ||
-    new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
+    new DateGuesser(guessDateFormat(values, timezone), timezone)
      .guess(values, docSettings) ||
    // Don't return the same values back if there's no conversion to be done,
    // as they have to be serialized and transferred over a pipe to Python.
--- a/app/common/ValueParser.ts
+++ b/app/common/ValueParser.ts
@ -36,18 +36,6 @@ export class ValueParser {
 class IdentityParser extends ValueParser {
 }
 /**
 * Same as basic Value parser, but will return null if a value is an empty string.
 */
 class NullIfEmptyParser extends ValueParser {
  public cleanParse(value: string): any {
    if (value === "") {
      return null;
    }
    return super.cleanParse(value);
  }
 }
 export class NumericParser extends ValueParser {
  private _parse: NumberParse;
@ -225,7 +213,6 @@ export class ReferenceListParser extends ReferenceParser {
 }
 export const valueParserClasses: { [type: string]: typeof ValueParser } = {
  Any: NullIfEmptyParser,
  Numeric: NumericParser,
  Int: NumericParser,
  Date: DateParser,
--- a/app/common/parseDate.ts
+++ b/app/common/parseDate.ts
@ -1,4 +1,5 @@
 import escapeRegExp = require('lodash/escapeRegExp');
 import last = require('lodash/last');
 import memoize = require('lodash/memoize');
 import {getDistinctValues, isObject} from 'app/common/gutil';
 // Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
@ -325,7 +326,26 @@ function standardizeTime(timeString: string): { remaining: string, time: string
  return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
 }
-export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string | null {
+/**
 * Guesses a full date[time] format that best matches the given strings.
 * If several formats match equally well, picks the last one lexicographically to match the old date guessing.
 * This means formats with an early Y and/or M are favoured.
 * If no formats match, returns the default YYYY-MM-DD.
 */
 export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string {
  const formats = guessDateFormats(values, timezone);
  if (!formats) {
    return "YYYY-MM-DD";
  }
  return last(formats)!;
 }
 /**
 * Returns all full date[time] formats that best match the given strings.
 * If several formats match equally well, returns them all.
 * May return null if there are no matching formats or choosing one is too expensive.
 */
 export function guessDateFormats(values: Array<string | null>, timezone: string = 'UTC'): string[] | null {
  const dateStrings: string[] = values.filter(isObject);
  const sample = getDistinctValues(dateStrings, 100);
  const formats: Record<string, number> = {};
@ -358,7 +378,9 @@ export function guessDateFormat(values: Array<string | null>, timezone: string =
  }
  const maxCount = Math.max(...Object.values(formats));
-  return formatKeys.find(format => formats[format] === maxCount)!;
+  // Return all formats that tied for first place.
  // Sort lexicographically for consistency in tests and with the old dateguess.py.
  return formatKeys.filter(format => formats[format] === maxCount).sort();
 }
 export const dateFormatOptions = [
--- a/app/server/lib/ActiveDocImport.ts
+++ b/app/server/lib/ActiveDocImport.ts
@ -294,7 +294,7 @@ export class ActiveDocImport {
      const origTableName = table.table_name ? table.table_name : '';
      const transformRule = transformRuleMap && transformRuleMap.hasOwnProperty(origTableName) ?
        transformRuleMap[origTableName] : null;
-      const columnMetadata = addLabelsIfPossible(table.column_metadata);
+      const columnMetadata = cleanColumnMetadata(table.column_metadata);
      const result: ApplyUAResult = await this._activeDoc.applyUserActions(docSession,
        [["AddTable", hiddenTableName, columnMetadata]]);
      const retValue: AddTableRetValue = result.retValues[0];
@ -313,7 +313,9 @@ export class ActiveDocImport {
      const ruleCanBeApplied = (transformRule != null) &&
                               _.difference(transformRule.sourceCols, hiddenTableColIds).length === 0;
      await this._activeDoc.applyUserActions(docSession,
-        [["ReplaceTableData", hiddenTableId, rowIdColumn, columnValues]], {parseStrings: true});
+        // BulkAddRecord rather than ReplaceTableData so that type guessing is applied to Any columns.
        // Don't use parseStrings, only use the strict parsing in ValueGuesser to make the import lossless.
        [["BulkAddRecord", hiddenTableId, rowIdColumn, columnValues]]);
      // data parsed and put into hiddenTableId
      // For preview_table (isHidden) do GenImporterView to make views and formulas and cols
@ -433,14 +435,15 @@ export class ActiveDocImport {
    // If destination is a new table, we need to create it.
    if (intoNewTable) {
-      const colSpecs = destCols.map(({type, colId: id, label}) => ({type, id, label}));
+      const colSpecs = destCols.map(({type, colId: id, label, widgetOptions}) => ({type, id, label, widgetOptions}));
      const newTable = await this._activeDoc.applyUserActions(docSession, [['AddTable', destTableId, colSpecs]]);
      destTableId = newTable.retValues[0].table_id;
    }
    await this._activeDoc.applyUserActions(docSession,
      [['BulkAddRecord', destTableId, gutil.arrayRepeat(hiddenTableData.id.length, null), columnData]],
-      {parseStrings: true});
+      // Don't use parseStrings for new tables to make the import lossless.
      {parseStrings: !intoNewTable});
    return destTableId;
  }
@ -586,6 +589,7 @@ export class ActiveDocImport {
        colId: destTableId ? id as string : null,
        label: fields.label as string,
        type: fields.type as string,
        widgetOptions: fields.widgetOptions as string,
        formula: srcColIds.includes(id as string) ? `$${id}` :  ''
      });
    }
@ -730,10 +734,21 @@ function getMergeFunction({type}: MergeStrategy): MergeFunction {
 }
 /**
 * Tweak the column metadata used in the AddTable action.
 * If `columns` is populated with non-blank column ids, adds labels to all
- * columns using the values set for the column ids. Otherwise, returns
+ * columns using the values set for the column ids.
- * a copy of columns with no modifications made.
+ * Ensure that columns of type Any start out as formula columns, i.e. empty columns,
 * so that type guessing is triggered when new data is added.
 */
-function addLabelsIfPossible(columns: GristColumn[]) {
+function cleanColumnMetadata(columns: GristColumn[]) {
-  return columns.map(c => (c.id ? {...c, label: c.id} : c));
+  return columns.map(c => {
    const newCol: any = {...c};
    if (c.id) {
      newCol.label = c.id;
    }
    if (c.type === "Any") {
      newCol.isFormula = true;
    }
    return newCol;
  });
 }
--- a/sandbox/grist/import_actions.py
+++ b/sandbox/grist/import_actions.py
@ -1,12 +1,11 @@
-from collections import defaultdict, namedtuple
+from collections import namedtuple
-import six
+from six.moves import zip
 from six.moves import zip, xrange
 import column
 import identifiers
 import logger
 log = logger.Logger(__name__, logger.INFO)
 # Prefix for transform columns created during imports.
@ -103,6 +102,7 @@ class ImportActions(object):
          "label":    c.label,
          "colId":    c.colId if dest_table_id else None, #should be None if into new table
          "type":     c.type,
          "widgetOptions": getattr(c, "widgetOptions", ""),
          "formula":  ("$" + c.colId) if (c.colId in src_cols) else ''
        })
@ -162,6 +162,7 @@ class ImportActions(object):
        new_col_spec = {
          "label": c.label,
          "type": c.type,
          "widgetOptions": getattr(c, "widgetOptions", ""),
          "isFormula": True,
          "formula": c.formula}
        result = self._useractions.doAddColumn(hidden_table_id, new_col_id, new_col_spec)
--- a/sandbox/grist/imports/dateguess.py
+++ b/sandbox/grist/imports/dateguess.py
@ -1,490 +0,0 @@
 """This module guesses possible formats of dates which can be parsed using datetime.strptime
 based on samples.
 dateguesser.guess(sample)
 dateguesser.guess takes a sample date string and returns a set of
 datetime.strftime/strptime-compliant date format strings that will correctly parse.
 dateguesser.guess_bulk(list_of_samples, error_rate=0)
 dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
 and returns a list of datetime.strftime/strptime-compliant date format strings
 sorted by error rate that will correctly parse.
 Algorithm:
  1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
  2. Analyze each token independently in terms what format codes could represent
  3. For given list of tokens generate all permutations of format codes
  4. During generating permutations check for validness of generated format and skip if invalid.
  5. Use rules listed below to decide if format is invalid:
 Invalid format checks:
  Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
  Rule #2. No holes (missing parts) in the format parts.
  Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
  Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
  Rule #5. Pattern can't have some part of date/time defined more than once.
  Rule #6: Separators between elements of the time group should be the same.
  Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
  Rule #8: Year can't be between other date elements
 Note:
  dateguess doesn't support defaulting to current year because parsing should be deterministic,
  it's better to to fail guessing the format then to guess it incorrectly.
 Examples:
  >>> guess('2014/05/05 14:00:00 UTC')
  set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
  >>> guess('12/12/12')
  set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
  >>> guess_bulk(['12-11-2014', '12-25-2014'])
  ['%m-%d-%Y']
  >>> guess_bulk(['12-11-2014', '25-25-2014'])
  []
  >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
  ['%m-%d-%Y']
 """
 import calendar
 import itertools
 import logging
 import re
 from collections import defaultdict
 from backports.functools_lru_cache import lru_cache
 import moment
 MONTH_NAME = calendar.month_name
 MONTH_ABBR = calendar.month_abbr
 TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
 AM_PM = {'am', 'pm'}
 DAYS_OF_WEEK_NAME = calendar.day_name
 DAYS_OF_WEEK_ABBR = calendar.day_abbr
 ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
 # Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
 # Use isdigit(x) instead, to only match ASCII digits 0-9.
 isdigit = ASCII_DIGITS_RE.match
 DATE_ELEMENTS = [
  # Name   Pattern  Predicate               Group (mutual exclusive)  Consumes N prev elements
  ("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
  ("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
  ("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
  ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
  ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
  ("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
  ("Day of week", "%A", lambda x, p, v: x.isalpha()
                                        and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
  ("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
                                             and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
  ("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
                                                and 0 <= int(x[0:2]) < 24
                                                and 0 <= int(x[2:4]) < 60
                                                and 0 <= int(x[4:6]) < 60, "HMS", 0),
  ("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
  ("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
                                              and 0 <= int(x) <= 11, "H", 0),
  ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
  ("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
  ("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
  ("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
                                               and p.val == '.', "f", 0),
  ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
                                          and x in TZ_VALID_NAMES, "Z", 0),
  ("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '+', "Z", 1),
  ("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '-', "Z", 1),
 ]
 class Token(object):
  """Represents a part of a date string that's being parsed.
  Note that __hash__ and __eq__ are overridden in order
  to compare only meaningful parts of an object.
  """
  def __init__(self, val, length):
    self.val = val
    self.length = length
    self.compatible_types = ()
  def __hash__(self):
    h = hash(self.length) + hash(self.compatible_types)
    if not self.compatible_types:
      h += hash(self.val)
    return hash(h)
  def __eq__(self, other):
    """
    Two tokens are equal when these both are true:
    a) length and compatible types are equal
    b) if it is separator (no compatible types), separator values must be equal
    """
    if self.length != other.length or self.compatible_types != other.compatible_types:
      return False
    if not other.compatible_types and self.val != other.val:
      return False
    return True
 def _check_rule_1(pattern, types_used):
  """Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
  Examples:
    >>> _check_rule_1('%Y/%m/%d', 'Ymd')
    True
    >>> _check_rule_1('%m/%d', 'md')
    False
  """
  if 'Y' not in types_used:
    logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True
 def _check_rule_2(pattern, types_used):
  """Rule #2: No holes (missing parts) in the format parts.
  Examples:
    >>> _check_rule_2('%Y:%H', 'YH')
    False
    >>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
    True
  """
  priorities = 'YmdHMSf'
  seen_parts = [p in types_used for p in priorities]
  if sorted(seen_parts, reverse=True) != seen_parts:
    logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True
 def _check_rule_3(pattern, types_used):
  """Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
  Examples:
    >>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
    True
    >>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
    False
  """
  time_parts = 'HMSf'
  time_parts_highlighted = [t in time_parts for t in types_used]
  time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
  if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
    logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True
 def _check_rule_4(pattern, types_used):
  """Rule #4: It's highly impossible that minutes coming before hours,
  millis coming before seconds etc.
  Examples:
    >>> _check_rule_4('%H:%M', 'HM')
    True
    >>> _check_rule_4('%S:%M', 'SM')
    False
  """
  time_parts_priority = 'HMSf'
  time_parts_indexes = list(filter(lambda x: x >= 0,
                                              [time_parts_priority.find(t) for t in types_used]))
  if sorted(time_parts_indexes) != time_parts_indexes:
    logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True
 def _check_rule_5(pattern, types_used):
  """Rule #5: Pattern can't have some part of date/time defined more than once.
  Examples:
    >>> _check_rule_5('%Y/%Y', 'YY')
    False
    >>> _check_rule_5('%m/%b', 'mm')
    False
    >>> _check_rule_5('%Y/%m', 'Ym')
    True
  """
  if len(types_used) != len(set(types_used)):
    logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True
 def _check_rule_6(tokens_chosen, pattern, types_used):
  """Rule #6: Separators between elements of the time group should be the same.
  Examples:
    _check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
    _check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
    _check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
                                                                  ('-' and ':') in time group)
  """
  time_parts = 'HMS'
  num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
  time_parts_seen = 0
  separators_seen = []
  previous_was_a_separator = False
  for token in tokens_chosen:
    if token[1] is not None and token[1][3] in time_parts:
      # This rule doesn't work for separator-less time group so when we found the type
      # and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
      if len(token[1][3]) == 3:
        break
      # If not a first time then
      if time_parts_seen > 0 and not previous_was_a_separator:
        separators_seen.append(None)
      time_parts_seen += 1
      if time_parts_seen == num_of_time_parts_used:
        break
      previous_was_a_separator = False
    else:
      if time_parts_seen > 0:
        separators_seen.append(token[0].val)
      previous_was_a_separator = True
  if len(set(separators_seen)) > 1:
    logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
                  pattern, separators_seen)
    return False
  return True
 def _check_rule_7a(pattern):
  """Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
  Otherwise it's 24-hour.
  Examples:
    >>> _check_rule_7a('%Y/%m/%d %H:%M %p')
    False
    >>> _check_rule_7a('%Y/%m/%d %I:%M %p')
    True
  """
  if '%p' in pattern and '%H' in pattern:
    logging.debug("Rule #7a is violated for pattern %s", pattern)
    return False
  return True
 def _check_rule_7b(pattern):
  """Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
  Otherwise it's 24-hour.
  Examples:
    >>> _check_rule_7b('%Y/%m/%d %I:%M')
    False
    >>> _check_rule_7b('%Y/%m/%d %I:%M %p')
    True
  """
  if '%I' in pattern and '%p' not in pattern:
    logging.debug("Rule #7b is violated for pattern %s", pattern)
    return False
  return True
 def _check_rule_8(pattern, types_used):
  """Rule #9: Year can't be between other date elements
  Examples:
    >>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
    False
  """
  if 'mYd' in types_used or 'dYm' in types_used:
    logging.debug("Rule #8 is violated for pattern %s", pattern)
    return False
  return True
 def _tokenize_by_character_class(s):
  """Return a list of strings by splitting s (tokenizing) by character class.
  Example:
    >>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
    >>> [i.val for i in t]
    ['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
    >>> t = _tokenize_by_character_class('5/14/2014')
    >>> [i.val for i in t]
    ['5', '/', '14', '/', '2014']
  """
  res = re.split(r'(\d+)|(\W)|(_)', s)
  return [Token(i, len(i)) for i in res if i]
 def _sliding_triplets(tokens):
  for idx, t in enumerate(tokens):
    yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
 def _analyze_tokens(tokens):
  """Analyze each token and find out compatible types for it."""
  for token, prev, nxt in _sliding_triplets(tokens):
    token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
@lru_cache()
 def _generate_all_permutations(tokens):
  """Generate all permutations of format codes for given list of tokens.
  Brute-forcing of all possible permutations and rules checking eats most of the time or date
  parsing. But since the input is expected to be highly uniform then we can expect that
  memoization of this step will be very efficient.
  Token contains values for date parts but due to overridden eq and hash methods,
  we treat two tokens having the same length and same possible formats as equal
  tokens and separators should be the same
  """
  all_patterns = set()
  _generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
  return all_patterns
 def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
  """Apply rules which are applicable for partially constructed patterns.
  Example: duplicates of a date part in a pattern.
  """
  return _check_rule_5(pattern, types_used) \
      and _check_rule_4(pattern, types_used) \
      and _check_rule_7a(pattern)
 def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
  """Apply rules which are applicable for full pattern only.
  Example: existence of Year part in the pattern.
  """
  return _check_rule_1(pattern, types_used) \
      and _check_rule_2(pattern, types_used) \
      and _check_rule_3(pattern, types_used) \
      and _check_rule_6(tokens_chosen, pattern, types_used) \
      and _check_rule_7b(pattern) \
      and _check_rule_8(pattern, types_used)
 def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
                                         types_used):
  """Generate all format elements permutations recursively.
  Args:
    tokens (list[Token]): List of tokens.
    token_idx (int): Index of token processing this cycle.
    tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
      containing token and compatible type
    pattern (str): String containing format for parsing
    found_patterns (set): Set of guessed patterns
    types_used (str): String of types used to build pattern.
  Returns:
    list: List of permutations
  """
  if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
    return
  if token_idx < len(tokens):
    t = tokens[token_idx]
    if t.compatible_types:
      for ct in t.compatible_types:
        _generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
                                             (pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
                                             found_patterns, types_used + ct[3])
    else:
      # if no compatible types it should be separator, add it to the pattern
      _generate_all_permutations_recursive(tokens, token_idx+1,
                                           tokens_chosen[:] + [(t, None)], pattern + t.val,
                                           found_patterns, types_used)
  else:
    if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
      found_patterns.add(pattern)
 def guess(date):
  """Guesses datetime.strftime/strptime-compliant date formats for date string.
  Args:
    date (str): Date string.
  Returns:
    set: Set of datetime.strftime/strptime-compliant date format strings
  Examples:
    >>> guess('2014/05/05 14:00:00 UTC')
    set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
    >>> guess('12/12/12')
    set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
  """
  # Don't attempt to parse strings that are so long as to be certainly non-dates. Somewhat long
  # strings could be dates (like "Wednesday, September 16, 2020 A.D. 08:47:02.2667911 AM -06:00",
  # and who knows what other languages do). A limit is important also because the current approach
  # can run into "maximum recursion depth exceeded" on a very long string.
  if len(date) > 150:
    return set()
  tokens = _tokenize_by_character_class(date)
  _analyze_tokens(tokens)
  return _generate_all_permutations(tuple(tokens))
 def guess_bulk(dates, error_rate=0):
  """Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
  Args:
    dates (list): List of samples date strings.
    error_rate (float): Acceptable error rate (default 0.0)
  Returns:
    list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
  Examples:
    >>> guess_bulk(['12-11-2014', '12-25-2014'])
    ['%m-%d-%Y']
    >>> guess_bulk(['12-11-2014', '25-25-2014'])
    []
    >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
    ['%m-%d-%Y']
  """
  if error_rate == 0.0:
    patterns = None
    for date in dates:
      guesses_patterns = guess(date)
      if patterns is None:
        patterns = guesses_patterns
      else:
        patterns = patterns.intersection(guesses_patterns)
      if not patterns:
        break   # No need to iterate more if zero patterns found
    return list(patterns)
  else:
    found_dates = 0
    pattern_counters = defaultdict(lambda: 0)
    num_dates = len(dates)
    min_num_dates_to_be_found = num_dates - num_dates * error_rate
    for idx, date in enumerate(dates):
      patterns = guess(date)
      if patterns:
        found_dates += 1
      for pattern in patterns:
        pattern_counters[pattern] = pattern_counters[pattern] + 1
      # Early return if number of strings that can't be date is already over error rate
      cells_left = num_dates - idx - 1
      cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
      if cannot_be_found:
        return []
    patterns = [(v, k) for k, v in pattern_counters.items()
                if v > min_num_dates_to_be_found]
    patterns.sort(reverse=True)
    return [k for (v, k) in patterns]
--- a/sandbox/grist/imports/test_dateguess.py
+++ b/sandbox/grist/imports/test_dateguess.py
@ -1,102 +0,0 @@
 import unittest
 from imports.dateguess import guess, guess_bulk
 class TestGuesser(unittest.TestCase):
  def assertDate(self, input_str, fmt_list):
    guessed = guess(input_str)
    self.assertEqual(set(guessed), set(fmt_list))
  def assertDates(self, input_lst, error_rate, fmt_list):
    guessed = guess_bulk(input_lst, error_rate=error_rate)
    self.assertEqual(set(guessed), set(fmt_list))
  def test_guess_dates(self):
    self.assertDate('', [])
    self.assertDate("2013-13-13", [])
    self.assertDate("25/25/1911", [])
    self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
    self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
    self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
    self.assertDate("2013-12-13", ['%Y-%m-%d'])
    self.assertDate("12/31/1999", ['%m/%d/%Y'])
    self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
    self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
    self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
    self.assertDate("12/31/99", ['%m/%d/%y'])
    self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
    self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
    self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
    self.assertDate("31.12.91", ['%d.%m.%y'])
    self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
    self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
    self.assertDate("31.12.1991", ['%d.%m.%Y'])
    self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
    self.assertDate("13.2.2008", ['%d.%m.%Y'])
    self.assertDate("31.12.91", ['%d.%m.%y'])
    self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
    self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
    self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
    self.assertDate("31 Dec 1999", ['%d %b %Y'])
    self.assertDate("1 Jan 2012", ['%d %b %Y'])
    self.assertDate("3 August 2009", ['%d %B %Y'])
    self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
    self.assertDate("13/1/2012", ['%d/%m/%Y'])
    self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
    self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
  def test_guess_datetimes(self):
    self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
    self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
    self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
    self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
    self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
    # TODO remove all except first one
    self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
                                         '%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
    self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d'])    #TODO remove second one
    self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
    self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
    self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
    self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
    self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
    self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
    self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
    self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
    self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
    self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
    self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
    self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
    self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
    self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
    self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
    self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
  def test_guess_bulk_dates(self):
    self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
    self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
    self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
    self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
    self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
 if __name__ == "__main__":
  unittest.main()
--- a/sandbox/grist/imports/test_import_csv.py
+++ b/sandbox/grist/imports/test_import_csv.py
@ -4,8 +4,6 @@ import textwrap
 import unittest
 from six import BytesIO, text_type
 import csv
 import calendar
 import datetime
 from imports import import_csv
@ -22,9 +20,15 @@ def bytes_io_from_str(string):
 class TestImportCSV(unittest.TestCase):
-  def _check_col(self, sheet, index, name, typename, values):
+  def _check_col(self, sheet, index, name, _typename, values):
    self.assertEqual(sheet["column_metadata"][index]["id"], name)
-    self.assertEqual(sheet["column_metadata"][index]["type"], typename)
+    # Previously, strings were parsed and types were guessed in CSV imports.
    # Now all data is kept as strings and the column type is left as Any
    # so that type guessing and parsing can happen elsewhere.
    # To avoid updating 85 calls to _check_col, the typename argument was kept but can be ignored,
    # and all values are converted back to strings for comparison.
    self.assertEqual(sheet["column_metadata"][index]["type"], "Any")
    values = [text_type(v) for v in values]
    self.assertEqual(sheet["table_data"][index], values)
  def _check_num_cols(self, sheet, exp_cols):
@ -40,18 +44,16 @@ class TestImportCSV(unittest.TestCase):
    self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
    self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
    self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
-    self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
+    self._check_col(sheet, 4, "num2", "Numeric", ['123456789.1234560000', '', ''])
-    self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
+    self._check_col(sheet, 5, "bignum", "Numeric", ['7.22597E+86', '', ''])
    self._check_col(sheet, 6, "date1", "DateTime",
-                    [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
+                    [u'12/22/15 11:59 AM', u'', u''])
    self._check_col(sheet, 7, "date2", "Date",
-                    [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
+                    [u'December 20, 2015', u'', u''])
    self._check_col(sheet, 8, "datetext", "Date",
-                    [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
+                    [u'12/22/2015', u'', u''])
    self._check_col(sheet, 9, "datetimetext", "DateTime",
-                    [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
+                    [u'12/22/2015 00:00:00', u'12/22/2015 13:15:00', u'02/27/2018 16:08:39'])
                     calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
                     calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
  def test_user_parse_options(self):
@ -68,7 +70,11 @@ class TestImportCSV(unittest.TestCase):
    self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
                                                      '2013433434', '(201)343-3434'])
    self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
-    self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
+    self._check_col(parsed_file, 4, "DATE", "DateTime",
                    [u'2018-02-27 16:08:39 +0000',
                     u'2018-02-27 16:08:39 +0100',
                     u'2018-02-27 16:08:39 -0100',
                     u''])
  def test_wrong_cols1(self):
    file_obj = bytes_io_from_str(textwrap.dedent(
--- a/sandbox/grist/imports/test_import_xls.py
+++ b/sandbox/grist/imports/test_import_xls.py
@ -16,31 +16,33 @@ class TestImportXLS(unittest.TestCase):
  def _check_col(self, sheet, index, name, typename, values):
    self.assertEqual(sheet["column_metadata"][index]["id"], name)
    self.assertEqual(sheet["column_metadata"][index]["type"], typename)
    if typename == "Any":
      # Convert values to strings to reduce changes to tests after imports were overhauled.
      values = [str(v) for v in values]
    self.assertEqual(sheet["table_data"][index], values)
  def test_excel(self):
    parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
-    # check that column type was correctly set to int and values are properly parsed
+    # check that column type was correctly set to numeric and values are properly parsed
-    self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
+    self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"})
    self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
    # check that column type was correctly set to text and values are properly parsed
-    self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
+    self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"})
    self.assertEqual(parsed_file[1][0]["table_data"][1],
      ["a", "b", "c", "d", "e", "f", "g", "h"])
-    # messy tables does not support bool types yet, it classifies them as ints
+    # 0s and 1s become Numeric, not boolean like in the past
-    self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
+    self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"})
-    self.assertEqual(parsed_file[1][False]["table_data"][2],
+    self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0])
      [True, False, True, False, True, False, True, False])
    # check that column type was correctly set to text and values are properly parsed
    self.assertEqual(parsed_file[1][0]["column_metadata"][3],
-                     {"type": "Text", "id": "corner-cases"})
+                     {"type": "Any", "id": "corner-cases"})
    self.assertEqual(parsed_file[1][0]["table_data"][3],
      # The type is detected as text, so all values should be text.
-      [u'=function()', '3.0', u'two spaces after  ',
+      [u'=function()', u'3.0', u'two spaces after  ',
        u'  two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
    # check that multiple tables are created when there are multiple sheets in a document
@ -51,23 +53,19 @@ class TestImportXLS(unittest.TestCase):
  def test_excel_types(self):
    parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
    sheet = parsed_file[1][0]
-    self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
+    self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', ''])
-    self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
+    self._check_col(sheet, 1, "int2", "Numeric", [5, '', ''])
-    self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
+    self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
-    self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
+    self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
    self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
    self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
    self._check_col(sheet, 6, "date1", "DateTime",
             [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
    self._check_col(sheet, 7, "date2", "Date",
             [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
-    self._check_col(sheet, 8, "datetext", "Date",
+    self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', ''])
-             [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
+    self._check_col(sheet, 9, "datetimetext", "Any",
-    # TODO: all dates have different format
+                    [u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000'])
    # self._check_col(sheet, 9, "datetimetext", "DateTime",
    #          [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
    #           calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
    #           calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
  def test_excel_type_detection(self):
    # This tests goes over the second sheet of the fixture doc, which has multiple rows that try
@ -81,23 +79,20 @@ class TestImportXLS(unittest.TestCase):
                     1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
    self._check_col(sheet, 1, "float_not_int", "Numeric",
                    [1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
-    self._check_col(sheet, 2, "int_not_bool", "Int",
+    self._check_col(sheet, 2, "int_not_bool", "Any",
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
-    self._check_col(sheet, 3, "float_not_bool", "Numeric",
+    self._check_col(sheet, 3, "float_not_bool", "Any",
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
-    self._check_col(sheet, 4, "text_as_bool", "Bool",
+    self._check_col(sheet, 4, "text_as_bool", "Any",
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
-    self._check_col(sheet, 5, "int_as_bool", "Bool",
+    self._check_col(sheet, 5, "int_as_bool", "Numeric",
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
-    self._check_col(sheet, 6, "float_not_date", "Numeric",
+    self._check_col(sheet, 6, "float_not_date", "Any",
                    [4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
                     4.0, 6.0, '3-4', 4.0, 6.5])
    self._check_col(sheet, 7, "float_not_text", "Numeric",
-                    [-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, "   ???   ", 5.50, "", "-",
+                    [-10.25, -8.00, -5.75, -3.50, "n/a", '  1.  ', "   ???   ", 5.50, "", "-",
                     12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
    self._check_col(sheet, 8, "dollar_amts", "Numeric",
                    [0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
                     1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
  def test_excel_single_merged_cell(self):
    # An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
@ -107,11 +102,11 @@ class TestImportXLS(unittest.TestCase):
    self.assertEqual(tables, [{
      'table_name': u'Transaction Report',
      'column_metadata': [
-        {'type': 'Text', 'id': u''},
+        {'type': 'Any', 'id': u''},
        {'type': 'Numeric', 'id': u'Start'},
        {'type': 'Numeric', 'id': u''},
        {'type': 'Numeric', 'id': u''},
-        {'type': 'Text', 'id': u'Seek no easy ways'},
+        {'type': 'Any', 'id': u'Seek no easy ways'},
      ],
      'table_data': [
        [u'SINGLE MERGED', u'The End'],
@ -133,15 +128,15 @@ class TestImportXLS(unittest.TestCase):
    self.assertEqual(tables, [{
      'table_name': u'Sheet1',
      'column_metadata': [
-        {'id': 'a', 'type': 'Text'},
+        {'id': 'a', 'type': 'Any'},
        {'id': 'b', 'type': 'Date'},
-        {'id': 'c', 'type': 'Text'},
+        {'id': 'c', 'type': 'Any'},
-        {'id': 'd', 'type': 'Text'},
+        {'id': 'd', 'type': 'Any'},
        {'id': 'e', 'type': 'Numeric'},
-        {'id': 'f', 'type': 'Int'},
+        {'id': 'f', 'type': 'Numeric'},
-        {'id': 'g', 'type': 'Date'},
+        {'id': 'g', 'type': 'Any'},
        {'id': 'h', 'type': 'Date'},
-        {'id': 'i', 'type': 'Bool'},
+        {'id': 'i', 'type': 'Numeric'},
      ],
      'table_data': [
        [u'21:14:00'],
@ -150,9 +145,9 @@ class TestImportXLS(unittest.TestCase):
        [u'10:20:30'],
        [4.180902777777778],
        [20],
-        [-6106060800.0],
+        [u'7/4/1776'],
        [205286400.0],
-        [False],    # This is not great either, we should be able to distinguish 0 from FALSE.
+        [0],
      ],
    }])
--- a/sandbox/grist/parse_data.py
+++ b/sandbox/grist/parse_data.py
@ -7,13 +7,11 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a
 of values. All "data" lists will have the same length.
 """
 from imports import dateguess
 import datetime
 import logging
 import re
 import messytables
 import moment # TODO grist internal libraries might not be available to plugins in the future.
 import dateutil.parser as date_parser
 import six
 from six.moves import zip, xrange
@ -25,12 +23,17 @@ log = logging.getLogger(__name__)
 # Our approach to type detection is different from that of messytables.
-# We first go through each cell in a sample of rows, trying to convert it to each of the basic
+# We first go through each cell in a sample of rows, checking if it's one of the basic
 # types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
 # numeric vs text). Then we go through the full data set converting to the chosen basic type.
 # During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
 # We use those counts to produce the selected Grist type at the end.
 # Previously string values were used here for type guessing and were parsed to typed values.
 # That process now happens elsewhere, and this module only handles the case
 # where the imported data already contains actual numbers or dates.
 # This happens for Excel sheets but not CSV files.
 class BaseConverter(object):
  @classmethod
@ -57,50 +60,19 @@ class BaseConverter(object):
 class NumericConverter(BaseConverter):
-  """Handles numeric values, including Grist types Numeric and Int."""
+  """Handles the Grist Numeric type"""
  # A number matching this is probably an identifier of some sort. Converting it to a float will
  # lose precision, so it's better not to consider it numeric.
  _unlikely_float = re.compile(r'\d{17}|^0\d')
  # Integers outside this range will be represented as floats. This is the limit for values that can
  # be stored in a JS Int32Array.
  _max_js_int = 1<<31
  # The thousands separator. It should be locale-specific, but we don't currently have a way to
  # detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
  _thousands_sep = ','
  @classmethod
  def convert(cls, value):
    if type(value) in six.integer_types + (float, complex):
      return value
    if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
      return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
    raise ValueError()
  @classmethod
  def _is_integer(cls, value):
    ttype = type(value)
    if ttype == int or (ttype == float and value.is_integer()):
      return -cls._max_js_int <= value < cls._max_js_int
    return False
  @classmethod
  def get_grist_column(cls, values):
    if all(cls._is_integer(v) for v in values):
      return ("Int", [int(v) for v in values])
    return ("Numeric", values)
 class DateParserInfo(date_parser.parserinfo):
  def validate(self, res):
    # Avoid this bogus combination which accepts plain numbers.
    if res.day and not res.month:
      return False
    return super(DateParserInfo, self).validate(res)
 class SimpleDateTimeConverter(BaseConverter):
  """Handles Date and DateTime values which are already instances of datetime.datetime."""
@ -124,66 +96,18 @@ class SimpleDateTimeConverter(BaseConverter):
    return grist_type, grist_values
-class DateTimeCoverter(BaseConverter):
+class AnyConverter(BaseConverter):
-  """Handles dateformats by guessed format."""
+  """
-
+  Fallback converter that converts everything to strings.
-  def __init__(self, date_format):
+  Type guessing and parsing of the strings will happen elsewhere.
-    self._format = date_format
+  """
  def convert(self, value):
    if value == "":
      return None
    if type(value) in (str, six.text_type):
      # datetime.strptime doesn't handle %z and %Z tags in Python 2.
      if '%z' in self._format or '%Z' in self._format:
        return date_parser.parse(value)
      else:
        try:
          return datetime.datetime.strptime(value, self._format)
        except ValueError:
          return date_parser.parse(value)
    raise ValueError()
  def _is_date(self, value):
    return value is None or value.time() == datetime.time()
  def get_grist_column(self, values):
    grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
                    for v in values]
    return grist_type, grist_values
 class BoolConverter(BaseConverter):
  """Handles Boolean type."""
  _true_values = (1, '1', 'true', 'yes')
  _false_values = (0, '0', 'false', 'no')
  @classmethod
  def convert(cls, value):
    v = value.strip().lower() if type(value) in (str, six.text_type) else value
    if v in cls._true_values:
      return True
    elif v in cls._false_values:
      return False
    raise ValueError()
  @classmethod
  def get_grist_column(cls, values):
    return ("Bool", values)
 class TextConverter(BaseConverter):
  """Fallback converter that converts everything to strings."""
  @classmethod
  def convert(cls, value):
    return six.text_type(value)
  @classmethod
  def get_grist_column(cls, values):
-    return ("Text", values)
+    return ("Any", values)
 class ColumnDetector(object):
@ -194,7 +118,7 @@ class ColumnDetector(object):
  """
  # Converters are listed in the order of preference, which is only used if two converters succeed
  # on the same exact number of values. Text is always a fallback.
-  converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
+  converters = [SimpleDateTimeConverter, NumericConverter]
  # If this many non-junk values or more can't be converted, fall back to text.
  _text_threshold = 0.10
@ -221,19 +145,11 @@ class ColumnDetector(object):
        self._counts[i] += 1
  def get_converter(self):
    if sum(self._counts) == 0:
      # if not already guessed as int, bool or datetime then we should try to guess date pattern
      str_data = [d for d in self._data if isinstance(d, six.string_types)]
      data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
      data_format = data_formats[0] if data_formats else None
      if data_format:
        return DateTimeCoverter(data_format)
    # We find the max by count, and secondarily by minimum index in the converters list.
    count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
    if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
      return self.converters[-neg_index]
-    return TextConverter
+    return AnyConverter
 def _guess_basic_types(rows, num_columns):