mirror of
				https://github.com/gristlabs/grist-core.git
				synced 2025-06-13 20:53:59 +00:00 
			
		
		
		
	(core) Lossless imports
Summary: - Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates. - 0s and 1s are treated as numbers instead of booleans to keep imports lossless. - Removed dateguess.py and test_dateguess.py. - Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py. - Columns containing numbers are now always imported as Numeric, never Int. - Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff. Test Plan: - Added 2 `nbrowser/Importer2` tests. - Updated various existing tests. - Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3302
This commit is contained in:
		
							parent
							
								
									9522438967
								
							
						
					
					
						commit
						321019217d
					
				@ -353,6 +353,7 @@ export class Importer extends DisposableWithEvents {
 | 
			
		||||
        label: field.label(),
 | 
			
		||||
        colId: destTableId ? field.colId() : null, // if inserting into new table, colId isn't defined
 | 
			
		||||
        type: field.column().type(),
 | 
			
		||||
        widgetOptions: field.column().widgetOptions(),
 | 
			
		||||
        formula: field.column().formula()
 | 
			
		||||
      })),
 | 
			
		||||
      sourceCols: sourceFields.map((field) => field.colId())
 | 
			
		||||
 | 
			
		||||
@ -105,7 +105,7 @@ export async function prepTransformColInfo(docModel: DocModel, origCol: ColumnRe
 | 
			
		||||
      let {dateFormat} = prevOptions;
 | 
			
		||||
      if (!dateFormat) {
 | 
			
		||||
        const colValues = tableData.getColValues(sourceCol.colId()) || [];
 | 
			
		||||
        dateFormat = guessDateFormat(colValues.map(String)) || "YYYY-MM-DD";
 | 
			
		||||
        dateFormat = guessDateFormat(colValues.map(String));
 | 
			
		||||
      }
 | 
			
		||||
      widgetOptions = dateTimeWidgetOptions(dateFormat, true);
 | 
			
		||||
      break;
 | 
			
		||||
 | 
			
		||||
@ -49,6 +49,7 @@ export interface TransformColumn {
 | 
			
		||||
  colId: string|null;
 | 
			
		||||
  type: string;
 | 
			
		||||
  formula: string;
 | 
			
		||||
  widgetOptions: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export interface ImportResult {
 | 
			
		||||
 | 
			
		||||
@ -3,7 +3,7 @@ import {ApplyUAResult, QueryFilters} from 'app/common/ActiveDocAPI';
 | 
			
		||||
import {BaseAPI, IOptions} from 'app/common/BaseAPI';
 | 
			
		||||
import {BillingAPI, BillingAPIImpl} from 'app/common/BillingAPI';
 | 
			
		||||
import {BrowserSettings} from 'app/common/BrowserSettings';
 | 
			
		||||
import {BulkColValues, TableColValues, UserAction} from 'app/common/DocActions';
 | 
			
		||||
import {BulkColValues, TableColValues, TableRecordValue, TableRecordValues, UserAction} from 'app/common/DocActions';
 | 
			
		||||
import {DocCreationInfo, OpenDocMode} from 'app/common/DocListAPI';
 | 
			
		||||
import {Features} from 'app/common/Features';
 | 
			
		||||
import {ICustomWidget} from 'app/common/CustomWidget';
 | 
			
		||||
@ -402,6 +402,11 @@ export interface UserAPI {
 | 
			
		||||
  filters?: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
interface GetRowsParams {
 | 
			
		||||
  filters?: QueryFilters;
 | 
			
		||||
  immediate?: boolean;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Collect endpoints related to the content of a single document that we've been thinking
 | 
			
		||||
 * of as the (restful) "Doc API".  A few endpoints that could be here are not, for historical
 | 
			
		||||
@ -411,8 +416,8 @@ export interface DocAPI {
 | 
			
		||||
  // Immediate flag is a currently not-advertised feature, allowing a query to proceed without
 | 
			
		||||
  // waiting for a document to be initialized. This is useful if the calculations done when
 | 
			
		||||
  // opening a document are irrelevant.
 | 
			
		||||
  getRows(tableId: string, options?: { filters?: QueryFilters,
 | 
			
		||||
                                       immediate?: boolean }): Promise<TableColValues>;
 | 
			
		||||
  getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues>;
 | 
			
		||||
  getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]>;
 | 
			
		||||
  updateRows(tableId: string, changes: TableColValues): Promise<number[]>;
 | 
			
		||||
  addRows(tableId: string, additions: BulkColValues): Promise<number[]>;
 | 
			
		||||
  removeRows(tableId: string, removals: number[]): Promise<number[]>;
 | 
			
		||||
@ -869,16 +874,13 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
 | 
			
		||||
    this._url = `${url}/api/docs/${docId}`;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  public async getRows(tableId: string, options?: { filters?: QueryFilters,
 | 
			
		||||
                                                    immediate?: boolean }): Promise<TableColValues> {
 | 
			
		||||
    const url = new URL(`${this._url}/tables/${tableId}/data`);
 | 
			
		||||
    if (options?.filters) {
 | 
			
		||||
      url.searchParams.append('filter', JSON.stringify(options.filters));
 | 
			
		||||
    }
 | 
			
		||||
    if (options?.immediate) {
 | 
			
		||||
      url.searchParams.append('immediate', 'true');
 | 
			
		||||
    }
 | 
			
		||||
    return this.requestJson(url.href);
 | 
			
		||||
  public async getRows(tableId: string, options?: GetRowsParams): Promise<TableColValues> {
 | 
			
		||||
    return this._getRecords(tableId, 'data', options);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  public async getRecords(tableId: string, options?: GetRowsParams): Promise<TableRecordValue[]> {
 | 
			
		||||
    const response: TableRecordValues = await this._getRecords(tableId, 'records', options);
 | 
			
		||||
    return response.records;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  public async updateRows(tableId: string, changes: TableColValues): Promise<number[]> {
 | 
			
		||||
@ -967,6 +969,17 @@ export class DocAPIImpl extends BaseAPI implements DocAPI {
 | 
			
		||||
    url.searchParams.append('code', code);
 | 
			
		||||
    return this.requestJson(url.href);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private _getRecords(tableId: string, endpoint: 'data' | 'records', options?: GetRowsParams): Promise<any> {
 | 
			
		||||
    const url = new URL(`${this._url}/tables/${tableId}/${endpoint}`);
 | 
			
		||||
    if (options?.filters) {
 | 
			
		||||
      url.searchParams.append('filter', JSON.stringify(options.filters));
 | 
			
		||||
    }
 | 
			
		||||
    if (options?.immediate) {
 | 
			
		||||
      url.searchParams.append('immediate', 'true');
 | 
			
		||||
    }
 | 
			
		||||
    return this.requestJson(url.href);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 | 
			
		||||
@ -162,7 +162,7 @@ export function guessColInfo(
 | 
			
		||||
      NumberParse.fromSettings(docSettings).guessOptions(values)
 | 
			
		||||
    )
 | 
			
		||||
      .guess(values, docSettings) ||
 | 
			
		||||
    new DateGuesser(guessDateFormat(values, timezone) || "YYYY-MM-DD", timezone)
 | 
			
		||||
    new DateGuesser(guessDateFormat(values, timezone), timezone)
 | 
			
		||||
      .guess(values, docSettings) ||
 | 
			
		||||
    // Don't return the same values back if there's no conversion to be done,
 | 
			
		||||
    // as they have to be serialized and transferred over a pipe to Python.
 | 
			
		||||
 | 
			
		||||
@ -36,18 +36,6 @@ export class ValueParser {
 | 
			
		||||
class IdentityParser extends ValueParser {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Same as basic Value parser, but will return null if a value is an empty string.
 | 
			
		||||
 */
 | 
			
		||||
class NullIfEmptyParser extends ValueParser {
 | 
			
		||||
  public cleanParse(value: string): any {
 | 
			
		||||
    if (value === "") {
 | 
			
		||||
      return null;
 | 
			
		||||
    }
 | 
			
		||||
    return super.cleanParse(value);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export class NumericParser extends ValueParser {
 | 
			
		||||
  private _parse: NumberParse;
 | 
			
		||||
 | 
			
		||||
@ -225,7 +213,6 @@ export class ReferenceListParser extends ReferenceParser {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export const valueParserClasses: { [type: string]: typeof ValueParser } = {
 | 
			
		||||
  Any: NullIfEmptyParser,
 | 
			
		||||
  Numeric: NumericParser,
 | 
			
		||||
  Int: NumericParser,
 | 
			
		||||
  Date: DateParser,
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,5 @@
 | 
			
		||||
import escapeRegExp = require('lodash/escapeRegExp');
 | 
			
		||||
import last = require('lodash/last');
 | 
			
		||||
import memoize = require('lodash/memoize');
 | 
			
		||||
import {getDistinctValues, isObject} from 'app/common/gutil';
 | 
			
		||||
// Simply importing 'moment-guess' inconsistently imports bundle.js or bundle.esm.js depending on environment
 | 
			
		||||
@ -325,7 +326,26 @@ function standardizeTime(timeString: string): { remaining: string, time: string
 | 
			
		||||
  return {remaining: timeString.slice(0, match.index).trim(), time: `${hh}:${mm}:${ss}`};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string | null {
 | 
			
		||||
/**
 | 
			
		||||
 * Guesses a full date[time] format that best matches the given strings.
 | 
			
		||||
 * If several formats match equally well, picks the last one lexicographically to match the old date guessing.
 | 
			
		||||
 * This means formats with an early Y and/or M are favoured.
 | 
			
		||||
 * If no formats match, returns the default YYYY-MM-DD.
 | 
			
		||||
 */
 | 
			
		||||
export function guessDateFormat(values: Array<string | null>, timezone: string = 'UTC'): string {
 | 
			
		||||
  const formats = guessDateFormats(values, timezone);
 | 
			
		||||
  if (!formats) {
 | 
			
		||||
    return "YYYY-MM-DD";
 | 
			
		||||
  }
 | 
			
		||||
  return last(formats)!;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Returns all full date[time] formats that best match the given strings.
 | 
			
		||||
 * If several formats match equally well, returns them all.
 | 
			
		||||
 * May return null if there are no matching formats or choosing one is too expensive.
 | 
			
		||||
 */
 | 
			
		||||
export function guessDateFormats(values: Array<string | null>, timezone: string = 'UTC'): string[] | null {
 | 
			
		||||
  const dateStrings: string[] = values.filter(isObject);
 | 
			
		||||
  const sample = getDistinctValues(dateStrings, 100);
 | 
			
		||||
  const formats: Record<string, number> = {};
 | 
			
		||||
@ -358,7 +378,9 @@ export function guessDateFormat(values: Array<string | null>, timezone: string =
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  const maxCount = Math.max(...Object.values(formats));
 | 
			
		||||
  return formatKeys.find(format => formats[format] === maxCount)!;
 | 
			
		||||
  // Return all formats that tied for first place.
 | 
			
		||||
  // Sort lexicographically for consistency in tests and with the old dateguess.py.
 | 
			
		||||
  return formatKeys.filter(format => formats[format] === maxCount).sort();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export const dateFormatOptions = [
 | 
			
		||||
 | 
			
		||||
@ -294,7 +294,7 @@ export class ActiveDocImport {
 | 
			
		||||
      const origTableName = table.table_name ? table.table_name : '';
 | 
			
		||||
      const transformRule = transformRuleMap && transformRuleMap.hasOwnProperty(origTableName) ?
 | 
			
		||||
        transformRuleMap[origTableName] : null;
 | 
			
		||||
      const columnMetadata = addLabelsIfPossible(table.column_metadata);
 | 
			
		||||
      const columnMetadata = cleanColumnMetadata(table.column_metadata);
 | 
			
		||||
      const result: ApplyUAResult = await this._activeDoc.applyUserActions(docSession,
 | 
			
		||||
        [["AddTable", hiddenTableName, columnMetadata]]);
 | 
			
		||||
      const retValue: AddTableRetValue = result.retValues[0];
 | 
			
		||||
@ -313,7 +313,9 @@ export class ActiveDocImport {
 | 
			
		||||
      const ruleCanBeApplied = (transformRule != null) &&
 | 
			
		||||
                               _.difference(transformRule.sourceCols, hiddenTableColIds).length === 0;
 | 
			
		||||
      await this._activeDoc.applyUserActions(docSession,
 | 
			
		||||
        [["ReplaceTableData", hiddenTableId, rowIdColumn, columnValues]], {parseStrings: true});
 | 
			
		||||
        // BulkAddRecord rather than ReplaceTableData so that type guessing is applied to Any columns.
 | 
			
		||||
        // Don't use parseStrings, only use the strict parsing in ValueGuesser to make the import lossless.
 | 
			
		||||
        [["BulkAddRecord", hiddenTableId, rowIdColumn, columnValues]]);
 | 
			
		||||
 | 
			
		||||
      // data parsed and put into hiddenTableId
 | 
			
		||||
      // For preview_table (isHidden) do GenImporterView to make views and formulas and cols
 | 
			
		||||
@ -433,14 +435,15 @@ export class ActiveDocImport {
 | 
			
		||||
 | 
			
		||||
    // If destination is a new table, we need to create it.
 | 
			
		||||
    if (intoNewTable) {
 | 
			
		||||
      const colSpecs = destCols.map(({type, colId: id, label}) => ({type, id, label}));
 | 
			
		||||
      const colSpecs = destCols.map(({type, colId: id, label, widgetOptions}) => ({type, id, label, widgetOptions}));
 | 
			
		||||
      const newTable = await this._activeDoc.applyUserActions(docSession, [['AddTable', destTableId, colSpecs]]);
 | 
			
		||||
      destTableId = newTable.retValues[0].table_id;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    await this._activeDoc.applyUserActions(docSession,
 | 
			
		||||
      [['BulkAddRecord', destTableId, gutil.arrayRepeat(hiddenTableData.id.length, null), columnData]],
 | 
			
		||||
      {parseStrings: true});
 | 
			
		||||
      // Don't use parseStrings for new tables to make the import lossless.
 | 
			
		||||
      {parseStrings: !intoNewTable});
 | 
			
		||||
 | 
			
		||||
    return destTableId;
 | 
			
		||||
  }
 | 
			
		||||
@ -586,6 +589,7 @@ export class ActiveDocImport {
 | 
			
		||||
        colId: destTableId ? id as string : null,
 | 
			
		||||
        label: fields.label as string,
 | 
			
		||||
        type: fields.type as string,
 | 
			
		||||
        widgetOptions: fields.widgetOptions as string,
 | 
			
		||||
        formula: srcColIds.includes(id as string) ? `$${id}` :  ''
 | 
			
		||||
      });
 | 
			
		||||
    }
 | 
			
		||||
@ -730,10 +734,21 @@ function getMergeFunction({type}: MergeStrategy): MergeFunction {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Tweak the column metadata used in the AddTable action.
 | 
			
		||||
 * If `columns` is populated with non-blank column ids, adds labels to all
 | 
			
		||||
 * columns using the values set for the column ids. Otherwise, returns
 | 
			
		||||
 * a copy of columns with no modifications made.
 | 
			
		||||
 * columns using the values set for the column ids.
 | 
			
		||||
 * Ensure that columns of type Any start out as formula columns, i.e. empty columns,
 | 
			
		||||
 * so that type guessing is triggered when new data is added.
 | 
			
		||||
 */
 | 
			
		||||
function addLabelsIfPossible(columns: GristColumn[]) {
 | 
			
		||||
  return columns.map(c => (c.id ? {...c, label: c.id} : c));
 | 
			
		||||
function cleanColumnMetadata(columns: GristColumn[]) {
 | 
			
		||||
  return columns.map(c => {
 | 
			
		||||
    const newCol: any = {...c};
 | 
			
		||||
    if (c.id) {
 | 
			
		||||
      newCol.label = c.id;
 | 
			
		||||
    }
 | 
			
		||||
    if (c.type === "Any") {
 | 
			
		||||
      newCol.isFormula = true;
 | 
			
		||||
    }
 | 
			
		||||
    return newCol;
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,11 @@
 | 
			
		||||
from collections import defaultdict, namedtuple
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
 | 
			
		||||
import six
 | 
			
		||||
from six.moves import zip, xrange
 | 
			
		||||
from six.moves import zip
 | 
			
		||||
 | 
			
		||||
import column
 | 
			
		||||
import identifiers
 | 
			
		||||
 | 
			
		||||
import logger
 | 
			
		||||
 | 
			
		||||
log = logger.Logger(__name__, logger.INFO)
 | 
			
		||||
 | 
			
		||||
# Prefix for transform columns created during imports.
 | 
			
		||||
@ -103,6 +102,7 @@ class ImportActions(object):
 | 
			
		||||
          "label":    c.label,
 | 
			
		||||
          "colId":    c.colId if dest_table_id else None, #should be None if into new table
 | 
			
		||||
          "type":     c.type,
 | 
			
		||||
          "widgetOptions": getattr(c, "widgetOptions", ""),
 | 
			
		||||
          "formula":  ("$" + c.colId) if (c.colId in src_cols) else ''
 | 
			
		||||
        })
 | 
			
		||||
 | 
			
		||||
@ -162,6 +162,7 @@ class ImportActions(object):
 | 
			
		||||
        new_col_spec = {
 | 
			
		||||
          "label": c.label,
 | 
			
		||||
          "type": c.type,
 | 
			
		||||
          "widgetOptions": getattr(c, "widgetOptions", ""),
 | 
			
		||||
          "isFormula": True,
 | 
			
		||||
          "formula": c.formula}
 | 
			
		||||
        result = self._useractions.doAddColumn(hidden_table_id, new_col_id, new_col_spec)
 | 
			
		||||
 | 
			
		||||
@ -1,490 +0,0 @@
 | 
			
		||||
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
 | 
			
		||||
based on samples.
 | 
			
		||||
 | 
			
		||||
dateguesser.guess(sample)
 | 
			
		||||
dateguesser.guess takes a sample date string and returns a set of
 | 
			
		||||
datetime.strftime/strptime-compliant date format strings that will correctly parse.
 | 
			
		||||
 | 
			
		||||
dateguesser.guess_bulk(list_of_samples, error_rate=0)
 | 
			
		||||
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
 | 
			
		||||
and returns a list of datetime.strftime/strptime-compliant date format strings
 | 
			
		||||
sorted by error rate that will correctly parse.
 | 
			
		||||
 | 
			
		||||
Algorithm:
 | 
			
		||||
 | 
			
		||||
  1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
 | 
			
		||||
  2. Analyze each token independently in terms what format codes could represent
 | 
			
		||||
  3. For given list of tokens generate all permutations of format codes
 | 
			
		||||
  4. During generating permutations check for validness of generated format and skip if invalid.
 | 
			
		||||
  5. Use rules listed below to decide if format is invalid:
 | 
			
		||||
 | 
			
		||||
Invalid format checks:
 | 
			
		||||
 | 
			
		||||
  Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
 | 
			
		||||
  Rule #2. No holes (missing parts) in the format parts.
 | 
			
		||||
  Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
 | 
			
		||||
  Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
 | 
			
		||||
  Rule #5. Pattern can't have some part of date/time defined more than once.
 | 
			
		||||
  Rule #6: Separators between elements of the time group should be the same.
 | 
			
		||||
  Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
 | 
			
		||||
  Rule #8: Year can't be between other date elements
 | 
			
		||||
 | 
			
		||||
Note:
 | 
			
		||||
  dateguess doesn't support defaulting to current year because parsing should be deterministic,
 | 
			
		||||
  it's better to to fail guessing the format then to guess it incorrectly.
 | 
			
		||||
 | 
			
		||||
Examples:
 | 
			
		||||
  >>> guess('2014/05/05 14:00:00 UTC')
 | 
			
		||||
  set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
 | 
			
		||||
  >>> guess('12/12/12')
 | 
			
		||||
  set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
 | 
			
		||||
  >>> guess_bulk(['12-11-2014', '12-25-2014'])
 | 
			
		||||
  ['%m-%d-%Y']
 | 
			
		||||
  >>> guess_bulk(['12-11-2014', '25-25-2014'])
 | 
			
		||||
  []
 | 
			
		||||
  >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
 | 
			
		||||
  ['%m-%d-%Y']
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
import calendar
 | 
			
		||||
import itertools
 | 
			
		||||
import logging
 | 
			
		||||
import re
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
 | 
			
		||||
from backports.functools_lru_cache import lru_cache
 | 
			
		||||
import moment
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MONTH_NAME = calendar.month_name
 | 
			
		||||
MONTH_ABBR = calendar.month_abbr
 | 
			
		||||
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
 | 
			
		||||
AM_PM = {'am', 'pm'}
 | 
			
		||||
DAYS_OF_WEEK_NAME = calendar.day_name
 | 
			
		||||
DAYS_OF_WEEK_ABBR = calendar.day_abbr
 | 
			
		||||
ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
 | 
			
		||||
 | 
			
		||||
# Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
 | 
			
		||||
# Use isdigit(x) instead, to only match ASCII digits 0-9.
 | 
			
		||||
isdigit = ASCII_DIGITS_RE.match
 | 
			
		||||
 | 
			
		||||
DATE_ELEMENTS = [
 | 
			
		||||
  # Name   Pattern  Predicate               Group (mutual exclusive)  Consumes N prev elements
 | 
			
		||||
  ("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
 | 
			
		||||
  ("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
 | 
			
		||||
  ("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
 | 
			
		||||
  ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
 | 
			
		||||
  ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
 | 
			
		||||
  ("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
 | 
			
		||||
  ("Day of week", "%A", lambda x, p, v: x.isalpha()
 | 
			
		||||
                                        and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
 | 
			
		||||
  ("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
 | 
			
		||||
                                             and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
 | 
			
		||||
 | 
			
		||||
  ("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
 | 
			
		||||
                                                and 0 <= int(x[0:2]) < 24
 | 
			
		||||
                                                and 0 <= int(x[2:4]) < 60
 | 
			
		||||
                                                and 0 <= int(x[4:6]) < 60, "HMS", 0),
 | 
			
		||||
 | 
			
		||||
  ("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
 | 
			
		||||
  ("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
 | 
			
		||||
                                              and 0 <= int(x) <= 11, "H", 0),
 | 
			
		||||
  ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
 | 
			
		||||
  ("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
 | 
			
		||||
  ("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
 | 
			
		||||
  ("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
 | 
			
		||||
                                               and p.val == '.', "f", 0),
 | 
			
		||||
  ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
 | 
			
		||||
                                          and x in TZ_VALID_NAMES, "Z", 0),
 | 
			
		||||
  ("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
 | 
			
		||||
                                           and 0 <= int(x[2:4]) < 60 and p is not None
 | 
			
		||||
                                           and p.val == '+', "Z", 1),
 | 
			
		||||
  ("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
 | 
			
		||||
                                           and 0 <= int(x[2:4]) < 60 and p is not None
 | 
			
		||||
                                           and p.val == '-', "Z", 1),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Token(object):
 | 
			
		||||
  """Represents a part of a date string that's being parsed.
 | 
			
		||||
  Note that __hash__ and __eq__ are overridden in order
 | 
			
		||||
  to compare only meaningful parts of an object.
 | 
			
		||||
  """
 | 
			
		||||
  def __init__(self, val, length):
 | 
			
		||||
    self.val = val
 | 
			
		||||
    self.length = length
 | 
			
		||||
    self.compatible_types = ()
 | 
			
		||||
 | 
			
		||||
  def __hash__(self):
 | 
			
		||||
    h = hash(self.length) + hash(self.compatible_types)
 | 
			
		||||
    if not self.compatible_types:
 | 
			
		||||
      h += hash(self.val)
 | 
			
		||||
    return hash(h)
 | 
			
		||||
 | 
			
		||||
  def __eq__(self, other):
 | 
			
		||||
    """
 | 
			
		||||
    Two tokens are equal when these both are true:
 | 
			
		||||
    a) length and compatible types are equal
 | 
			
		||||
    b) if it is separator (no compatible types), separator values must be equal
 | 
			
		||||
    """
 | 
			
		||||
    if self.length != other.length or self.compatible_types != other.compatible_types:
 | 
			
		||||
      return False
 | 
			
		||||
    if not other.compatible_types and self.val != other.val:
 | 
			
		||||
      return False
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_1(pattern, types_used):
 | 
			
		||||
  """Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_1('%Y/%m/%d', 'Ymd')
 | 
			
		||||
    True
 | 
			
		||||
    >>> _check_rule_1('%m/%d', 'md')
 | 
			
		||||
    False
 | 
			
		||||
  """
 | 
			
		||||
  if 'Y' not in types_used:
 | 
			
		||||
    logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_2(pattern, types_used):
 | 
			
		||||
  """Rule #2: No holes (missing parts) in the format parts.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_2('%Y:%H', 'YH')
 | 
			
		||||
    False
 | 
			
		||||
    >>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
 | 
			
		||||
    True
 | 
			
		||||
  """
 | 
			
		||||
  priorities = 'YmdHMSf'
 | 
			
		||||
  seen_parts = [p in types_used for p in priorities]
 | 
			
		||||
  if sorted(seen_parts, reverse=True) != seen_parts:
 | 
			
		||||
    logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_3(pattern, types_used):
 | 
			
		||||
  """Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
 | 
			
		||||
    True
 | 
			
		||||
    >>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
 | 
			
		||||
    False
 | 
			
		||||
  """
 | 
			
		||||
  time_parts = 'HMSf'
 | 
			
		||||
  time_parts_highlighted = [t in time_parts for t in types_used]
 | 
			
		||||
  time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
 | 
			
		||||
  if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
 | 
			
		||||
    logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_4(pattern, types_used):
 | 
			
		||||
  """Rule #4: It's highly impossible that minutes coming before hours,
 | 
			
		||||
  millis coming before seconds etc.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_4('%H:%M', 'HM')
 | 
			
		||||
    True
 | 
			
		||||
    >>> _check_rule_4('%S:%M', 'SM')
 | 
			
		||||
    False
 | 
			
		||||
  """
 | 
			
		||||
  time_parts_priority = 'HMSf'
 | 
			
		||||
  time_parts_indexes = list(filter(lambda x: x >= 0,
 | 
			
		||||
                                              [time_parts_priority.find(t) for t in types_used]))
 | 
			
		||||
  if sorted(time_parts_indexes) != time_parts_indexes:
 | 
			
		||||
    logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_5(pattern, types_used):
 | 
			
		||||
  """Rule #5: Pattern can't have some part of date/time defined more than once.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_5('%Y/%Y', 'YY')
 | 
			
		||||
    False
 | 
			
		||||
    >>> _check_rule_5('%m/%b', 'mm')
 | 
			
		||||
    False
 | 
			
		||||
    >>> _check_rule_5('%Y/%m', 'Ym')
 | 
			
		||||
    True
 | 
			
		||||
  """
 | 
			
		||||
  if len(types_used) != len(set(types_used)):
 | 
			
		||||
    logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_6(tokens_chosen, pattern, types_used):
 | 
			
		||||
  """Rule #6: Separators between elements of the time group should be the same.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    _check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
 | 
			
		||||
    _check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
 | 
			
		||||
    _check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
 | 
			
		||||
                                                                  ('-' and ':') in time group)
 | 
			
		||||
  """
 | 
			
		||||
  time_parts = 'HMS'
 | 
			
		||||
  num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
 | 
			
		||||
  time_parts_seen = 0
 | 
			
		||||
  separators_seen = []
 | 
			
		||||
  previous_was_a_separator = False
 | 
			
		||||
 | 
			
		||||
  for token in tokens_chosen:
 | 
			
		||||
    if token[1] is not None and token[1][3] in time_parts:
 | 
			
		||||
      # This rule doesn't work for separator-less time group so when we found the type
 | 
			
		||||
      # and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
 | 
			
		||||
      if len(token[1][3]) == 3:
 | 
			
		||||
        break
 | 
			
		||||
      # If not a first time then
 | 
			
		||||
      if time_parts_seen > 0 and not previous_was_a_separator:
 | 
			
		||||
        separators_seen.append(None)
 | 
			
		||||
      time_parts_seen += 1
 | 
			
		||||
      if time_parts_seen == num_of_time_parts_used:
 | 
			
		||||
        break
 | 
			
		||||
      previous_was_a_separator = False
 | 
			
		||||
    else:
 | 
			
		||||
      if time_parts_seen > 0:
 | 
			
		||||
        separators_seen.append(token[0].val)
 | 
			
		||||
      previous_was_a_separator = True
 | 
			
		||||
 | 
			
		||||
  if len(set(separators_seen)) > 1:
 | 
			
		||||
    logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
 | 
			
		||||
                  pattern, separators_seen)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_7a(pattern):
 | 
			
		||||
  """Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
 | 
			
		||||
  Otherwise it's 24-hour.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_7a('%Y/%m/%d %H:%M %p')
 | 
			
		||||
    False
 | 
			
		||||
    >>> _check_rule_7a('%Y/%m/%d %I:%M %p')
 | 
			
		||||
    True
 | 
			
		||||
  """
 | 
			
		||||
  if '%p' in pattern and '%H' in pattern:
 | 
			
		||||
    logging.debug("Rule #7a is violated for pattern %s", pattern)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_7b(pattern):
 | 
			
		||||
  """Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
 | 
			
		||||
  Otherwise it's 24-hour.
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_7b('%Y/%m/%d %I:%M')
 | 
			
		||||
    False
 | 
			
		||||
    >>> _check_rule_7b('%Y/%m/%d %I:%M %p')
 | 
			
		||||
    True
 | 
			
		||||
  """
 | 
			
		||||
  if '%I' in pattern and '%p' not in pattern:
 | 
			
		||||
    logging.debug("Rule #7b is violated for pattern %s", pattern)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_rule_8(pattern, types_used):
 | 
			
		||||
  """Rule #9: Year can't be between other date elements
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
 | 
			
		||||
    False
 | 
			
		||||
  """
 | 
			
		||||
  if 'mYd' in types_used or 'dYm' in types_used:
 | 
			
		||||
    logging.debug("Rule #8 is violated for pattern %s", pattern)
 | 
			
		||||
    return False
 | 
			
		||||
  return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _tokenize_by_character_class(s):
 | 
			
		||||
  """Return a list of strings by splitting s (tokenizing) by character class.
 | 
			
		||||
 | 
			
		||||
  Example:
 | 
			
		||||
    >>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
 | 
			
		||||
    >>> [i.val for i in t]
 | 
			
		||||
    ['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
 | 
			
		||||
 | 
			
		||||
    >>> t = _tokenize_by_character_class('5/14/2014')
 | 
			
		||||
    >>> [i.val for i in t]
 | 
			
		||||
    ['5', '/', '14', '/', '2014']
 | 
			
		||||
  """
 | 
			
		||||
  res = re.split(r'(\d+)|(\W)|(_)', s)
 | 
			
		||||
  return [Token(i, len(i)) for i in res if i]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _sliding_triplets(tokens):
 | 
			
		||||
  for idx, t in enumerate(tokens):
 | 
			
		||||
    yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _analyze_tokens(tokens):
 | 
			
		||||
  """Analyze each token and find out compatible types for it."""
 | 
			
		||||
  for token, prev, nxt in _sliding_triplets(tokens):
 | 
			
		||||
    token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@lru_cache()
 | 
			
		||||
def _generate_all_permutations(tokens):
 | 
			
		||||
  """Generate all permutations of format codes for given list of tokens.
 | 
			
		||||
 | 
			
		||||
  Brute-forcing of all possible permutations and rules checking eats most of the time or date
 | 
			
		||||
  parsing. But since the input is expected to be highly uniform then we can expect that
 | 
			
		||||
  memoization of this step will be very efficient.
 | 
			
		||||
 | 
			
		||||
  Token contains values for date parts but due to overridden eq and hash methods,
 | 
			
		||||
  we treat two tokens having the same length and same possible formats as equal
 | 
			
		||||
  tokens and separators should be the same
 | 
			
		||||
  """
 | 
			
		||||
  all_patterns = set()
 | 
			
		||||
  _generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
 | 
			
		||||
 | 
			
		||||
  return all_patterns
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
 | 
			
		||||
  """Apply rules which are applicable for partially constructed patterns.
 | 
			
		||||
 | 
			
		||||
  Example: duplicates of a date part in a pattern.
 | 
			
		||||
  """
 | 
			
		||||
  return _check_rule_5(pattern, types_used) \
 | 
			
		||||
      and _check_rule_4(pattern, types_used) \
 | 
			
		||||
      and _check_rule_7a(pattern)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
 | 
			
		||||
  """Apply rules which are applicable for full pattern only.
 | 
			
		||||
 | 
			
		||||
  Example: existence of Year part in the pattern.
 | 
			
		||||
  """
 | 
			
		||||
  return _check_rule_1(pattern, types_used) \
 | 
			
		||||
      and _check_rule_2(pattern, types_used) \
 | 
			
		||||
      and _check_rule_3(pattern, types_used) \
 | 
			
		||||
      and _check_rule_6(tokens_chosen, pattern, types_used) \
 | 
			
		||||
      and _check_rule_7b(pattern) \
 | 
			
		||||
      and _check_rule_8(pattern, types_used)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
 | 
			
		||||
                                         types_used):
 | 
			
		||||
  """Generate all format elements permutations recursively.
 | 
			
		||||
 | 
			
		||||
  Args:
 | 
			
		||||
    tokens (list[Token]): List of tokens.
 | 
			
		||||
    token_idx (int): Index of token processing this cycle.
 | 
			
		||||
    tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
 | 
			
		||||
      containing token and compatible type
 | 
			
		||||
    pattern (str): String containing format for parsing
 | 
			
		||||
    found_patterns (set): Set of guessed patterns
 | 
			
		||||
    types_used (str): String of types used to build pattern.
 | 
			
		||||
 | 
			
		||||
  Returns:
 | 
			
		||||
    list: List of permutations
 | 
			
		||||
  """
 | 
			
		||||
  if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
 | 
			
		||||
    return
 | 
			
		||||
 | 
			
		||||
  if token_idx < len(tokens):
 | 
			
		||||
    t = tokens[token_idx]
 | 
			
		||||
    if t.compatible_types:
 | 
			
		||||
      for ct in t.compatible_types:
 | 
			
		||||
        _generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
 | 
			
		||||
                                             (pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
 | 
			
		||||
                                             found_patterns, types_used + ct[3])
 | 
			
		||||
    else:
 | 
			
		||||
      # if no compatible types it should be separator, add it to the pattern
 | 
			
		||||
      _generate_all_permutations_recursive(tokens, token_idx+1,
 | 
			
		||||
                                           tokens_chosen[:] + [(t, None)], pattern + t.val,
 | 
			
		||||
                                           found_patterns, types_used)
 | 
			
		||||
  else:
 | 
			
		||||
    if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
 | 
			
		||||
      found_patterns.add(pattern)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def guess(date):
 | 
			
		||||
  """Guesses datetime.strftime/strptime-compliant date formats for date string.
 | 
			
		||||
 | 
			
		||||
  Args:
 | 
			
		||||
    date (str): Date string.
 | 
			
		||||
 | 
			
		||||
  Returns:
 | 
			
		||||
    set: Set of datetime.strftime/strptime-compliant date format strings
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> guess('2014/05/05 14:00:00 UTC')
 | 
			
		||||
    set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
 | 
			
		||||
    >>> guess('12/12/12')
 | 
			
		||||
    set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
 | 
			
		||||
  """
 | 
			
		||||
  # Don't attempt to parse strings that are so long as to be certainly non-dates. Somewhat long
 | 
			
		||||
  # strings could be dates (like "Wednesday, September 16, 2020 A.D. 08:47:02.2667911 AM -06:00",
 | 
			
		||||
  # and who knows what other languages do). A limit is important also because the current approach
 | 
			
		||||
  # can run into "maximum recursion depth exceeded" on a very long string.
 | 
			
		||||
  if len(date) > 150:
 | 
			
		||||
    return set()
 | 
			
		||||
  tokens = _tokenize_by_character_class(date)
 | 
			
		||||
  _analyze_tokens(tokens)
 | 
			
		||||
  return _generate_all_permutations(tuple(tokens))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def guess_bulk(dates, error_rate=0):
 | 
			
		||||
  """Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
 | 
			
		||||
 | 
			
		||||
  Args:
 | 
			
		||||
    dates (list): List of samples date strings.
 | 
			
		||||
    error_rate (float): Acceptable error rate (default 0.0)
 | 
			
		||||
 | 
			
		||||
  Returns:
 | 
			
		||||
    list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
 | 
			
		||||
 | 
			
		||||
  Examples:
 | 
			
		||||
    >>> guess_bulk(['12-11-2014', '12-25-2014'])
 | 
			
		||||
    ['%m-%d-%Y']
 | 
			
		||||
    >>> guess_bulk(['12-11-2014', '25-25-2014'])
 | 
			
		||||
    []
 | 
			
		||||
    >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
 | 
			
		||||
    ['%m-%d-%Y']
 | 
			
		||||
  """
 | 
			
		||||
  if error_rate == 0.0:
 | 
			
		||||
    patterns = None
 | 
			
		||||
    for date in dates:
 | 
			
		||||
      guesses_patterns = guess(date)
 | 
			
		||||
      if patterns is None:
 | 
			
		||||
        patterns = guesses_patterns
 | 
			
		||||
      else:
 | 
			
		||||
        patterns = patterns.intersection(guesses_patterns)
 | 
			
		||||
      if not patterns:
 | 
			
		||||
        break   # No need to iterate more if zero patterns found
 | 
			
		||||
    return list(patterns)
 | 
			
		||||
  else:
 | 
			
		||||
    found_dates = 0
 | 
			
		||||
    pattern_counters = defaultdict(lambda: 0)
 | 
			
		||||
    num_dates = len(dates)
 | 
			
		||||
    min_num_dates_to_be_found = num_dates - num_dates * error_rate
 | 
			
		||||
 | 
			
		||||
    for idx, date in enumerate(dates):
 | 
			
		||||
      patterns = guess(date)
 | 
			
		||||
      if patterns:
 | 
			
		||||
        found_dates += 1
 | 
			
		||||
      for pattern in patterns:
 | 
			
		||||
        pattern_counters[pattern] = pattern_counters[pattern] + 1
 | 
			
		||||
 | 
			
		||||
      # Early return if number of strings that can't be date is already over error rate
 | 
			
		||||
      cells_left = num_dates - idx - 1
 | 
			
		||||
      cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
 | 
			
		||||
      if cannot_be_found:
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    patterns = [(v, k) for k, v in pattern_counters.items()
 | 
			
		||||
                if v > min_num_dates_to_be_found]
 | 
			
		||||
    patterns.sort(reverse=True)
 | 
			
		||||
    return [k for (v, k) in patterns]
 | 
			
		||||
@ -1,102 +0,0 @@
 | 
			
		||||
import unittest
 | 
			
		||||
from imports.dateguess import guess, guess_bulk
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestGuesser(unittest.TestCase):
 | 
			
		||||
  def assertDate(self, input_str, fmt_list):
 | 
			
		||||
    guessed = guess(input_str)
 | 
			
		||||
    self.assertEqual(set(guessed), set(fmt_list))
 | 
			
		||||
 | 
			
		||||
  def assertDates(self, input_lst, error_rate, fmt_list):
 | 
			
		||||
    guessed = guess_bulk(input_lst, error_rate=error_rate)
 | 
			
		||||
    self.assertEqual(set(guessed), set(fmt_list))
 | 
			
		||||
 | 
			
		||||
  def test_guess_dates(self):
 | 
			
		||||
    self.assertDate('', [])
 | 
			
		||||
    self.assertDate("2013-13-13", [])
 | 
			
		||||
    self.assertDate("25/25/1911", [])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
 | 
			
		||||
    self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
 | 
			
		||||
    self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
 | 
			
		||||
    self.assertDate("2013-12-13", ['%Y-%m-%d'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("12/31/1999", ['%m/%d/%Y'])
 | 
			
		||||
    self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
 | 
			
		||||
    self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
 | 
			
		||||
    self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("12/31/99", ['%m/%d/%y'])
 | 
			
		||||
    self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
 | 
			
		||||
    self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
 | 
			
		||||
    self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("31.12.91", ['%d.%m.%y'])
 | 
			
		||||
    self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
 | 
			
		||||
    self.assertDate("31.12.1991", ['%d.%m.%Y'])
 | 
			
		||||
    self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
 | 
			
		||||
    self.assertDate("13.2.2008", ['%d.%m.%Y'])
 | 
			
		||||
    self.assertDate("31.12.91", ['%d.%m.%y'])
 | 
			
		||||
    self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
 | 
			
		||||
    self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
 | 
			
		||||
    self.assertDate("31 Dec 1999", ['%d %b %Y'])
 | 
			
		||||
    self.assertDate("1 Jan 2012", ['%d %b %Y'])
 | 
			
		||||
    self.assertDate("3 August 2009", ['%d %B %Y'])
 | 
			
		||||
    self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("13/1/2012", ['%d/%m/%Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
 | 
			
		||||
    self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
 | 
			
		||||
 | 
			
		||||
  def test_guess_datetimes(self):
 | 
			
		||||
    self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
 | 
			
		||||
    self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
 | 
			
		||||
    self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
 | 
			
		||||
    self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
 | 
			
		||||
    # TODO remove all except first one
 | 
			
		||||
    self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
 | 
			
		||||
                                         '%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
 | 
			
		||||
    self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d'])    #TODO remove second one
 | 
			
		||||
 | 
			
		||||
    self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
 | 
			
		||||
    self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
 | 
			
		||||
    self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
 | 
			
		||||
    self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
 | 
			
		||||
    self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
 | 
			
		||||
    self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
 | 
			
		||||
    self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
 | 
			
		||||
    self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
 | 
			
		||||
    self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
 | 
			
		||||
    self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
 | 
			
		||||
    self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
 | 
			
		||||
    self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
 | 
			
		||||
 | 
			
		||||
    self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
 | 
			
		||||
    self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
 | 
			
		||||
 | 
			
		||||
  def test_guess_bulk_dates(self):
 | 
			
		||||
    self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
 | 
			
		||||
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
 | 
			
		||||
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
 | 
			
		||||
    self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
 | 
			
		||||
 | 
			
		||||
    self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
 | 
			
		||||
    self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
 | 
			
		||||
    self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
  unittest.main()
 | 
			
		||||
@ -4,8 +4,6 @@ import textwrap
 | 
			
		||||
import unittest
 | 
			
		||||
from six import BytesIO, text_type
 | 
			
		||||
import csv
 | 
			
		||||
import calendar
 | 
			
		||||
import datetime
 | 
			
		||||
 | 
			
		||||
from imports import import_csv
 | 
			
		||||
 | 
			
		||||
@ -22,9 +20,15 @@ def bytes_io_from_str(string):
 | 
			
		||||
 | 
			
		||||
class TestImportCSV(unittest.TestCase):
 | 
			
		||||
 | 
			
		||||
  def _check_col(self, sheet, index, name, typename, values):
 | 
			
		||||
  def _check_col(self, sheet, index, name, _typename, values):
 | 
			
		||||
    self.assertEqual(sheet["column_metadata"][index]["id"], name)
 | 
			
		||||
    self.assertEqual(sheet["column_metadata"][index]["type"], typename)
 | 
			
		||||
    # Previously, strings were parsed and types were guessed in CSV imports.
 | 
			
		||||
    # Now all data is kept as strings and the column type is left as Any
 | 
			
		||||
    # so that type guessing and parsing can happen elsewhere.
 | 
			
		||||
    # To avoid updating 85 calls to _check_col, the typename argument was kept but can be ignored,
 | 
			
		||||
    # and all values are converted back to strings for comparison.
 | 
			
		||||
    self.assertEqual(sheet["column_metadata"][index]["type"], "Any")
 | 
			
		||||
    values = [text_type(v) for v in values]
 | 
			
		||||
    self.assertEqual(sheet["table_data"][index], values)
 | 
			
		||||
 | 
			
		||||
  def _check_num_cols(self, sheet, exp_cols):
 | 
			
		||||
@ -40,18 +44,16 @@ class TestImportCSV(unittest.TestCase):
 | 
			
		||||
    self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
 | 
			
		||||
    self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
 | 
			
		||||
    self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
 | 
			
		||||
    self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
 | 
			
		||||
    self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
 | 
			
		||||
    self._check_col(sheet, 4, "num2", "Numeric", ['123456789.1234560000', '', ''])
 | 
			
		||||
    self._check_col(sheet, 5, "bignum", "Numeric", ['7.22597E+86', '', ''])
 | 
			
		||||
    self._check_col(sheet, 6, "date1", "DateTime",
 | 
			
		||||
                    [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
 | 
			
		||||
                    [u'12/22/15 11:59 AM', u'', u''])
 | 
			
		||||
    self._check_col(sheet, 7, "date2", "Date",
 | 
			
		||||
                    [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
 | 
			
		||||
                    [u'December 20, 2015', u'', u''])
 | 
			
		||||
    self._check_col(sheet, 8, "datetext", "Date",
 | 
			
		||||
                    [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
 | 
			
		||||
                    [u'12/22/2015', u'', u''])
 | 
			
		||||
    self._check_col(sheet, 9, "datetimetext", "DateTime",
 | 
			
		||||
                    [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
 | 
			
		||||
                     calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
 | 
			
		||||
                     calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
 | 
			
		||||
                    [u'12/22/2015 00:00:00', u'12/22/2015 13:15:00', u'02/27/2018 16:08:39'])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  def test_user_parse_options(self):
 | 
			
		||||
@ -68,7 +70,11 @@ class TestImportCSV(unittest.TestCase):
 | 
			
		||||
    self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
 | 
			
		||||
                                                      '2013433434', '(201)343-3434'])
 | 
			
		||||
    self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
 | 
			
		||||
    self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
 | 
			
		||||
    self._check_col(parsed_file, 4, "DATE", "DateTime",
 | 
			
		||||
                    [u'2018-02-27 16:08:39 +0000',
 | 
			
		||||
                     u'2018-02-27 16:08:39 +0100',
 | 
			
		||||
                     u'2018-02-27 16:08:39 -0100',
 | 
			
		||||
                     u''])
 | 
			
		||||
 | 
			
		||||
  def test_wrong_cols1(self):
 | 
			
		||||
    file_obj = bytes_io_from_str(textwrap.dedent(
 | 
			
		||||
 | 
			
		||||
@ -16,31 +16,33 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
  def _check_col(self, sheet, index, name, typename, values):
 | 
			
		||||
    self.assertEqual(sheet["column_metadata"][index]["id"], name)
 | 
			
		||||
    self.assertEqual(sheet["column_metadata"][index]["type"], typename)
 | 
			
		||||
    if typename == "Any":
 | 
			
		||||
      # Convert values to strings to reduce changes to tests after imports were overhauled.
 | 
			
		||||
      values = [str(v) for v in values]
 | 
			
		||||
    self.assertEqual(sheet["table_data"][index], values)
 | 
			
		||||
 | 
			
		||||
  def test_excel(self):
 | 
			
		||||
    parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
 | 
			
		||||
 | 
			
		||||
    # check that column type was correctly set to int and values are properly parsed
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
 | 
			
		||||
    # check that column type was correctly set to numeric and values are properly parsed
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
 | 
			
		||||
 | 
			
		||||
    # check that column type was correctly set to text and values are properly parsed
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["table_data"][1],
 | 
			
		||||
      ["a", "b", "c", "d", "e", "f", "g", "h"])
 | 
			
		||||
 | 
			
		||||
    # messy tables does not support bool types yet, it classifies them as ints
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][False]["table_data"][2],
 | 
			
		||||
      [True, False, True, False, True, False, True, False])
 | 
			
		||||
    # 0s and 1s become Numeric, not boolean like in the past
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0])
 | 
			
		||||
 | 
			
		||||
    # check that column type was correctly set to text and values are properly parsed
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["column_metadata"][3],
 | 
			
		||||
                     {"type": "Text", "id": "corner-cases"})
 | 
			
		||||
                     {"type": "Any", "id": "corner-cases"})
 | 
			
		||||
    self.assertEqual(parsed_file[1][0]["table_data"][3],
 | 
			
		||||
      # The type is detected as text, so all values should be text.
 | 
			
		||||
      [u'=function()', '3.0', u'two spaces after  ',
 | 
			
		||||
      [u'=function()', u'3.0', u'two spaces after  ',
 | 
			
		||||
        u'  two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
 | 
			
		||||
 | 
			
		||||
    # check that multiple tables are created when there are multiple sheets in a document
 | 
			
		||||
@ -51,23 +53,19 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
  def test_excel_types(self):
 | 
			
		||||
    parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
 | 
			
		||||
    sheet = parsed_file[1][0]
 | 
			
		||||
    self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
 | 
			
		||||
    self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
 | 
			
		||||
    self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
 | 
			
		||||
    self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
 | 
			
		||||
    self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', ''])
 | 
			
		||||
    self._check_col(sheet, 1, "int2", "Numeric", [5, '', ''])
 | 
			
		||||
    self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
 | 
			
		||||
    self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
 | 
			
		||||
    self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
 | 
			
		||||
    self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
 | 
			
		||||
    self._check_col(sheet, 6, "date1", "DateTime",
 | 
			
		||||
             [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
 | 
			
		||||
    self._check_col(sheet, 7, "date2", "Date",
 | 
			
		||||
             [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
 | 
			
		||||
    self._check_col(sheet, 8, "datetext", "Date",
 | 
			
		||||
             [calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
 | 
			
		||||
    # TODO: all dates have different format
 | 
			
		||||
    # self._check_col(sheet, 9, "datetimetext", "DateTime",
 | 
			
		||||
    #          [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
 | 
			
		||||
    #           calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
 | 
			
		||||
    #           calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
 | 
			
		||||
    self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', ''])
 | 
			
		||||
    self._check_col(sheet, 9, "datetimetext", "Any",
 | 
			
		||||
                    [u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000'])
 | 
			
		||||
 | 
			
		||||
  def test_excel_type_detection(self):
 | 
			
		||||
    # This tests goes over the second sheet of the fixture doc, which has multiple rows that try
 | 
			
		||||
@ -81,23 +79,20 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
                     1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
 | 
			
		||||
    self._check_col(sheet, 1, "float_not_int", "Numeric",
 | 
			
		||||
                    [1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
 | 
			
		||||
    self._check_col(sheet, 2, "int_not_bool", "Int",
 | 
			
		||||
    self._check_col(sheet, 2, "int_not_bool", "Any",
 | 
			
		||||
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
 | 
			
		||||
    self._check_col(sheet, 3, "float_not_bool", "Numeric",
 | 
			
		||||
    self._check_col(sheet, 3, "float_not_bool", "Any",
 | 
			
		||||
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
 | 
			
		||||
    self._check_col(sheet, 4, "text_as_bool", "Bool",
 | 
			
		||||
    self._check_col(sheet, 4, "text_as_bool", "Any",
 | 
			
		||||
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
 | 
			
		||||
    self._check_col(sheet, 5, "int_as_bool", "Bool",
 | 
			
		||||
    self._check_col(sheet, 5, "int_as_bool", "Numeric",
 | 
			
		||||
                    [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
 | 
			
		||||
    self._check_col(sheet, 6, "float_not_date", "Numeric",
 | 
			
		||||
    self._check_col(sheet, 6, "float_not_date", "Any",
 | 
			
		||||
                    [4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
 | 
			
		||||
                     4.0, 6.0, '3-4', 4.0, 6.5])
 | 
			
		||||
    self._check_col(sheet, 7, "float_not_text", "Numeric",
 | 
			
		||||
                    [-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, "   ???   ", 5.50, "", "-",
 | 
			
		||||
                    [-10.25, -8.00, -5.75, -3.50, "n/a", '  1.  ', "   ???   ", 5.50, "", "-",
 | 
			
		||||
                     12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
 | 
			
		||||
    self._check_col(sheet, 8, "dollar_amts", "Numeric",
 | 
			
		||||
                    [0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
 | 
			
		||||
                     1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
 | 
			
		||||
 | 
			
		||||
  def test_excel_single_merged_cell(self):
 | 
			
		||||
    # An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
 | 
			
		||||
@ -107,11 +102,11 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
    self.assertEqual(tables, [{
 | 
			
		||||
      'table_name': u'Transaction Report',
 | 
			
		||||
      'column_metadata': [
 | 
			
		||||
        {'type': 'Text', 'id': u''},
 | 
			
		||||
        {'type': 'Any', 'id': u''},
 | 
			
		||||
        {'type': 'Numeric', 'id': u'Start'},
 | 
			
		||||
        {'type': 'Numeric', 'id': u''},
 | 
			
		||||
        {'type': 'Numeric', 'id': u''},
 | 
			
		||||
        {'type': 'Text', 'id': u'Seek no easy ways'},
 | 
			
		||||
        {'type': 'Any', 'id': u'Seek no easy ways'},
 | 
			
		||||
      ],
 | 
			
		||||
      'table_data': [
 | 
			
		||||
        [u'SINGLE MERGED', u'The End'],
 | 
			
		||||
@ -133,15 +128,15 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
    self.assertEqual(tables, [{
 | 
			
		||||
      'table_name': u'Sheet1',
 | 
			
		||||
      'column_metadata': [
 | 
			
		||||
        {'id': 'a', 'type': 'Text'},
 | 
			
		||||
        {'id': 'a', 'type': 'Any'},
 | 
			
		||||
        {'id': 'b', 'type': 'Date'},
 | 
			
		||||
        {'id': 'c', 'type': 'Text'},
 | 
			
		||||
        {'id': 'd', 'type': 'Text'},
 | 
			
		||||
        {'id': 'c', 'type': 'Any'},
 | 
			
		||||
        {'id': 'd', 'type': 'Any'},
 | 
			
		||||
        {'id': 'e', 'type': 'Numeric'},
 | 
			
		||||
        {'id': 'f', 'type': 'Int'},
 | 
			
		||||
        {'id': 'g', 'type': 'Date'},
 | 
			
		||||
        {'id': 'f', 'type': 'Numeric'},
 | 
			
		||||
        {'id': 'g', 'type': 'Any'},
 | 
			
		||||
        {'id': 'h', 'type': 'Date'},
 | 
			
		||||
        {'id': 'i', 'type': 'Bool'},
 | 
			
		||||
        {'id': 'i', 'type': 'Numeric'},
 | 
			
		||||
      ],
 | 
			
		||||
      'table_data': [
 | 
			
		||||
        [u'21:14:00'],
 | 
			
		||||
@ -150,9 +145,9 @@ class TestImportXLS(unittest.TestCase):
 | 
			
		||||
        [u'10:20:30'],
 | 
			
		||||
        [4.180902777777778],
 | 
			
		||||
        [20],
 | 
			
		||||
        [-6106060800.0],
 | 
			
		||||
        [u'7/4/1776'],
 | 
			
		||||
        [205286400.0],
 | 
			
		||||
        [False],    # This is not great either, we should be able to distinguish 0 from FALSE.
 | 
			
		||||
        [0],
 | 
			
		||||
      ],
 | 
			
		||||
    }])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -7,13 +7,11 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a
 | 
			
		||||
of values. All "data" lists will have the same length.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from imports import dateguess
 | 
			
		||||
import datetime
 | 
			
		||||
import logging
 | 
			
		||||
import re
 | 
			
		||||
import messytables
 | 
			
		||||
import moment # TODO grist internal libraries might not be available to plugins in the future.
 | 
			
		||||
import dateutil.parser as date_parser
 | 
			
		||||
import six
 | 
			
		||||
from six.moves import zip, xrange
 | 
			
		||||
 | 
			
		||||
@ -25,12 +23,17 @@ log = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Our approach to type detection is different from that of messytables.
 | 
			
		||||
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
 | 
			
		||||
# We first go through each cell in a sample of rows, checking if it's one of the basic
 | 
			
		||||
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
 | 
			
		||||
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
 | 
			
		||||
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
 | 
			
		||||
# We use those counts to produce the selected Grist type at the end.
 | 
			
		||||
 | 
			
		||||
# Previously string values were used here for type guessing and were parsed to typed values.
 | 
			
		||||
# That process now happens elsewhere, and this module only handles the case
 | 
			
		||||
# where the imported data already contains actual numbers or dates.
 | 
			
		||||
# This happens for Excel sheets but not CSV files.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BaseConverter(object):
 | 
			
		||||
  @classmethod
 | 
			
		||||
@ -57,50 +60,19 @@ class BaseConverter(object):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NumericConverter(BaseConverter):
 | 
			
		||||
  """Handles numeric values, including Grist types Numeric and Int."""
 | 
			
		||||
 | 
			
		||||
  # A number matching this is probably an identifier of some sort. Converting it to a float will
 | 
			
		||||
  # lose precision, so it's better not to consider it numeric.
 | 
			
		||||
  _unlikely_float = re.compile(r'\d{17}|^0\d')
 | 
			
		||||
 | 
			
		||||
  # Integers outside this range will be represented as floats. This is the limit for values that can
 | 
			
		||||
  # be stored in a JS Int32Array.
 | 
			
		||||
  _max_js_int = 1<<31
 | 
			
		||||
 | 
			
		||||
  # The thousands separator. It should be locale-specific, but we don't currently have a way to
 | 
			
		||||
  # detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
 | 
			
		||||
  _thousands_sep = ','
 | 
			
		||||
  """Handles the Grist Numeric type"""
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def convert(cls, value):
 | 
			
		||||
    if type(value) in six.integer_types + (float, complex):
 | 
			
		||||
      return value
 | 
			
		||||
    if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
 | 
			
		||||
      return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
 | 
			
		||||
    raise ValueError()
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def _is_integer(cls, value):
 | 
			
		||||
    ttype = type(value)
 | 
			
		||||
    if ttype == int or (ttype == float and value.is_integer()):
 | 
			
		||||
      return -cls._max_js_int <= value < cls._max_js_int
 | 
			
		||||
    return False
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def get_grist_column(cls, values):
 | 
			
		||||
    if all(cls._is_integer(v) for v in values):
 | 
			
		||||
      return ("Int", [int(v) for v in values])
 | 
			
		||||
    return ("Numeric", values)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DateParserInfo(date_parser.parserinfo):
 | 
			
		||||
  def validate(self, res):
 | 
			
		||||
    # Avoid this bogus combination which accepts plain numbers.
 | 
			
		||||
    if res.day and not res.month:
 | 
			
		||||
      return False
 | 
			
		||||
    return super(DateParserInfo, self).validate(res)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SimpleDateTimeConverter(BaseConverter):
 | 
			
		||||
  """Handles Date and DateTime values which are already instances of datetime.datetime."""
 | 
			
		||||
 | 
			
		||||
@ -124,66 +96,18 @@ class SimpleDateTimeConverter(BaseConverter):
 | 
			
		||||
    return grist_type, grist_values
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DateTimeCoverter(BaseConverter):
 | 
			
		||||
  """Handles dateformats by guessed format."""
 | 
			
		||||
 | 
			
		||||
  def __init__(self, date_format):
 | 
			
		||||
    self._format = date_format
 | 
			
		||||
 | 
			
		||||
  def convert(self, value):
 | 
			
		||||
    if value == "":
 | 
			
		||||
      return None
 | 
			
		||||
    if type(value) in (str, six.text_type):
 | 
			
		||||
      # datetime.strptime doesn't handle %z and %Z tags in Python 2.
 | 
			
		||||
      if '%z' in self._format or '%Z' in self._format:
 | 
			
		||||
        return date_parser.parse(value)
 | 
			
		||||
      else:
 | 
			
		||||
        try:
 | 
			
		||||
          return datetime.datetime.strptime(value, self._format)
 | 
			
		||||
        except ValueError:
 | 
			
		||||
          return date_parser.parse(value)
 | 
			
		||||
 | 
			
		||||
    raise ValueError()
 | 
			
		||||
 | 
			
		||||
  def _is_date(self, value):
 | 
			
		||||
    return value is None or value.time() == datetime.time()
 | 
			
		||||
 | 
			
		||||
  def get_grist_column(self, values):
 | 
			
		||||
    grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
 | 
			
		||||
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
 | 
			
		||||
                    for v in values]
 | 
			
		||||
    return grist_type, grist_values
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BoolConverter(BaseConverter):
 | 
			
		||||
  """Handles Boolean type."""
 | 
			
		||||
 | 
			
		||||
  _true_values = (1, '1', 'true', 'yes')
 | 
			
		||||
  _false_values = (0, '0', 'false', 'no')
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def convert(cls, value):
 | 
			
		||||
    v = value.strip().lower() if type(value) in (str, six.text_type) else value
 | 
			
		||||
    if v in cls._true_values:
 | 
			
		||||
      return True
 | 
			
		||||
    elif v in cls._false_values:
 | 
			
		||||
      return False
 | 
			
		||||
    raise ValueError()
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def get_grist_column(cls, values):
 | 
			
		||||
    return ("Bool", values)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TextConverter(BaseConverter):
 | 
			
		||||
  """Fallback converter that converts everything to strings."""
 | 
			
		||||
class AnyConverter(BaseConverter):
 | 
			
		||||
  """
 | 
			
		||||
  Fallback converter that converts everything to strings.
 | 
			
		||||
  Type guessing and parsing of the strings will happen elsewhere.
 | 
			
		||||
  """
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def convert(cls, value):
 | 
			
		||||
    return six.text_type(value)
 | 
			
		||||
 | 
			
		||||
  @classmethod
 | 
			
		||||
  def get_grist_column(cls, values):
 | 
			
		||||
    return ("Text", values)
 | 
			
		||||
    return ("Any", values)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ColumnDetector(object):
 | 
			
		||||
@ -194,7 +118,7 @@ class ColumnDetector(object):
 | 
			
		||||
  """
 | 
			
		||||
  # Converters are listed in the order of preference, which is only used if two converters succeed
 | 
			
		||||
  # on the same exact number of values. Text is always a fallback.
 | 
			
		||||
  converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
 | 
			
		||||
  converters = [SimpleDateTimeConverter, NumericConverter]
 | 
			
		||||
 | 
			
		||||
  # If this many non-junk values or more can't be converted, fall back to text.
 | 
			
		||||
  _text_threshold = 0.10
 | 
			
		||||
@ -221,19 +145,11 @@ class ColumnDetector(object):
 | 
			
		||||
        self._counts[i] += 1
 | 
			
		||||
 | 
			
		||||
  def get_converter(self):
 | 
			
		||||
    if sum(self._counts) == 0:
 | 
			
		||||
      # if not already guessed as int, bool or datetime then we should try to guess date pattern
 | 
			
		||||
      str_data = [d for d in self._data if isinstance(d, six.string_types)]
 | 
			
		||||
      data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
 | 
			
		||||
      data_format = data_formats[0] if data_formats else None
 | 
			
		||||
      if data_format:
 | 
			
		||||
        return DateTimeCoverter(data_format)
 | 
			
		||||
 | 
			
		||||
    # We find the max by count, and secondarily by minimum index in the converters list.
 | 
			
		||||
    count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
 | 
			
		||||
    if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
 | 
			
		||||
      return self.converters[-neg_index]
 | 
			
		||||
    return TextConverter
 | 
			
		||||
    return AnyConverter
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _guess_basic_types(rows, num_columns):
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user