(core) Move file import plugins into core/sandbox/grist

Summary:
Move all the plugins python code into the main folder with the core code.

Register file importing functions in the same main.py entrypoint as the data engine.

Remove options relating to different entrypoints and code directories. The only remaining plugin-specific option in NSandbox is the import directory/mount, i.e. where files to be parsed are placed.

Test Plan: this

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D2965
This commit is contained in:
Alex Hall
2021-08-09 16:51:43 +02:00
parent 5b92a43849
commit 4d526da58f
26 changed files with 95 additions and 353 deletions

View File

@@ -0,0 +1,479 @@
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
based on samples.
dateguesser.guess(sample)
dateguesser.guess takes a sample date string and returns a set of
datetime.strftime/strptime-compliant date format strings that will correctly parse.
dateguesser.guess_bulk(list_of_samples, error_rate=0)
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
and returns a list of datetime.strftime/strptime-compliant date format strings
sorted by error rate that will correctly parse.
Algorithm:
1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
2. Analyze each token independently in terms what format codes could represent
3. For given list of tokens generate all permutations of format codes
4. During generating permutations check for validness of generated format and skip if invalid.
5. Use rules listed below to decide if format is invalid:
Invalid format checks:
Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
Rule #2. No holes (missing parts) in the format parts.
Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
Rule #5. Pattern can't have some part of date/time defined more than once.
Rule #6: Separators between elements of the time group should be the same.
Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
Rule #8: Year can't be between other date elements
Note:
dateguess doesn't support defaulting to current year because parsing should be deterministic,
it's better to to fail guessing the format then to guess it incorrectly.
Examples:
>>> guess('2014/05/05 14:00:00 UTC')
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
>>> guess('12/12/12')
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
>>> guess_bulk(['12-11-2014', '12-25-2014'])
['%m-%d-%Y']
>>> guess_bulk(['12-11-2014', '25-25-2014'])
[]
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
['%m-%d-%Y']
"""
import calendar
import itertools
import logging
import re
from collections import defaultdict
from backports.functools_lru_cache import lru_cache
import moment
MONTH_NAME = calendar.month_name
MONTH_ABBR = calendar.month_abbr
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
AM_PM = {'am', 'pm'}
DAYS_OF_WEEK_NAME = calendar.day_name
DAYS_OF_WEEK_ABBR = calendar.day_abbr
DATE_ELEMENTS = [
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),
("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),
("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
("Day of week", "%A", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6
and 0 <= int(x[0:2]) < 24
and 0 <= int(x[2:4]) < 60
and 0 <= int(x[4:6]) < 60, "HMS", 0),
("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2
and 0 <= int(x) <= 11, "H", 0),
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None
and p.val == '.', "f", 0),
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
and x in TZ_VALID_NAMES, "Z", 0),
("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '+', "Z", 1),
("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '-', "Z", 1),
]
class Token(object):
"""Represents a part of a date string that's being parsed.
Note that __hash__ and __eq__ are overridden in order
to compare only meaningful parts of an object.
"""
def __init__(self, val, length):
self.val = val
self.length = length
self.compatible_types = ()
def __hash__(self):
h = hash(self.length) + hash(self.compatible_types)
if not self.compatible_types:
h += hash(self.val)
return hash(h)
def __eq__(self, other):
"""
Two tokens are equal when these both are true:
a) length and compatible types are equal
b) if it is separator (no compatible types), separator values must be equal
"""
if self.length != other.length or self.compatible_types != other.compatible_types:
return False
if not other.compatible_types and self.val != other.val:
return False
return True
def _check_rule_1(pattern, types_used):
"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
Examples:
>>> _check_rule_1('%Y/%m/%d', 'Ymd')
True
>>> _check_rule_1('%m/%d', 'md')
False
"""
if 'Y' not in types_used:
logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_2(pattern, types_used):
"""Rule #2: No holes (missing parts) in the format parts.
Examples:
>>> _check_rule_2('%Y:%H', 'YH')
False
>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
True
"""
priorities = 'YmdHMSf'
seen_parts = [p in types_used for p in priorities]
if sorted(seen_parts, reverse=True) != seen_parts:
logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_3(pattern, types_used):
"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
Examples:
>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
True
>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
False
"""
time_parts = 'HMSf'
time_parts_highlighted = [t in time_parts for t in types_used]
time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_4(pattern, types_used):
"""Rule #4: It's highly impossible that minutes coming before hours,
millis coming before seconds etc.
Examples:
>>> _check_rule_4('%H:%M', 'HM')
True
>>> _check_rule_4('%S:%M', 'SM')
False
"""
time_parts_priority = 'HMSf'
time_parts_indexes = list(filter(lambda x: x >= 0,
[time_parts_priority.find(t) for t in types_used]))
if sorted(time_parts_indexes) != time_parts_indexes:
logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_5(pattern, types_used):
"""Rule #5: Pattern can't have some part of date/time defined more than once.
Examples:
>>> _check_rule_5('%Y/%Y', 'YY')
False
>>> _check_rule_5('%m/%b', 'mm')
False
>>> _check_rule_5('%Y/%m', 'Ym')
True
"""
if len(types_used) != len(set(types_used)):
logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
return False
return True
def _check_rule_6(tokens_chosen, pattern, types_used):
"""Rule #6: Separators between elements of the time group should be the same.
Examples:
_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
('-' and ':') in time group)
"""
time_parts = 'HMS'
num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
time_parts_seen = 0
separators_seen = []
previous_was_a_separator = False
for token in tokens_chosen:
if token[1] is not None and token[1][3] in time_parts:
# This rule doesn't work for separator-less time group so when we found the type
# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
if len(token[1][3]) == 3:
break
# If not a first time then
if time_parts_seen > 0 and not previous_was_a_separator:
separators_seen.append(None)
time_parts_seen += 1
if time_parts_seen == num_of_time_parts_used:
break
previous_was_a_separator = False
else:
if time_parts_seen > 0:
separators_seen.append(token[0].val)
previous_was_a_separator = True
if len(set(separators_seen)) > 1:
logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
pattern, separators_seen)
return False
return True
def _check_rule_7a(pattern):
"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
Otherwise it's 24-hour.
Examples:
>>> _check_rule_7a('%Y/%m/%d %H:%M %p')
False
>>> _check_rule_7a('%Y/%m/%d %I:%M %p')
True
"""
if '%p' in pattern and '%H' in pattern:
logging.debug("Rule #7a is violated for pattern %s", pattern)
return False
return True
def _check_rule_7b(pattern):
"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
Otherwise it's 24-hour.
Examples:
>>> _check_rule_7b('%Y/%m/%d %I:%M')
False
>>> _check_rule_7b('%Y/%m/%d %I:%M %p')
True
"""
if '%I' in pattern and '%p' not in pattern:
logging.debug("Rule #7b is violated for pattern %s", pattern)
return False
return True
def _check_rule_8(pattern, types_used):
"""Rule #9: Year can't be between other date elements
Examples:
>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
False
"""
if 'mYd' in types_used or 'dYm' in types_used:
logging.debug("Rule #8 is violated for pattern %s", pattern)
return False
return True
def _tokenize_by_character_class(s):
"""Return a list of strings by splitting s (tokenizing) by character class.
Example:
>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
>>> [i.val for i in t]
['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
>>> t = _tokenize_by_character_class('5/14/2014')
>>> [i.val for i in t]
['5', '/', '14', '/', '2014']
"""
res = re.split(r'(\d+)|(\W)|(_)', s)
return [Token(i, len(i)) for i in res if i]
def _sliding_triplets(tokens):
for idx, t in enumerate(tokens):
yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
def _analyze_tokens(tokens):
"""Analize each token and find out compatible types for it."""
for token, prev, nxt in _sliding_triplets(tokens):
token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
@lru_cache()
def _generate_all_permutations(tokens):
"""Generate all permutations of format codes for given list of tokens.
Brute-forcing of all possible permutations and rules checking eats most of the time or date
parsing. But since the input is expected to be highly uniform then we can expect that
memoization of this step will be very efficient.
Token contains values for date parts but due to overridden eq and hash methods,
we treat two tokens having the same length and same possible formats as equal
tokens and separators should be the same
"""
all_patterns = set()
_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
return all_patterns
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
"""Apply rules which are applicable for partially constructed patterns.
Example: duplicates of a date part in a pattern.
"""
return _check_rule_5(pattern, types_used) \
and _check_rule_4(pattern, types_used) \
and _check_rule_7a(pattern)
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
"""Apply rules which are applicable for full pattern only.
Example: existence of Year part in the pattern.
"""
return _check_rule_1(pattern, types_used) \
and _check_rule_2(pattern, types_used) \
and _check_rule_3(pattern, types_used) \
and _check_rule_6(tokens_chosen, pattern, types_used) \
and _check_rule_7b(pattern) \
and _check_rule_8(pattern, types_used)
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
types_used):
"""Generate all format elements permutations recursively.
Args:
tokens (list[Token]): List of tokens.
token_idx (int): Index of token processing this cycle.
tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
containing token and compatible type
pattern (str): String containing format for parsing
found_patterns (set): Set of guessed patterns
types_used (str): String of types used to build pattern.
Returns:
list: List of permutations
"""
if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
return
if token_idx < len(tokens):
t = tokens[token_idx]
if t.compatible_types:
for ct in t.compatible_types:
_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
found_patterns, types_used + ct[3])
else:
# if no compatible types it should be separator, add it to the pattern
_generate_all_permutations_recursive(tokens, token_idx+1,
tokens_chosen[:] + [(t, None)], pattern + t.val,
found_patterns, types_used)
else:
if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
found_patterns.add(pattern)
def guess(date):
"""Guesses datetime.strftime/strptime-compliant date formats for date string.
Args:
date (str): Date string.
Returns:
set: Set of datetime.strftime/strptime-compliant date format strings
Examples:
>>> guess('2014/05/05 14:00:00 UTC')
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
>>> guess('12/12/12')
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
"""
tokens = _tokenize_by_character_class(date)
_analyze_tokens(tokens)
return _generate_all_permutations(tuple(tokens))
def guess_bulk(dates, error_rate=0):
"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
Args:
dates (list): List of samples date strings.
error_rate (float): Acceptable error rate (default 0.0)
Returns:
list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
Examples:
>>> guess_bulk(['12-11-2014', '12-25-2014'])
['%m-%d-%Y']
>>> guess_bulk(['12-11-2014', '25-25-2014'])
[]
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
['%m-%d-%Y']
"""
if error_rate == 0.0:
patterns = None
for date in dates:
guesses_patterns = guess(date)
if patterns is None:
patterns = guesses_patterns
else:
patterns = patterns.intersection(guesses_patterns)
if not patterns:
break # No need to iterate more if zero patterns found
return list(patterns)
else:
found_dates = 0
pattern_counters = defaultdict(lambda: 0)
num_dates = len(dates)
min_num_dates_to_be_found = num_dates - num_dates * error_rate
for idx, date in enumerate(dates):
patterns = guess(date)
if patterns:
found_dates += 1
for pattern in patterns:
pattern_counters[pattern] = pattern_counters[pattern] + 1
# Early return if number of strings that can't be date is already over error rate
cells_left = num_dates - idx - 1
cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
if cannot_be_found:
return []
patterns = [(v, k) for k, v in pattern_counters.items()
if v > min_num_dates_to_be_found]
patterns.sort(reverse=True)
return [k for (v, k) in patterns]

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1 @@
int1,int2,textint,bigint,num2,bignum,date1,date2,datetext,datetimetext
1 int1 int2 textint bigint num2 bignum date1 date2 datetext datetimetext -1234123 5 12345678902345689 320150170634561830 123456789.1234560000 7.22597E+86 12/22/15 11:59 AM December 20, 2015 12/22/2015 12/22/2015 00:00:00 12/22/2015 13:15:00 02/27/2018 16:08:39

Binary file not shown.

View File

@@ -0,0 +1,5 @@
FIRST_NAME,LAST_NAME,PHONE,VALUE,DATE
John,Moor,201-343-3434,45,2018-02-27 16:08:39 +0000
Tim,Kale,201.343.3434,4545,2018-02-27 16:08:39 +0100
Jenny,Jo,2013433434,0,2018-02-27 16:08:39 -0100
Lily,Smit,(201)343-3434,4,
1 FIRST_NAME LAST_NAME PHONE VALUE DATE
2 John Moor 201-343-3434 45 2018-02-27 16:08:39 +0000
3 Tim Kale 201.343.3434 4545 2018-02-27 16:08:39 +0100
4 Jenny Jo 2013433434 0 2018-02-27 16:08:39 -0100
5 Lily Smit (201)343-3434 4

View File

@@ -0,0 +1,197 @@
"""
Plugin for importing CSV files
"""
import os
import logging
import chardet
import messytables
import six
from six.moves import zip
import parse_data
from imports import import_utils
log = logging.getLogger(__name__)
SCHEMA = [
{
'name': 'lineterminator',
'label': 'Line terminator',
'type': 'string',
'visible': True,
},
{
'name': 'include_col_names_as_headers',
'label': 'First row contains headers',
'type': 'boolean',
'visible': True,
},
{
'name': 'delimiter',
'label': 'Field separator',
'type': 'string',
'visible': True,
},
{
'name': 'skipinitialspace',
'label': 'Skip leading whitespace',
'type': 'boolean',
'visible': True,
},
{
'name': 'quotechar',
'label': 'Quote character',
'type': 'string',
'visible': True,
},
{
'name': 'doublequote',
'label': 'Quotes in fields are doubled',
'type': 'boolean',
'visible': True,
},
{
'name': 'quoting',
'label': 'Convert quoted fields',
'type': 'number',
'visible': False, # Not supported by messytables
},
{
'name': 'escapechar',
'label': 'Escape character',
'type': 'string',
'visible': False, # Not supported by messytables
},
{
'name': 'start_with_row',
'label': 'Start with row',
'type': 'number',
'visible': False, # Not yet implemented
},
{
'name': 'NUM_ROWS',
'label': 'Number of rows',
'type': 'number',
'visible': False,
}]
def parse_file_source(file_source, options):
parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
return {"parseOptions": parsing_options, "tables": export_list}
def parse_file(file_path, parse_options=None):
"""
Reads a file path and parse options that are passed in using ActiveDoc.importFile()
and returns a tuple with parsing options (users' or guessed) and an object formatted so that
it can be used by grist for a bulk add records action.
"""
parse_options = parse_options or {}
with open(file_path, "rb") as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
return parsing_options, export_list
def _parse_open_file(file_obj, parse_options=None):
options = {}
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
csv_options = {k: parse_options.get(k) for k in csv_keys}
if six.PY2:
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
for k, v in csv_options.items()}
table_set = messytables.CSVTableSet(file_obj,
delimiter=csv_options['delimiter'],
quotechar=csv_options['quotechar'],
lineterminator=csv_options['lineterminator'],
doublequote=csv_options['doublequote'],
skipinitialspace=csv_options['skipinitialspace'])
num_rows = parse_options.get('NUM_ROWS', 0)
# Messytable's encoding detection uses too small a sample, so we override it here.
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding']
# In addition, always prefer UTF8 over ASCII.
if table_set.encoding == 'ascii':
table_set.encoding = 'utf8'
export_list = []
# A table set is a collection of tables:
for row_set in table_set.tables:
table_name = None
sample_rows = list(row_set.sample)
# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(sample_rows)
# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
headers[i] = six.text_type(header)
log.info("Guessed data_offset as %s", data_offset)
log.info("Guessed headers as: %s", headers)
have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
have_guessed_headers)
if include_col_names_as_headers and not have_guessed_headers:
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data
data_offset -= 1
headers = [''] * len(headers)
row_set.register_processor(messytables.offset_processor(data_offset))
table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows)
# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
table_data = []
for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]):
continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
if not table_data:
# Don't add tables with no columns.
continue
guessed = row_set._dialect
quoting = parse_options.get('quoting')
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1,
"NUM_ROWS": num_rows,
"SCHEMA": SCHEMA
}
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list.append({
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
})
return options, export_list
def get_version():
""" Return name and version of plug-in"""
pass

View File

@@ -0,0 +1,257 @@
"""
The import_json module converts json file into a list of grist tables.
It supports data being structured as a list of record, turning each
object into a row and each object's key into a column. For
example:
```
[{'a': 1, 'b': 'tree'}, {'a': 4, 'b': 'flowers'}, ... ]
```
is turned into a table with two columns 'a' of type 'Int' and 'b' of
type 'Text'.
Nested object are stored as references to a distinct table where the
nested object is stored. For example:
```
[{'a': {'b': 4}}, ...]
```
is turned into a column 'a' of type 'Ref:my_import_name.a', and into
another table 'my_import_name.a' with a column 'b' of type
'Int'. (Nested-nested objects are supported as well and the module
assumes no limit to the number of level of nesting you can do.)
Each value which is not an object will be stored into a column with id
'' (empty string). For example:
```
['apple', 'peach', ... ]
```
is turned into a table with an un-named column that stores the values.
Arrays are stored as a list of references to a table where the content
of the array is stored. For example:
```
[{'items': [{'a':'apple'}, {'a':'peach'}]}, {'items': [{'a':'cucumber'}, {'a':'carots'}, ...]}, ...]
```
is turned into a column named 'items' of type
'RefList:my_import_name.items' which points to another table named
'my_import_name.items' which has a column 'a' of type Text.
Data could be structured with an object at the root as well in which
case, the object is considered to represent a single row, and gets
turned into a table with one row.
A column's type is defined by the type of its first value that is not
None (ie: if another value with different type is stored in the same
column, the column's type remains unchanged), 'Text' otherwise.
Usage:
import import_json
# if you have a file to parse
import_json.parse_file(file_path)
# if data is already encoded with python's standard containers (dict and list)
import_json.dumps(data, import_name)
TODO:
- references should map to appropriate column type ie: `Ref:{$colname}` and
`RefList:{$colname}` (which depends on T413).
- Allows user to set the uniqueValues options per table.
- User should be able to choose some objects to be imported as
indexes: for instance:
```
{
'pink lady': {'type': 'apple', 'taste': 'juicy'},
'gala': {'type': 'apple', 'taste': 'tart'},
'comice': {'type': 'pear', 'taste': 'lemon'},
...
}
```
could be mapped to columns 'type', 'taste' and a 3rd that holds the
property 'name'.
"""
import os
import json
from collections import OrderedDict, namedtuple
from itertools import count, chain
import six
from imports import import_utils
Ref = namedtuple('Ref', ['table_name', 'rowid'])
Row = namedtuple('Row', ['values', 'parent', 'ref'])
Col = namedtuple('Col', ['type', 'values'])
GRIST_TYPES={
float: "Numeric",
bool: "Bool",
}
for typ in six.integer_types:
GRIST_TYPES[typ] = "Int"
for typ in six.string_types:
GRIST_TYPES[typ] = "Text"
SCHEMA = [{
'name': 'includes',
'label': 'Includes (list of tables seperated by semicolon)',
'type': 'string',
'visible': True
}, {
'name': 'excludes',
'label': 'Excludes (list of tables seperated by semicolon)',
'type': 'string',
'visible': True
}]
DEFAULT_PARSE_OPTIONS = {
'includes': '',
'excludes': '',
'SCHEMA': SCHEMA
}
def parse_file(file_source, parse_options):
"Deserialize `file_source` into a python object and dumps it into jgrist form"
path = import_utils.get_path(file_source['path'])
name, ext = os.path.splitext(file_source['origName'])
if 'SCHEMA' not in parse_options:
parse_options.update(DEFAULT_PARSE_OPTIONS)
with open(path, 'r') as json_file:
data = json.loads(json_file.read())
return dumps(data, name, parse_options)
def dumps(data, name = "", parse_options = DEFAULT_PARSE_OPTIONS):
" Serializes `data` to a jgrist formatted object. "
tables = Tables(parse_options)
if not isinstance(data, list):
# put simple record into a list
data = [data]
for val in data:
tables.add_row(name, val)
return {
'tables': tables.dumps(),
'parseOptions': parse_options
}
class Tables(object):
"""
Tables maintains the list of tables indexed by their name. Each table
is a list of row. A row is a dictionary mapping columns id to a value.
"""
def __init__(self, parse_options):
self._tables = OrderedDict()
self._includes_opt = list(filter(None, parse_options['includes'].split(';')))
self._excludes_opt = list(filter(None, parse_options['excludes'].split(';')))
def dumps(self):
" Dumps tables in jgrist format "
return [_dump_table(name, rows) for name, rows in six.iteritems(self._tables)]
def add_row(self, table, value, parent = None):
"""
Adds a row to `table` and fill it with the content of value, then
returns a Ref object pointing to this row. Returns None if the row
was excluded. Calls itself recursively to add nested object and
lists.
"""
row = None
if self._is_included(table):
rows = self._tables.setdefault(table, [])
row = Row(OrderedDict(), parent, Ref(table, len(rows)+1))
rows.append(row)
# we need a dictionary to map values to the row's columns
value = _dictify(value)
for (k, val) in sorted(six.iteritems(value)):
if isinstance(val, dict):
val = self.add_row(table + '_' + k, val)
if row and val:
row.values[k] = val.ref
elif isinstance(val, list):
for list_val in val:
self.add_row(table + '_' + k, list_val, row)
else:
if row and self._is_included(table + '_' + k):
row.values[k] = val
return row
def _is_included(self, property_path):
is_included = (any(property_path.startswith(inc) for inc in self._includes_opt)
if self._includes_opt else True)
is_excluded = (any(property_path.startswith(exc) for exc in self._excludes_opt)
if self._excludes_opt else False)
return is_included and not is_excluded
def first_available_key(dictionary, name):
"""
Returns the first of (name, name2, name3 ...) that is not a key of
dictionary.
"""
names = chain([name], ("{}{}".format(name, i) for i in count(2)))
return next(n for n in names if n not in dictionary)
def _dictify(value):
"""
Converts non-dictionary value to a dictionary with a single
empty-string key mapping to the given value. Or returns the value
itself if it's already a dictionary. This is useful to map values to
row's columns.
"""
return value if isinstance(value, dict) else {'': value}
def _dump_table(name, rows):
"Converts a list of rows into a jgrist table and set 'table_name' to name."
columns = _transpose([r.values for r in rows])
# find ref to first parent
ref = next((r.parent.ref for r in rows if r.parent), None)
if ref:
# adds a column to store ref to parent
col_id = first_available_key(columns, ref.table_name)
columns[col_id] = Col(_grist_type(ref),
[row.parent.ref if row.parent else None for row in rows])
return {
'column_metadata': [{'id': key, 'type': col.type} for (key, col) in six.iteritems(columns)],
'table_data': [[_dump_value(val) for val in col.values] for col in columns.values()],
'table_name': name
}
def _transpose(rows):
"""
Transposes a collection of dictionary mapping key to values into a
dictionary mapping key to values. Values are encoded into a tuple
made of the grist_type of the first value that is not None and the
collection of values.
"""
transpose = OrderedDict()
values = OrderedDict()
for row in reversed(rows):
values.update(row)
for key, val in six.iteritems(values):
transpose[key] = Col(_grist_type(val), [row.get(key, None) for row in rows])
return transpose
def _dump_value(value):
" Serialize a value."
if isinstance(value, Ref):
return value.rowid
return value
def _grist_type(value):
" Returns the grist type for value. "
val_type = type(value)
if val_type == Ref:
return 'Ref:{}'.format(value.table_name)
return GRIST_TYPES.get(val_type, 'Text')

View File

@@ -0,0 +1,120 @@
"""
Helper functions for import plugins
"""
import sys
import itertools
import logging
import os
# Include /thirdparty into module search paths, in particular for messytables.
sys.path.append('/thirdparty')
import six
from six.moves import zip
log = logging.getLogger(__name__)
# Get path to an imported file.
def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir'
return os.path.join(importdir, file_source)
def capitalize(word):
"""Capitalize the first character in the word (without lowercasing the rest)."""
return word[0].capitalize() + word[1:]
def _is_numeric(text):
for t in six.integer_types + (float, complex):
try:
t(text)
return True
except (ValueError, OverflowError):
pass
return False
def _is_header(header, data_rows):
"""
Returns whether header can be considered a legitimate header for data_rows.
"""
# See if the row has any non-text values.
for cell in header:
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
return False
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
# a header.
count_repeats = [0 for cell in header]
for row in data_rows:
for cell, header_cell in zip(row, header):
if cell.value and cell.value == header_cell.value:
return False
return True
def _count_nonempty(row):
"""
Returns the count of cells in row, ignoring trailing empty cells.
"""
count = 0
for i, c in enumerate(row):
if not c.empty:
count = i + 1
return count
def find_first_non_empty_row(rows):
"""
Returns (data_offset, header) of the first row with non-empty fields
or (0, []) if there are no non-empty rows.
"""
for i, row in enumerate(rows):
if _count_nonempty(row) > 0:
return i + 1, row
# No non-empty rows.
return 0, []
def expand_headers(headers, data_offset, rows):
"""
Returns expanded header to have enough columns for all rows in the given sample.
"""
row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
None))))
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values
def headers_guess(rows):
"""
Our own smarter version of messytables.headers_guess, which also guesses as to whether one of
the first rows is in fact a header. Returns (data_offset, headers) where data_offset is the
index of the first line of data, and headers is the list of guessed headers (which will contain
empty strings if the file had no headers).
"""
# Messytables guesses at the length of data rows, and then assumes that the first row that has
# close to that many non-empty fields is the header, where by "close" it means 1 less.
#
# For Grist, it's better to mistake headers for data than to mistake data for headers. Note that
# there is csv.Sniffer().has_header(), which tries to be clever, but it's messes up too much.
#
# We only consider for the header the first row with non-empty cells. It is a header if
# - it has no non-text fields
# - none of the fields have a value that repeats in that column of data
# Find the first row with non-empty fields.
data_offset, header = find_first_non_empty_row(rows)
if not header:
return data_offset, header
# Let's see if row is really a header.
if not _is_header(header, itertools.islice(rows, data_offset, None)):
data_offset -= 1
header = []
# Expand header to have enough columns for all rows in the given sample.
header_values = expand_headers(header, data_offset, rows)
return data_offset, header_values

View File

@@ -0,0 +1,150 @@
"""
This module reads a file path that is passed in using ActiveDoc.importFile()
and returns a object formatted so that it can be used by grist for a bulk add records action
"""
import csv
import logging
import os
import chardet
import messytables
import messytables.excel
import six
from six.moves import zip
import parse_data
from imports import import_utils
log = logging.getLogger(__name__)
def import_file(file_source, parse_options):
path = import_utils.get_path(file_source["path"])
orig_name = file_source["origName"]
parse_options, tables = parse_file(path, orig_name, parse_options)
return {"parseOptions": parse_options, "tables": tables}
# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any
# behavior.
orig_dialect = messytables.CSVRowSet._dialect
def override_dialect(self):
if self.delimiter == '\t':
return csv.excel_tab
return orig_dialect.fget(self)
messytables.CSVRowSet._dialect = property(override_dialect)
def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None):
# pylint: disable=unused-argument
with open(file_path, "rb") as f:
try:
return parse_open_file(f, orig_name, table_name_hint=table_name_hint)
except Exception as e:
# Log the full error, but simplify the thrown error to omit the unhelpful extra args.
log.info("import_xls parse_file failed: %s", e)
if six.PY2 and e.args and isinstance(e.args[0], six.string_types):
raise Exception(e.args[0])
raise
def parse_open_file(file_obj, orig_name, table_name_hint=None):
file_root, file_ext = os.path.splitext(orig_name)
table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False)
# Messytable's encoding detection uses too small a sample, so we override it here.
if isinstance(table_set, messytables.CSVTableSet):
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding']
# In addition, always prefer UTF8 over ASCII.
if table_set.encoding == 'ascii':
table_set.encoding = 'utf8'
export_list = []
# A table set is a collection of tables:
for row_set in table_set.tables:
table_name = row_set.name
if isinstance(row_set, messytables.CSVRowSet):
# For csv files, we can do better for table_name by using the filename.
table_name = import_utils.capitalize(table_name_hint or
os.path.basename(file_root.decode('utf8')))
# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(list(row_set.sample))
else:
# Let messytables guess header names and the offset of the header.
offset, headers = messytables.headers_guess(row_set.sample)
data_offset = offset + 1 # Add the header line
# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
headers[i] = six.text_type(header)
log.debug("Guessed data_offset as %s", data_offset)
log.debug("Guessed headers as: %s", headers)
row_set.register_processor(messytables.offset_processor(data_offset))
table_data_with_types = parse_data.get_table_data(row_set, len(headers))
# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
table_data = []
for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]):
continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
if not table_data:
# Don't add tables with no columns.
continue
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list.append({
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
})
parse_options = {}
return parse_options, export_list
# This change was initially introduced in https://phab.getgrist.com/D2145
# Monkey-patching done in https://phab.getgrist.com/D2965
# to move towards normal dependency management
@staticmethod
def from_xlrdcell(xlrd_cell, sheet, col, row):
from messytables.excel import (
XLS_TYPES, StringType, DateType, InvalidDateError, xlrd, time, datetime, XLSCell
)
value = xlrd_cell.value
cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
if cell_type == DateType(None):
# Try-catch added by Dmitry, to avoid failing even if we see a date we can't handle.
try:
if value == 0:
raise InvalidDateError
year, month, day, hour, minute, second = \
xlrd.xldate_as_tuple(value, sheet.book.datemode)
if (year, month, day) == (0, 0, 0):
value = time(hour, minute, second)
else:
value = datetime(year, month, day, hour, minute, second)
except Exception:
# Keep going, and we'll just interpret the date as a number.
pass
messy_cell = XLSCell(value, type=cell_type)
messy_cell.sheet = sheet
messy_cell.xlrd_cell = xlrd_cell
messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y)
return messy_cell
messytables.excel.XLSCell.from_xlrdcell = from_xlrdcell

View File

@@ -1,60 +0,0 @@
"""This module loads a file_importer that implements the Grist import
API, and calls its selected method passing argument received from
PluginManager.sandboxImporter(). It returns an object formatted so
that it can be used by Grist.
"""
import sys
import argparse
import logging
import imp
import json
import marshal
log = logging.getLogger(__name__)
# Include /thirdparty into module search paths, in particular for messytables.
# pylint: disable=wrong-import-position
sys.path.append('/thirdparty')
def marshal_data(export_list):
return marshal.dumps(export_list, 2)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--debug', action='store_true',
help="Print debug instead of producing normal binary output")
parser.add_argument('-t', '--table',
help="Suggested table name to use with CSV imports")
parser.add_argument('-n', '--plugin-name', required=True,
help="Name of a python module implementing the import API.")
parser.add_argument('-p', '--plugin-path',
help="Location of the module.")
parser.add_argument('--action-options',
help="Options to pass to the action. See API documentation.")
parser.add_argument('action', help='Action to call',
choices=['can_parse', 'parse_file'])
parser.add_argument('input', help='File to convert')
args = parser.parse_args()
s = logging.StreamHandler()
s.setFormatter(logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'))
rootLogger = logging.getLogger()
rootLogger.addHandler(s)
rootLogger.setLevel(logging.DEBUG if args.debug else logging.INFO)
import_plugin = imp.load_compiled(
args.plugin_name,
args.plugin_path)
options = {}
if args.action_options:
options = json.loads(args.action_options)
parsed_data = getattr(import_plugin, args.action)(args.input, **options)
marshalled_data = marshal_data(parsed_data)
log.info("Marshalled data has %d bytes", len(marshalled_data))
if not args.debug:
sys.stdout.write(marshalled_data)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,18 @@
def register_import_parsers(sandbox):
def parse_csv(file_source, options):
from imports.import_csv import parse_file_source
return parse_file_source(file_source, options)
sandbox.register("csv_parser.parseFile", parse_csv)
def parse_excel(file_source, parse_options):
from imports.import_xls import import_file
return import_file(file_source, parse_options)
sandbox.register("xls_parser.parseFile", parse_excel)
def parse_json(file_source, parse_options):
from imports.import_json import parse_file
return parse_file(file_source, parse_options)
sandbox.register("json_parser.parseFile", parse_json)

View File

@@ -0,0 +1,102 @@
import unittest
from imports.dateguess import guess, guess_bulk
class TestGuesser(unittest.TestCase):
def assertDate(self, input_str, fmt_list):
guessed = guess(input_str)
self.assertEqual(set(guessed), set(fmt_list))
def assertDates(self, input_lst, error_rate, fmt_list):
guessed = guess_bulk(input_lst, error_rate=error_rate)
self.assertEqual(set(guessed), set(fmt_list))
def test_guess_dates(self):
self.assertDate('', [])
self.assertDate("2013-13-13", [])
self.assertDate("25/25/1911", [])
self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
self.assertDate("2013-12-13", ['%Y-%m-%d'])
self.assertDate("12/31/1999", ['%m/%d/%Y'])
self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
self.assertDate("12/31/99", ['%m/%d/%y'])
self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
self.assertDate("31.12.91", ['%d.%m.%y'])
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
self.assertDate("31.12.1991", ['%d.%m.%Y'])
self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
self.assertDate("13.2.2008", ['%d.%m.%Y'])
self.assertDate("31.12.91", ['%d.%m.%y'])
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
self.assertDate("31 Dec 1999", ['%d %b %Y'])
self.assertDate("1 Jan 2012", ['%d %b %Y'])
self.assertDate("3 August 2009", ['%d %B %Y'])
self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
self.assertDate("13/1/2012", ['%d/%m/%Y'])
self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
def test_guess_datetimes(self):
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
# TODO remove all except first one
self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
'%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one
self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
def test_guess_bulk_dates(self):
self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,333 @@
# This Python file uses the following encoding: utf-8
import os
import textwrap
import unittest
from six import BytesIO, text_type
import csv
import calendar
import datetime
from imports import import_csv
def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):
if isinstance(string, text_type):
string = string.encode("utf8")
return BytesIO(string)
class TestImportCSV(unittest.TestCase):
def _check_col(self, sheet, index, name, typename, values):
self.assertEqual(sheet["column_metadata"][index]["id"], name)
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
self.assertEqual(sheet["table_data"][index], values)
def _check_num_cols(self, sheet, exp_cols):
self.assertEqual(len(sheet["column_metadata"]), exp_cols)
self.assertEqual(len(sheet["table_data"]), exp_cols)
def test_csv_types(self):
parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='')
sheet = parsed_file[1][0]
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
self._check_col(sheet, 6, "date1", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
self._check_col(sheet, 7, "date2", "Date",
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
self._check_col(sheet, 8, "datetext", "Date",
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
self._check_col(sheet, 9, "datetimetext", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
def test_user_parse_options(self):
options = {u'parse_options': {"escapechar": None, "include_col_names_as_headers": True,
"lineterminator": "\n", "skipinitialspace": False,
"limit_rows": False, "quoting": 0, "start_with_row": 1,
"delimiter": ",", "NUM_ROWS":10,
"quotechar": "\"", "doublequote":True}}
parsed_file = import_csv.parse_file(_get_fixture('test_import_csv.csv'),
**options)[1][0]
self._check_num_cols(parsed_file, 5)
self._check_col(parsed_file, 0, "FIRST_NAME", "Text", ['John', 'Tim', 'Jenny', 'Lily'])
self._check_col(parsed_file, 1, "LAST_NAME", "Text", ['Moor', 'Kale', 'Jo', 'Smit'])
self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
'2013433434', '(201)343-3434'])
self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1, name2, name3
a1,b1,c1
a2,b2
a3
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", ""])
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
def test_wrong_cols2(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1
a1,b1
a2,b2,c2
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2"])
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
def test_offset(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
,,,,,,,
name1,name2,name3
a1,b1,c1
a2,b2,c2
a3,b3,c3,d4
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 4)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", "b3"])
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
def test_offset_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
4,b1,c1
4,b2,c2
4,b3,c3
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "", "Int", [4, 4, 4])
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2", "b3"])
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
def test_empty_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
,,-,-
b,a,a,a,a
b,a,a,a,a
b,a,a,a,a
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 5)
self._check_col(parsed_file, 0, "", "Text", ["b", "b", "b"])
self._check_col(parsed_file, 1, "", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
-,-,-,-,-,-
b,a,a,a,a
b,a,a,a,a
b,a,a,a,a
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 6)
self._check_col(parsed_file, 0, "-", "Text", ["b", "b", "b"])
self._check_col(parsed_file, 1, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
def test_guess_missing_user_option(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1,;name2,;name3
a1,;b1,;c1
a2,;b2,;c2
a3,;b3,;c3
"""))
parse_options = {"delimiter": ';',
"escapechar": None,
"lineterminator": '\r\n',
"quotechar": '"',
"quoting": csv.QUOTE_MINIMAL}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1,", "Text", ["a1,", "a2,", "a3,"])
self._check_col(parsed_file, 1, "name2,", "Text", ["b1,", "b2,", "b3,"])
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
# Sniffer detects delimiters in order [',', '\t', ';', ' ', ':'],
# so for this file_obj it will be ','
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
self._check_col(parsed_file, 1, ";name2", "Text", [";b1", ";b2", ";b3"])
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
def test_one_line_file_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
2,name2,name3
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "", "Int", [2])
self._check_col(parsed_file, 1, "", "Text", ["name2"])
self._check_col(parsed_file, 2, "", "Text", ["name3"])
def test_one_line_file_with_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1,name2,name3
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", [])
self._check_col(parsed_file, 1, "name2", "Text", [])
self._check_col(parsed_file, 2, "name3", "Text", [])
def test_empty_file(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
"""))
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})
self.assertEqual(parsed_file, ({}, []))
def test_option_num_rows(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1,name2,name3
a1,b1,c1
a2,b2,c2
a3,b3,c3
"""))
parse_options = {}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
parse_options = {"NUM_ROWS": 2}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2"])
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2"])
parse_options = {"NUM_ROWS": 10}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
def test_option_num_rows_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
,,
,,
a1,1,c1
a2,2,c2
a3,3,c3
"""))
parse_options = {}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2', 'a3'])
self._check_col(parsed_file, 1, "", "Int", [1, 2, 3])
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2', 'c3'])
parse_options = {"NUM_ROWS": 2}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2'])
self._check_col(parsed_file, 1, "", "Int", [1, 2])
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
def test_option_use_col_name_as_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
name1,name2,name3
a1,1,c1
a2,2,c2
a3,3,c3
"""))
parse_options = {"include_col_names_as_headers": False}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "", "Text", ["name1", "a1", "a2", "a3"])
self._check_col(parsed_file, 1, "", "Text", ["name2", "1", "2", "3"])
self._check_col(parsed_file, 2, "", "Text", ["name3", "c1", "c2", "c3"])
parse_options = {"include_col_names_as_headers": True}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 3)
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
self._check_col(parsed_file, 1, "name2", "Int", [1, 2, 3])
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
def test_option_use_col_name_as_header_no_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent(
"""\
,,,
,,,
n1,2,n3
a1,1,c1,d1
a2,4,c2
a3,5,c3
"""))
parse_options = {"include_col_names_as_headers": False}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 4)
self._check_col(parsed_file, 0, "", "Text", ["n1", "a1", "a2", "a3"])
self._check_col(parsed_file, 1, "", "Int", [2, 1, 4, 5])
self._check_col(parsed_file, 2, "", "Text", ["n3", "c1", "c2", "c3"])
self._check_col(parsed_file, 3, "", "Text", ["", "d1", "", ""])
parse_options = {"include_col_names_as_headers": True}
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
self._check_num_cols(parsed_file, 4)
self._check_col(parsed_file, 0, "n1", "Text", ["a1", "a2", "a3"])
self._check_col(parsed_file, 1, "2", "Int", [1, 4, 5])
self._check_col(parsed_file, 2, "n3", "Text", ["c1", "c2", "c3"])
self._check_col(parsed_file, 3, "", "Text", [ "d1", "", ""])
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,259 @@
from unittest import TestCase
from imports import import_json
class TestImportJSON(TestCase):
maxDiff = None
def test_simple_json_array(self):
grist_tables = import_json.dumps([{'a': 1, 'b': 'baba'}, {'a': 4, 'b': 'abab'}], '')
self.assertEqual(grist_tables['tables'], [{
'column_metadata': [
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
'table_data': [[1, 4], ['baba', 'abab']],
'table_name': ''
}])
def test_missing_data(self):
grist_tables = import_json.dumps([{'a': 1}, {'b': 'abab'}, {'a': 4}])
self.assertEqual(grist_tables['tables'], [{
'column_metadata': [
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
'table_data': [[1, None, 4], [None, 'abab', None]],
'table_name': ''
}])
def test_even_more_simple_array(self):
self.assertEqual(
import_json.dumps(['apple', 'pear', 'banana'], '')['tables'],
[{
'column_metadata': [
{'id': '', 'type': 'Text'}],
'table_data': [['apple', 'pear', 'banana']],
'table_name': ''
}])
def test_mixing_simple_and_even_more_simple(self):
self.assertEqual(
import_json.dumps(['apple', 'pear', {'a': 'some cucumbers'}, 'banana'], '')['tables'],
[{
'column_metadata': [
{'id': '', 'type': 'Text'},
{'id': 'a', 'type': 'Text'}],
'table_data': [['apple', 'pear', None, 'banana'], [None, None, 'some cucumbers', None]],
'table_name': ''
}])
def test_array_with_reference(self):
# todo: reference should follow Grist's format
self.assertEqual(
import_json.dumps([{'a': {'b': 2}, 'c': 'foo'}], 'Hello')['tables'],
[{
'column_metadata': [
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
],
'table_data': [[1], ['foo']],
'table_name': 'Hello'
}, {
'column_metadata': [
{'id': 'b', 'type': 'Int'}
],
'table_data': [[2]],
'table_name': 'Hello_a'
}])
def test_nested_nested_object(self):
self.assertEqual(
import_json.dumps([{'a': {'b': 2, 'd': {'a': 'sugar'}}, 'c': 'foo'}], 'Hello')['tables'],
[{
'column_metadata': [
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
],
'table_data': [[1], ['foo']],
'table_name': 'Hello'
}, {
'column_metadata': [
{'id': 'b', 'type': 'Int'}, {'id': 'd', 'type': 'Ref:Hello_a_d'}
],
'table_data': [[2], [1]],
'table_name': 'Hello_a'
}, {
'column_metadata': [
{'id': 'a', 'type': 'Text'}
],
'table_data': [['sugar']],
'table_name': 'Hello_a_d'
}])
def test_array_with_list(self):
self.assertEqual(
import_json.dumps([{'a': ['ES', 'FR', 'US']}, {'a': ['FR']}], 'Hello')['tables'],
[{
'column_metadata': [],
'table_data': [],
'table_name': 'Hello'
}, {
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
'table_data': [['ES', 'FR', 'US', 'FR'], [1, 1, 1, 2]],
'table_name': 'Hello_a'
}])
def test_array_with_list_of_dict(self):
self.assertEqual(
import_json.dumps([{'a': [{'b': 1}, {'b': 4}]}, {'c': 2}], 'Hello')['tables'],
[ {
'column_metadata': [{'id': 'c', 'type': 'Int'}],
'table_data': [[None, 2]],
'table_name': 'Hello'
}, {
'column_metadata': [
{'id': 'b', 'type': 'Int'},
{'id': 'Hello', 'type': 'Ref:Hello'}
],
'table_data': [[1, 4], [1, 1]],
'table_name': 'Hello_a'
}])
def test_array_of_array(self):
self.assertEqual(
import_json.dumps([['FR', 'US'], ['ES', 'CH']], 'Hello')['tables'],
[{
'column_metadata': [],
'table_data': [],
'table_name': 'Hello'
}, {
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
'table_data': [['FR', 'US', 'ES', 'CH'], [1, 1, 2, 2]],
'table_name': 'Hello_'
}, ])
def test_json_dict(self):
self.assertEqual(
import_json.dumps({
'foo': [{'a': 1, 'b': 'santa'}, {'a': 4, 'b': 'cats'}],
'bar': [{'c': 2, 'd': 'ducks'}, {'c': 5, 'd': 'dogs'}],
'status': {'success': True, 'time': '5s'}
}, 'Hello')['tables'], [{
'table_name': 'Hello',
'column_metadata': [{'id': 'status', 'type': 'Ref:Hello_status'}],
'table_data': [[1]]
}, {
'table_name': 'Hello_bar',
'column_metadata': [
{'id': 'c', 'type': 'Int'},
{'id': 'd', 'type': 'Text'},
{'id': 'Hello', 'type': 'Ref:Hello'}
],
'table_data': [[2, 5], ['ducks', 'dogs'], [1, 1]]
}, {
'table_name': 'Hello_foo',
'column_metadata': [
{'id': 'a', 'type': 'Int'},
{'id': 'b', 'type': 'Text'},
{'id': 'Hello', 'type': 'Ref:Hello'}],
'table_data': [[1, 4], ['santa', 'cats'], [1, 1]]
}, {
'table_name': 'Hello_status',
'column_metadata': [
{'id': 'success', 'type': 'Bool'},
{'id': 'time', 'type': 'Text'}
],
'table_data': [[True], ['5s']]
}])
def test_json_types(self):
self.assertEqual(import_json.dumps({
'a': 3, 'b': 3.14, 'c': True, 'd': 'name', 'e': -4, 'f': '3.14', 'g': None
}, 'Hello')['tables'],
[{
'table_name': 'Hello',
'column_metadata': [
{'id': 'a', 'type': 'Int'},
{'id': 'b', 'type': 'Numeric'},
{'id': 'c', 'type': 'Bool'},
{'id': 'd', 'type': 'Text'},
{'id': 'e', 'type': 'Int'},
{'id': 'f', 'type': 'Text'},
{'id': 'g', 'type': 'Text'}
],
'table_data': [[3], [3.14], [True], ['name'], [-4], ['3.14'], [None]]
}])
def test_type_is_defined_with_first_value(self):
tables = import_json.dumps([{'a': 'some text'}, {'a': 3}], '')
self.assertIsNotNone(tables['tables'])
self.assertIsNotNone(tables['tables'][0])
self.assertIsNotNone(tables['tables'][0]['column_metadata'])
self.assertIsNotNone(tables['tables'][0]['column_metadata'][0])
self.assertEqual(tables['tables'][0]['column_metadata'][0]['type'], 'Text')
def test_first_unique_key(self):
self.assertEqual(import_json.first_available_key({'a': 1}, 'a'), 'a2')
self.assertEqual(import_json.first_available_key({'a': 1}, 'b'), 'b')
self.assertEqual(import_json.first_available_key({'a': 1, 'a2': 1}, 'a'), 'a3')
def dump_tables(options):
data = {
"foos": [
{'foo': 1, 'link': [1, 2]},
{'foo': 2, 'link': [1, 2]}
],
"bar": {'hi': 'santa'}
}
return [t for t in import_json.dumps(data, 'FooBar', options)['tables']]
class TestParseOptions(TestCase):
maxDiff = None
# helpers
def assertColInTable(self, tables, **kwargs):
table = next(t for t in tables if t['table_name'] == kwargs['table_name'])
self.assertEqual(any(col['id'] == kwargs['col_id'] for col in table['column_metadata']),
kwargs['present'])
def assertTableNamesEqual(self, tables, expected_table_names):
table_names = [t['table_name'] for t in tables]
self.assertEqual(sorted(table_names), sorted(expected_table_names))
def test_including_empty_string_includes_all(self):
tables = dump_tables({'includes': '', 'excludes': ''})
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar', 'FooBar_foos', 'FooBar_foos_link'])
def test_including_foos_includes_nested_object_and_removes_ref_to_table_not_included(self):
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': ''})
self.assertTableNamesEqual(tables, ['FooBar_foos', 'FooBar_foos_link'])
self.assertColInTable(tables, table_name='FooBar_foos', col_id='FooBar', present=False)
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
self.assertColInTable(tables, table_name='FooBar_foos_link', col_id='FooBar_foos',
present=False)
def test_excluding_foos_excludes_nested_object_and_removes_link_to_excluded_table(self):
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos'})
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar'])
self.assertColInTable(tables, table_name='FooBar', col_id='foos', present=False)
def test_excludes_works_on_nested_object_that_are_included(self):
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': 'FooBar_foos_link'})
self.assertTableNamesEqual(tables, ['FooBar_foos'])
def test_excludes_works_on_property(self):
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_foo'})
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos', 'FooBar_foos_link', 'FooBar_bar'])
self.assertColInTable(tables, table_name='FooBar_foos', col_id='foo', present=False)
def test_works_with_multiple_includes(self):
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
tables = dump_tables({'includes': 'FooBar_foos_link;FooBar_bar', 'excludes': ''})
self.assertTableNamesEqual(tables, ['FooBar_bar', 'FooBar_foos_link'])
def test_works_with_multiple_excludes(self):
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_link;FooBar_bar'})
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos'])

View File

@@ -0,0 +1,160 @@
# This Python file uses the following encoding: utf-8
import calendar
import datetime
import math
import os
import unittest
from imports import import_xls
def _get_fixture(filename):
return [os.path.join(os.path.dirname(__file__), "fixtures", filename), filename]
class TestImportXLS(unittest.TestCase):
def _check_col(self, sheet, index, name, typename, values):
self.assertEqual(sheet["column_metadata"][index]["id"], name)
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
self.assertEqual(sheet["table_data"][index], values)
def test_excel(self):
parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
# check that column type was correctly set to int and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
# check that column type was correctly set to text and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
self.assertEqual(parsed_file[1][0]["table_data"][1],
["a", "b", "c", "d", "e", "f", "g", "h"])
# messy tables does not support bool types yet, it classifies them as ints
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
self.assertEqual(parsed_file[1][False]["table_data"][2],
[True, False, True, False, True, False, True, False])
# check that column type was correctly set to text and values are properly parsed
self.assertEqual(parsed_file[1][0]["column_metadata"][3],
{"type": "Text", "id": "corner-cases"})
self.assertEqual(parsed_file[1][0]["table_data"][3],
# The type is detected as text, so all values should be text.
[u'=function()', '3.0', u'two spaces after ',
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
# check that multiple tables are created when there are multiple sheets in a document
self.assertEqual(parsed_file[1][0]["table_name"], u"Sheet1")
self.assertEqual(parsed_file[1][1]["table_name"], u"Sheet2")
self.assertEqual(parsed_file[1][1]["table_data"][0], ["a", "b", "c", "d"])
def test_excel_types(self):
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
sheet = parsed_file[1][0]
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
self._check_col(sheet, 6, "date1", "DateTime",
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
self._check_col(sheet, 7, "date2", "Date",
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
self._check_col(sheet, 8, "datetext", "Date",
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
# TODO: all dates have different format
# self._check_col(sheet, 9, "datetimetext", "DateTime",
# [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
# calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
# calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
def test_excel_type_detection(self):
# This tests goes over the second sheet of the fixture doc, which has multiple rows that try
# to throw off the type detection.
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
sheet = parsed_file[1][1]
self._check_col(sheet, 0, "date_with_other", "DateTime",
[1467676800.0, 1451606400.0, 1451692800.0, 1454544000.0, 1199577600.0,
1467732614.0, u'n/a', 1207958400.0, 1451865600.0, 1451952000.0,
None, 1452038400.0, 1451549340.0, 1483214940.0, None,
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
self._check_col(sheet, 1, "float_not_int", "Numeric",
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
self._check_col(sheet, 2, "int_not_bool", "Int",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 3, "float_not_bool", "Numeric",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 4, "text_as_bool", "Bool",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 5, "int_as_bool", "Bool",
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
self._check_col(sheet, 6, "float_not_date", "Numeric",
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
4.0, 6.0, '3-4', 4.0, 6.5])
self._check_col(sheet, 7, "float_not_text", "Numeric",
[-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-",
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
self._check_col(sheet, 8, "dollar_amts", "Numeric",
[0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
def test_excel_single_merged_cell(self):
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
# exception.
parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx'))
tables = parsed_file[1]
self.assertEqual(tables, [{
'table_name': u'Transaction Report',
'column_metadata': [
{'type': 'Text', 'id': u''},
{'type': 'Numeric', 'id': u'Start'},
{'type': 'Numeric', 'id': u''},
{'type': 'Numeric', 'id': u''},
{'type': 'Text', 'id': u'Seek no easy ways'},
],
'table_data': [
[u'SINGLE MERGED', u'The End'],
[1637384.52, u''],
[2444344.06, u''],
[2444344.06, u''],
[u'', u''],
],
}])
def test_excel_strange_dates(self):
# TODO fails with xlrd.xldate.XLDateAmbiguous: 4.180902777777778
# Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00).
parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx'))
tables = parsed_file[1]
# We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30"
# would be best left as text, but here become "01:10:00" (after xlrd parses the first as
# datetime.time), and as 4.18... (after xlrd fails and we resort to the numerical value).
self.assertEqual(tables, [{
'table_name': u'Sheet1',
'column_metadata': [
{'id': 'a', 'type': 'Text'},
{'id': 'b', 'type': 'Date'},
{'id': 'c', 'type': 'Text'},
{'id': 'd', 'type': 'Text'},
{'id': 'e', 'type': 'Numeric'},
{'id': 'f', 'type': 'Int'},
{'id': 'g', 'type': 'Date'},
{'id': 'h', 'type': 'Date'},
{'id': 'i', 'type': 'Bool'},
],
'table_data': [
[u'21:14:00'],
[1568851200.0],
[u'01:10:00'],
[u'10:20:30'],
[4.180902777777778],
[20],
[-6106060800.0],
[205286400.0],
[False], # This is not great either, we should be able to distinguish 0 from FALSE.
],
}])
if __name__ == '__main__':
unittest.main()

View File

@@ -11,14 +11,15 @@ import functools
import six
from acl_formula import parse_acl_formula
import actions
from sandbox import get_default_sandbox
import engine
import migrations
import schema
import useractions
import objtypes
from acl_formula import parse_acl_formula
from sandbox import get_default_sandbox
from imports.register import register_import_parsers
import logger
log = logger.Logger(__name__, logger.INFO)
@@ -107,6 +108,8 @@ def run(sandbox):
export(eng.load_empty)
export(eng.load_done)
register_import_parsers(sandbox)
sandbox.run()
def main():

299
sandbox/grist/parse_data.py Normal file
View File

@@ -0,0 +1,299 @@
"""
This module implements a way to detect and convert types that's better than messytables (at least
in some relevant cases).
It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
of values. All "data" lists will have the same length.
"""
from imports import dateguess
import datetime
import logging
import re
import messytables
import moment # TODO grist internal libraries might not be available to plugins in the future.
import dateutil.parser as date_parser
import six
from six.moves import zip, xrange
log = logging.getLogger(__name__)
# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
# pylint:disable=unidiomatic-typecheck
# Our approach to type detection is different from that of messytables.
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
# We use those counts to produce the selected Grist type at the end.
class BaseConverter(object):
@classmethod
def test(cls, value):
try:
cls.convert(value)
return True
except Exception:
return False
@classmethod
def convert(cls, value):
"""Implement to convert imported value to a basic type."""
raise NotImplementedError()
@classmethod
def get_grist_column(cls, values):
"""
Given an array of values returned successfully by convert(), return a tuple of
(grist_type_string, grist_values), where grist_values is an array of values suitable for the
returned grist type.
"""
raise NotImplementedError()
class NumericConverter(BaseConverter):
"""Handles numeric values, including Grist types Numeric and Int."""
# A number matching this is probably an identifier of some sort. Converting it to a float will
# lose precision, so it's better not to consider it numeric.
_unlikely_float = re.compile(r'\d{17}|^0\d')
# Integers outside this range will be represented as floats. This is the limit for values that can
# be stored in a JS Int32Array.
_max_js_int = 1<<31
# The thousands separator. It should be locale-specific, but we don't currently have a way to
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
_thousands_sep = ','
@classmethod
def convert(cls, value):
if type(value) in six.integer_types + (float, complex):
return value
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
raise ValueError()
@classmethod
def _is_integer(cls, value):
ttype = type(value)
if ttype == int or (ttype == float and value.is_integer()):
return -cls._max_js_int <= value < cls._max_js_int
return False
@classmethod
def get_grist_column(cls, values):
if all(cls._is_integer(v) for v in values):
return ("Int", [int(v) for v in values])
return ("Numeric", values)
class DateParserInfo(date_parser.parserinfo):
def validate(self, res):
# Avoid this bogus combination which accepts plain numbers.
if res.day and not res.month:
return False
return super(DateParserInfo, self).validate(res)
class SimpleDateTimeConverter(BaseConverter):
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
@classmethod
def convert(cls, value):
if type(value) is datetime.datetime:
return value
elif value == "":
return None
raise ValueError()
@classmethod
def _is_date(cls, value):
return value is None or value.time() == datetime.time()
@classmethod
def get_grist_column(cls, values):
grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
for v in values]
return grist_type, grist_values
class DateTimeCoverter(BaseConverter):
"""Handles dateformats by guessed format."""
def __init__(self, date_format):
self._format = date_format
def convert(self, value):
if value == "":
return None
if type(value) in (str, six.text_type):
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
if '%z' in self._format or '%Z' in self._format:
return date_parser.parse(value)
else:
try:
return datetime.datetime.strptime(value, self._format)
except ValueError:
return date_parser.parse(value)
raise ValueError()
def _is_date(self, value):
return value is None or value.time() == datetime.time()
def get_grist_column(self, values):
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
for v in values]
return grist_type, grist_values
class BoolConverter(BaseConverter):
"""Handles Boolean type."""
_true_values = (1, '1', 'true', 'yes')
_false_values = (0, '0', 'false', 'no')
@classmethod
def convert(cls, value):
v = value.strip().lower() if type(value) in (str, six.text_type) else value
if v in cls._true_values:
return True
elif v in cls._false_values:
return False
raise ValueError()
@classmethod
def get_grist_column(cls, values):
return ("Bool", values)
class TextConverter(BaseConverter):
"""Fallback converter that converts everything to strings."""
@classmethod
def convert(cls, value):
return six.text_type(value)
@classmethod
def get_grist_column(cls, values):
return ("Text", values)
class ColumnDetector(object):
"""
ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
different basic types. At the end `get_converter()` method returns the class of the most
suitable converter.
"""
# Converters are listed in the order of preference, which is only used if two converters succeed
# on the same exact number of values. Text is always a fallback.
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
# If this many non-junk values or more can't be converted, fall back to text.
_text_threshold = 0.10
# Junk values: these aren't counted when deciding whether to fall back to text.
_junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)
def __init__(self):
self._counts = [0] * len(self.converters)
self._count_nonjunk = 0
self._count_total = 0
self._data = []
def add_value(self, value):
self._count_total += 1
if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
return
self._data.append(value)
self._count_nonjunk += 1
for i, conv in enumerate(self.converters):
if conv.test(value):
self._counts[i] += 1
def get_converter(self):
if sum(self._counts) == 0:
# if not already guessed as int, bool or datetime then we should try to guess date pattern
str_data = [d for d in self._data if isinstance(d, six.string_types)]
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
data_format = data_formats[0] if data_formats else None
if data_format:
return DateTimeCoverter(data_format)
# We find the max by count, and secondarily by minimum index in the converters list.
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
return self.converters[-neg_index]
return TextConverter
def _guess_basic_types(rows, num_columns):
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
for row in rows:
for cell, detector in zip(row, column_detectors):
detector.add_value(cell.value)
return [detector.get_converter() for detector in column_detectors]
class ColumnConverter(object):
"""
ColumnConverter converts and collects values using the passed-in converter object. At the end
`get_grist_column()` method returns a column of converted data.
"""
def __init__(self, converter):
self._converter = converter
self._all_col_values = [] # Initially this has None's for converted values
self._converted_values = [] # A list of all converted values
self._converted_indices = [] # Indices of the converted values into self._all_col_values
def convert_and_add(self, value):
# For some reason, we get 'str' type rather than 'unicode' for empty strings.
# Correct this, since all text should be unicode.
value = u"" if value == "" else value
try:
conv = self._converter.convert(value)
self._converted_values.append(conv)
self._converted_indices.append(len(self._all_col_values))
self._all_col_values.append(None)
except Exception:
self._all_col_values.append(six.text_type(value))
def get_grist_column(self):
"""
Returns a dictionary {"type": grist_type, "data": grist_value_array}.
"""
grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
for i, v in zip(self._converted_indices, grist_values):
self._all_col_values[i] = v
return {"type": grist_type, "data": self._all_col_values}
def get_table_data(row_set, num_columns, num_rows=0):
converters = _guess_basic_types(row_set.sample, num_columns)
col_converters = [ColumnConverter(c) for c in converters]
for num, row in enumerate(row_set):
if num_rows and num == num_rows:
break
if num % 10000 == 0:
log.info("Processing row %d", num)
# Make sure we have a value for every column.
missing_values = len(converters) - len(row)
if missing_values > 0:
row.extend([messytables.Cell("")] * missing_values)
for cell, conv in zip(row, col_converters):
conv.convert_and_add(cell.value)
return [conv.get_grist_column() for conv in col_converters]