mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) Move file import plugins into core/sandbox/grist
Summary: Move all the plugins python code into the main folder with the core code. Register file importing functions in the same main.py entrypoint as the data engine. Remove options relating to different entrypoints and code directories. The only remaining plugin-specific option in NSandbox is the import directory/mount, i.e. where files to be parsed are placed. Test Plan: this Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2965
This commit is contained in:
@@ -20,5 +20,4 @@ contributions:
|
||||
|
||||
scripts:
|
||||
build:
|
||||
# Note that ${XUNIT:+xxx} inserts "xxx" when XUNIT is set, and nothing otherwise.
|
||||
test: $GRIST_PYTHON -m runtests discover -v -s /sandbox ${XUNIT:+--xunit}
|
||||
test:
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
|
||||
@@ -1,184 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import functools
|
||||
from collections import namedtuple
|
||||
from threading import RLock
|
||||
|
||||
_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
|
||||
|
||||
|
||||
@functools.wraps(functools.update_wrapper)
|
||||
def update_wrapper(wrapper,
|
||||
wrapped,
|
||||
assigned = functools.WRAPPER_ASSIGNMENTS,
|
||||
updated = functools.WRAPPER_UPDATES):
|
||||
"""
|
||||
Patch two bugs in functools.update_wrapper.
|
||||
"""
|
||||
# workaround for http://bugs.python.org/issue3445
|
||||
assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
|
||||
wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
|
||||
# workaround for https://bugs.python.org/issue17482
|
||||
wrapper.__wrapped__ = wrapped
|
||||
return wrapper
|
||||
|
||||
|
||||
class _HashedSeq(list):
|
||||
__slots__ = 'hashvalue'
|
||||
|
||||
def __init__(self, tup, hash=hash):
|
||||
self[:] = tup
|
||||
self.hashvalue = hash(tup)
|
||||
|
||||
def __hash__(self):
|
||||
return self.hashvalue
|
||||
|
||||
|
||||
def _make_key(args, kwds, typed,
|
||||
kwd_mark=(object(),),
|
||||
fasttypes=set([int, str, frozenset, type(None)]),
|
||||
sorted=sorted, tuple=tuple, type=type, len=len):
|
||||
'Make a cache key from optionally typed positional and keyword arguments'
|
||||
key = args
|
||||
if kwds:
|
||||
sorted_items = sorted(kwds.items())
|
||||
key += kwd_mark
|
||||
for item in sorted_items:
|
||||
key += item
|
||||
if typed:
|
||||
key += tuple(type(v) for v in args)
|
||||
if kwds:
|
||||
key += tuple(type(v) for k, v in sorted_items)
|
||||
elif len(key) == 1 and type(key[0]) in fasttypes:
|
||||
return key[0]
|
||||
return _HashedSeq(key)
|
||||
|
||||
|
||||
def lru_cache(maxsize=100, typed=False):
|
||||
"""Least-recently-used cache decorator.
|
||||
|
||||
If *maxsize* is set to None, the LRU features are disabled and the cache
|
||||
can grow without bound.
|
||||
|
||||
If *typed* is True, arguments of different types will be cached separately.
|
||||
For example, f(3.0) and f(3) will be treated as distinct calls with
|
||||
distinct results.
|
||||
|
||||
Arguments to the cached function must be hashable.
|
||||
|
||||
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
|
||||
f.cache_info(). Clear the cache and statistics with f.cache_clear().
|
||||
Access the underlying function with f.__wrapped__.
|
||||
|
||||
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
|
||||
|
||||
"""
|
||||
|
||||
# Users should only access the lru_cache through its public API:
|
||||
# cache_info, cache_clear, and f.__wrapped__
|
||||
# The internals of the lru_cache are encapsulated for thread safety and
|
||||
# to allow the implementation to change (including a possible C version).
|
||||
|
||||
def decorating_function(user_function):
|
||||
|
||||
cache = dict()
|
||||
stats = [0, 0] # make statistics updateable non-locally
|
||||
HITS, MISSES = 0, 1 # names for the stats fields
|
||||
make_key = _make_key
|
||||
cache_get = cache.get # bound method to lookup key or return None
|
||||
_len = len # localize the global len() function
|
||||
lock = RLock() # because linkedlist updates aren't threadsafe
|
||||
root = [] # root of the circular doubly linked list
|
||||
root[:] = [root, root, None, None] # initialize by pointing to self
|
||||
nonlocal_root = [root] # make updateable non-locally
|
||||
PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
|
||||
|
||||
if maxsize == 0:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# no caching, just do a statistics update after a successful call
|
||||
result = user_function(*args, **kwds)
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
elif maxsize is None:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# simple caching without ordering or size limit
|
||||
key = make_key(args, kwds, typed)
|
||||
result = cache_get(key, root) # root used here as a unique not-found sentinel
|
||||
if result is not root:
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
cache[key] = result
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
else:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# size limited caching that tracks accesses by recency
|
||||
key = make_key(args, kwds, typed) if kwds or typed else args
|
||||
with lock:
|
||||
link = cache_get(key)
|
||||
if link is not None:
|
||||
# record recent use of the key by moving it to the front of the list
|
||||
root, = nonlocal_root
|
||||
link_prev, link_next, key, result = link
|
||||
link_prev[NEXT] = link_next
|
||||
link_next[PREV] = link_prev
|
||||
last = root[PREV]
|
||||
last[NEXT] = root[PREV] = link
|
||||
link[PREV] = last
|
||||
link[NEXT] = root
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
with lock:
|
||||
root, = nonlocal_root
|
||||
if key in cache:
|
||||
# getting here means that this same key was added to the
|
||||
# cache while the lock was released. since the link
|
||||
# update is already done, we need only return the
|
||||
# computed result and update the count of misses.
|
||||
pass
|
||||
elif _len(cache) >= maxsize:
|
||||
# use the old root to store the new key and result
|
||||
oldroot = root
|
||||
oldroot[KEY] = key
|
||||
oldroot[RESULT] = result
|
||||
# empty the oldest link and make it the new root
|
||||
root = nonlocal_root[0] = oldroot[NEXT]
|
||||
oldkey = root[KEY]
|
||||
root[KEY] = root[RESULT] = None
|
||||
# now update the cache dictionary for the new links
|
||||
del cache[oldkey]
|
||||
cache[key] = oldroot
|
||||
else:
|
||||
# put result in a new link at the front of the list
|
||||
last = root[PREV]
|
||||
link = [last, root, key, result]
|
||||
last[NEXT] = root[PREV] = cache[key] = link
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
def cache_info():
|
||||
"""Report cache statistics"""
|
||||
with lock:
|
||||
return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
|
||||
|
||||
def cache_clear():
|
||||
"""Clear the cache and cache statistics"""
|
||||
with lock:
|
||||
cache.clear()
|
||||
root = nonlocal_root[0]
|
||||
root[:] = [root, root, None, None]
|
||||
stats[:] = [0, 0]
|
||||
|
||||
wrapper.__wrapped__ = user_function
|
||||
wrapper.cache_info = cache_info
|
||||
wrapper.cache_clear = cache_clear
|
||||
return update_wrapper(wrapper, user_function)
|
||||
|
||||
return decorating_function
|
||||
@@ -1,479 +0,0 @@
|
||||
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
|
||||
based on samples.
|
||||
|
||||
dateguesser.guess(sample)
|
||||
dateguesser.guess takes a sample date string and returns a set of
|
||||
datetime.strftime/strptime-compliant date format strings that will correctly parse.
|
||||
|
||||
dateguesser.guess_bulk(list_of_samples, error_rate=0)
|
||||
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
|
||||
and returns a list of datetime.strftime/strptime-compliant date format strings
|
||||
sorted by error rate that will correctly parse.
|
||||
|
||||
Algorithm:
|
||||
|
||||
1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
|
||||
2. Analyze each token independently in terms what format codes could represent
|
||||
3. For given list of tokens generate all permutations of format codes
|
||||
4. During generating permutations check for validness of generated format and skip if invalid.
|
||||
5. Use rules listed below to decide if format is invalid:
|
||||
|
||||
Invalid format checks:
|
||||
|
||||
Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
Rule #2. No holes (missing parts) in the format parts.
|
||||
Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
|
||||
Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
|
||||
Rule #5. Pattern can't have some part of date/time defined more than once.
|
||||
Rule #6: Separators between elements of the time group should be the same.
|
||||
Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
|
||||
Rule #8: Year can't be between other date elements
|
||||
|
||||
Note:
|
||||
dateguess doesn't support defaulting to current year because parsing should be deterministic,
|
||||
it's better to to fail guessing the format then to guess it incorrectly.
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
|
||||
|
||||
import calendar
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
import moment
|
||||
|
||||
|
||||
MONTH_NAME = calendar.month_name
|
||||
MONTH_ABBR = calendar.month_abbr
|
||||
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
|
||||
AM_PM = {'am', 'pm'}
|
||||
DAYS_OF_WEEK_NAME = calendar.day_name
|
||||
DAYS_OF_WEEK_ABBR = calendar.day_abbr
|
||||
|
||||
DATE_ELEMENTS = [
|
||||
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
|
||||
("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),
|
||||
("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),
|
||||
("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
|
||||
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
|
||||
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
|
||||
("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
|
||||
("Day of week", "%A", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
|
||||
("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
|
||||
|
||||
("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6
|
||||
and 0 <= int(x[0:2]) < 24
|
||||
and 0 <= int(x[2:4]) < 60
|
||||
and 0 <= int(x[4:6]) < 60, "HMS", 0),
|
||||
|
||||
("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
|
||||
("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2
|
||||
and 0 <= int(x) <= 11, "H", 0),
|
||||
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
|
||||
("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
|
||||
("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
|
||||
("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None
|
||||
and p.val == '.', "f", 0),
|
||||
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
|
||||
and x in TZ_VALID_NAMES, "Z", 0),
|
||||
("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '+', "Z", 1),
|
||||
("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '-', "Z", 1),
|
||||
]
|
||||
|
||||
|
||||
class Token(object):
|
||||
"""Represents a part of a date string that's being parsed.
|
||||
Note that __hash__ and __eq__ are overridden in order
|
||||
to compare only meaningful parts of an object.
|
||||
"""
|
||||
def __init__(self, val, length):
|
||||
self.val = val
|
||||
self.length = length
|
||||
self.compatible_types = ()
|
||||
|
||||
def __hash__(self):
|
||||
h = hash(self.length) + hash(self.compatible_types)
|
||||
if not self.compatible_types:
|
||||
h += hash(self.val)
|
||||
return hash(h)
|
||||
|
||||
def __eq__(self, other):
|
||||
"""
|
||||
Two tokens are equal when these both are true:
|
||||
a) length and compatible types are equal
|
||||
b) if it is separator (no compatible types), separator values must be equal
|
||||
"""
|
||||
if self.length != other.length or self.compatible_types != other.compatible_types:
|
||||
return False
|
||||
if not other.compatible_types and self.val != other.val:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_1(pattern, types_used):
|
||||
"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_1('%Y/%m/%d', 'Ymd')
|
||||
True
|
||||
>>> _check_rule_1('%m/%d', 'md')
|
||||
False
|
||||
"""
|
||||
if 'Y' not in types_used:
|
||||
logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_2(pattern, types_used):
|
||||
"""Rule #2: No holes (missing parts) in the format parts.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_2('%Y:%H', 'YH')
|
||||
False
|
||||
>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
|
||||
True
|
||||
"""
|
||||
priorities = 'YmdHMSf'
|
||||
seen_parts = [p in types_used for p in priorities]
|
||||
if sorted(seen_parts, reverse=True) != seen_parts:
|
||||
logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_3(pattern, types_used):
|
||||
"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
|
||||
True
|
||||
>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
|
||||
False
|
||||
"""
|
||||
time_parts = 'HMSf'
|
||||
time_parts_highlighted = [t in time_parts for t in types_used]
|
||||
time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
|
||||
if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
|
||||
logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_4(pattern, types_used):
|
||||
"""Rule #4: It's highly impossible that minutes coming before hours,
|
||||
millis coming before seconds etc.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_4('%H:%M', 'HM')
|
||||
True
|
||||
>>> _check_rule_4('%S:%M', 'SM')
|
||||
False
|
||||
"""
|
||||
time_parts_priority = 'HMSf'
|
||||
time_parts_indexes = list(filter(lambda x: x >= 0,
|
||||
[time_parts_priority.find(t) for t in types_used]))
|
||||
if sorted(time_parts_indexes) != time_parts_indexes:
|
||||
logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_5(pattern, types_used):
|
||||
"""Rule #5: Pattern can't have some part of date/time defined more than once.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_5('%Y/%Y', 'YY')
|
||||
False
|
||||
>>> _check_rule_5('%m/%b', 'mm')
|
||||
False
|
||||
>>> _check_rule_5('%Y/%m', 'Ym')
|
||||
True
|
||||
"""
|
||||
if len(types_used) != len(set(types_used)):
|
||||
logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_6(tokens_chosen, pattern, types_used):
|
||||
"""Rule #6: Separators between elements of the time group should be the same.
|
||||
|
||||
Examples:
|
||||
_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
|
||||
('-' and ':') in time group)
|
||||
"""
|
||||
time_parts = 'HMS'
|
||||
num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
|
||||
time_parts_seen = 0
|
||||
separators_seen = []
|
||||
previous_was_a_separator = False
|
||||
|
||||
for token in tokens_chosen:
|
||||
if token[1] is not None and token[1][3] in time_parts:
|
||||
# This rule doesn't work for separator-less time group so when we found the type
|
||||
# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
|
||||
if len(token[1][3]) == 3:
|
||||
break
|
||||
# If not a first time then
|
||||
if time_parts_seen > 0 and not previous_was_a_separator:
|
||||
separators_seen.append(None)
|
||||
time_parts_seen += 1
|
||||
if time_parts_seen == num_of_time_parts_used:
|
||||
break
|
||||
previous_was_a_separator = False
|
||||
else:
|
||||
if time_parts_seen > 0:
|
||||
separators_seen.append(token[0].val)
|
||||
previous_was_a_separator = True
|
||||
|
||||
if len(set(separators_seen)) > 1:
|
||||
logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
|
||||
pattern, separators_seen)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7a(pattern):
|
||||
"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7a('%Y/%m/%d %H:%M %p')
|
||||
False
|
||||
>>> _check_rule_7a('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%p' in pattern and '%H' in pattern:
|
||||
logging.debug("Rule #7a is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7b(pattern):
|
||||
"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M')
|
||||
False
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%I' in pattern and '%p' not in pattern:
|
||||
logging.debug("Rule #7b is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_8(pattern, types_used):
|
||||
"""Rule #9: Year can't be between other date elements
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
|
||||
False
|
||||
"""
|
||||
if 'mYd' in types_used or 'dYm' in types_used:
|
||||
logging.debug("Rule #8 is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _tokenize_by_character_class(s):
|
||||
"""Return a list of strings by splitting s (tokenizing) by character class.
|
||||
|
||||
Example:
|
||||
>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
|
||||
>>> [i.val for i in t]
|
||||
['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
|
||||
|
||||
>>> t = _tokenize_by_character_class('5/14/2014')
|
||||
>>> [i.val for i in t]
|
||||
['5', '/', '14', '/', '2014']
|
||||
"""
|
||||
res = re.split(r'(\d+)|(\W)|(_)', s)
|
||||
return [Token(i, len(i)) for i in res if i]
|
||||
|
||||
|
||||
def _sliding_triplets(tokens):
|
||||
for idx, t in enumerate(tokens):
|
||||
yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
|
||||
|
||||
|
||||
def _analyze_tokens(tokens):
|
||||
"""Analize each token and find out compatible types for it."""
|
||||
for token, prev, nxt in _sliding_triplets(tokens):
|
||||
token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _generate_all_permutations(tokens):
|
||||
"""Generate all permutations of format codes for given list of tokens.
|
||||
|
||||
Brute-forcing of all possible permutations and rules checking eats most of the time or date
|
||||
parsing. But since the input is expected to be highly uniform then we can expect that
|
||||
memoization of this step will be very efficient.
|
||||
|
||||
Token contains values for date parts but due to overridden eq and hash methods,
|
||||
we treat two tokens having the same length and same possible formats as equal
|
||||
tokens and separators should be the same
|
||||
"""
|
||||
all_patterns = set()
|
||||
_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
|
||||
|
||||
return all_patterns
|
||||
|
||||
|
||||
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
"""Apply rules which are applicable for partially constructed patterns.
|
||||
|
||||
Example: duplicates of a date part in a pattern.
|
||||
"""
|
||||
return _check_rule_5(pattern, types_used) \
|
||||
and _check_rule_4(pattern, types_used) \
|
||||
and _check_rule_7a(pattern)
|
||||
|
||||
|
||||
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
"""Apply rules which are applicable for full pattern only.
|
||||
|
||||
Example: existence of Year part in the pattern.
|
||||
"""
|
||||
return _check_rule_1(pattern, types_used) \
|
||||
and _check_rule_2(pattern, types_used) \
|
||||
and _check_rule_3(pattern, types_used) \
|
||||
and _check_rule_6(tokens_chosen, pattern, types_used) \
|
||||
and _check_rule_7b(pattern) \
|
||||
and _check_rule_8(pattern, types_used)
|
||||
|
||||
|
||||
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
|
||||
types_used):
|
||||
"""Generate all format elements permutations recursively.
|
||||
|
||||
Args:
|
||||
tokens (list[Token]): List of tokens.
|
||||
token_idx (int): Index of token processing this cycle.
|
||||
tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
|
||||
containing token and compatible type
|
||||
pattern (str): String containing format for parsing
|
||||
found_patterns (set): Set of guessed patterns
|
||||
types_used (str): String of types used to build pattern.
|
||||
|
||||
Returns:
|
||||
list: List of permutations
|
||||
"""
|
||||
if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
return
|
||||
|
||||
if token_idx < len(tokens):
|
||||
t = tokens[token_idx]
|
||||
if t.compatible_types:
|
||||
for ct in t.compatible_types:
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
|
||||
(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
|
||||
found_patterns, types_used + ct[3])
|
||||
else:
|
||||
# if no compatible types it should be separator, add it to the pattern
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1,
|
||||
tokens_chosen[:] + [(t, None)], pattern + t.val,
|
||||
found_patterns, types_used)
|
||||
else:
|
||||
if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
found_patterns.add(pattern)
|
||||
|
||||
|
||||
def guess(date):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for date string.
|
||||
|
||||
Args:
|
||||
date (str): Date string.
|
||||
|
||||
Returns:
|
||||
set: Set of datetime.strftime/strptime-compliant date format strings
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
"""
|
||||
tokens = _tokenize_by_character_class(date)
|
||||
_analyze_tokens(tokens)
|
||||
return _generate_all_permutations(tuple(tokens))
|
||||
|
||||
|
||||
def guess_bulk(dates, error_rate=0):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
|
||||
|
||||
Args:
|
||||
dates (list): List of samples date strings.
|
||||
error_rate (float): Acceptable error rate (default 0.0)
|
||||
|
||||
Returns:
|
||||
list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
|
||||
|
||||
Examples:
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
if error_rate == 0.0:
|
||||
patterns = None
|
||||
for date in dates:
|
||||
guesses_patterns = guess(date)
|
||||
if patterns is None:
|
||||
patterns = guesses_patterns
|
||||
else:
|
||||
patterns = patterns.intersection(guesses_patterns)
|
||||
if not patterns:
|
||||
break # No need to iterate more if zero patterns found
|
||||
return list(patterns)
|
||||
else:
|
||||
found_dates = 0
|
||||
pattern_counters = defaultdict(lambda: 0)
|
||||
num_dates = len(dates)
|
||||
min_num_dates_to_be_found = num_dates - num_dates * error_rate
|
||||
|
||||
for idx, date in enumerate(dates):
|
||||
patterns = guess(date)
|
||||
if patterns:
|
||||
found_dates += 1
|
||||
for pattern in patterns:
|
||||
pattern_counters[pattern] = pattern_counters[pattern] + 1
|
||||
|
||||
# Early return if number of strings that can't be date is already over error rate
|
||||
cells_left = num_dates - idx - 1
|
||||
cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
|
||||
if cannot_be_found:
|
||||
return []
|
||||
|
||||
patterns = [(v, k) for k, v in pattern_counters.items()
|
||||
if v > min_num_dates_to_be_found]
|
||||
patterns.sort(reverse=True)
|
||||
return [k for (v, k) in patterns]
|
||||
@@ -1,197 +0,0 @@
|
||||
"""
|
||||
Plugin for importing CSV files
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
|
||||
import chardet
|
||||
import messytables
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
import parse_data
|
||||
import import_utils
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SCHEMA = [
|
||||
{
|
||||
'name': 'lineterminator',
|
||||
'label': 'Line terminator',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'include_col_names_as_headers',
|
||||
'label': 'First row contains headers',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'delimiter',
|
||||
'label': 'Field separator',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'skipinitialspace',
|
||||
'label': 'Skip leading whitespace',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'quotechar',
|
||||
'label': 'Quote character',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'doublequote',
|
||||
'label': 'Quotes in fields are doubled',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
|
||||
{
|
||||
'name': 'quoting',
|
||||
'label': 'Convert quoted fields',
|
||||
'type': 'number',
|
||||
'visible': False, # Not supported by messytables
|
||||
},
|
||||
{
|
||||
'name': 'escapechar',
|
||||
'label': 'Escape character',
|
||||
'type': 'string',
|
||||
'visible': False, # Not supported by messytables
|
||||
},
|
||||
{
|
||||
'name': 'start_with_row',
|
||||
'label': 'Start with row',
|
||||
'type': 'number',
|
||||
'visible': False, # Not yet implemented
|
||||
},
|
||||
{
|
||||
'name': 'NUM_ROWS',
|
||||
'label': 'Number of rows',
|
||||
'type': 'number',
|
||||
'visible': False,
|
||||
}]
|
||||
|
||||
def parse_file_source(file_source, options):
|
||||
parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
|
||||
return {"parseOptions": parsing_options, "tables": export_list}
|
||||
|
||||
def parse_file(file_path, parse_options=None):
|
||||
"""
|
||||
Reads a file path and parse options that are passed in using ActiveDoc.importFile()
|
||||
and returns a tuple with parsing options (users' or guessed) and an object formatted so that
|
||||
it can be used by grist for a bulk add records action.
|
||||
"""
|
||||
parse_options = parse_options or {}
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
|
||||
return parsing_options, export_list
|
||||
|
||||
|
||||
def _parse_open_file(file_obj, parse_options=None):
|
||||
options = {}
|
||||
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
|
||||
csv_options = {k: parse_options.get(k) for k in csv_keys}
|
||||
if six.PY2:
|
||||
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
|
||||
for k, v in csv_options.items()}
|
||||
|
||||
table_set = messytables.CSVTableSet(file_obj,
|
||||
delimiter=csv_options['delimiter'],
|
||||
quotechar=csv_options['quotechar'],
|
||||
lineterminator=csv_options['lineterminator'],
|
||||
doublequote=csv_options['doublequote'],
|
||||
skipinitialspace=csv_options['skipinitialspace'])
|
||||
|
||||
num_rows = parse_options.get('NUM_ROWS', 0)
|
||||
|
||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
||||
sample = file_obj.read(100000)
|
||||
table_set.encoding = chardet.detect(sample)['encoding']
|
||||
# In addition, always prefer UTF8 over ASCII.
|
||||
if table_set.encoding == 'ascii':
|
||||
table_set.encoding = 'utf8'
|
||||
|
||||
export_list = []
|
||||
# A table set is a collection of tables:
|
||||
for row_set in table_set.tables:
|
||||
table_name = None
|
||||
sample_rows = list(row_set.sample)
|
||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
||||
data_offset, headers = import_utils.headers_guess(sample_rows)
|
||||
|
||||
# Make sure all header values are strings.
|
||||
for i, header in enumerate(headers):
|
||||
if not isinstance(header, six.string_types):
|
||||
headers[i] = six.text_type(header)
|
||||
|
||||
log.info("Guessed data_offset as %s", data_offset)
|
||||
log.info("Guessed headers as: %s", headers)
|
||||
|
||||
have_guessed_headers = any(headers)
|
||||
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
|
||||
have_guessed_headers)
|
||||
|
||||
if include_col_names_as_headers and not have_guessed_headers:
|
||||
# use first line as headers
|
||||
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
|
||||
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
|
||||
|
||||
elif not include_col_names_as_headers and have_guessed_headers:
|
||||
# move guessed headers to data
|
||||
data_offset -= 1
|
||||
headers = [''] * len(headers)
|
||||
|
||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
||||
|
||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows)
|
||||
|
||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||
column_metadata = []
|
||||
table_data = []
|
||||
for col_data, header in zip(table_data_with_types, headers):
|
||||
if not header and all(val == "" for val in col_data["data"]):
|
||||
continue # empty column
|
||||
data = col_data.pop("data")
|
||||
col_data["id"] = header
|
||||
column_metadata.append(col_data)
|
||||
table_data.append(data)
|
||||
|
||||
if not table_data:
|
||||
# Don't add tables with no columns.
|
||||
continue
|
||||
|
||||
guessed = row_set._dialect
|
||||
quoting = parse_options.get('quoting')
|
||||
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
||||
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
||||
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
|
||||
"quotechar": parse_options.get('quotechar', guessed.quotechar),
|
||||
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
|
||||
"include_col_names_as_headers": include_col_names_as_headers,
|
||||
"start_with_row": 1,
|
||||
"NUM_ROWS": num_rows,
|
||||
"SCHEMA": SCHEMA
|
||||
}
|
||||
|
||||
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
||||
for c in column_metadata:
|
||||
log.debug("Output column %s", c)
|
||||
export_list.append({
|
||||
"table_name": table_name,
|
||||
"column_metadata": column_metadata,
|
||||
"table_data": table_data
|
||||
})
|
||||
|
||||
return options, export_list
|
||||
|
||||
def get_version():
|
||||
""" Return name and version of plug-in"""
|
||||
pass
|
||||
@@ -1,257 +0,0 @@
|
||||
"""
|
||||
The import_json module converts json file into a list of grist tables.
|
||||
|
||||
It supports data being structured as a list of record, turning each
|
||||
object into a row and each object's key into a column. For
|
||||
example:
|
||||
```
|
||||
[{'a': 1, 'b': 'tree'}, {'a': 4, 'b': 'flowers'}, ... ]
|
||||
```
|
||||
is turned into a table with two columns 'a' of type 'Int' and 'b' of
|
||||
type 'Text'.
|
||||
|
||||
Nested object are stored as references to a distinct table where the
|
||||
nested object is stored. For example:
|
||||
```
|
||||
[{'a': {'b': 4}}, ...]
|
||||
```
|
||||
is turned into a column 'a' of type 'Ref:my_import_name.a', and into
|
||||
another table 'my_import_name.a' with a column 'b' of type
|
||||
'Int'. (Nested-nested objects are supported as well and the module
|
||||
assumes no limit to the number of level of nesting you can do.)
|
||||
|
||||
Each value which is not an object will be stored into a column with id
|
||||
'' (empty string). For example:
|
||||
```
|
||||
['apple', 'peach', ... ]
|
||||
```
|
||||
is turned into a table with an un-named column that stores the values.
|
||||
|
||||
Arrays are stored as a list of references to a table where the content
|
||||
of the array is stored. For example:
|
||||
```
|
||||
[{'items': [{'a':'apple'}, {'a':'peach'}]}, {'items': [{'a':'cucumber'}, {'a':'carots'}, ...]}, ...]
|
||||
```
|
||||
is turned into a column named 'items' of type
|
||||
'RefList:my_import_name.items' which points to another table named
|
||||
'my_import_name.items' which has a column 'a' of type Text.
|
||||
|
||||
Data could be structured with an object at the root as well in which
|
||||
case, the object is considered to represent a single row, and gets
|
||||
turned into a table with one row.
|
||||
|
||||
A column's type is defined by the type of its first value that is not
|
||||
None (ie: if another value with different type is stored in the same
|
||||
column, the column's type remains unchanged), 'Text' otherwise.
|
||||
|
||||
Usage:
|
||||
import import_json
|
||||
# if you have a file to parse
|
||||
import_json.parse_file(file_path)
|
||||
|
||||
# if data is already encoded with python's standard containers (dict and list)
|
||||
import_json.dumps(data, import_name)
|
||||
|
||||
|
||||
TODO:
|
||||
- references should map to appropriate column type ie: `Ref:{$colname}` and
|
||||
`RefList:{$colname}` (which depends on T413).
|
||||
- Allows user to set the uniqueValues options per table.
|
||||
- User should be able to choose some objects to be imported as
|
||||
indexes: for instance:
|
||||
```
|
||||
{
|
||||
'pink lady': {'type': 'apple', 'taste': 'juicy'},
|
||||
'gala': {'type': 'apple', 'taste': 'tart'},
|
||||
'comice': {'type': 'pear', 'taste': 'lemon'},
|
||||
...
|
||||
}
|
||||
```
|
||||
could be mapped to columns 'type', 'taste' and a 3rd that holds the
|
||||
property 'name'.
|
||||
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
from collections import OrderedDict, namedtuple
|
||||
from itertools import count, chain
|
||||
|
||||
import six
|
||||
|
||||
import import_utils
|
||||
|
||||
Ref = namedtuple('Ref', ['table_name', 'rowid'])
|
||||
Row = namedtuple('Row', ['values', 'parent', 'ref'])
|
||||
Col = namedtuple('Col', ['type', 'values'])
|
||||
|
||||
GRIST_TYPES={
|
||||
float: "Numeric",
|
||||
bool: "Bool",
|
||||
}
|
||||
|
||||
for typ in six.integer_types:
|
||||
GRIST_TYPES[typ] = "Int"
|
||||
|
||||
for typ in six.string_types:
|
||||
GRIST_TYPES[typ] = "Text"
|
||||
|
||||
SCHEMA = [{
|
||||
'name': 'includes',
|
||||
'label': 'Includes (list of tables seperated by semicolon)',
|
||||
'type': 'string',
|
||||
'visible': True
|
||||
}, {
|
||||
'name': 'excludes',
|
||||
'label': 'Excludes (list of tables seperated by semicolon)',
|
||||
'type': 'string',
|
||||
'visible': True
|
||||
}]
|
||||
|
||||
DEFAULT_PARSE_OPTIONS = {
|
||||
'includes': '',
|
||||
'excludes': '',
|
||||
'SCHEMA': SCHEMA
|
||||
}
|
||||
|
||||
def parse_file(file_source, parse_options):
|
||||
"Deserialize `file_source` into a python object and dumps it into jgrist form"
|
||||
path = import_utils.get_path(file_source['path'])
|
||||
name, ext = os.path.splitext(file_source['origName'])
|
||||
if 'SCHEMA' not in parse_options:
|
||||
parse_options.update(DEFAULT_PARSE_OPTIONS)
|
||||
with open(path, 'r') as json_file:
|
||||
data = json.loads(json_file.read())
|
||||
|
||||
return dumps(data, name, parse_options)
|
||||
|
||||
def dumps(data, name = "", parse_options = DEFAULT_PARSE_OPTIONS):
|
||||
" Serializes `data` to a jgrist formatted object. "
|
||||
tables = Tables(parse_options)
|
||||
if not isinstance(data, list):
|
||||
# put simple record into a list
|
||||
data = [data]
|
||||
for val in data:
|
||||
tables.add_row(name, val)
|
||||
return {
|
||||
'tables': tables.dumps(),
|
||||
'parseOptions': parse_options
|
||||
}
|
||||
|
||||
|
||||
class Tables(object):
|
||||
"""
|
||||
Tables maintains the list of tables indexed by their name. Each table
|
||||
is a list of row. A row is a dictionary mapping columns id to a value.
|
||||
"""
|
||||
|
||||
def __init__(self, parse_options):
|
||||
self._tables = OrderedDict()
|
||||
self._includes_opt = list(filter(None, parse_options['includes'].split(';')))
|
||||
self._excludes_opt = list(filter(None, parse_options['excludes'].split(';')))
|
||||
|
||||
|
||||
def dumps(self):
|
||||
" Dumps tables in jgrist format "
|
||||
return [_dump_table(name, rows) for name, rows in six.iteritems(self._tables)]
|
||||
|
||||
def add_row(self, table, value, parent = None):
|
||||
"""
|
||||
Adds a row to `table` and fill it with the content of value, then
|
||||
returns a Ref object pointing to this row. Returns None if the row
|
||||
was excluded. Calls itself recursively to add nested object and
|
||||
lists.
|
||||
"""
|
||||
row = None
|
||||
if self._is_included(table):
|
||||
rows = self._tables.setdefault(table, [])
|
||||
row = Row(OrderedDict(), parent, Ref(table, len(rows)+1))
|
||||
rows.append(row)
|
||||
|
||||
# we need a dictionary to map values to the row's columns
|
||||
value = _dictify(value)
|
||||
for (k, val) in sorted(six.iteritems(value)):
|
||||
if isinstance(val, dict):
|
||||
val = self.add_row(table + '_' + k, val)
|
||||
if row and val:
|
||||
row.values[k] = val.ref
|
||||
elif isinstance(val, list):
|
||||
for list_val in val:
|
||||
self.add_row(table + '_' + k, list_val, row)
|
||||
else:
|
||||
if row and self._is_included(table + '_' + k):
|
||||
row.values[k] = val
|
||||
return row
|
||||
|
||||
|
||||
def _is_included(self, property_path):
|
||||
is_included = (any(property_path.startswith(inc) for inc in self._includes_opt)
|
||||
if self._includes_opt else True)
|
||||
is_excluded = (any(property_path.startswith(exc) for exc in self._excludes_opt)
|
||||
if self._excludes_opt else False)
|
||||
return is_included and not is_excluded
|
||||
|
||||
|
||||
def first_available_key(dictionary, name):
|
||||
"""
|
||||
Returns the first of (name, name2, name3 ...) that is not a key of
|
||||
dictionary.
|
||||
"""
|
||||
names = chain([name], ("{}{}".format(name, i) for i in count(2)))
|
||||
return next(n for n in names if n not in dictionary)
|
||||
|
||||
|
||||
def _dictify(value):
|
||||
"""
|
||||
Converts non-dictionary value to a dictionary with a single
|
||||
empty-string key mapping to the given value. Or returns the value
|
||||
itself if it's already a dictionary. This is useful to map values to
|
||||
row's columns.
|
||||
"""
|
||||
return value if isinstance(value, dict) else {'': value}
|
||||
|
||||
|
||||
def _dump_table(name, rows):
|
||||
"Converts a list of rows into a jgrist table and set 'table_name' to name."
|
||||
columns = _transpose([r.values for r in rows])
|
||||
# find ref to first parent
|
||||
ref = next((r.parent.ref for r in rows if r.parent), None)
|
||||
if ref:
|
||||
# adds a column to store ref to parent
|
||||
col_id = first_available_key(columns, ref.table_name)
|
||||
columns[col_id] = Col(_grist_type(ref),
|
||||
[row.parent.ref if row.parent else None for row in rows])
|
||||
return {
|
||||
'column_metadata': [{'id': key, 'type': col.type} for (key, col) in six.iteritems(columns)],
|
||||
'table_data': [[_dump_value(val) for val in col.values] for col in columns.values()],
|
||||
'table_name': name
|
||||
}
|
||||
|
||||
def _transpose(rows):
|
||||
"""
|
||||
Transposes a collection of dictionary mapping key to values into a
|
||||
dictionary mapping key to values. Values are encoded into a tuple
|
||||
made of the grist_type of the first value that is not None and the
|
||||
collection of values.
|
||||
"""
|
||||
transpose = OrderedDict()
|
||||
values = OrderedDict()
|
||||
for row in reversed(rows):
|
||||
values.update(row)
|
||||
for key, val in six.iteritems(values):
|
||||
transpose[key] = Col(_grist_type(val), [row.get(key, None) for row in rows])
|
||||
return transpose
|
||||
|
||||
|
||||
def _dump_value(value):
|
||||
" Serialize a value."
|
||||
if isinstance(value, Ref):
|
||||
return value.rowid
|
||||
return value
|
||||
|
||||
|
||||
def _grist_type(value):
|
||||
" Returns the grist type for value. "
|
||||
val_type = type(value)
|
||||
if val_type == Ref:
|
||||
return 'Ref:{}'.format(value.table_name)
|
||||
return GRIST_TYPES.get(val_type, 'Text')
|
||||
@@ -1,120 +0,0 @@
|
||||
"""
|
||||
Helper functions for import plugins
|
||||
"""
|
||||
import sys
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Include /thirdparty into module search paths, in particular for messytables.
|
||||
sys.path.append('/thirdparty')
|
||||
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Get path to an imported file.
|
||||
def get_path(file_source):
|
||||
importdir = os.environ.get('IMPORTDIR') or '/importdir'
|
||||
return os.path.join(importdir, file_source)
|
||||
|
||||
def capitalize(word):
|
||||
"""Capitalize the first character in the word (without lowercasing the rest)."""
|
||||
return word[0].capitalize() + word[1:]
|
||||
|
||||
def _is_numeric(text):
|
||||
for t in six.integer_types + (float, complex):
|
||||
try:
|
||||
t(text)
|
||||
return True
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _is_header(header, data_rows):
|
||||
"""
|
||||
Returns whether header can be considered a legitimate header for data_rows.
|
||||
"""
|
||||
# See if the row has any non-text values.
|
||||
for cell in header:
|
||||
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
|
||||
return False
|
||||
|
||||
|
||||
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
|
||||
# a header.
|
||||
count_repeats = [0 for cell in header]
|
||||
for row in data_rows:
|
||||
for cell, header_cell in zip(row, header):
|
||||
if cell.value and cell.value == header_cell.value:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _count_nonempty(row):
|
||||
"""
|
||||
Returns the count of cells in row, ignoring trailing empty cells.
|
||||
"""
|
||||
count = 0
|
||||
for i, c in enumerate(row):
|
||||
if not c.empty:
|
||||
count = i + 1
|
||||
return count
|
||||
|
||||
|
||||
def find_first_non_empty_row(rows):
|
||||
"""
|
||||
Returns (data_offset, header) of the first row with non-empty fields
|
||||
or (0, []) if there are no non-empty rows.
|
||||
"""
|
||||
for i, row in enumerate(rows):
|
||||
if _count_nonempty(row) > 0:
|
||||
return i + 1, row
|
||||
# No non-empty rows.
|
||||
return 0, []
|
||||
|
||||
|
||||
def expand_headers(headers, data_offset, rows):
|
||||
"""
|
||||
Returns expanded header to have enough columns for all rows in the given sample.
|
||||
"""
|
||||
row_length = max(itertools.chain([len(headers)],
|
||||
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
|
||||
None))))
|
||||
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
|
||||
return header_values
|
||||
|
||||
|
||||
def headers_guess(rows):
|
||||
"""
|
||||
Our own smarter version of messytables.headers_guess, which also guesses as to whether one of
|
||||
the first rows is in fact a header. Returns (data_offset, headers) where data_offset is the
|
||||
index of the first line of data, and headers is the list of guessed headers (which will contain
|
||||
empty strings if the file had no headers).
|
||||
"""
|
||||
# Messytables guesses at the length of data rows, and then assumes that the first row that has
|
||||
# close to that many non-empty fields is the header, where by "close" it means 1 less.
|
||||
#
|
||||
# For Grist, it's better to mistake headers for data than to mistake data for headers. Note that
|
||||
# there is csv.Sniffer().has_header(), which tries to be clever, but it's messes up too much.
|
||||
#
|
||||
# We only consider for the header the first row with non-empty cells. It is a header if
|
||||
# - it has no non-text fields
|
||||
# - none of the fields have a value that repeats in that column of data
|
||||
|
||||
# Find the first row with non-empty fields.
|
||||
data_offset, header = find_first_non_empty_row(rows)
|
||||
if not header:
|
||||
return data_offset, header
|
||||
|
||||
# Let's see if row is really a header.
|
||||
if not _is_header(header, itertools.islice(rows, data_offset, None)):
|
||||
data_offset -= 1
|
||||
header = []
|
||||
|
||||
# Expand header to have enough columns for all rows in the given sample.
|
||||
header_values = expand_headers(header, data_offset, rows)
|
||||
|
||||
return data_offset, header_values
|
||||
@@ -1,118 +0,0 @@
|
||||
"""
|
||||
This module reads a file path that is passed in using ActiveDoc.importFile()
|
||||
and returns a object formatted so that it can be used by grist for a bulk add records action
|
||||
"""
|
||||
import os
|
||||
import csv
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
import chardet
|
||||
import messytables
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
import parse_data
|
||||
import import_utils
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def import_file(file_source, parse_options):
|
||||
path = import_utils.get_path(file_source["path"])
|
||||
orig_name = file_source["origName"]
|
||||
parse_options, tables = parse_file(path, orig_name, parse_options)
|
||||
return {"parseOptions": parse_options, "tables": tables}
|
||||
|
||||
# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any
|
||||
# behavior.
|
||||
orig_dialect = messytables.CSVRowSet._dialect
|
||||
def override_dialect(self):
|
||||
if self.delimiter == '\t':
|
||||
return csv.excel_tab
|
||||
return orig_dialect.fget(self)
|
||||
messytables.CSVRowSet._dialect = property(override_dialect)
|
||||
|
||||
def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None):
|
||||
# pylint: disable=unused-argument
|
||||
with open(file_path, "rb") as f:
|
||||
try:
|
||||
return parse_open_file(f, orig_name, table_name_hint=table_name_hint)
|
||||
except Exception as e:
|
||||
# Log the full error, but simplify the thrown error to omit the unhelpful extra args.
|
||||
log.info("import_xls parse_file failed: %s", e)
|
||||
if six.PY2 and e.args and isinstance(e.args[0], six.string_types):
|
||||
raise Exception(e.args[0])
|
||||
raise
|
||||
|
||||
|
||||
def parse_open_file(file_obj, orig_name, table_name_hint=None):
|
||||
file_root, file_ext = os.path.splitext(orig_name)
|
||||
table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False)
|
||||
|
||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
||||
if isinstance(table_set, messytables.CSVTableSet):
|
||||
sample = file_obj.read(100000)
|
||||
table_set.encoding = chardet.detect(sample)['encoding']
|
||||
# In addition, always prefer UTF8 over ASCII.
|
||||
if table_set.encoding == 'ascii':
|
||||
table_set.encoding = 'utf8'
|
||||
|
||||
export_list = []
|
||||
# A table set is a collection of tables:
|
||||
for row_set in table_set.tables:
|
||||
table_name = row_set.name
|
||||
|
||||
if isinstance(row_set, messytables.CSVRowSet):
|
||||
# For csv files, we can do better for table_name by using the filename.
|
||||
table_name = import_utils.capitalize(table_name_hint or
|
||||
os.path.basename(file_root.decode('utf8')))
|
||||
|
||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
||||
data_offset, headers = import_utils.headers_guess(list(row_set.sample))
|
||||
else:
|
||||
# Let messytables guess header names and the offset of the header.
|
||||
offset, headers = messytables.headers_guess(row_set.sample)
|
||||
data_offset = offset + 1 # Add the header line
|
||||
|
||||
# Make sure all header values are strings.
|
||||
for i, header in enumerate(headers):
|
||||
if not isinstance(header, six.string_types):
|
||||
headers[i] = six.text_type(header)
|
||||
|
||||
log.debug("Guessed data_offset as %s", data_offset)
|
||||
log.debug("Guessed headers as: %s", headers)
|
||||
|
||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
||||
|
||||
|
||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers))
|
||||
|
||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||
column_metadata = []
|
||||
table_data = []
|
||||
for col_data, header in zip(table_data_with_types, headers):
|
||||
if not header and all(val == "" for val in col_data["data"]):
|
||||
continue # empty column
|
||||
data = col_data.pop("data")
|
||||
col_data["id"] = header
|
||||
column_metadata.append(col_data)
|
||||
table_data.append(data)
|
||||
|
||||
if not table_data:
|
||||
# Don't add tables with no columns.
|
||||
continue
|
||||
|
||||
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
||||
for c in column_metadata:
|
||||
log.debug("Output column %s", c)
|
||||
export_list.append({
|
||||
"table_name": table_name,
|
||||
"column_metadata": column_metadata,
|
||||
"table_data": table_data
|
||||
})
|
||||
|
||||
parse_options = {}
|
||||
|
||||
return parse_options, export_list
|
||||
@@ -1,25 +0,0 @@
|
||||
import logging
|
||||
import sandbox
|
||||
|
||||
import import_csv
|
||||
import import_xls
|
||||
import import_json
|
||||
|
||||
def main():
|
||||
s = logging.StreamHandler()
|
||||
s.setFormatter(logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'))
|
||||
rootLogger = logging.getLogger()
|
||||
rootLogger.addHandler(s)
|
||||
rootLogger.setLevel(logging.INFO)
|
||||
|
||||
# Todo: Grist should expose a register method accepting arguments as
|
||||
# follow: register('csv_parser', 'canParse', can_parse)
|
||||
sandbox.register("csv_parser.parseFile", import_csv.parse_file_source)
|
||||
sandbox.register("xls_parser.parseFile", import_xls.import_file)
|
||||
sandbox.register("json_parser.parseFile", import_json.parse_file)
|
||||
|
||||
sandbox.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,299 +0,0 @@
|
||||
"""
|
||||
This module implements a way to detect and convert types that's better than messytables (at least
|
||||
in some relevant cases).
|
||||
|
||||
It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
|
||||
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
|
||||
of values. All "data" lists will have the same length.
|
||||
"""
|
||||
|
||||
import dateguess
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import messytables
|
||||
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
||||
import dateutil.parser as date_parser
|
||||
import six
|
||||
from six.moves import zip, xrange
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
|
||||
# pylint:disable=unidiomatic-typecheck
|
||||
|
||||
|
||||
# Our approach to type detection is different from that of messytables.
|
||||
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
|
||||
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
|
||||
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
|
||||
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
|
||||
# We use those counts to produce the selected Grist type at the end.
|
||||
|
||||
|
||||
class BaseConverter(object):
|
||||
@classmethod
|
||||
def test(cls, value):
|
||||
try:
|
||||
cls.convert(value)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
"""Implement to convert imported value to a basic type."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
"""
|
||||
Given an array of values returned successfully by convert(), return a tuple of
|
||||
(grist_type_string, grist_values), where grist_values is an array of values suitable for the
|
||||
returned grist type.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class NumericConverter(BaseConverter):
|
||||
"""Handles numeric values, including Grist types Numeric and Int."""
|
||||
|
||||
# A number matching this is probably an identifier of some sort. Converting it to a float will
|
||||
# lose precision, so it's better not to consider it numeric.
|
||||
_unlikely_float = re.compile(r'\d{17}|^0\d')
|
||||
|
||||
# Integers outside this range will be represented as floats. This is the limit for values that can
|
||||
# be stored in a JS Int32Array.
|
||||
_max_js_int = 1<<31
|
||||
|
||||
# The thousands separator. It should be locale-specific, but we don't currently have a way to
|
||||
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
|
||||
_thousands_sep = ','
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
if type(value) in six.integer_types + (float, complex):
|
||||
return value
|
||||
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
|
||||
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def _is_integer(cls, value):
|
||||
ttype = type(value)
|
||||
if ttype == int or (ttype == float and value.is_integer()):
|
||||
return -cls._max_js_int <= value < cls._max_js_int
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
if all(cls._is_integer(v) for v in values):
|
||||
return ("Int", [int(v) for v in values])
|
||||
return ("Numeric", values)
|
||||
|
||||
|
||||
class DateParserInfo(date_parser.parserinfo):
|
||||
def validate(self, res):
|
||||
# Avoid this bogus combination which accepts plain numbers.
|
||||
if res.day and not res.month:
|
||||
return False
|
||||
return super(DateParserInfo, self).validate(res)
|
||||
|
||||
|
||||
class SimpleDateTimeConverter(BaseConverter):
|
||||
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
if type(value) is datetime.datetime:
|
||||
return value
|
||||
elif value == "":
|
||||
return None
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def _is_date(cls, value):
|
||||
return value is None or value.time() == datetime.time()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
|
||||
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
||||
for v in values]
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class DateTimeCoverter(BaseConverter):
|
||||
"""Handles dateformats by guessed format."""
|
||||
|
||||
def __init__(self, date_format):
|
||||
self._format = date_format
|
||||
|
||||
def convert(self, value):
|
||||
if value == "":
|
||||
return None
|
||||
if type(value) in (str, six.text_type):
|
||||
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
|
||||
if '%z' in self._format or '%Z' in self._format:
|
||||
return date_parser.parse(value)
|
||||
else:
|
||||
try:
|
||||
return datetime.datetime.strptime(value, self._format)
|
||||
except ValueError:
|
||||
return date_parser.parse(value)
|
||||
|
||||
raise ValueError()
|
||||
|
||||
def _is_date(self, value):
|
||||
return value is None or value.time() == datetime.time()
|
||||
|
||||
def get_grist_column(self, values):
|
||||
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
|
||||
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
||||
for v in values]
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class BoolConverter(BaseConverter):
|
||||
"""Handles Boolean type."""
|
||||
|
||||
_true_values = (1, '1', 'true', 'yes')
|
||||
_false_values = (0, '0', 'false', 'no')
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
v = value.strip().lower() if type(value) in (str, six.text_type) else value
|
||||
if v in cls._true_values:
|
||||
return True
|
||||
elif v in cls._false_values:
|
||||
return False
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Bool", values)
|
||||
|
||||
|
||||
class TextConverter(BaseConverter):
|
||||
"""Fallback converter that converts everything to strings."""
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
return six.text_type(value)
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Text", values)
|
||||
|
||||
|
||||
class ColumnDetector(object):
|
||||
"""
|
||||
ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
|
||||
different basic types. At the end `get_converter()` method returns the class of the most
|
||||
suitable converter.
|
||||
"""
|
||||
# Converters are listed in the order of preference, which is only used if two converters succeed
|
||||
# on the same exact number of values. Text is always a fallback.
|
||||
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
|
||||
|
||||
# If this many non-junk values or more can't be converted, fall back to text.
|
||||
_text_threshold = 0.10
|
||||
|
||||
# Junk values: these aren't counted when deciding whether to fall back to text.
|
||||
_junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)
|
||||
|
||||
def __init__(self):
|
||||
self._counts = [0] * len(self.converters)
|
||||
self._count_nonjunk = 0
|
||||
self._count_total = 0
|
||||
self._data = []
|
||||
|
||||
def add_value(self, value):
|
||||
self._count_total += 1
|
||||
if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
|
||||
return
|
||||
|
||||
self._data.append(value)
|
||||
|
||||
self._count_nonjunk += 1
|
||||
for i, conv in enumerate(self.converters):
|
||||
if conv.test(value):
|
||||
self._counts[i] += 1
|
||||
|
||||
def get_converter(self):
|
||||
if sum(self._counts) == 0:
|
||||
# if not already guessed as int, bool or datetime then we should try to guess date pattern
|
||||
str_data = [d for d in self._data if isinstance(d, six.string_types)]
|
||||
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
|
||||
data_format = data_formats[0] if data_formats else None
|
||||
if data_format:
|
||||
return DateTimeCoverter(data_format)
|
||||
|
||||
# We find the max by count, and secondarily by minimum index in the converters list.
|
||||
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
|
||||
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
|
||||
return self.converters[-neg_index]
|
||||
return TextConverter
|
||||
|
||||
|
||||
def _guess_basic_types(rows, num_columns):
|
||||
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
|
||||
for row in rows:
|
||||
for cell, detector in zip(row, column_detectors):
|
||||
detector.add_value(cell.value)
|
||||
|
||||
return [detector.get_converter() for detector in column_detectors]
|
||||
|
||||
|
||||
class ColumnConverter(object):
|
||||
"""
|
||||
ColumnConverter converts and collects values using the passed-in converter object. At the end
|
||||
`get_grist_column()` method returns a column of converted data.
|
||||
"""
|
||||
def __init__(self, converter):
|
||||
self._converter = converter
|
||||
self._all_col_values = [] # Initially this has None's for converted values
|
||||
self._converted_values = [] # A list of all converted values
|
||||
self._converted_indices = [] # Indices of the converted values into self._all_col_values
|
||||
|
||||
def convert_and_add(self, value):
|
||||
# For some reason, we get 'str' type rather than 'unicode' for empty strings.
|
||||
# Correct this, since all text should be unicode.
|
||||
value = u"" if value == "" else value
|
||||
try:
|
||||
conv = self._converter.convert(value)
|
||||
self._converted_values.append(conv)
|
||||
self._converted_indices.append(len(self._all_col_values))
|
||||
self._all_col_values.append(None)
|
||||
except Exception:
|
||||
self._all_col_values.append(six.text_type(value))
|
||||
|
||||
def get_grist_column(self):
|
||||
"""
|
||||
Returns a dictionary {"type": grist_type, "data": grist_value_array}.
|
||||
"""
|
||||
grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
|
||||
for i, v in zip(self._converted_indices, grist_values):
|
||||
self._all_col_values[i] = v
|
||||
return {"type": grist_type, "data": self._all_col_values}
|
||||
|
||||
|
||||
def get_table_data(row_set, num_columns, num_rows=0):
|
||||
converters = _guess_basic_types(row_set.sample, num_columns)
|
||||
col_converters = [ColumnConverter(c) for c in converters]
|
||||
for num, row in enumerate(row_set):
|
||||
if num_rows and num == num_rows:
|
||||
break
|
||||
|
||||
if num % 10000 == 0:
|
||||
log.info("Processing row %d", num)
|
||||
|
||||
# Make sure we have a value for every column.
|
||||
missing_values = len(converters) - len(row)
|
||||
if missing_values > 0:
|
||||
row.extend([messytables.Cell("")] * missing_values)
|
||||
|
||||
for cell, conv in zip(row, col_converters):
|
||||
conv.convert_and_add(cell.value)
|
||||
|
||||
return [conv.get_grist_column() for conv in col_converters]
|
||||
Binary file not shown.
BIN
plugins/core/sandbox/test/fixtures/test_excel.xlsx
vendored
BIN
plugins/core/sandbox/test/fixtures/test_excel.xlsx
vendored
Binary file not shown.
@@ -1 +0,0 @@
|
||||
int1,int2,textint,bigint,num2,bignum,date1,date2,datetext,datetimetext
|
||||
|
Binary file not shown.
@@ -1,5 +0,0 @@
|
||||
FIRST_NAME,LAST_NAME,PHONE,VALUE,DATE
|
||||
John,Moor,201-343-3434,45,2018-02-27 16:08:39 +0000
|
||||
Tim,Kale,201.343.3434,4545,2018-02-27 16:08:39 +0100
|
||||
Jenny,Jo,2013433434,0,2018-02-27 16:08:39 -0100
|
||||
Lily,Smit,(201)343-3434,4,
|
||||
|
Binary file not shown.
@@ -1,102 +0,0 @@
|
||||
import unittest
|
||||
from dateguess import guess, guess_bulk
|
||||
|
||||
|
||||
class TestGuesser(unittest.TestCase):
|
||||
def assertDate(self, input_str, fmt_list):
|
||||
guessed = guess(input_str)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def assertDates(self, input_lst, error_rate, fmt_list):
|
||||
guessed = guess_bulk(input_lst, error_rate=error_rate)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def test_guess_dates(self):
|
||||
self.assertDate('', [])
|
||||
self.assertDate("2013-13-13", [])
|
||||
self.assertDate("25/25/1911", [])
|
||||
|
||||
self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2013-12-13", ['%Y-%m-%d'])
|
||||
|
||||
self.assertDate("12/31/1999", ['%m/%d/%Y'])
|
||||
self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
|
||||
self.assertDate("12/31/99", ['%m/%d/%y'])
|
||||
self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
|
||||
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
self.assertDate("31.12.1991", ['%d.%m.%Y'])
|
||||
self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
|
||||
self.assertDate("13.2.2008", ['%d.%m.%Y'])
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
|
||||
self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
|
||||
self.assertDate("31 Dec 1999", ['%d %b %Y'])
|
||||
self.assertDate("1 Jan 2012", ['%d %b %Y'])
|
||||
self.assertDate("3 August 2009", ['%d %B %Y'])
|
||||
self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
|
||||
|
||||
self.assertDate("13/1/2012", ['%d/%m/%Y'])
|
||||
|
||||
self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
|
||||
self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
|
||||
|
||||
def test_guess_datetimes(self):
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
# TODO remove all except first one
|
||||
self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
|
||||
'%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
|
||||
self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one
|
||||
|
||||
self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
|
||||
self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
|
||||
|
||||
def test_guess_bulk_dates(self):
|
||||
self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
|
||||
self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
|
||||
self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,341 +0,0 @@
|
||||
# This Python file uses the following encoding: utf-8
|
||||
# Run tests with:
|
||||
#
|
||||
# ./sandbox/nacl/bin/sel_ldr -E PYTHONPATH=/grist:/thirdparty -B ./sandbox/nacl/lib/irt_core.nexe -l /dev/null -m ./sandbox/nacl/root:/:ro -m ./plugins/core/sandbox:/sandbox:ro ./sandbox/nacl/lib/runnable-ld.so --library-path /slib /python/bin/python2.7.nexe -m unittest discover -v -s /sandbox #pylint: disable=line-too-long
|
||||
#
|
||||
#
|
||||
# TODO: run test automatically
|
||||
#
|
||||
import math
|
||||
import os
|
||||
import textwrap
|
||||
import unittest
|
||||
from six import BytesIO, text_type
|
||||
import csv
|
||||
import calendar
|
||||
import datetime
|
||||
|
||||
import import_csv
|
||||
|
||||
|
||||
def _get_fixture(filename):
|
||||
return os.path.join(os.path.dirname(__file__), "test/fixtures", filename)
|
||||
|
||||
|
||||
def bytes_io_from_str(string):
|
||||
if isinstance(string, text_type):
|
||||
string = string.encode("utf8")
|
||||
return BytesIO(string)
|
||||
|
||||
|
||||
class TestImportCSV(unittest.TestCase):
|
||||
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def _check_num_cols(self, sheet, exp_cols):
|
||||
self.assertEqual(len(sheet["column_metadata"]), exp_cols)
|
||||
self.assertEqual(len(sheet["table_data"]), exp_cols)
|
||||
|
||||
|
||||
def test_csv_types(self):
|
||||
parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='')
|
||||
sheet = parsed_file[1][0]
|
||||
|
||||
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
|
||||
|
||||
|
||||
def test_user_parse_options(self):
|
||||
options = {u'parse_options': {"escapechar": None, "include_col_names_as_headers": True,
|
||||
"lineterminator": "\n", "skipinitialspace": False,
|
||||
"limit_rows": False, "quoting": 0, "start_with_row": 1,
|
||||
"delimiter": ",", "NUM_ROWS":10,
|
||||
"quotechar": "\"", "doublequote":True}}
|
||||
parsed_file = import_csv.parse_file(_get_fixture('test_import_csv.csv'),
|
||||
**options)[1][0]
|
||||
self._check_num_cols(parsed_file, 5)
|
||||
self._check_col(parsed_file, 0, "FIRST_NAME", "Text", ['John', 'Tim', 'Jenny', 'Lily'])
|
||||
self._check_col(parsed_file, 1, "LAST_NAME", "Text", ['Moor', 'Kale', 'Jo', 'Smit'])
|
||||
self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
|
||||
'2013433434', '(201)343-3434'])
|
||||
self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
|
||||
self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
|
||||
|
||||
def test_wrong_cols1(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1, name2, name3
|
||||
a1,b1,c1
|
||||
a2,b2
|
||||
a3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", ""])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
|
||||
|
||||
def test_wrong_cols2(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1
|
||||
a1,b1
|
||||
a2,b2,c2
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
|
||||
|
||||
def test_offset(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,,,,,,
|
||||
name1,name2,name3
|
||||
a1,b1,c1
|
||||
a2,b2,c2
|
||||
a3,b3,c3,d4
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", "b3"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
|
||||
|
||||
def test_offset_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
4,b1,c1
|
||||
4,b2,c2
|
||||
4,b3,c3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Int", [4, 4, 4])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2", "b3"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
def test_empty_headers(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,-,-
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 5)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["b", "b", "b"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
|
||||
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
-,-,-,-,-,-
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 6)
|
||||
self._check_col(parsed_file, 0, "-", "Text", ["b", "b", "b"])
|
||||
self._check_col(parsed_file, 1, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 4, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
|
||||
|
||||
def test_guess_missing_user_option(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,;name2,;name3
|
||||
a1,;b1,;c1
|
||||
a2,;b2,;c2
|
||||
a3,;b3,;c3
|
||||
"""))
|
||||
parse_options = {"delimiter": ';',
|
||||
"escapechar": None,
|
||||
"lineterminator": '\r\n',
|
||||
"quotechar": '"',
|
||||
"quoting": csv.QUOTE_MINIMAL}
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1,", "Text", ["a1,", "a2,", "a3,"])
|
||||
self._check_col(parsed_file, 1, "name2,", "Text", ["b1,", "b2,", "b3,"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
# Sniffer detects delimiters in order [',', '\t', ';', ' ', ':'],
|
||||
# so for this file_obj it will be ','
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, ";name2", "Text", [";b1", ";b2", ";b3"])
|
||||
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
|
||||
|
||||
def test_one_line_file_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
2,name2,name3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Int", [2])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["name2"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["name3"])
|
||||
|
||||
def test_one_line_file_with_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", [])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", [])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", [])
|
||||
|
||||
def test_empty_file(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})
|
||||
self.assertEqual(parsed_file, ({}, []))
|
||||
|
||||
def test_option_num_rows(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
a1,b1,c1
|
||||
a2,b2,c2
|
||||
a3,b3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
parse_options = {"NUM_ROWS": 2}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2"])
|
||||
|
||||
parse_options = {"NUM_ROWS": 10}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
def test_option_num_rows_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,
|
||||
,,
|
||||
a1,1,c1
|
||||
a2,2,c2
|
||||
a3,3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "", "Int", [1, 2, 3])
|
||||
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
parse_options = {"NUM_ROWS": 2}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2'])
|
||||
self._check_col(parsed_file, 1, "", "Int", [1, 2])
|
||||
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
|
||||
|
||||
def test_option_use_col_name_as_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
a1,1,c1
|
||||
a2,2,c2
|
||||
a3,3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {"include_col_names_as_headers": False}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["name1", "a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["name2", "1", "2", "3"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["name3", "c1", "c2", "c3"])
|
||||
|
||||
parse_options = {"include_col_names_as_headers": True}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Int", [1, 2, 3])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
def test_option_use_col_name_as_header_no_headers(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,,
|
||||
,,,
|
||||
n1,2,n3
|
||||
a1,1,c1,d1
|
||||
a2,4,c2
|
||||
a3,5,c3
|
||||
"""))
|
||||
|
||||
parse_options = {"include_col_names_as_headers": False}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["n1", "a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "", "Int", [2, 1, 4, 5])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["n3", "c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", ["", "d1", "", ""])
|
||||
|
||||
parse_options = {"include_col_names_as_headers": True}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "n1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "2", "Int", [1, 4, 5])
|
||||
self._check_col(parsed_file, 2, "n3", "Text", ["c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", [ "d1", "", ""])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -1,259 +0,0 @@
|
||||
from unittest import TestCase
|
||||
import import_json
|
||||
|
||||
class TestImportJSON(TestCase):
|
||||
|
||||
maxDiff = None
|
||||
|
||||
def test_simple_json_array(self):
|
||||
grist_tables = import_json.dumps([{'a': 1, 'b': 'baba'}, {'a': 4, 'b': 'abab'}], '')
|
||||
self.assertEqual(grist_tables['tables'], [{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
|
||||
'table_data': [[1, 4], ['baba', 'abab']],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_missing_data(self):
|
||||
grist_tables = import_json.dumps([{'a': 1}, {'b': 'abab'}, {'a': 4}])
|
||||
self.assertEqual(grist_tables['tables'], [{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
|
||||
'table_data': [[1, None, 4], [None, 'abab', None]],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_even_more_simple_array(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps(['apple', 'pear', 'banana'], '')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': '', 'type': 'Text'}],
|
||||
'table_data': [['apple', 'pear', 'banana']],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_mixing_simple_and_even_more_simple(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps(['apple', 'pear', {'a': 'some cucumbers'}, 'banana'], '')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': '', 'type': 'Text'},
|
||||
{'id': 'a', 'type': 'Text'}],
|
||||
'table_data': [['apple', 'pear', None, 'banana'], [None, None, 'some cucumbers', None]],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_array_with_reference(self):
|
||||
# todo: reference should follow Grist's format
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': {'b': 2}, 'c': 'foo'}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[1], ['foo']],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'}
|
||||
],
|
||||
'table_data': [[2]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
def test_nested_nested_object(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': {'b': 2, 'd': {'a': 'sugar'}}, 'c': 'foo'}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[1], ['foo']],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'}, {'id': 'd', 'type': 'Ref:Hello_a_d'}
|
||||
],
|
||||
'table_data': [[2], [1]],
|
||||
'table_name': 'Hello_a'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [['sugar']],
|
||||
'table_name': 'Hello_a_d'
|
||||
}])
|
||||
|
||||
|
||||
def test_array_with_list(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': ['ES', 'FR', 'US']}, {'a': ['FR']}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [],
|
||||
'table_data': [],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [['ES', 'FR', 'US', 'FR'], [1, 1, 1, 2]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
def test_array_with_list_of_dict(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': [{'b': 1}, {'b': 4}]}, {'c': 2}], 'Hello')['tables'],
|
||||
[ {
|
||||
'column_metadata': [{'id': 'c', 'type': 'Int'}],
|
||||
'table_data': [[None, 2]],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}
|
||||
],
|
||||
'table_data': [[1, 4], [1, 1]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
|
||||
def test_array_of_array(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([['FR', 'US'], ['ES', 'CH']], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [],
|
||||
'table_data': [],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [['FR', 'US', 'ES', 'CH'], [1, 1, 2, 2]],
|
||||
'table_name': 'Hello_'
|
||||
}, ])
|
||||
|
||||
|
||||
def test_json_dict(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps({
|
||||
'foo': [{'a': 1, 'b': 'santa'}, {'a': 4, 'b': 'cats'}],
|
||||
'bar': [{'c': 2, 'd': 'ducks'}, {'c': 5, 'd': 'dogs'}],
|
||||
'status': {'success': True, 'time': '5s'}
|
||||
}, 'Hello')['tables'], [{
|
||||
'table_name': 'Hello',
|
||||
'column_metadata': [{'id': 'status', 'type': 'Ref:Hello_status'}],
|
||||
'table_data': [[1]]
|
||||
}, {
|
||||
'table_name': 'Hello_bar',
|
||||
'column_metadata': [
|
||||
{'id': 'c', 'type': 'Int'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}
|
||||
],
|
||||
'table_data': [[2, 5], ['ducks', 'dogs'], [1, 1]]
|
||||
}, {
|
||||
'table_name': 'Hello_foo',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'},
|
||||
{'id': 'b', 'type': 'Text'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [[1, 4], ['santa', 'cats'], [1, 1]]
|
||||
}, {
|
||||
'table_name': 'Hello_status',
|
||||
'column_metadata': [
|
||||
{'id': 'success', 'type': 'Bool'},
|
||||
{'id': 'time', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[True], ['5s']]
|
||||
}])
|
||||
|
||||
def test_json_types(self):
|
||||
self.assertEqual(import_json.dumps({
|
||||
'a': 3, 'b': 3.14, 'c': True, 'd': 'name', 'e': -4, 'f': '3.14', 'g': None
|
||||
}, 'Hello')['tables'],
|
||||
[{
|
||||
'table_name': 'Hello',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'},
|
||||
{'id': 'b', 'type': 'Numeric'},
|
||||
{'id': 'c', 'type': 'Bool'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'e', 'type': 'Int'},
|
||||
{'id': 'f', 'type': 'Text'},
|
||||
{'id': 'g', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[3], [3.14], [True], ['name'], [-4], ['3.14'], [None]]
|
||||
}])
|
||||
|
||||
def test_type_is_defined_with_first_value(self):
|
||||
tables = import_json.dumps([{'a': 'some text'}, {'a': 3}], '')
|
||||
self.assertIsNotNone(tables['tables'])
|
||||
self.assertIsNotNone(tables['tables'][0])
|
||||
self.assertIsNotNone(tables['tables'][0]['column_metadata'])
|
||||
self.assertIsNotNone(tables['tables'][0]['column_metadata'][0])
|
||||
self.assertEqual(tables['tables'][0]['column_metadata'][0]['type'], 'Text')
|
||||
|
||||
def test_first_unique_key(self):
|
||||
self.assertEqual(import_json.first_available_key({'a': 1}, 'a'), 'a2')
|
||||
self.assertEqual(import_json.first_available_key({'a': 1}, 'b'), 'b')
|
||||
self.assertEqual(import_json.first_available_key({'a': 1, 'a2': 1}, 'a'), 'a3')
|
||||
|
||||
|
||||
def dump_tables(options):
|
||||
data = {
|
||||
"foos": [
|
||||
{'foo': 1, 'link': [1, 2]},
|
||||
{'foo': 2, 'link': [1, 2]}
|
||||
],
|
||||
"bar": {'hi': 'santa'}
|
||||
}
|
||||
return [t for t in import_json.dumps(data, 'FooBar', options)['tables']]
|
||||
|
||||
|
||||
class TestParseOptions(TestCase):
|
||||
|
||||
maxDiff = None
|
||||
|
||||
# helpers
|
||||
def assertColInTable(self, tables, **kwargs):
|
||||
table = next(t for t in tables if t['table_name'] == kwargs['table_name'])
|
||||
self.assertEqual(any(col['id'] == kwargs['col_id'] for col in table['column_metadata']),
|
||||
kwargs['present'])
|
||||
|
||||
def assertTableNamesEqual(self, tables, expected_table_names):
|
||||
table_names = [t['table_name'] for t in tables]
|
||||
self.assertEqual(sorted(table_names), sorted(expected_table_names))
|
||||
|
||||
def test_including_empty_string_includes_all(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar', 'FooBar_foos', 'FooBar_foos_link'])
|
||||
|
||||
def test_including_foos_includes_nested_object_and_removes_ref_to_table_not_included(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos', 'FooBar_foos_link'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos', col_id='FooBar', present=False)
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos_link', col_id='FooBar_foos',
|
||||
present=False)
|
||||
|
||||
def test_excluding_foos_excludes_nested_object_and_removes_link_to_excluded_table(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar'])
|
||||
self.assertColInTable(tables, table_name='FooBar', col_id='foos', present=False)
|
||||
|
||||
def test_excludes_works_on_nested_object_that_are_included(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': 'FooBar_foos_link'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos'])
|
||||
|
||||
def test_excludes_works_on_property(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_foo'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos', 'FooBar_foos_link', 'FooBar_bar'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos', col_id='foo', present=False)
|
||||
|
||||
def test_works_with_multiple_includes(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link;FooBar_bar', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_bar', 'FooBar_foos_link'])
|
||||
|
||||
def test_works_with_multiple_excludes(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_link;FooBar_bar'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos'])
|
||||
@@ -1,160 +0,0 @@
|
||||
# This Python file uses the following encoding: utf-8
|
||||
import calendar
|
||||
import datetime
|
||||
import math
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import import_xls
|
||||
|
||||
def _get_fixture(filename):
|
||||
return [os.path.join(os.path.dirname(__file__), "test/fixtures", filename), filename]
|
||||
|
||||
|
||||
class TestImportXLS(unittest.TestCase):
|
||||
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def test_excel(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
|
||||
|
||||
# check that column type was correctly set to int and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][1],
|
||||
["a", "b", "c", "d", "e", "f", "g", "h"])
|
||||
|
||||
# messy tables does not support bool types yet, it classifies them as ints
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
|
||||
self.assertEqual(parsed_file[1][False]["table_data"][2],
|
||||
[True, False, True, False, True, False, True, False])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][3],
|
||||
{"type": "Text", "id": "corner-cases"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][3],
|
||||
# The type is detected as text, so all values should be text.
|
||||
[u'=function()', '3.0', u'two spaces after ',
|
||||
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
|
||||
|
||||
# check that multiple tables are created when there are multiple sheets in a document
|
||||
self.assertEqual(parsed_file[1][0]["table_name"], u"Sheet1")
|
||||
self.assertEqual(parsed_file[1][1]["table_name"], u"Sheet2")
|
||||
self.assertEqual(parsed_file[1][1]["table_data"][0], ["a", "b", "c", "d"])
|
||||
|
||||
def test_excel_types(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||
sheet = parsed_file[1][0]
|
||||
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
# TODO: all dates have different format
|
||||
# self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
# [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
|
||||
|
||||
def test_excel_type_detection(self):
|
||||
# This tests goes over the second sheet of the fixture doc, which has multiple rows that try
|
||||
# to throw off the type detection.
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||
sheet = parsed_file[1][1]
|
||||
self._check_col(sheet, 0, "date_with_other", "DateTime",
|
||||
[1467676800.0, 1451606400.0, 1451692800.0, 1454544000.0, 1199577600.0,
|
||||
1467732614.0, u'n/a', 1207958400.0, 1451865600.0, 1451952000.0,
|
||||
None, 1452038400.0, 1451549340.0, 1483214940.0, None,
|
||||
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
|
||||
self._check_col(sheet, 1, "float_not_int", "Numeric",
|
||||
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
|
||||
self._check_col(sheet, 2, "int_not_bool", "Int",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 3, "float_not_bool", "Numeric",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 4, "text_as_bool", "Bool",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 5, "int_as_bool", "Bool",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 6, "float_not_date", "Numeric",
|
||||
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
|
||||
4.0, 6.0, '3-4', 4.0, 6.5])
|
||||
self._check_col(sheet, 7, "float_not_text", "Numeric",
|
||||
[-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-",
|
||||
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
|
||||
self._check_col(sheet, 8, "dollar_amts", "Numeric",
|
||||
[0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
|
||||
1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
|
||||
|
||||
def test_excel_single_merged_cell(self):
|
||||
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
|
||||
# exception.
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx'))
|
||||
tables = parsed_file[1]
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Transaction Report',
|
||||
'column_metadata': [
|
||||
{'type': 'Text', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u'Start'},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Text', 'id': u'Seek no easy ways'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'SINGLE MERGED', u'The End'],
|
||||
[1637384.52, u''],
|
||||
[2444344.06, u''],
|
||||
[2444344.06, u''],
|
||||
[u'', u''],
|
||||
],
|
||||
}])
|
||||
|
||||
def test_excel_strange_dates(self):
|
||||
# TODO fails with xlrd.xldate.XLDateAmbiguous: 4.180902777777778
|
||||
# Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00).
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx'))
|
||||
tables = parsed_file[1]
|
||||
# We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30"
|
||||
# would be best left as text, but here become "01:10:00" (after xlrd parses the first as
|
||||
# datetime.time), and as 4.18... (after xlrd fails and we resort to the numerical value).
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Sheet1',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Text'},
|
||||
{'id': 'b', 'type': 'Date'},
|
||||
{'id': 'c', 'type': 'Text'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'e', 'type': 'Numeric'},
|
||||
{'id': 'f', 'type': 'Int'},
|
||||
{'id': 'g', 'type': 'Date'},
|
||||
{'id': 'h', 'type': 'Date'},
|
||||
{'id': 'i', 'type': 'Bool'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'21:14:00'],
|
||||
[1568851200.0],
|
||||
[u'01:10:00'],
|
||||
[u'10:20:30'],
|
||||
[4.180902777777778],
|
||||
[20],
|
||||
[-6106060800.0],
|
||||
[205286400.0],
|
||||
[False], # This is not great either, we should be able to distinguish 0 from FALSE.
|
||||
],
|
||||
}])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user