Summary: * Moves essential plugins to grist-core, so that basic imports (e.g. csv) work. * Adds support for a `GRIST_SANDBOX_FLAVOR` flag that can systematically override how the data engine is run. - `GRIST_SANDBOX_FLAVOR=pynbox` is "classic" nacl-based sandbox. - `GRIST_SANDBOX_FLAVOR=docker` runs engines in individual docker containers. It requires an image specified in `sandbox/docker` (alternative images can be named with `GRIST_SANDBOX` flag - need to contain python and engine requirements). It is a simple reference implementation for sandboxing. - `GRIST_SANDBOX_FLAVOR=unsandboxed` runs whatever local version of python is specified by a `GRIST_SANDBOX` flag directly, with no sandboxing. Engine requirements must be installed, so an absolute path to a python executable in a virtualenv is easiest to manage. - `GRIST_SANDBOX_FLAVOR=gvisor` runs the data engine via gvisor's runsc. Experimental, with implementation not included in grist-core. Since gvisor runs on Linux only, this flavor supports wrapping the sandboxes in a single shared docker container. * Tweaks some recent express query parameter code to work in grist-core, which has a slightly different version of express (smoke test doesn't catch this since in Jenkins core is built within a workspace that has node_modules, and wires get crossed - in a dev environment the problem on master can be seen by doing `buildtools/build_core.sh /tmp/any_path_outside_grist`). The new sandbox options do not have tests yet, nor does this they change the behavior of grist servers today. They are there to clean up and consolidate a collection of patches I've been using that were getting cumbersome, and make it easier to run experiments. I haven't looked closely at imports beyond core. Test Plan: tested manually against regular grist and grist-core, including imports Reviewers: alexmojaki, dsagal Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D2942pull/9/head
parent
cd0c6de53e
commit
bb8cb2593d
@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
if [ ! -e sandbox_venv3 ]; then
|
||||
virtualenv -ppython3 sandbox_venv3
|
||||
fi
|
||||
|
||||
. sandbox_venv3/bin/activate
|
||||
|
||||
pip install --no-deps -r sandbox/requirements3.txt
|
@ -0,0 +1,24 @@
|
||||
name: core
|
||||
version: 0.0.0
|
||||
description: Grist core features
|
||||
components:
|
||||
safePython: sandbox/main.py
|
||||
contributions:
|
||||
fileParsers:
|
||||
- fileExtensions: ["csv"]
|
||||
parseFile:
|
||||
component: safePython
|
||||
name: csv_parser
|
||||
- fileExtensions: ["xls", "xlsx", "tsv", "txt", "xlsm"]
|
||||
parseFile:
|
||||
component: safePython
|
||||
name: xls_parser
|
||||
- fileExtensions: ["json"]
|
||||
parseFile:
|
||||
component: safePython
|
||||
name: json_parser
|
||||
|
||||
scripts:
|
||||
build:
|
||||
# Note that ${XUNIT:+xxx} inserts "xxx" when XUNIT is set, and nothing otherwise.
|
||||
test: $GRIST_PYTHON -m runtests discover -v -s /sandbox ${XUNIT:+--xunit}
|
@ -0,0 +1 @@
|
||||
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
|
@ -0,0 +1,184 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import functools
|
||||
from collections import namedtuple
|
||||
from threading import RLock
|
||||
|
||||
_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
|
||||
|
||||
|
||||
@functools.wraps(functools.update_wrapper)
|
||||
def update_wrapper(wrapper,
|
||||
wrapped,
|
||||
assigned = functools.WRAPPER_ASSIGNMENTS,
|
||||
updated = functools.WRAPPER_UPDATES):
|
||||
"""
|
||||
Patch two bugs in functools.update_wrapper.
|
||||
"""
|
||||
# workaround for http://bugs.python.org/issue3445
|
||||
assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
|
||||
wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
|
||||
# workaround for https://bugs.python.org/issue17482
|
||||
wrapper.__wrapped__ = wrapped
|
||||
return wrapper
|
||||
|
||||
|
||||
class _HashedSeq(list):
|
||||
__slots__ = 'hashvalue'
|
||||
|
||||
def __init__(self, tup, hash=hash):
|
||||
self[:] = tup
|
||||
self.hashvalue = hash(tup)
|
||||
|
||||
def __hash__(self):
|
||||
return self.hashvalue
|
||||
|
||||
|
||||
def _make_key(args, kwds, typed,
|
||||
kwd_mark=(object(),),
|
||||
fasttypes=set([int, str, frozenset, type(None)]),
|
||||
sorted=sorted, tuple=tuple, type=type, len=len):
|
||||
'Make a cache key from optionally typed positional and keyword arguments'
|
||||
key = args
|
||||
if kwds:
|
||||
sorted_items = sorted(kwds.items())
|
||||
key += kwd_mark
|
||||
for item in sorted_items:
|
||||
key += item
|
||||
if typed:
|
||||
key += tuple(type(v) for v in args)
|
||||
if kwds:
|
||||
key += tuple(type(v) for k, v in sorted_items)
|
||||
elif len(key) == 1 and type(key[0]) in fasttypes:
|
||||
return key[0]
|
||||
return _HashedSeq(key)
|
||||
|
||||
|
||||
def lru_cache(maxsize=100, typed=False):
|
||||
"""Least-recently-used cache decorator.
|
||||
|
||||
If *maxsize* is set to None, the LRU features are disabled and the cache
|
||||
can grow without bound.
|
||||
|
||||
If *typed* is True, arguments of different types will be cached separately.
|
||||
For example, f(3.0) and f(3) will be treated as distinct calls with
|
||||
distinct results.
|
||||
|
||||
Arguments to the cached function must be hashable.
|
||||
|
||||
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
|
||||
f.cache_info(). Clear the cache and statistics with f.cache_clear().
|
||||
Access the underlying function with f.__wrapped__.
|
||||
|
||||
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
|
||||
|
||||
"""
|
||||
|
||||
# Users should only access the lru_cache through its public API:
|
||||
# cache_info, cache_clear, and f.__wrapped__
|
||||
# The internals of the lru_cache are encapsulated for thread safety and
|
||||
# to allow the implementation to change (including a possible C version).
|
||||
|
||||
def decorating_function(user_function):
|
||||
|
||||
cache = dict()
|
||||
stats = [0, 0] # make statistics updateable non-locally
|
||||
HITS, MISSES = 0, 1 # names for the stats fields
|
||||
make_key = _make_key
|
||||
cache_get = cache.get # bound method to lookup key or return None
|
||||
_len = len # localize the global len() function
|
||||
lock = RLock() # because linkedlist updates aren't threadsafe
|
||||
root = [] # root of the circular doubly linked list
|
||||
root[:] = [root, root, None, None] # initialize by pointing to self
|
||||
nonlocal_root = [root] # make updateable non-locally
|
||||
PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
|
||||
|
||||
if maxsize == 0:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# no caching, just do a statistics update after a successful call
|
||||
result = user_function(*args, **kwds)
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
elif maxsize is None:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# simple caching without ordering or size limit
|
||||
key = make_key(args, kwds, typed)
|
||||
result = cache_get(key, root) # root used here as a unique not-found sentinel
|
||||
if result is not root:
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
cache[key] = result
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
else:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# size limited caching that tracks accesses by recency
|
||||
key = make_key(args, kwds, typed) if kwds or typed else args
|
||||
with lock:
|
||||
link = cache_get(key)
|
||||
if link is not None:
|
||||
# record recent use of the key by moving it to the front of the list
|
||||
root, = nonlocal_root
|
||||
link_prev, link_next, key, result = link
|
||||
link_prev[NEXT] = link_next
|
||||
link_next[PREV] = link_prev
|
||||
last = root[PREV]
|
||||
last[NEXT] = root[PREV] = link
|
||||
link[PREV] = last
|
||||
link[NEXT] = root
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
with lock:
|
||||
root, = nonlocal_root
|
||||
if key in cache:
|
||||
# getting here means that this same key was added to the
|
||||
# cache while the lock was released. since the link
|
||||
# update is already done, we need only return the
|
||||
# computed result and update the count of misses.
|
||||
pass
|
||||
elif _len(cache) >= maxsize:
|
||||
# use the old root to store the new key and result
|
||||
oldroot = root
|
||||
oldroot[KEY] = key
|
||||
oldroot[RESULT] = result
|
||||
# empty the oldest link and make it the new root
|
||||
root = nonlocal_root[0] = oldroot[NEXT]
|
||||
oldkey = root[KEY]
|
||||
root[KEY] = root[RESULT] = None
|
||||
# now update the cache dictionary for the new links
|
||||
del cache[oldkey]
|
||||
cache[key] = oldroot
|
||||
else:
|
||||
# put result in a new link at the front of the list
|
||||
last = root[PREV]
|
||||
link = [last, root, key, result]
|
||||
last[NEXT] = root[PREV] = cache[key] = link
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
def cache_info():
|
||||
"""Report cache statistics"""
|
||||
with lock:
|
||||
return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
|
||||
|
||||
def cache_clear():
|
||||
"""Clear the cache and cache statistics"""
|
||||
with lock:
|
||||
cache.clear()
|
||||
root = nonlocal_root[0]
|
||||
root[:] = [root, root, None, None]
|
||||
stats[:] = [0, 0]
|
||||
|
||||
wrapper.__wrapped__ = user_function
|
||||
wrapper.cache_info = cache_info
|
||||
wrapper.cache_clear = cache_clear
|
||||
return update_wrapper(wrapper, user_function)
|
||||
|
||||
return decorating_function
|
@ -0,0 +1,479 @@
|
||||
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
|
||||
based on samples.
|
||||
|
||||
dateguesser.guess(sample)
|
||||
dateguesser.guess takes a sample date string and returns a set of
|
||||
datetime.strftime/strptime-compliant date format strings that will correctly parse.
|
||||
|
||||
dateguesser.guess_bulk(list_of_samples, error_rate=0)
|
||||
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
|
||||
and returns a list of datetime.strftime/strptime-compliant date format strings
|
||||
sorted by error rate that will correctly parse.
|
||||
|
||||
Algorithm:
|
||||
|
||||
1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
|
||||
2. Analyze each token independently in terms what format codes could represent
|
||||
3. For given list of tokens generate all permutations of format codes
|
||||
4. During generating permutations check for validness of generated format and skip if invalid.
|
||||
5. Use rules listed below to decide if format is invalid:
|
||||
|
||||
Invalid format checks:
|
||||
|
||||
Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
Rule #2. No holes (missing parts) in the format parts.
|
||||
Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
|
||||
Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
|
||||
Rule #5. Pattern can't have some part of date/time defined more than once.
|
||||
Rule #6: Separators between elements of the time group should be the same.
|
||||
Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
|
||||
Rule #8: Year can't be between other date elements
|
||||
|
||||
Note:
|
||||
dateguess doesn't support defaulting to current year because parsing should be deterministic,
|
||||
it's better to to fail guessing the format then to guess it incorrectly.
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
|
||||
|
||||
import calendar
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
import moment
|
||||
|
||||
|
||||
MONTH_NAME = calendar.month_name
|
||||
MONTH_ABBR = calendar.month_abbr
|
||||
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
|
||||
AM_PM = {'am', 'pm'}
|
||||
DAYS_OF_WEEK_NAME = calendar.day_name
|
||||
DAYS_OF_WEEK_ABBR = calendar.day_abbr
|
||||
|
||||
DATE_ELEMENTS = [
|
||||
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
|
||||
("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),
|
||||
("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),
|
||||
("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
|
||||
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
|
||||
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
|
||||
("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
|
||||
("Day of week", "%A", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
|
||||
("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
|
||||
|
||||
("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6
|
||||
and 0 <= int(x[0:2]) < 24
|
||||
and 0 <= int(x[2:4]) < 60
|
||||
and 0 <= int(x[4:6]) < 60, "HMS", 0),
|
||||
|
||||
("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
|
||||
("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2
|
||||
and 0 <= int(x) <= 11, "H", 0),
|
||||
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
|
||||
("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
|
||||
("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
|
||||
("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None
|
||||
and p.val == '.', "f", 0),
|
||||
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
|
||||
and x in TZ_VALID_NAMES, "Z", 0),
|
||||
("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '+', "Z", 1),
|
||||
("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '-', "Z", 1),
|
||||
]
|
||||
|
||||
|
||||
class Token(object):
|
||||
"""Represents a part of a date string that's being parsed.
|
||||
Note that __hash__ and __eq__ are overridden in order
|
||||
to compare only meaningful parts of an object.
|
||||
"""
|
||||
def __init__(self, val, length):
|
||||
self.val = val
|
||||
self.length = length
|
||||
self.compatible_types = ()
|
||||
|
||||
def __hash__(self):
|
||||
h = hash(self.length) + hash(self.compatible_types)
|
||||
if not self.compatible_types:
|
||||
h += hash(self.val)
|
||||
return hash(h)
|
||||
|
||||
def __eq__(self, other):
|
||||
"""
|
||||
Two tokens are equal when these both are true:
|
||||
a) length and compatible types are equal
|
||||
b) if it is separator (no compatible types), separator values must be equal
|
||||
"""
|
||||
if self.length != other.length or self.compatible_types != other.compatible_types:
|
||||
return False
|
||||
if not other.compatible_types and self.val != other.val:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_1(pattern, types_used):
|
||||
"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_1('%Y/%m/%d', 'Ymd')
|
||||
True
|
||||
>>> _check_rule_1('%m/%d', 'md')
|
||||
False
|
||||
"""
|
||||
if 'Y' not in types_used:
|
||||
logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_2(pattern, types_used):
|
||||
"""Rule #2: No holes (missing parts) in the format parts.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_2('%Y:%H', 'YH')
|
||||
False
|
||||
>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
|
||||
True
|
||||
"""
|
||||
priorities = 'YmdHMSf'
|
||||
seen_parts = [p in types_used for p in priorities]
|
||||
if sorted(seen_parts, reverse=True) != seen_parts:
|
||||
logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_3(pattern, types_used):
|
||||
"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
|
||||
True
|
||||
>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
|
||||
False
|
||||
"""
|
||||
time_parts = 'HMSf'
|
||||
time_parts_highlighted = [t in time_parts for t in types_used]
|
||||
time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
|
||||
if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
|
||||
logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_4(pattern, types_used):
|
||||
"""Rule #4: It's highly impossible that minutes coming before hours,
|
||||
millis coming before seconds etc.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_4('%H:%M', 'HM')
|
||||
True
|
||||
>>> _check_rule_4('%S:%M', 'SM')
|
||||
False
|
||||
"""
|
||||
time_parts_priority = 'HMSf'
|
||||
time_parts_indexes = list(filter(lambda x: x >= 0,
|
||||
[time_parts_priority.find(t) for t in types_used]))
|
||||
if sorted(time_parts_indexes) != time_parts_indexes:
|
||||
logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_5(pattern, types_used):
|
||||
"""Rule #5: Pattern can't have some part of date/time defined more than once.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_5('%Y/%Y', 'YY')
|
||||
False
|
||||
>>> _check_rule_5('%m/%b', 'mm')
|
||||
False
|
||||
>>> _check_rule_5('%Y/%m', 'Ym')
|
||||
True
|
||||
"""
|
||||
if len(types_used) != len(set(types_used)):
|
||||
logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_6(tokens_chosen, pattern, types_used):
|
||||
"""Rule #6: Separators between elements of the time group should be the same.
|
||||
|
||||
Examples:
|
||||
_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
|
||||
('-' and ':') in time group)
|
||||
"""
|
||||
time_parts = 'HMS'
|
||||
num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
|
||||
time_parts_seen = 0
|
||||
separators_seen = []
|
||||
previous_was_a_separator = False
|
||||
|
||||
for token in tokens_chosen:
|
||||
if token[1] is not None and token[1][3] in time_parts:
|
||||
# This rule doesn't work for separator-less time group so when we found the type
|
||||
# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
|
||||
if len(token[1][3]) == 3:
|
||||
break
|
||||
# If not a first time then
|
||||
if time_parts_seen > 0 and not previous_was_a_separator:
|
||||
separators_seen.append(None)
|
||||
time_parts_seen += 1
|
||||
if time_parts_seen == num_of_time_parts_used:
|
||||
break
|
||||
previous_was_a_separator = False
|
||||
else:
|
||||
if time_parts_seen > 0:
|
||||
separators_seen.append(token[0].val)
|
||||
previous_was_a_separator = True
|
||||
|
||||
if len(set(separators_seen)) > 1:
|
||||
logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
|
||||
pattern, separators_seen)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7a(pattern):
|
||||
"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7a('%Y/%m/%d %H:%M %p')
|
||||
False
|
||||
>>> _check_rule_7a('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%p' in pattern and '%H' in pattern:
|
||||
logging.debug("Rule #7a is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7b(pattern):
|
||||
"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M')
|
||||
False
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%I' in pattern and '%p' not in pattern:
|
||||
logging.debug("Rule #7b is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_8(pattern, types_used):
|
||||
"""Rule #9: Year can't be between other date elements
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
|
||||
False
|
||||
"""
|
||||
if 'mYd' in types_used or 'dYm' in types_used:
|
||||
logging.debug("Rule #8 is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _tokenize_by_character_class(s):
|
||||
"""Return a list of strings by splitting s (tokenizing) by character class.
|
||||
|
||||
Example:
|
||||
>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
|
||||
>>> [i.val for i in t]
|
||||
['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
|
||||
|
||||
>>> t = _tokenize_by_character_class('5/14/2014')
|
||||
>>> [i.val for i in t]
|
||||
['5', '/', '14', '/', '2014']
|
||||
"""
|
||||
res = re.split(r'(\d+)|(\W)|(_)', s)
|
||||
return [Token(i, len(i)) for i in res if i]
|
||||
|
||||
|
||||
def _sliding_triplets(tokens):
|
||||
for idx, t in enumerate(tokens):
|
||||
yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
|
||||
|
||||
|
||||
def _analyze_tokens(tokens):
|
||||
"""Analize each token and find out compatible types for it."""
|
||||
for token, prev, nxt in _sliding_triplets(tokens):
|
||||
token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _generate_all_permutations(tokens):
|
||||
"""Generate all permutations of format codes for given list of tokens.
|
||||
|
||||
Brute-forcing of all possible permutations and rules checking eats most of the time or date
|
||||
parsing. But since the input is expected to be highly uniform then we can expect that
|
||||
memoization of this step will be very efficient.
|
||||
|
||||
Token contains values for date parts but due to overridden eq and hash methods,
|
||||
we treat two tokens having the same length and same possible formats as equal
|
||||
tokens and separators should be the same
|
||||
"""
|
||||
all_patterns = set()
|
||||
_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
|
||||
|
||||
return all_patterns
|
||||
|
||||
|
||||
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
"""Apply rules which are applicable for partially constructed patterns.
|
||||
|
||||
Example: duplicates of a date part in a pattern.
|
||||
"""
|
||||
return _check_rule_5(pattern, types_used) \
|
||||
and _check_rule_4(pattern, types_used) \
|
||||
and _check_rule_7a(pattern)
|
||||
|
||||
|
||||
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
"""Apply rules which are applicable for full pattern only.
|
||||
|
||||
Example: existence of Year part in the pattern.
|
||||
"""
|
||||
return _check_rule_1(pattern, types_used) \
|
||||
and _check_rule_2(pattern, types_used) \
|
||||
and _check_rule_3(pattern, types_used) \
|
||||
and _check_rule_6(tokens_chosen, pattern, types_used) \
|
||||
and _check_rule_7b(pattern) \
|
||||
and _check_rule_8(pattern, types_used)
|
||||
|
||||
|
||||
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
|
||||
types_used):
|
||||
"""Generate all format elements permutations recursively.
|
||||
|
||||
Args:
|
||||
tokens (list[Token]): List of tokens.
|
||||
token_idx (int): Index of token processing this cycle.
|
||||
tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
|
||||
containing token and compatible type
|
||||
pattern (str): String containing format for parsing
|
||||
found_patterns (set): Set of guessed patterns
|
||||
types_used (str): String of types used to build pattern.
|
||||
|
||||
Returns:
|
||||
list: List of permutations
|
||||
"""
|
||||
if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
return
|
||||
|
||||
if token_idx < len(tokens):
|
||||
t = tokens[token_idx]
|
||||
if t.compatible_types:
|
||||
for ct in t.compatible_types:
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
|
||||
(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
|
||||
found_patterns, types_used + ct[3])
|
||||
else:
|
||||
# if no compatible types it should be separator, add it to the pattern
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1,
|
||||
tokens_chosen[:] + [(t, None)], pattern + t.val,
|
||||
found_patterns, types_used)
|
||||
else:
|
||||
if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
found_patterns.add(pattern)
|
||||
|
||||
|
||||
def guess(date):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for date string.
|
||||
|
||||
Args:
|
||||
date (str): Date string.
|
||||
|
||||
Returns:
|
||||
set: Set of datetime.strftime/strptime-compliant date format strings
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
"""
|
||||
tokens = _tokenize_by_character_class(date)
|
||||
_analyze_tokens(tokens)
|
||||
return _generate_all_permutations(tuple(tokens))
|
||||
|
||||
|
||||
def guess_bulk(dates, error_rate=0):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
|
||||
|
||||
Args:
|
||||
dates (list): List of samples date strings.
|
||||
error_rate (float): Acceptable error rate (default 0.0)
|
||||
|
||||
Returns:
|
||||
list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
|
||||
|
||||
Examples:
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
if error_rate == 0.0:
|
||||
patterns = None
|
||||
for date in dates:
|
||||
guesses_patterns = guess(date)
|
||||
if patterns is None:
|
||||
patterns = guesses_patterns
|
||||
else:
|
||||
patterns = patterns.intersection(guesses_patterns)
|
||||
if not patterns:
|
||||
break # No need to iterate more if zero patterns found
|
||||
return list(patterns)
|
||||
else:
|
||||
found_dates = 0
|
||||
pattern_counters = defaultdict(lambda: 0)
|
||||
num_dates = len(dates)
|
||||
min_num_dates_to_be_found = num_dates - num_dates * error_rate
|
||||
|
||||
for idx, date in enumerate(dates):
|
||||
patterns = guess(date)
|
||||
if patterns:
|
||||
found_dates += 1
|
||||
for pattern in patterns:
|
||||
pattern_counters[pattern] = pattern_counters[pattern] + 1
|
||||
|
||||
# Early return if number of strings that can't be date is already over error rate
|
||||
cells_left = num_dates - idx - 1
|
||||
cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
|
||||
if cannot_be_found:
|
||||
return []
|
||||
|
||||
patterns = [(v, k) for k, v in pattern_counters.items()
|
||||
if v > min_num_dates_to_be_found]
|
||||
patterns.sort(reverse=True)
|
||||
return [k for (v, k) in patterns]
|
@ -0,0 +1,197 @@
|
||||
"""
|
||||
Plugin for importing CSV files
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
|
||||
import chardet
|
||||
import messytables
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
import parse_data
|
||||
import import_utils
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SCHEMA = [
|
||||
{
|
||||
'name': 'lineterminator',
|
||||
'label': 'Line terminator',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'include_col_names_as_headers',
|
||||
'label': 'First row contains headers',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'delimiter',
|
||||
'label': 'Field separator',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'skipinitialspace',
|
||||
'label': 'Skip leading whitespace',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'quotechar',
|
||||
'label': 'Quote character',
|
||||
'type': 'string',
|
||||
'visible': True,
|
||||
},
|
||||
{
|
||||
'name': 'doublequote',
|
||||
'label': 'Quotes in fields are doubled',
|
||||
'type': 'boolean',
|
||||
'visible': True,
|
||||
},
|
||||
|
||||
{
|
||||
'name': 'quoting',
|
||||
'label': 'Convert quoted fields',
|
||||
'type': 'number',
|
||||
'visible': False, # Not supported by messytables
|
||||
},
|
||||
{
|
||||
'name': 'escapechar',
|
||||
'label': 'Escape character',
|
||||
'type': 'string',
|
||||
'visible': False, # Not supported by messytables
|
||||
},
|
||||
{
|
||||
'name': 'start_with_row',
|
||||
'label': 'Start with row',
|
||||
'type': 'number',
|
||||
'visible': False, # Not yet implemented
|
||||
},
|
||||
{
|
||||
'name': 'NUM_ROWS',
|
||||
'label': 'Number of rows',
|
||||
'type': 'number',
|
||||
'visible': False,
|
||||
}]
|
||||
|
||||
def parse_file_source(file_source, options):
|
||||
parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
|
||||
return {"parseOptions": parsing_options, "tables": export_list}
|
||||
|
||||
def parse_file(file_path, parse_options=None):
|
||||
"""
|
||||
Reads a file path and parse options that are passed in using ActiveDoc.importFile()
|
||||
and returns a tuple with parsing options (users' or guessed) and an object formatted so that
|
||||
it can be used by grist for a bulk add records action.
|
||||
"""
|
||||
parse_options = parse_options or {}
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
|
||||
return parsing_options, export_list
|
||||
|
||||
|
||||
def _parse_open_file(file_obj, parse_options=None):
|
||||
options = {}
|
||||
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
|
||||
csv_options = {k: parse_options.get(k) for k in csv_keys}
|
||||
if six.PY2:
|
||||
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
|
||||
for k, v in csv_options.items()}
|
||||
|
||||
table_set = messytables.CSVTableSet(file_obj,
|
||||
delimiter=csv_options['delimiter'],
|
||||
quotechar=csv_options['quotechar'],
|
||||
lineterminator=csv_options['lineterminator'],
|
||||
doublequote=csv_options['doublequote'],
|
||||
skipinitialspace=csv_options['skipinitialspace'])
|
||||
|
||||
num_rows = parse_options.get('NUM_ROWS', 0)
|
||||
|
||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
||||
sample = file_obj.read(100000)
|
||||
table_set.encoding = chardet.detect(sample)['encoding']
|
||||
# In addition, always prefer UTF8 over ASCII.
|
||||
if table_set.encoding == 'ascii':
|
||||
table_set.encoding = 'utf8'
|
||||
|
||||
export_list = []
|
||||
# A table set is a collection of tables:
|
||||
for row_set in table_set.tables:
|
||||
table_name = None
|
||||
sample_rows = list(row_set.sample)
|
||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
||||
data_offset, headers = import_utils.headers_guess(sample_rows)
|
||||
|
||||
# Make sure all header values are strings.
|
||||
for i, header in enumerate(headers):
|
||||
if not isinstance(header, six.string_types):
|
||||
headers[i] = six.text_type(header)
|
||||
|
||||
log.info("Guessed data_offset as %s", data_offset)
|
||||
log.info("Guessed headers as: %s", headers)
|
||||
|
||||
have_guessed_headers = any(headers)
|
||||
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
|
||||
have_guessed_headers)
|
||||
|
||||
if include_col_names_as_headers and not have_guessed_headers:
|
||||
# use first line as headers
|
||||
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
|
||||
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
|
||||
|
||||
elif not include_col_names_as_headers and have_guessed_headers:
|
||||
# move guessed headers to data
|
||||
data_offset -= 1
|
||||
headers = [''] * len(headers)
|
||||
|
||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
||||
|
||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows)
|
||||
|
||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||
column_metadata = []
|
||||
table_data = []
|
||||
for col_data, header in zip(table_data_with_types, headers):
|
||||
if not header and all(val == "" for val in col_data["data"]):
|
||||
continue # empty column
|
||||
data = col_data.pop("data")
|
||||
col_data["id"] = header
|
||||
column_metadata.append(col_data)
|
||||
table_data.append(data)
|
||||
|
||||
if not table_data:
|
||||
# Don't add tables with no columns.
|
||||
continue
|
||||
|
||||
guessed = row_set._dialect
|
||||
quoting = parse_options.get('quoting')
|
||||
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
||||
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
||||
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
|
||||
"quotechar": parse_options.get('quotechar', guessed.quotechar),
|
||||
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
|
||||
"include_col_names_as_headers": include_col_names_as_headers,
|
||||
"start_with_row": 1,
|
||||
"NUM_ROWS": num_rows,
|
||||
"SCHEMA": SCHEMA
|
||||
}
|
||||
|
||||
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
||||
for c in column_metadata:
|
||||
log.debug("Output column %s", c)
|
||||
export_list.append({
|
||||
"table_name": table_name,
|
||||
"column_metadata": column_metadata,
|
||||
"table_data": table_data
|
||||
})
|
||||
|
||||
return options, export_list
|
||||
|
||||
def get_version():
|
||||
""" Return name and version of plug-in"""
|
||||
pass
|
@ -0,0 +1,257 @@
|
||||
"""
|
||||
The import_json module converts json file into a list of grist tables.
|
||||
|
||||
It supports data being structured as a list of record, turning each
|
||||
object into a row and each object's key into a column. For
|
||||
example:
|
||||
```
|
||||
[{'a': 1, 'b': 'tree'}, {'a': 4, 'b': 'flowers'}, ... ]
|
||||
```
|
||||
is turned into a table with two columns 'a' of type 'Int' and 'b' of
|
||||
type 'Text'.
|
||||
|
||||
Nested object are stored as references to a distinct table where the
|
||||
nested object is stored. For example:
|
||||
```
|
||||
[{'a': {'b': 4}}, ...]
|
||||
```
|
||||
is turned into a column 'a' of type 'Ref:my_import_name.a', and into
|
||||
another table 'my_import_name.a' with a column 'b' of type
|
||||
'Int'. (Nested-nested objects are supported as well and the module
|
||||
assumes no limit to the number of level of nesting you can do.)
|
||||
|
||||
Each value which is not an object will be stored into a column with id
|
||||
'' (empty string). For example:
|
||||
```
|
||||
['apple', 'peach', ... ]
|
||||
```
|
||||
is turned into a table with an un-named column that stores the values.
|
||||
|
||||
Arrays are stored as a list of references to a table where the content
|
||||
of the array is stored. For example:
|
||||
```
|
||||
[{'items': [{'a':'apple'}, {'a':'peach'}]}, {'items': [{'a':'cucumber'}, {'a':'carots'}, ...]}, ...]
|
||||
```
|
||||
is turned into a column named 'items' of type
|
||||
'RefList:my_import_name.items' which points to another table named
|
||||
'my_import_name.items' which has a column 'a' of type Text.
|
||||
|
||||
Data could be structured with an object at the root as well in which
|
||||
case, the object is considered to represent a single row, and gets
|
||||
turned into a table with one row.
|
||||
|
||||
A column's type is defined by the type of its first value that is not
|
||||
None (ie: if another value with different type is stored in the same
|
||||
column, the column's type remains unchanged), 'Text' otherwise.
|
||||
|
||||
Usage:
|
||||
import import_json
|
||||
# if you have a file to parse
|
||||
import_json.parse_file(file_path)
|
||||
|
||||
# if data is already encoded with python's standard containers (dict and list)
|
||||
import_json.dumps(data, import_name)
|
||||
|
||||
|
||||
TODO:
|
||||
- references should map to appropriate column type ie: `Ref:{$colname}` and
|
||||
`RefList:{$colname}` (which depends on T413).
|
||||
- Allows user to set the uniqueValues options per table.
|
||||
- User should be able to choose some objects to be imported as
|
||||
indexes: for instance:
|
||||
```
|
||||
{
|
||||
'pink lady': {'type': 'apple', 'taste': 'juicy'},
|
||||
'gala': {'type': 'apple', 'taste': 'tart'},
|
||||
'comice': {'type': 'pear', 'taste': 'lemon'},
|
||||
...
|
||||
}
|
||||
```
|
||||
could be mapped to columns 'type', 'taste' and a 3rd that holds the
|
||||
property 'name'.
|
||||
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
from collections import OrderedDict, namedtuple
|
||||
from itertools import count, chain
|
||||
|
||||
import six
|
||||
|
||||
import import_utils
|
||||
|
||||
Ref = namedtuple('Ref', ['table_name', 'rowid'])
|
||||
Row = namedtuple('Row', ['values', 'parent', 'ref'])
|
||||
Col = namedtuple('Col', ['type', 'values'])
|
||||
|
||||
GRIST_TYPES={
|
||||
float: "Numeric",
|
||||
bool: "Bool",
|
||||
}
|
||||
|
||||
for typ in six.integer_types:
|
||||
GRIST_TYPES[typ] = "Int"
|
||||
|
||||
for typ in six.string_types:
|
||||
GRIST_TYPES[typ] = "Text"
|
||||
|
||||
SCHEMA = [{
|
||||
'name': 'includes',
|
||||
'label': 'Includes (list of tables seperated by semicolon)',
|
||||
'type': 'string',
|
||||
'visible': True
|
||||
}, {
|
||||
'name': 'excludes',
|
||||
'label': 'Excludes (list of tables seperated by semicolon)',
|
||||
'type': 'string',
|
||||
'visible': True
|
||||
}]
|
||||
|
||||
DEFAULT_PARSE_OPTIONS = {
|
||||
'includes': '',
|
||||
'excludes': '',
|
||||
'SCHEMA': SCHEMA
|
||||
}
|
||||
|
||||
def parse_file(file_source, parse_options):
|
||||
"Deserialize `file_source` into a python object and dumps it into jgrist form"
|
||||
path = import_utils.get_path(file_source['path'])
|
||||
name, ext = os.path.splitext(file_source['origName'])
|
||||
if 'SCHEMA' not in parse_options:
|
||||
parse_options.update(DEFAULT_PARSE_OPTIONS)
|
||||
with open(path, 'r') as json_file:
|
||||
data = json.loads(json_file.read())
|
||||
|
||||
return dumps(data, name, parse_options)
|
||||
|
||||
def dumps(data, name = "", parse_options = DEFAULT_PARSE_OPTIONS):
|
||||
" Serializes `data` to a jgrist formatted object. "
|
||||
tables = Tables(parse_options)
|
||||
if not isinstance(data, list):
|
||||
# put simple record into a list
|
||||
data = [data]
|
||||
for val in data:
|
||||
tables.add_row(name, val)
|
||||
return {
|
||||
'tables': tables.dumps(),
|
||||
'parseOptions': parse_options
|
||||
}
|
||||
|
||||
|
||||
class Tables(object):
|
||||
"""
|
||||
Tables maintains the list of tables indexed by their name. Each table
|
||||
is a list of row. A row is a dictionary mapping columns id to a value.
|
||||
"""
|
||||
|
||||
def __init__(self, parse_options):
|
||||
self._tables = OrderedDict()
|
||||
self._includes_opt = list(filter(None, parse_options['includes'].split(';')))
|
||||
self._excludes_opt = list(filter(None, parse_options['excludes'].split(';')))
|
||||
|
||||
|
||||
def dumps(self):
|
||||
" Dumps tables in jgrist format "
|
||||
return [_dump_table(name, rows) for name, rows in six.iteritems(self._tables)]
|
||||
|
||||
def add_row(self, table, value, parent = None):
|
||||
"""
|
||||
Adds a row to `table` and fill it with the content of value, then
|
||||
returns a Ref object pointing to this row. Returns None if the row
|
||||
was excluded. Calls itself recursively to add nested object and
|
||||
lists.
|
||||
"""
|
||||
row = None
|
||||
if self._is_included(table):
|
||||
rows = self._tables.setdefault(table, [])
|
||||
row = Row(OrderedDict(), parent, Ref(table, len(rows)+1))
|
||||
rows.append(row)
|
||||
|
||||
# we need a dictionary to map values to the row's columns
|
||||
value = _dictify(value)
|
||||
for (k, val) in sorted(six.iteritems(value)):
|
||||
if isinstance(val, dict):
|
||||
val = self.add_row(table + '_' + k, val)
|
||||
if row and val:
|
||||
row.values[k] = val.ref
|
||||
elif isinstance(val, list):
|
||||
for list_val in val:
|
||||
self.add_row(table + '_' + k, list_val, row)
|
||||
else:
|
||||
if row and self._is_included(table + '_' + k):
|
||||
row.values[k] = val
|
||||
return row
|
||||
|
||||
|
||||
def _is_included(self, property_path):
|
||||
is_included = (any(property_path.startswith(inc) for inc in self._includes_opt)
|
||||
if self._includes_opt else True)
|
||||
is_excluded = (any(property_path.startswith(exc) for exc in self._excludes_opt)
|
||||
if self._excludes_opt else False)
|
||||
return is_included and not is_excluded
|
||||
|
||||
|
||||
def first_available_key(dictionary, name):
|
||||
"""
|
||||
Returns the first of (name, name2, name3 ...) that is not a key of
|
||||
dictionary.
|
||||
"""
|
||||
names = chain([name], ("{}{}".format(name, i) for i in count(2)))
|
||||
return next(n for n in names if n not in dictionary)
|
||||
|
||||
|
||||
def _dictify(value):
|
||||
"""
|
||||
Converts non-dictionary value to a dictionary with a single
|
||||
empty-string key mapping to the given value. Or returns the value
|
||||
itself if it's already a dictionary. This is useful to map values to
|
||||
row's columns.
|
||||
"""
|
||||
return value if isinstance(value, dict) else {'': value}
|
||||
|
||||
|
||||
def _dump_table(name, rows):
|
||||
"Converts a list of rows into a jgrist table and set 'table_name' to name."
|
||||
columns = _transpose([r.values for r in rows])
|
||||
# find ref to first parent
|
||||
ref = next((r.parent.ref for r in rows if r.parent), None)
|
||||
if ref:
|
||||
# adds a column to store ref to parent
|
||||
col_id = first_available_key(columns, ref.table_name)
|
||||
columns[col_id] = Col(_grist_type(ref),
|
||||
[row.parent.ref if row.parent else None for row in rows])
|
||||
return {
|
||||
'column_metadata': [{'id': key, 'type': col.type} for (key, col) in six.iteritems(columns)],
|
||||
'table_data': [[_dump_value(val) for val in col.values] for col in columns.values()],
|
||||
'table_name': name
|
||||
}
|
||||
|
||||
def _transpose(rows):
|
||||
"""
|
||||
Transposes a collection of dictionary mapping key to values into a
|
||||
dictionary mapping key to values. Values are encoded into a tuple
|
||||
made of the grist_type of the first value that is not None and the
|
||||
collection of values.
|
||||
"""
|
||||
transpose = OrderedDict()
|
||||
values = OrderedDict()
|
||||
for row in reversed(rows):
|
||||
values.update(row)
|
||||
for key, val in six.iteritems(values):
|
||||
transpose[key] = Col(_grist_type(val), [row.get(key, None) for row in rows])
|
||||
return transpose
|
||||
|
||||
|
||||
def _dump_value(value):
|
||||
" Serialize a value."
|
||||
if isinstance(value, Ref):
|
||||
return value.rowid
|
||||
return value
|
||||
|
||||
|
||||
def _grist_type(value):
|
||||
" Returns the grist type for value. "
|
||||
val_type = type(value)
|
||||
if val_type == Ref:
|
||||
return 'Ref:{}'.format(value.table_name)
|
||||
return GRIST_TYPES.get(val_type, 'Text')
|
@ -0,0 +1,120 @@
|
||||
"""
|
||||
Helper functions for import plugins
|
||||
"""
|
||||
import sys
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Include /thirdparty into module search paths, in particular for messytables.
|
||||
sys.path.append('/thirdparty')
|
||||
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Get path to an imported file.
|
||||
def get_path(file_source):
|
||||
importdir = os.environ.get('IMPORTDIR') or '/importdir'
|
||||
return os.path.join(importdir, file_source)
|
||||
|
||||
def capitalize(word):
|
||||
"""Capitalize the first character in the word (without lowercasing the rest)."""
|
||||
return word[0].capitalize() + word[1:]
|
||||
|
||||
def _is_numeric(text):
|
||||
for t in six.integer_types + (float, complex):
|
||||
try:
|
||||
t(text)
|
||||
return True
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _is_header(header, data_rows):
|
||||
"""
|
||||
Returns whether header can be considered a legitimate header for data_rows.
|
||||
"""
|
||||
# See if the row has any non-text values.
|
||||
for cell in header:
|
||||
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
|
||||
return False
|
||||
|
||||
|
||||
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
|
||||
# a header.
|
||||
count_repeats = [0 for cell in header]
|
||||
for row in data_rows:
|
||||
for cell, header_cell in zip(row, header):
|
||||
if cell.value and cell.value == header_cell.value:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _count_nonempty(row):
|
||||
"""
|
||||
Returns the count of cells in row, ignoring trailing empty cells.
|
||||
"""
|
||||
count = 0
|
||||
for i, c in enumerate(row):
|
||||
if not c.empty:
|
||||
count = i + 1
|
||||
return count
|
||||
|
||||
|
||||
def find_first_non_empty_row(rows):
|
||||
"""
|
||||
Returns (data_offset, header) of the first row with non-empty fields
|
||||
or (0, []) if there are no non-empty rows.
|
||||
"""
|
||||
for i, row in enumerate(rows):
|
||||
if _count_nonempty(row) > 0:
|
||||
return i + 1, row
|
||||
# No non-empty rows.
|
||||
return 0, []
|
||||
|
||||
|
||||
def expand_headers(headers, data_offset, rows):
|
||||
"""
|
||||
Returns expanded header to have enough columns for all rows in the given sample.
|
||||
"""
|
||||
row_length = max(itertools.chain([len(headers)],
|
||||
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
|
||||
None))))
|
||||
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
|
||||
return header_values
|
||||
|
||||
|
||||
def headers_guess(rows):
|
||||
"""
|
||||
Our own smarter version of messytables.headers_guess, which also guesses as to whether one of
|
||||
the first rows is in fact a header. Returns (data_offset, headers) where data_offset is the
|
||||
index of the first line of data, and headers is the list of guessed headers (which will contain
|
||||
empty strings if the file had no headers).
|
||||
"""
|
||||
# Messytables guesses at the length of data rows, and then assumes that the first row that has
|
||||
# close to that many non-empty fields is the header, where by "close" it means 1 less.
|
||||
#
|
||||
# For Grist, it's better to mistake headers for data than to mistake data for headers. Note that
|
||||
# there is csv.Sniffer().has_header(), which tries to be clever, but it's messes up too much.
|
||||
#
|
||||
# We only consider for the header the first row with non-empty cells. It is a header if
|
||||
# - it has no non-text fields
|
||||
# - none of the fields have a value that repeats in that column of data
|
||||
|
||||
# Find the first row with non-empty fields.
|
||||
data_offset, header = find_first_non_empty_row(rows)
|
||||
if not header:
|
||||
return data_offset, header
|
||||
|
||||
# Let's see if row is really a header.
|
||||
if not _is_header(header, itertools.islice(rows, data_offset, None)):
|
||||
data_offset -= 1
|
||||
header = []
|
||||
|
||||
# Expand header to have enough columns for all rows in the given sample.
|
||||
header_values = expand_headers(header, data_offset, rows)
|
||||
|
||||
return data_offset, header_values
|
@ -0,0 +1,118 @@
|
||||
"""
|
||||
This module reads a file path that is passed in using ActiveDoc.importFile()
|
||||
and returns a object formatted so that it can be used by grist for a bulk add records action
|
||||
"""
|
||||
import os
|
||||
import csv
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
import chardet
|
||||
import messytables
|
||||
import six
|
||||
from six.moves import zip
|
||||
|
||||
import parse_data
|
||||
import import_utils
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def import_file(file_source, parse_options):
|
||||
path = import_utils.get_path(file_source["path"])
|
||||
orig_name = file_source["origName"]
|
||||
parse_options, tables = parse_file(path, orig_name, parse_options)
|
||||
return {"parseOptions": parse_options, "tables": tables}
|
||||
|
||||
# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any
|
||||
# behavior.
|
||||
orig_dialect = messytables.CSVRowSet._dialect
|
||||
def override_dialect(self):
|
||||
if self.delimiter == '\t':
|
||||
return csv.excel_tab
|
||||
return orig_dialect.fget(self)
|
||||
messytables.CSVRowSet._dialect = property(override_dialect)
|
||||
|
||||
def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None):
|
||||
# pylint: disable=unused-argument
|
||||
with open(file_path, "rb") as f:
|
||||
try:
|
||||
return parse_open_file(f, orig_name, table_name_hint=table_name_hint)
|
||||
except Exception as e:
|
||||
# Log the full error, but simplify the thrown error to omit the unhelpful extra args.
|
||||
log.info("import_xls parse_file failed: %s", e)
|
||||
if six.PY2 and e.args and isinstance(e.args[0], six.string_types):
|
||||
raise Exception(e.args[0])
|
||||
raise
|
||||
|
||||
|
||||
def parse_open_file(file_obj, orig_name, table_name_hint=None):
|
||||
file_root, file_ext = os.path.splitext(orig_name)
|
||||
table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False)
|
||||
|
||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
||||
if isinstance(table_set, messytables.CSVTableSet):
|
||||
sample = file_obj.read(100000)
|
||||
table_set.encoding = chardet.detect(sample)['encoding']
|
||||
# In addition, always prefer UTF8 over ASCII.
|
||||
if table_set.encoding == 'ascii':
|
||||
table_set.encoding = 'utf8'
|
||||
|
||||
export_list = []
|
||||
# A table set is a collection of tables:
|
||||
for row_set in table_set.tables:
|
||||
table_name = row_set.name
|
||||
|
||||
if isinstance(row_set, messytables.CSVRowSet):
|
||||
# For csv files, we can do better for table_name by using the filename.
|
||||
table_name = import_utils.capitalize(table_name_hint or
|
||||
os.path.basename(file_root.decode('utf8')))
|
||||
|
||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
||||
data_offset, headers = import_utils.headers_guess(list(row_set.sample))
|
||||
else:
|
||||
# Let messytables guess header names and the offset of the header.
|
||||
offset, headers = messytables.headers_guess(row_set.sample)
|
||||
data_offset = offset + 1 # Add the header line
|
||||
|
||||
# Make sure all header values are strings.
|
||||
for i, header in enumerate(headers):
|
||||
if not isinstance(header, six.string_types):
|
||||
headers[i] = six.text_type(header)
|
||||
|
||||
log.debug("Guessed data_offset as %s", data_offset)
|
||||
log.debug("Guessed headers as: %s", headers)
|
||||
|
||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
||||
|
||||
|
||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers))
|
||||
|
||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||
column_metadata = []
|
||||
table_data = []
|
||||
for col_data, header in zip(table_data_with_types, headers):
|
||||
if not header and all(val == "" for val in col_data["data"]):
|
||||
continue # empty column
|
||||
data = col_data.pop("data")
|
||||
col_data["id"] = header
|
||||
column_metadata.append(col_data)
|
||||
table_data.append(data)
|
||||
|
||||
if not table_data:
|
||||
# Don't add tables with no columns.
|
||||
continue
|
||||
|
||||
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
||||
for c in column_metadata:
|
||||
log.debug("Output column %s", c)
|
||||
export_list.append({
|
||||
"table_name": table_name,
|
||||
"column_metadata": column_metadata,
|
||||
"table_data": table_data
|
||||
})
|
||||
|
||||
parse_options = {}
|
||||
|
||||
return parse_options, export_list
|
@ -0,0 +1,25 @@
|
||||
import logging
|
||||
import sandbox
|
||||
|
||||
import import_csv
|
||||
import import_xls
|
||||
import import_json
|
||||
|
||||
def main():
|
||||
s = logging.StreamHandler()
|
||||
s.setFormatter(logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'))
|
||||
rootLogger = logging.getLogger()
|
||||
rootLogger.addHandler(s)
|
||||
rootLogger.setLevel(logging.INFO)
|
||||
|
||||
# Todo: Grist should expose a register method accepting arguments as
|
||||
# follow: register('csv_parser', 'canParse', can_parse)
|
||||
sandbox.register("csv_parser.parseFile", import_csv.parse_file_source)
|
||||
sandbox.register("xls_parser.parseFile", import_xls.import_file)
|
||||
sandbox.register("json_parser.parseFile", import_json.parse_file)
|
||||
|
||||
sandbox.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,299 @@
|
||||
"""
|
||||
This module implements a way to detect and convert types that's better than messytables (at least
|
||||
in some relevant cases).
|
||||
|
||||
It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
|
||||
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
|
||||
of values. All "data" lists will have the same length.
|
||||
"""
|
||||
|
||||
import dateguess
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import messytables
|
||||
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
||||
import dateutil.parser as date_parser
|
||||
import six
|
||||
from six.moves import zip, xrange
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
|
||||
# pylint:disable=unidiomatic-typecheck
|
||||
|
||||
|
||||
# Our approach to type detection is different from that of messytables.
|
||||
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
|
||||
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
|
||||
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
|
||||
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
|
||||
# We use those counts to produce the selected Grist type at the end.
|
||||
|
||||
|
||||
class BaseConverter(object):
|
||||
@classmethod
|
||||
def test(cls, value):
|
||||
try:
|
||||
cls.convert(value)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
"""Implement to convert imported value to a basic type."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
"""
|
||||
Given an array of values returned successfully by convert(), return a tuple of
|
||||
(grist_type_string, grist_values), where grist_values is an array of values suitable for the
|
||||
returned grist type.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class NumericConverter(BaseConverter):
|
||||
"""Handles numeric values, including Grist types Numeric and Int."""
|
||||
|
||||
# A number matching this is probably an identifier of some sort. Converting it to a float will
|
||||
# lose precision, so it's better not to consider it numeric.
|
||||
_unlikely_float = re.compile(r'\d{17}|^0\d')
|
||||
|
||||
# Integers outside this range will be represented as floats. This is the limit for values that can
|
||||
# be stored in a JS Int32Array.
|
||||
_max_js_int = 1<<31
|
||||
|
||||
# The thousands separator. It should be locale-specific, but we don't currently have a way to
|
||||
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
|
||||
_thousands_sep = ','
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
if type(value) in six.integer_types + (float, complex):
|
||||
return value
|
||||
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
|
||||
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def _is_integer(cls, value):
|
||||
ttype = type(value)
|
||||
if ttype == int or (ttype == float and value.is_integer()):
|
||||
return -cls._max_js_int <= value < cls._max_js_int
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
if all(cls._is_integer(v) for v in values):
|
||||
return ("Int", [int(v) for v in values])
|
||||
return ("Numeric", values)
|
||||
|
||||
|
||||
class DateParserInfo(date_parser.parserinfo):
|
||||
def validate(self, res):
|
||||
# Avoid this bogus combination which accepts plain numbers.
|
||||
if res.day and not res.month:
|
||||
return False
|
||||
return super(DateParserInfo, self).validate(res)
|
||||
|
||||
|
||||
class SimpleDateTimeConverter(BaseConverter):
|
||||
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
if type(value) is datetime.datetime:
|
||||
return value
|
||||
elif value == "":
|
||||
return None
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def _is_date(cls, value):
|
||||
return value is None or value.time() == datetime.time()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
|
||||
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
||||
for v in values]
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class DateTimeCoverter(BaseConverter):
|
||||
"""Handles dateformats by guessed format."""
|
||||
|
||||
def __init__(self, date_format):
|
||||
self._format = date_format
|
||||
|
||||
def convert(self, value):
|
||||
if value == "":
|
||||
return None
|
||||
if type(value) in (str, six.text_type):
|
||||
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
|
||||
if '%z' in self._format or '%Z' in self._format:
|
||||
return date_parser.parse(value)
|
||||
else:
|
||||
try:
|
||||
return datetime.datetime.strptime(value, self._format)
|
||||
except ValueError:
|
||||
return date_parser.parse(value)
|
||||
|
||||
raise ValueError()
|
||||
|
||||
def _is_date(self, value):
|
||||
return value is None or value.time() == datetime.time()
|
||||
|
||||
def get_grist_column(self, values):
|
||||
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
|
||||
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
||||
for v in values]
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class BoolConverter(BaseConverter):
|
||||
"""Handles Boolean type."""
|
||||
|
||||
_true_values = (1, '1', 'true', 'yes')
|
||||
_false_values = (0, '0', 'false', 'no')
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
v = value.strip().lower() if type(value) in (str, six.text_type) else value
|
||||
if v in cls._true_values:
|
||||
return True
|
||||
elif v in cls._false_values:
|
||||
return False
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Bool", values)
|
||||
|
||||
|
||||
class TextConverter(BaseConverter):
|
||||
"""Fallback converter that converts everything to strings."""
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
return six.text_type(value)
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Text", values)
|
||||
|
||||
|
||||
class ColumnDetector(object):
|
||||
"""
|
||||
ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
|
||||
different basic types. At the end `get_converter()` method returns the class of the most
|
||||
suitable converter.
|
||||
"""
|
||||
# Converters are listed in the order of preference, which is only used if two converters succeed
|
||||
# on the same exact number of values. Text is always a fallback.
|
||||
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
|
||||
|
||||
# If this many non-junk values or more can't be converted, fall back to text.
|
||||
_text_threshold = 0.10
|
||||
|
||||
# Junk values: these aren't counted when deciding whether to fall back to text.
|
||||
_junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)
|
||||
|
||||
def __init__(self):
|
||||
self._counts = [0] * len(self.converters)
|
||||
self._count_nonjunk = 0
|
||||
self._count_total = 0
|
||||
self._data = []
|
||||
|
||||
def add_value(self, value):
|
||||
self._count_total += 1
|
||||
if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
|
||||
return
|
||||
|
||||
self._data.append(value)
|
||||
|
||||
self._count_nonjunk += 1
|
||||
for i, conv in enumerate(self.converters):
|
||||
if conv.test(value):
|
||||
self._counts[i] += 1
|
||||
|
||||
def get_converter(self):
|
||||
if sum(self._counts) == 0:
|
||||
# if not already guessed as int, bool or datetime then we should try to guess date pattern
|
||||
str_data = [d for d in self._data if isinstance(d, six.string_types)]
|
||||
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
|
||||
data_format = data_formats[0] if data_formats else None
|
||||
if data_format:
|
||||
return DateTimeCoverter(data_format)
|
||||
|
||||
# We find the max by count, and secondarily by minimum index in the converters list.
|
||||
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
|
||||
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
|
||||
return self.converters[-neg_index]
|
||||
return TextConverter
|
||||
|
||||
|
||||
def _guess_basic_types(rows, num_columns):
|
||||
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
|
||||
for row in rows:
|
||||
for cell, detector in zip(row, column_detectors):
|
||||
detector.add_value(cell.value)
|
||||
|
||||
return [detector.get_converter() for detector in column_detectors]
|
||||
|
||||
|
||||
class ColumnConverter(object):
|
||||
"""
|
||||
ColumnConverter converts and collects values using the passed-in converter object. At the end
|
||||
`get_grist_column()` method returns a column of converted data.
|
||||
"""
|
||||
def __init__(self, converter):
|
||||
self._converter = converter
|
||||
self._all_col_values = [] # Initially this has None's for converted values
|
||||
self._converted_values = [] # A list of all converted values
|
||||
self._converted_indices = [] # Indices of the converted values into self._all_col_values
|
||||
|
||||
def convert_and_add(self, value):
|
||||
# For some reason, we get 'str' type rather than 'unicode' for empty strings.
|
||||
# Correct this, since all text should be unicode.
|
||||
value = u"" if value == "" else value
|
||||
try:
|
||||
conv = self._converter.convert(value)
|
||||
self._converted_values.append(conv)
|
||||
self._converted_indices.append(len(self._all_col_values))
|
||||
self._all_col_values.append(None)
|
||||
except Exception:
|
||||
self._all_col_values.append(six.text_type(value))
|
||||
|
||||
def get_grist_column(self):
|
||||
"""
|
||||
Returns a dictionary {"type": grist_type, "data": grist_value_array}.
|
||||
"""
|
||||
grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
|
||||
for i, v in zip(self._converted_indices, grist_values):
|
||||
self._all_col_values[i] = v
|
||||
return {"type": grist_type, "data": self._all_col_values}
|
||||
|
||||
|
||||
def get_table_data(row_set, num_columns, num_rows=0):
|
||||
converters = _guess_basic_types(row_set.sample, num_columns)
|
||||
col_converters = [ColumnConverter(c) for c in converters]
|
||||
for num, row in enumerate(row_set):
|
||||
if num_rows and num == num_rows:
|
||||
break
|
||||
|
||||
if num % 10000 == 0:
|
||||
log.info("Processing row %d", num)
|
||||
|
||||
# Make sure we have a value for every column.
|
||||
missing_values = len(converters) - len(row)
|
||||
if missing_values > 0:
|
||||
row.extend([messytables.Cell("")] * missing_values)
|
||||
|
||||
for cell, conv in zip(row, col_converters):
|
||||
conv.convert_and_add(cell.value)
|
||||
|
||||
return [conv.get_grist_column() for conv in col_converters]
|
Binary file not shown.
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
@ -0,0 +1,102 @@
|
||||
import unittest
|
||||
from dateguess import guess, guess_bulk
|
||||
|
||||
|
||||
class TestGuesser(unittest.TestCase):
|
||||
def assertDate(self, input_str, fmt_list):
|
||||
guessed = guess(input_str)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def assertDates(self, input_lst, error_rate, fmt_list):
|
||||
guessed = guess_bulk(input_lst, error_rate=error_rate)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def test_guess_dates(self):
|
||||
self.assertDate('', [])
|
||||
self.assertDate("2013-13-13", [])
|
||||
self.assertDate("25/25/1911", [])
|
||||
|
||||
self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2013-12-13", ['%Y-%m-%d'])
|
||||
|
||||
self.assertDate("12/31/1999", ['%m/%d/%Y'])
|
||||
self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
|
||||
self.assertDate("12/31/99", ['%m/%d/%y'])
|
||||
self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
|
||||
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
self.assertDate("31.12.1991", ['%d.%m.%Y'])
|
||||
self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
|
||||
self.assertDate("13.2.2008", ['%d.%m.%Y'])
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
|
||||
self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
|
||||
self.assertDate("31 Dec 1999", ['%d %b %Y'])
|
||||
self.assertDate("1 Jan 2012", ['%d %b %Y'])
|
||||
self.assertDate("3 August 2009", ['%d %B %Y'])
|
||||
self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
|
||||
|
||||
self.assertDate("13/1/2012", ['%d/%m/%Y'])
|
||||
|
||||
self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
|
||||
self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
|
||||
|
||||
def test_guess_datetimes(self):
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
# TODO remove all except first one
|
||||
self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
|
||||
'%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
|
||||
self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one
|
||||
|
||||
self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
|
||||
self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
|
||||
|
||||
def test_guess_bulk_dates(self):
|
||||
self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
|
||||
self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
|
||||
self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -0,0 +1,341 @@
|
||||
# This Python file uses the following encoding: utf-8
|
||||
# Run tests with:
|
||||
#
|
||||
# ./sandbox/nacl/bin/sel_ldr -E PYTHONPATH=/grist:/thirdparty -B ./sandbox/nacl/lib/irt_core.nexe -l /dev/null -m ./sandbox/nacl/root:/:ro -m ./plugins/core/sandbox:/sandbox:ro ./sandbox/nacl/lib/runnable-ld.so --library-path /slib /python/bin/python2.7.nexe -m unittest discover -v -s /sandbox #pylint: disable=line-too-long
|
||||
#
|
||||
#
|
||||
# TODO: run test automatically
|
||||
#
|
||||
import math
|
||||
import os
|
||||
import textwrap
|
||||
import unittest
|
||||
from six import BytesIO, text_type
|
||||
import csv
|
||||
import calendar
|
||||
import datetime
|
||||
|
||||
import import_csv
|
||||
|
||||
|
||||
def _get_fixture(filename):
|
||||
return os.path.join(os.path.dirname(__file__), "test/fixtures", filename)
|
||||
|
||||
|
||||
def bytes_io_from_str(string):
|
||||
if isinstance(string, text_type):
|
||||
string = string.encode("utf8")
|
||||
return BytesIO(string)
|
||||
|
||||
|
||||
class TestImportCSV(unittest.TestCase):
|
||||
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def _check_num_cols(self, sheet, exp_cols):
|
||||
self.assertEqual(len(sheet["column_metadata"]), exp_cols)
|
||||
self.assertEqual(len(sheet["table_data"]), exp_cols)
|
||||
|
||||
|
||||
def test_csv_types(self):
|
||||
parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='')
|
||||
sheet = parsed_file[1][0]
|
||||
|
||||
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
|
||||
|
||||
|
||||
def test_user_parse_options(self):
|
||||
options = {u'parse_options': {"escapechar": None, "include_col_names_as_headers": True,
|
||||
"lineterminator": "\n", "skipinitialspace": False,
|
||||
"limit_rows": False, "quoting": 0, "start_with_row": 1,
|
||||
"delimiter": ",", "NUM_ROWS":10,
|
||||
"quotechar": "\"", "doublequote":True}}
|
||||
parsed_file = import_csv.parse_file(_get_fixture('test_import_csv.csv'),
|
||||
**options)[1][0]
|
||||
self._check_num_cols(parsed_file, 5)
|
||||
self._check_col(parsed_file, 0, "FIRST_NAME", "Text", ['John', 'Tim', 'Jenny', 'Lily'])
|
||||
self._check_col(parsed_file, 1, "LAST_NAME", "Text", ['Moor', 'Kale', 'Jo', 'Smit'])
|
||||
self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
|
||||
'2013433434', '(201)343-3434'])
|
||||
self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
|
||||
self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
|
||||
|
||||
def test_wrong_cols1(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1, name2, name3
|
||||
a1,b1,c1
|
||||
a2,b2
|
||||
a3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", ""])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
|
||||
|
||||
def test_wrong_cols2(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1
|
||||
a1,b1
|
||||
a2,b2,c2
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
|
||||
|
||||
def test_offset(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,,,,,,
|
||||
name1,name2,name3
|
||||
a1,b1,c1
|
||||
a2,b2,c2
|
||||
a3,b3,c3,d4
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2", "b3"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
|
||||
|
||||
def test_offset_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
4,b1,c1
|
||||
4,b2,c2
|
||||
4,b3,c3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Int", [4, 4, 4])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["b1", "b2", "b3"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
def test_empty_headers(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,-,-
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 5)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["b", "b", "b"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
|
||||
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
-,-,-,-,-,-
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
b,a,a,a,a
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 6)
|
||||
self._check_col(parsed_file, 0, "-", "Text", ["b", "b", "b"])
|
||||
self._check_col(parsed_file, 1, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 2, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 4, "-", "Text", ["a", "a", "a"])
|
||||
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
|
||||
|
||||
def test_guess_missing_user_option(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,;name2,;name3
|
||||
a1,;b1,;c1
|
||||
a2,;b2,;c2
|
||||
a3,;b3,;c3
|
||||
"""))
|
||||
parse_options = {"delimiter": ';',
|
||||
"escapechar": None,
|
||||
"lineterminator": '\r\n',
|
||||
"quotechar": '"',
|
||||
"quoting": csv.QUOTE_MINIMAL}
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1,", "Text", ["a1,", "a2,", "a3,"])
|
||||
self._check_col(parsed_file, 1, "name2,", "Text", ["b1,", "b2,", "b3,"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
# Sniffer detects delimiters in order [',', '\t', ';', ' ', ':'],
|
||||
# so for this file_obj it will be ','
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, ";name2", "Text", [";b1", ";b2", ";b3"])
|
||||
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
|
||||
|
||||
def test_one_line_file_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
2,name2,name3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Int", [2])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["name2"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["name3"])
|
||||
|
||||
def test_one_line_file_with_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", [])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", [])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", [])
|
||||
|
||||
def test_empty_file(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
"""))
|
||||
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options={})
|
||||
self.assertEqual(parsed_file, ({}, []))
|
||||
|
||||
def test_option_num_rows(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
a1,b1,c1
|
||||
a2,b2,c2
|
||||
a3,b3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
parse_options = {"NUM_ROWS": 2}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2"])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ["b1", "b2"])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2"])
|
||||
|
||||
parse_options = {"NUM_ROWS": 10}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "name2", "Text", ['b1', 'b2', 'b3'])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
def test_option_num_rows_no_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,
|
||||
,,
|
||||
a1,1,c1
|
||||
a2,2,c2
|
||||
a3,3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2', 'a3'])
|
||||
self._check_col(parsed_file, 1, "", "Int", [1, 2, 3])
|
||||
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2', 'c3'])
|
||||
|
||||
parse_options = {"NUM_ROWS": 2}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ['a1', 'a2'])
|
||||
self._check_col(parsed_file, 1, "", "Int", [1, 2])
|
||||
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
|
||||
|
||||
def test_option_use_col_name_as_header(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
name1,name2,name3
|
||||
a1,1,c1
|
||||
a2,2,c2
|
||||
a3,3,c3
|
||||
"""))
|
||||
|
||||
parse_options = {"include_col_names_as_headers": False}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["name1", "a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "", "Text", ["name2", "1", "2", "3"])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["name3", "c1", "c2", "c3"])
|
||||
|
||||
parse_options = {"include_col_names_as_headers": True}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 3)
|
||||
self._check_col(parsed_file, 0, "name1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "name2", "Int", [1, 2, 3])
|
||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||
|
||||
def test_option_use_col_name_as_header_no_headers(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
"""\
|
||||
,,,
|
||||
,,,
|
||||
n1,2,n3
|
||||
a1,1,c1,d1
|
||||
a2,4,c2
|
||||
a3,5,c3
|
||||
"""))
|
||||
|
||||
parse_options = {"include_col_names_as_headers": False}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "", "Text", ["n1", "a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "", "Int", [2, 1, 4, 5])
|
||||
self._check_col(parsed_file, 2, "", "Text", ["n3", "c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", ["", "d1", "", ""])
|
||||
|
||||
parse_options = {"include_col_names_as_headers": True}
|
||||
parsed_file = import_csv._parse_open_file(file_obj, parse_options=parse_options)[1][0]
|
||||
self._check_num_cols(parsed_file, 4)
|
||||
self._check_col(parsed_file, 0, "n1", "Text", ["a1", "a2", "a3"])
|
||||
self._check_col(parsed_file, 1, "2", "Int", [1, 4, 5])
|
||||
self._check_col(parsed_file, 2, "n3", "Text", ["c1", "c2", "c3"])
|
||||
self._check_col(parsed_file, 3, "", "Text", [ "d1", "", ""])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,259 @@
|
||||
from unittest import TestCase
|
||||
import import_json
|
||||
|
||||
class TestImportJSON(TestCase):
|
||||
|
||||
maxDiff = None
|
||||
|
||||
def test_simple_json_array(self):
|
||||
grist_tables = import_json.dumps([{'a': 1, 'b': 'baba'}, {'a': 4, 'b': 'abab'}], '')
|
||||
self.assertEqual(grist_tables['tables'], [{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
|
||||
'table_data': [[1, 4], ['baba', 'abab']],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_missing_data(self):
|
||||
grist_tables = import_json.dumps([{'a': 1}, {'b': 'abab'}, {'a': 4}])
|
||||
self.assertEqual(grist_tables['tables'], [{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'}, {'id': 'b', 'type': 'Text'}],
|
||||
'table_data': [[1, None, 4], [None, 'abab', None]],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_even_more_simple_array(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps(['apple', 'pear', 'banana'], '')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': '', 'type': 'Text'}],
|
||||
'table_data': [['apple', 'pear', 'banana']],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_mixing_simple_and_even_more_simple(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps(['apple', 'pear', {'a': 'some cucumbers'}, 'banana'], '')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': '', 'type': 'Text'},
|
||||
{'id': 'a', 'type': 'Text'}],
|
||||
'table_data': [['apple', 'pear', None, 'banana'], [None, None, 'some cucumbers', None]],
|
||||
'table_name': ''
|
||||
}])
|
||||
|
||||
def test_array_with_reference(self):
|
||||
# todo: reference should follow Grist's format
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': {'b': 2}, 'c': 'foo'}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[1], ['foo']],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'}
|
||||
],
|
||||
'table_data': [[2]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
def test_nested_nested_object(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': {'b': 2, 'd': {'a': 'sugar'}}, 'c': 'foo'}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Ref:Hello_a'}, {'id': 'c', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[1], ['foo']],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'}, {'id': 'd', 'type': 'Ref:Hello_a_d'}
|
||||
],
|
||||
'table_data': [[2], [1]],
|
||||
'table_name': 'Hello_a'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [['sugar']],
|
||||
'table_name': 'Hello_a_d'
|
||||
}])
|
||||
|
||||
|
||||
def test_array_with_list(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': ['ES', 'FR', 'US']}, {'a': ['FR']}], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [],
|
||||
'table_data': [],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [['ES', 'FR', 'US', 'FR'], [1, 1, 1, 2]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
def test_array_with_list_of_dict(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([{'a': [{'b': 1}, {'b': 4}]}, {'c': 2}], 'Hello')['tables'],
|
||||
[ {
|
||||
'column_metadata': [{'id': 'c', 'type': 'Int'}],
|
||||
'table_data': [[None, 2]],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [
|
||||
{'id': 'b', 'type': 'Int'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}
|
||||
],
|
||||
'table_data': [[1, 4], [1, 1]],
|
||||
'table_name': 'Hello_a'
|
||||
}])
|
||||
|
||||
|
||||
def test_array_of_array(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps([['FR', 'US'], ['ES', 'CH']], 'Hello')['tables'],
|
||||
[{
|
||||
'column_metadata': [],
|
||||
'table_data': [],
|
||||
'table_name': 'Hello'
|
||||
}, {
|
||||
'column_metadata': [{'id': '', 'type': 'Text'}, {'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [['FR', 'US', 'ES', 'CH'], [1, 1, 2, 2]],
|
||||
'table_name': 'Hello_'
|
||||
}, ])
|
||||
|
||||
|
||||
def test_json_dict(self):
|
||||
self.assertEqual(
|
||||
import_json.dumps({
|
||||
'foo': [{'a': 1, 'b': 'santa'}, {'a': 4, 'b': 'cats'}],
|
||||
'bar': [{'c': 2, 'd': 'ducks'}, {'c': 5, 'd': 'dogs'}],
|
||||
'status': {'success': True, 'time': '5s'}
|
||||
}, 'Hello')['tables'], [{
|
||||
'table_name': 'Hello',
|
||||
'column_metadata': [{'id': 'status', 'type': 'Ref:Hello_status'}],
|
||||
'table_data': [[1]]
|
||||
}, {
|
||||
'table_name': 'Hello_bar',
|
||||
'column_metadata': [
|
||||
{'id': 'c', 'type': 'Int'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}
|
||||
],
|
||||
'table_data': [[2, 5], ['ducks', 'dogs'], [1, 1]]
|
||||
}, {
|
||||
'table_name': 'Hello_foo',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'},
|
||||
{'id': 'b', 'type': 'Text'},
|
||||
{'id': 'Hello', 'type': 'Ref:Hello'}],
|
||||
'table_data': [[1, 4], ['santa', 'cats'], [1, 1]]
|
||||
}, {
|
||||
'table_name': 'Hello_status',
|
||||
'column_metadata': [
|
||||
{'id': 'success', 'type': 'Bool'},
|
||||
{'id': 'time', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[True], ['5s']]
|
||||
}])
|
||||
|
||||
def test_json_types(self):
|
||||
self.assertEqual(import_json.dumps({
|
||||
'a': 3, 'b': 3.14, 'c': True, 'd': 'name', 'e': -4, 'f': '3.14', 'g': None
|
||||
}, 'Hello')['tables'],
|
||||
[{
|
||||
'table_name': 'Hello',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Int'},
|
||||
{'id': 'b', 'type': 'Numeric'},
|
||||
{'id': 'c', 'type': 'Bool'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'e', 'type': 'Int'},
|
||||
{'id': 'f', 'type': 'Text'},
|
||||
{'id': 'g', 'type': 'Text'}
|
||||
],
|
||||
'table_data': [[3], [3.14], [True], ['name'], [-4], ['3.14'], [None]]
|
||||
}])
|
||||
|
||||
def test_type_is_defined_with_first_value(self):
|
||||
tables = import_json.dumps([{'a': 'some text'}, {'a': 3}], '')
|
||||
self.assertIsNotNone(tables['tables'])
|
||||
self.assertIsNotNone(tables['tables'][0])
|
||||
self.assertIsNotNone(tables['tables'][0]['column_metadata'])
|
||||
self.assertIsNotNone(tables['tables'][0]['column_metadata'][0])
|
||||
self.assertEqual(tables['tables'][0]['column_metadata'][0]['type'], 'Text')
|
||||
|
||||
def test_first_unique_key(self):
|
||||
self.assertEqual(import_json.first_available_key({'a': 1}, 'a'), 'a2')
|
||||
self.assertEqual(import_json.first_available_key({'a': 1}, 'b'), 'b')
|
||||
self.assertEqual(import_json.first_available_key({'a': 1, 'a2': 1}, 'a'), 'a3')
|
||||
|
||||
|
||||
def dump_tables(options):
|
||||
data = {
|
||||
"foos": [
|
||||
{'foo': 1, 'link': [1, 2]},
|
||||
{'foo': 2, 'link': [1, 2]}
|
||||
],
|
||||
"bar": {'hi': 'santa'}
|
||||
}
|
||||
return [t for t in import_json.dumps(data, 'FooBar', options)['tables']]
|
||||
|
||||
|
||||
class TestParseOptions(TestCase):
|
||||
|
||||
maxDiff = None
|
||||
|
||||
# helpers
|
||||
def assertColInTable(self, tables, **kwargs):
|
||||
table = next(t for t in tables if t['table_name'] == kwargs['table_name'])
|
||||
self.assertEqual(any(col['id'] == kwargs['col_id'] for col in table['column_metadata']),
|
||||
kwargs['present'])
|
||||
|
||||
def assertTableNamesEqual(self, tables, expected_table_names):
|
||||
table_names = [t['table_name'] for t in tables]
|
||||
self.assertEqual(sorted(table_names), sorted(expected_table_names))
|
||||
|
||||
def test_including_empty_string_includes_all(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar', 'FooBar_foos', 'FooBar_foos_link'])
|
||||
|
||||
def test_including_foos_includes_nested_object_and_removes_ref_to_table_not_included(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos', 'FooBar_foos_link'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos', col_id='FooBar', present=False)
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos_link', col_id='FooBar_foos',
|
||||
present=False)
|
||||
|
||||
def test_excluding_foos_excludes_nested_object_and_removes_link_to_excluded_table(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_bar'])
|
||||
self.assertColInTable(tables, table_name='FooBar', col_id='foos', present=False)
|
||||
|
||||
def test_excludes_works_on_nested_object_that_are_included(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos', 'excludes': 'FooBar_foos_link'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos'])
|
||||
|
||||
def test_excludes_works_on_property(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_foo'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos', 'FooBar_foos_link', 'FooBar_bar'])
|
||||
self.assertColInTable(tables, table_name='FooBar_foos', col_id='foo', present=False)
|
||||
|
||||
def test_works_with_multiple_includes(self):
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_foos_link'])
|
||||
tables = dump_tables({'includes': 'FooBar_foos_link;FooBar_bar', 'excludes': ''})
|
||||
self.assertTableNamesEqual(tables, ['FooBar_bar', 'FooBar_foos_link'])
|
||||
|
||||
def test_works_with_multiple_excludes(self):
|
||||
tables = dump_tables({'includes': '', 'excludes': 'FooBar_foos_link;FooBar_bar'})
|
||||
self.assertTableNamesEqual(tables, ['FooBar', 'FooBar_foos'])
|
@ -0,0 +1,160 @@
|
||||
# This Python file uses the following encoding: utf-8
|
||||
import calendar
|
||||
import datetime
|
||||
import math
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import import_xls
|
||||
|
||||
def _get_fixture(filename):
|
||||
return [os.path.join(os.path.dirname(__file__), "test/fixtures", filename), filename]
|
||||
|
||||
|
||||
class TestImportXLS(unittest.TestCase):
|
||||
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def test_excel(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
|
||||
|
||||
# check that column type was correctly set to int and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][1],
|
||||
["a", "b", "c", "d", "e", "f", "g", "h"])
|
||||
|
||||
# messy tables does not support bool types yet, it classifies them as ints
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
|
||||
self.assertEqual(parsed_file[1][False]["table_data"][2],
|
||||
[True, False, True, False, True, False, True, False])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][3],
|
||||
{"type": "Text", "id": "corner-cases"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][3],
|
||||
# The type is detected as text, so all values should be text.
|
||||
[u'=function()', '3.0', u'two spaces after ',
|
||||
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
|
||||
|
||||
# check that multiple tables are created when there are multiple sheets in a document
|
||||
self.assertEqual(parsed_file[1][0]["table_name"], u"Sheet1")
|
||||
self.assertEqual(parsed_file[1][1]["table_name"], u"Sheet2")
|
||||
self.assertEqual(parsed_file[1][1]["table_data"][0], ["a", "b", "c", "d"])
|
||||
|
||||
def test_excel_types(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||
sheet = parsed_file[1][0]
|
||||
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
# TODO: all dates have different format
|
||||
# self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
# [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
|
||||
|
||||
def test_excel_type_detection(self):
|
||||
# This tests goes over the second sheet of the fixture doc, which has multiple rows that try
|
||||
# to throw off the type detection.
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||
sheet = parsed_file[1][1]
|
||||
self._check_col(sheet, 0, "date_with_other", "DateTime",
|
||||
[1467676800.0, 1451606400.0, 1451692800.0, 1454544000.0, 1199577600.0,
|
||||
1467732614.0, u'n/a', 1207958400.0, 1451865600.0, 1451952000.0,
|
||||
None, 1452038400.0, 1451549340.0, 1483214940.0, None,
|
||||
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
|
||||
self._check_col(sheet, 1, "float_not_int", "Numeric",
|
||||
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
|
||||
self._check_col(sheet, 2, "int_not_bool", "Int",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 3, "float_not_bool", "Numeric",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 4, "text_as_bool", "Bool",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 5, "int_as_bool", "Bool",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 6, "float_not_date", "Numeric",
|
||||
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
|
||||
4.0, 6.0, '3-4', 4.0, 6.5])
|
||||
self._check_col(sheet, 7, "float_not_text", "Numeric",
|
||||
[-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-",
|
||||
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
|
||||
self._check_col(sheet, 8, "dollar_amts", "Numeric",
|
||||
[0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
|
||||
1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
|
||||
|
||||
def test_excel_single_merged_cell(self):
|
||||
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
|
||||
# exception.
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx'))
|
||||
tables = parsed_file[1]
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Transaction Report',
|
||||
'column_metadata': [
|
||||
{'type': 'Text', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u'Start'},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Text', 'id': u'Seek no easy ways'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'SINGLE MERGED', u'The End'],
|
||||
[1637384.52, u''],
|
||||
[2444344.06, u''],
|
||||
[2444344.06, u''],
|
||||
[u'', u''],
|
||||
],
|
||||
}])
|
||||
|
||||
def test_excel_strange_dates(self):
|
||||
# TODO fails with xlrd.xldate.XLDateAmbiguous: 4.180902777777778
|
||||
# Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00).
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx'))
|
||||
tables = parsed_file[1]
|
||||
# We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30"
|
||||
# would be best left as text, but here become "01:10:00" (after xlrd parses the first as
|
||||
# datetime.time), and as 4.18... (after xlrd fails and we resort to the numerical value).
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Sheet1',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Text'},
|
||||
{'id': 'b', 'type': 'Date'},
|
||||
{'id': 'c', 'type': 'Text'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'e', 'type': 'Numeric'},
|
||||
{'id': 'f', 'type': 'Int'},
|
||||
{'id': 'g', 'type': 'Date'},
|
||||
{'id': 'h', 'type': 'Date'},
|
||||
{'id': 'i', 'type': 'Bool'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'21:14:00'],
|
||||
[1568851200.0],
|
||||
[u'01:10:00'],
|
||||
[u'10:20:30'],
|
||||
[4.180902777777778],
|
||||
[20],
|
||||
[-6106060800.0],
|
||||
[205286400.0],
|
||||
[False], # This is not great either, we should be able to distinguish 0 from FALSE.
|
||||
],
|
||||
}])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,14 @@
|
||||
FROM python:3.9
|
||||
|
||||
COPY requirements3.txt /tmp/requirements3.txt
|
||||
|
||||
RUN \
|
||||
pip3 install -r /tmp/requirements3.txt
|
||||
|
||||
RUN \
|
||||
apt-get update && \
|
||||
apt-get install -y faketime
|
||||
|
||||
RUN useradd --shell /bin/bash sandbox
|
||||
USER sandbox
|
||||
WORKDIR /
|
@ -0,0 +1,3 @@
|
||||
image:
|
||||
cp ../requirements3.txt . # docker build requires files to be present.
|
||||
docker build -t grist-docker-sandbox .
|
@ -0,0 +1,21 @@
|
||||
astroid==2.5.7 # this is a difference between python 2 and 3, everything else is same
|
||||
asttokens==2.0.5
|
||||
backports.functools-lru-cache==1.6.4
|
||||
chardet==2.3.0
|
||||
enum34==1.1.10
|
||||
html5lib==0.999999999
|
||||
iso8601==0.1.12
|
||||
json_table_schema==0.2.1
|
||||
lazy_object_proxy==1.6.0
|
||||
lxml==4.6.3 # used in csv plugin only?
|
||||
messytables==0.15.2
|
||||
python_dateutil==2.6.0
|
||||
python_magic==0.4.12
|
||||
roman==2.0.0
|
||||
singledispatch==3.6.2
|
||||
six==1.16.0
|
||||
sortedcontainers==1.5.7
|
||||
webencodings==0.5
|
||||
wrapt==1.12.1
|
||||
xlrd==1.2.0
|
||||
unittest-xml-reporting==2.0.0
|
Loading…
Reference in new issue