mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
4d526da58f
Summary: Move all the plugins python code into the main folder with the core code. Register file importing functions in the same main.py entrypoint as the data engine. Remove options relating to different entrypoints and code directories. The only remaining plugin-specific option in NSandbox is the import directory/mount, i.e. where files to be parsed are placed. Test Plan: this Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2965
300 lines
10 KiB
Python
300 lines
10 KiB
Python
"""
|
|
This module implements a way to detect and convert types that's better than messytables (at least
|
|
in some relevant cases).
|
|
|
|
It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
|
|
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
|
|
of values. All "data" lists will have the same length.
|
|
"""
|
|
|
|
from imports import dateguess
|
|
import datetime
|
|
import logging
|
|
import re
|
|
import messytables
|
|
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
|
import dateutil.parser as date_parser
|
|
import six
|
|
from six.moves import zip, xrange
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
|
|
# pylint:disable=unidiomatic-typecheck
|
|
|
|
|
|
# Our approach to type detection is different from that of messytables.
|
|
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
|
|
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
|
|
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
|
|
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
|
|
# We use those counts to produce the selected Grist type at the end.
|
|
|
|
|
|
class BaseConverter(object):
|
|
@classmethod
|
|
def test(cls, value):
|
|
try:
|
|
cls.convert(value)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
@classmethod
|
|
def convert(cls, value):
|
|
"""Implement to convert imported value to a basic type."""
|
|
raise NotImplementedError()
|
|
|
|
@classmethod
|
|
def get_grist_column(cls, values):
|
|
"""
|
|
Given an array of values returned successfully by convert(), return a tuple of
|
|
(grist_type_string, grist_values), where grist_values is an array of values suitable for the
|
|
returned grist type.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
|
|
class NumericConverter(BaseConverter):
|
|
"""Handles numeric values, including Grist types Numeric and Int."""
|
|
|
|
# A number matching this is probably an identifier of some sort. Converting it to a float will
|
|
# lose precision, so it's better not to consider it numeric.
|
|
_unlikely_float = re.compile(r'\d{17}|^0\d')
|
|
|
|
# Integers outside this range will be represented as floats. This is the limit for values that can
|
|
# be stored in a JS Int32Array.
|
|
_max_js_int = 1<<31
|
|
|
|
# The thousands separator. It should be locale-specific, but we don't currently have a way to
|
|
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
|
|
_thousands_sep = ','
|
|
|
|
@classmethod
|
|
def convert(cls, value):
|
|
if type(value) in six.integer_types + (float, complex):
|
|
return value
|
|
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
|
|
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
|
|
raise ValueError()
|
|
|
|
@classmethod
|
|
def _is_integer(cls, value):
|
|
ttype = type(value)
|
|
if ttype == int or (ttype == float and value.is_integer()):
|
|
return -cls._max_js_int <= value < cls._max_js_int
|
|
return False
|
|
|
|
@classmethod
|
|
def get_grist_column(cls, values):
|
|
if all(cls._is_integer(v) for v in values):
|
|
return ("Int", [int(v) for v in values])
|
|
return ("Numeric", values)
|
|
|
|
|
|
class DateParserInfo(date_parser.parserinfo):
|
|
def validate(self, res):
|
|
# Avoid this bogus combination which accepts plain numbers.
|
|
if res.day and not res.month:
|
|
return False
|
|
return super(DateParserInfo, self).validate(res)
|
|
|
|
|
|
class SimpleDateTimeConverter(BaseConverter):
|
|
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
|
|
|
|
@classmethod
|
|
def convert(cls, value):
|
|
if type(value) is datetime.datetime:
|
|
return value
|
|
elif value == "":
|
|
return None
|
|
raise ValueError()
|
|
|
|
@classmethod
|
|
def _is_date(cls, value):
|
|
return value is None or value.time() == datetime.time()
|
|
|
|
@classmethod
|
|
def get_grist_column(cls, values):
|
|
grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
|
|
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
|
for v in values]
|
|
return grist_type, grist_values
|
|
|
|
|
|
class DateTimeCoverter(BaseConverter):
|
|
"""Handles dateformats by guessed format."""
|
|
|
|
def __init__(self, date_format):
|
|
self._format = date_format
|
|
|
|
def convert(self, value):
|
|
if value == "":
|
|
return None
|
|
if type(value) in (str, six.text_type):
|
|
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
|
|
if '%z' in self._format or '%Z' in self._format:
|
|
return date_parser.parse(value)
|
|
else:
|
|
try:
|
|
return datetime.datetime.strptime(value, self._format)
|
|
except ValueError:
|
|
return date_parser.parse(value)
|
|
|
|
raise ValueError()
|
|
|
|
def _is_date(self, value):
|
|
return value is None or value.time() == datetime.time()
|
|
|
|
def get_grist_column(self, values):
|
|
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
|
|
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
|
for v in values]
|
|
return grist_type, grist_values
|
|
|
|
|
|
class BoolConverter(BaseConverter):
|
|
"""Handles Boolean type."""
|
|
|
|
_true_values = (1, '1', 'true', 'yes')
|
|
_false_values = (0, '0', 'false', 'no')
|
|
|
|
@classmethod
|
|
def convert(cls, value):
|
|
v = value.strip().lower() if type(value) in (str, six.text_type) else value
|
|
if v in cls._true_values:
|
|
return True
|
|
elif v in cls._false_values:
|
|
return False
|
|
raise ValueError()
|
|
|
|
@classmethod
|
|
def get_grist_column(cls, values):
|
|
return ("Bool", values)
|
|
|
|
|
|
class TextConverter(BaseConverter):
|
|
"""Fallback converter that converts everything to strings."""
|
|
@classmethod
|
|
def convert(cls, value):
|
|
return six.text_type(value)
|
|
|
|
@classmethod
|
|
def get_grist_column(cls, values):
|
|
return ("Text", values)
|
|
|
|
|
|
class ColumnDetector(object):
|
|
"""
|
|
ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
|
|
different basic types. At the end `get_converter()` method returns the class of the most
|
|
suitable converter.
|
|
"""
|
|
# Converters are listed in the order of preference, which is only used if two converters succeed
|
|
# on the same exact number of values. Text is always a fallback.
|
|
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
|
|
|
|
# If this many non-junk values or more can't be converted, fall back to text.
|
|
_text_threshold = 0.10
|
|
|
|
# Junk values: these aren't counted when deciding whether to fall back to text.
|
|
_junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)
|
|
|
|
def __init__(self):
|
|
self._counts = [0] * len(self.converters)
|
|
self._count_nonjunk = 0
|
|
self._count_total = 0
|
|
self._data = []
|
|
|
|
def add_value(self, value):
|
|
self._count_total += 1
|
|
if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
|
|
return
|
|
|
|
self._data.append(value)
|
|
|
|
self._count_nonjunk += 1
|
|
for i, conv in enumerate(self.converters):
|
|
if conv.test(value):
|
|
self._counts[i] += 1
|
|
|
|
def get_converter(self):
|
|
if sum(self._counts) == 0:
|
|
# if not already guessed as int, bool or datetime then we should try to guess date pattern
|
|
str_data = [d for d in self._data if isinstance(d, six.string_types)]
|
|
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
|
|
data_format = data_formats[0] if data_formats else None
|
|
if data_format:
|
|
return DateTimeCoverter(data_format)
|
|
|
|
# We find the max by count, and secondarily by minimum index in the converters list.
|
|
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
|
|
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
|
|
return self.converters[-neg_index]
|
|
return TextConverter
|
|
|
|
|
|
def _guess_basic_types(rows, num_columns):
|
|
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
|
|
for row in rows:
|
|
for cell, detector in zip(row, column_detectors):
|
|
detector.add_value(cell.value)
|
|
|
|
return [detector.get_converter() for detector in column_detectors]
|
|
|
|
|
|
class ColumnConverter(object):
|
|
"""
|
|
ColumnConverter converts and collects values using the passed-in converter object. At the end
|
|
`get_grist_column()` method returns a column of converted data.
|
|
"""
|
|
def __init__(self, converter):
|
|
self._converter = converter
|
|
self._all_col_values = [] # Initially this has None's for converted values
|
|
self._converted_values = [] # A list of all converted values
|
|
self._converted_indices = [] # Indices of the converted values into self._all_col_values
|
|
|
|
def convert_and_add(self, value):
|
|
# For some reason, we get 'str' type rather than 'unicode' for empty strings.
|
|
# Correct this, since all text should be unicode.
|
|
value = u"" if value == "" else value
|
|
try:
|
|
conv = self._converter.convert(value)
|
|
self._converted_values.append(conv)
|
|
self._converted_indices.append(len(self._all_col_values))
|
|
self._all_col_values.append(None)
|
|
except Exception:
|
|
self._all_col_values.append(six.text_type(value))
|
|
|
|
def get_grist_column(self):
|
|
"""
|
|
Returns a dictionary {"type": grist_type, "data": grist_value_array}.
|
|
"""
|
|
grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
|
|
for i, v in zip(self._converted_indices, grist_values):
|
|
self._all_col_values[i] = v
|
|
return {"type": grist_type, "data": self._all_col_values}
|
|
|
|
|
|
def get_table_data(row_set, num_columns, num_rows=0):
|
|
converters = _guess_basic_types(row_set.sample, num_columns)
|
|
col_converters = [ColumnConverter(c) for c in converters]
|
|
for num, row in enumerate(row_set):
|
|
if num_rows and num == num_rows:
|
|
break
|
|
|
|
if num % 10000 == 0:
|
|
log.info("Processing row %d", num)
|
|
|
|
# Make sure we have a value for every column.
|
|
missing_values = len(converters) - len(row)
|
|
if missing_values > 0:
|
|
row.extend([messytables.Cell("")] * missing_values)
|
|
|
|
for cell, conv in zip(row, col_converters):
|
|
conv.convert_and_add(cell.value)
|
|
|
|
return [conv.get_grist_column() for conv in col_converters]
|