mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) Lossless imports
Summary: - Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates. - 0s and 1s are treated as numbers instead of booleans to keep imports lossless. - Removed dateguess.py and test_dateguess.py. - Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py. - Columns containing numbers are now always imported as Numeric, never Int. - Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff. Test Plan: - Added 2 `nbrowser/Importer2` tests. - Updated various existing tests. - Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3302
This commit is contained in:
@@ -7,13 +7,11 @@ dictionary with "type" and "data" fields, where "type" is a Grist type string, a
|
||||
of values. All "data" lists will have the same length.
|
||||
"""
|
||||
|
||||
from imports import dateguess
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import messytables
|
||||
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
||||
import dateutil.parser as date_parser
|
||||
import six
|
||||
from six.moves import zip, xrange
|
||||
|
||||
@@ -25,12 +23,17 @@ log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Our approach to type detection is different from that of messytables.
|
||||
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
|
||||
# We first go through each cell in a sample of rows, checking if it's one of the basic
|
||||
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
|
||||
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
|
||||
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
|
||||
# We use those counts to produce the selected Grist type at the end.
|
||||
|
||||
# Previously string values were used here for type guessing and were parsed to typed values.
|
||||
# That process now happens elsewhere, and this module only handles the case
|
||||
# where the imported data already contains actual numbers or dates.
|
||||
# This happens for Excel sheets but not CSV files.
|
||||
|
||||
|
||||
class BaseConverter(object):
|
||||
@classmethod
|
||||
@@ -57,50 +60,19 @@ class BaseConverter(object):
|
||||
|
||||
|
||||
class NumericConverter(BaseConverter):
|
||||
"""Handles numeric values, including Grist types Numeric and Int."""
|
||||
|
||||
# A number matching this is probably an identifier of some sort. Converting it to a float will
|
||||
# lose precision, so it's better not to consider it numeric.
|
||||
_unlikely_float = re.compile(r'\d{17}|^0\d')
|
||||
|
||||
# Integers outside this range will be represented as floats. This is the limit for values that can
|
||||
# be stored in a JS Int32Array.
|
||||
_max_js_int = 1<<31
|
||||
|
||||
# The thousands separator. It should be locale-specific, but we don't currently have a way to
|
||||
# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
|
||||
_thousands_sep = ','
|
||||
"""Handles the Grist Numeric type"""
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
if type(value) in six.integer_types + (float, complex):
|
||||
return value
|
||||
if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
|
||||
return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def _is_integer(cls, value):
|
||||
ttype = type(value)
|
||||
if ttype == int or (ttype == float and value.is_integer()):
|
||||
return -cls._max_js_int <= value < cls._max_js_int
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
if all(cls._is_integer(v) for v in values):
|
||||
return ("Int", [int(v) for v in values])
|
||||
return ("Numeric", values)
|
||||
|
||||
|
||||
class DateParserInfo(date_parser.parserinfo):
|
||||
def validate(self, res):
|
||||
# Avoid this bogus combination which accepts plain numbers.
|
||||
if res.day and not res.month:
|
||||
return False
|
||||
return super(DateParserInfo, self).validate(res)
|
||||
|
||||
|
||||
class SimpleDateTimeConverter(BaseConverter):
|
||||
"""Handles Date and DateTime values which are already instances of datetime.datetime."""
|
||||
|
||||
@@ -124,66 +96,18 @@ class SimpleDateTimeConverter(BaseConverter):
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class DateTimeCoverter(BaseConverter):
|
||||
"""Handles dateformats by guessed format."""
|
||||
|
||||
def __init__(self, date_format):
|
||||
self._format = date_format
|
||||
|
||||
def convert(self, value):
|
||||
if value == "":
|
||||
return None
|
||||
if type(value) in (str, six.text_type):
|
||||
# datetime.strptime doesn't handle %z and %Z tags in Python 2.
|
||||
if '%z' in self._format or '%Z' in self._format:
|
||||
return date_parser.parse(value)
|
||||
else:
|
||||
try:
|
||||
return datetime.datetime.strptime(value, self._format)
|
||||
except ValueError:
|
||||
return date_parser.parse(value)
|
||||
|
||||
raise ValueError()
|
||||
|
||||
def _is_date(self, value):
|
||||
return value is None or value.time() == datetime.time()
|
||||
|
||||
def get_grist_column(self, values):
|
||||
grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
|
||||
grist_values = [(v if (v is None) else moment.dt_to_ts(v))
|
||||
for v in values]
|
||||
return grist_type, grist_values
|
||||
|
||||
|
||||
class BoolConverter(BaseConverter):
|
||||
"""Handles Boolean type."""
|
||||
|
||||
_true_values = (1, '1', 'true', 'yes')
|
||||
_false_values = (0, '0', 'false', 'no')
|
||||
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
v = value.strip().lower() if type(value) in (str, six.text_type) else value
|
||||
if v in cls._true_values:
|
||||
return True
|
||||
elif v in cls._false_values:
|
||||
return False
|
||||
raise ValueError()
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Bool", values)
|
||||
|
||||
|
||||
class TextConverter(BaseConverter):
|
||||
"""Fallback converter that converts everything to strings."""
|
||||
class AnyConverter(BaseConverter):
|
||||
"""
|
||||
Fallback converter that converts everything to strings.
|
||||
Type guessing and parsing of the strings will happen elsewhere.
|
||||
"""
|
||||
@classmethod
|
||||
def convert(cls, value):
|
||||
return six.text_type(value)
|
||||
|
||||
@classmethod
|
||||
def get_grist_column(cls, values):
|
||||
return ("Text", values)
|
||||
return ("Any", values)
|
||||
|
||||
|
||||
class ColumnDetector(object):
|
||||
@@ -194,7 +118,7 @@ class ColumnDetector(object):
|
||||
"""
|
||||
# Converters are listed in the order of preference, which is only used if two converters succeed
|
||||
# on the same exact number of values. Text is always a fallback.
|
||||
converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]
|
||||
converters = [SimpleDateTimeConverter, NumericConverter]
|
||||
|
||||
# If this many non-junk values or more can't be converted, fall back to text.
|
||||
_text_threshold = 0.10
|
||||
@@ -221,19 +145,11 @@ class ColumnDetector(object):
|
||||
self._counts[i] += 1
|
||||
|
||||
def get_converter(self):
|
||||
if sum(self._counts) == 0:
|
||||
# if not already guessed as int, bool or datetime then we should try to guess date pattern
|
||||
str_data = [d for d in self._data if isinstance(d, six.string_types)]
|
||||
data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
|
||||
data_format = data_formats[0] if data_formats else None
|
||||
if data_format:
|
||||
return DateTimeCoverter(data_format)
|
||||
|
||||
# We find the max by count, and secondarily by minimum index in the converters list.
|
||||
count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
|
||||
if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
|
||||
return self.converters[-neg_index]
|
||||
return TextConverter
|
||||
return AnyConverter
|
||||
|
||||
|
||||
def _guess_basic_types(rows, num_columns):
|
||||
|
||||
Reference in New Issue
Block a user