gristlabs_grist-core/sandbox/grist/parse_data.py

"""
This module implements a way to detect and convert types that's better than messytables (at least
in some relevant cases).

It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
of values. All "data" lists will have the same length.
"""

from imports import dateguess
import datetime
import logging
import re
import messytables
import moment # TODO grist internal libraries might not be available to plugins in the future.
import dateutil.parser as date_parser
import six
from six.moves import zip, xrange

log = logging.getLogger(__name__)


# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
# pylint:disable=unidiomatic-typecheck


# Our approach to type detection is different from that of messytables.
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
# We use those counts to produce the selected Grist type at the end.


class BaseConverter(object):
  @classmethod
  def test(cls, value):
    try:
      cls.convert(value)
      return True
    except Exception:
      return False

  @classmethod
  def convert(cls, value):
    """Implement to convert imported value to a basic type."""
    raise NotImplementedError()

  @classmethod
  def get_grist_column(cls, values):
    """
    Given an array of values returned successfully by convert(), return a tuple of
    (grist_type_string, grist_values), where grist_values is an array of values suitable for the
    returned grist type.
    """
    raise NotImplementedError()


class NumericConverter(BaseConverter):
  """Handles numeric values, including Grist types Numeric and Int."""

  # A number matching this is probably an identifier of some sort. Converting it to a float will
  # lose precision, so it's better not to consider it numeric.
  _unlikely_float = re.compile(r'\d{17}|^0\d')

  # Integers outside this range will be represented as floats. This is the limit for values that can
  # be stored in a JS Int32Array.
  _max_js_int = 1<<31

  # The thousands separator. It should be locale-specific, but we don't currently have a way to
  # detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
  _thousands_sep = ','

  @classmethod
  def convert(cls, value):
    if type(value) in six.integer_types + (float, complex):
      return value
    if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
      return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
    raise ValueError()

  @classmethod
  def _is_integer(cls, value):
    ttype = type(value)
    if ttype == int or (ttype == float and value.is_integer()):
      return -cls._max_js_int <= value < cls._max_js_int
    return False

  @classmethod
  def get_grist_column(cls, values):
    if all(cls._is_integer(v) for v in values):
      return ("Int", [int(v) for v in values])
    return ("Numeric", values)


class DateParserInfo(date_parser.parserinfo):
  def validate(self, res):
    # Avoid this bogus combination which accepts plain numbers.
    if res.day and not res.month:
      return False
    return super(DateParserInfo, self).validate(res)


class SimpleDateTimeConverter(BaseConverter):
  """Handles Date and DateTime values which are already instances of datetime.datetime."""

  @classmethod
  def convert(cls, value):
    if type(value) is datetime.datetime:
      return value
    elif value == "":
      return None
    raise ValueError()

  @classmethod
  def _is_date(cls, value):
    return value is None or value.time() == datetime.time()

  @classmethod
  def get_grist_column(cls, values):
    grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
                    for v in values]
    return grist_type, grist_values


class DateTimeCoverter(BaseConverter):
  """Handles dateformats by guessed format."""

  def __init__(self, date_format):
    self._format = date_format

  def convert(self, value):
    if value == "":
      return None
    if type(value) in (str, six.text_type):
      # datetime.strptime doesn't handle %z and %Z tags in Python 2.
      if '%z' in self._format or '%Z' in self._format:
        return date_parser.parse(value)
      else:
        try:
          return datetime.datetime.strptime(value, self._format)
        except ValueError:
          return date_parser.parse(value)

    raise ValueError()

  def _is_date(self, value):
    return value is None or value.time() == datetime.time()

  def get_grist_column(self, values):
    grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
                    for v in values]
    return grist_type, grist_values


class BoolConverter(BaseConverter):
  """Handles Boolean type."""

  _true_values = (1, '1', 'true', 'yes')
  _false_values = (0, '0', 'false', 'no')

  @classmethod
  def convert(cls, value):
    v = value.strip().lower() if type(value) in (str, six.text_type) else value
    if v in cls._true_values:
      return True
    elif v in cls._false_values:
      return False
    raise ValueError()

  @classmethod
  def get_grist_column(cls, values):
    return ("Bool", values)


class TextConverter(BaseConverter):
  """Fallback converter that converts everything to strings."""
  @classmethod
  def convert(cls, value):
    return six.text_type(value)

  @classmethod
  def get_grist_column(cls, values):
    return ("Text", values)


class ColumnDetector(object):
  """
  ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
  different basic types. At the end `get_converter()` method returns the class of the most
  suitable converter.
  """
  # Converters are listed in the order of preference, which is only used if two converters succeed
  # on the same exact number of values. Text is always a fallback.
  converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]

  # If this many non-junk values or more can't be converted, fall back to text.
  _text_threshold = 0.10

  # Junk values: these aren't counted when deciding whether to fall back to text.
  _junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)

  def __init__(self):
    self._counts = [0] * len(self.converters)
    self._count_nonjunk = 0
    self._count_total = 0
    self._data = []

  def add_value(self, value):
    self._count_total += 1
    if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
      return

    self._data.append(value)

    self._count_nonjunk += 1
    for i, conv in enumerate(self.converters):
      if conv.test(value):
        self._counts[i] += 1

  def get_converter(self):
    if sum(self._counts) == 0:
      # if not already guessed as int, bool or datetime then we should try to guess date pattern
      str_data = [d for d in self._data if isinstance(d, six.string_types)]
      data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
      data_format = data_formats[0] if data_formats else None
      if data_format:
        return DateTimeCoverter(data_format)

    # We find the max by count, and secondarily by minimum index in the converters list.
    count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
    if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
      return self.converters[-neg_index]
    return TextConverter


def _guess_basic_types(rows, num_columns):
  column_detectors = [ColumnDetector() for i in xrange(num_columns)]
  for row in rows:
    for cell, detector in zip(row, column_detectors):
      detector.add_value(cell.value)

  return [detector.get_converter() for detector in column_detectors]


class ColumnConverter(object):
  """
  ColumnConverter converts and collects values using the passed-in converter object. At the end
  `get_grist_column()` method returns a column of converted data.
  """
  def __init__(self, converter):
    self._converter = converter
    self._all_col_values = []     # Initially this has None's for converted values
    self._converted_values = []   # A list of all converted values
    self._converted_indices = []  # Indices of the converted values into self._all_col_values

  def convert_and_add(self, value):
    # For some reason, we get 'str' type rather than 'unicode' for empty strings.
    # Correct this, since all text should be unicode.
    value = u"" if value == "" else value
    try:
      conv = self._converter.convert(value)
      self._converted_values.append(conv)
      self._converted_indices.append(len(self._all_col_values))
      self._all_col_values.append(None)
    except Exception:
      self._all_col_values.append(six.text_type(value))

  def get_grist_column(self):
    """
    Returns a dictionary {"type": grist_type, "data": grist_value_array}.
    """
    grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
    for i, v in zip(self._converted_indices, grist_values):
      self._all_col_values[i] = v
    return {"type": grist_type, "data": self._all_col_values}


def get_table_data(row_set, num_columns, num_rows=0):
  converters = _guess_basic_types(row_set.sample, num_columns)
  col_converters = [ColumnConverter(c) for c in converters]
  for num, row in enumerate(row_set):
    if num_rows and num == num_rows:
      break

    if num % 10000 == 0:
      log.info("Processing row %d", num)

    # Make sure we have a value for every column.
    missing_values = len(converters) - len(row)
    if missing_values > 0:
      row.extend([messytables.Cell("")] * missing_values)

    for cell, conv in zip(row, col_converters):
      conv.convert_and_add(cell.value)

  return [conv.get_grist_column() for conv in col_converters]