gristlabs_grist-core/sandbox/grist/parse_data.py

"""
This module implements a way to detect and convert types that's better than messytables (at least
in some relevant cases).

It has a simple interface: get_table_data(row_set) which returns a list of columns, each a
dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list
of values. All "data" lists will have the same length.
"""

from imports import dateguess
import datetime
import logging
import re
import messytables
import moment # TODO grist internal libraries might not be available to plugins in the future.
import dateutil.parser as date_parser
import six
from six.moves import zip, xrange

log = logging.getLogger(__name__)


# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster
# pylint:disable=unidiomatic-typecheck


# Our approach to type detection is different from that of messytables.
# We first go through each cell in a sample of rows, trying to convert it to each of the basic
# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.
# numeric vs text). Then we go through the full data set converting to the chosen basic type.
# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).
# We use those counts to produce the selected Grist type at the end.


class BaseConverter(object):
  @classmethod
  def test(cls, value):
    try:
      cls.convert(value)
      return True
    except Exception:
      return False

  @classmethod
  def convert(cls, value):
    """Implement to convert imported value to a basic type."""
    raise NotImplementedError()

  @classmethod
  def get_grist_column(cls, values):
    """
    Given an array of values returned successfully by convert(), return a tuple of
    (grist_type_string, grist_values), where grist_values is an array of values suitable for the
    returned grist type.
    """
    raise NotImplementedError()


class NumericConverter(BaseConverter):
  """Handles numeric values, including Grist types Numeric and Int."""

  # A number matching this is probably an identifier of some sort. Converting it to a float will
  # lose precision, so it's better not to consider it numeric.
  _unlikely_float = re.compile(r'\d{17}|^0\d')

  # Integers outside this range will be represented as floats. This is the limit for values that can
  # be stored in a JS Int32Array.
  _max_js_int = 1<<31

  # The thousands separator. It should be locale-specific, but we don't currently have a way to
  # detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)
  _thousands_sep = ','

  @classmethod
  def convert(cls, value):
    if type(value) in six.integer_types + (float, complex):
      return value
    if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):
      return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))
    raise ValueError()

  @classmethod
  def _is_integer(cls, value):
    ttype = type(value)
    if ttype == int or (ttype == float and value.is_integer()):
      return -cls._max_js_int <= value < cls._max_js_int
    return False

  @classmethod
  def get_grist_column(cls, values):
    if all(cls._is_integer(v) for v in values):
      return ("Int", [int(v) for v in values])
    return ("Numeric", values)


class DateParserInfo(date_parser.parserinfo):
  def validate(self, res):
    # Avoid this bogus combination which accepts plain numbers.
    if res.day and not res.month:
      return False
    return super(DateParserInfo, self).validate(res)


class SimpleDateTimeConverter(BaseConverter):
  """Handles Date and DateTime values which are already instances of datetime.datetime."""

  @classmethod
  def convert(cls, value):
    if type(value) is datetime.datetime:
      return value
    elif value == "":
      return None
    raise ValueError()

  @classmethod
  def _is_date(cls, value):
    return value is None or value.time() == datetime.time()

  @classmethod
  def get_grist_column(cls, values):
    grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
                    for v in values]
    return grist_type, grist_values


class DateTimeCoverter(BaseConverter):
  """Handles dateformats by guessed format."""

  def __init__(self, date_format):
    self._format = date_format

  def convert(self, value):
    if value == "":
      return None
    if type(value) in (str, six.text_type):
      # datetime.strptime doesn't handle %z and %Z tags in Python 2.
      if '%z' in self._format or '%Z' in self._format:
        return date_parser.parse(value)
      else:
        try:
          return datetime.datetime.strptime(value, self._format)
        except ValueError:
          return date_parser.parse(value)

    raise ValueError()

  def _is_date(self, value):
    return value is None or value.time() == datetime.time()

  def get_grist_column(self, values):
    grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"
    grist_values = [(v if (v is None) else moment.dt_to_ts(v))
                    for v in values]
    return grist_type, grist_values


class BoolConverter(BaseConverter):
  """Handles Boolean type."""

  _true_values = (1, '1', 'true', 'yes')
  _false_values = (0, '0', 'false', 'no')

  @classmethod
  def convert(cls, value):
    v = value.strip().lower() if type(value) in (str, six.text_type) else value
    if v in cls._true_values:
      return True
    elif v in cls._false_values:
      return False
    raise ValueError()

  @classmethod
  def get_grist_column(cls, values):
    return ("Bool", values)


class TextConverter(BaseConverter):
  """Fallback converter that converts everything to strings."""
  @classmethod
  def convert(cls, value):
    return six.text_type(value)

  @classmethod
  def get_grist_column(cls, values):
    return ("Text", values)


class ColumnDetector(object):
  """
  ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
  different basic types. At the end `get_converter()` method returns the class of the most
  suitable converter.
  """
  # Converters are listed in the order of preference, which is only used if two converters succeed
  # on the same exact number of values. Text is always a fallback.
  converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]

  # If this many non-junk values or more can't be converted, fall back to text.
  _text_threshold = 0.10

  # Junk values: these aren't counted when deciding whether to fall back to text.
  _junk_re = re.compile(r'^\s*(|-+|\?+|n/?a)\s*$', re.I)

  def __init__(self):
    self._counts = [0] * len(self.converters)
    self._count_nonjunk = 0
    self._count_total = 0
    self._data = []

  def add_value(self, value):
    self._count_total += 1
    if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):
      return

    self._data.append(value)

    self._count_nonjunk += 1
    for i, conv in enumerate(self.converters):
      if conv.test(value):
        self._counts[i] += 1

  def get_converter(self):
    if sum(self._counts) == 0:
      # if not already guessed as int, bool or datetime then we should try to guess date pattern
      str_data = [d for d in self._data if isinstance(d, six.string_types)]
      data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)
      data_format = data_formats[0] if data_formats else None
      if data_format:
        return DateTimeCoverter(data_format)

    # We find the max by count, and secondarily by minimum index in the converters list.
    count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))
    if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):
      return self.converters[-neg_index]
    return TextConverter


def _guess_basic_types(rows, num_columns):
  column_detectors = [ColumnDetector() for i in xrange(num_columns)]
  for row in rows:
    for cell, detector in zip(row, column_detectors):
      detector.add_value(cell.value)

  return [detector.get_converter() for detector in column_detectors]


class ColumnConverter(object):
  """
  ColumnConverter converts and collects values using the passed-in converter object. At the end
  `get_grist_column()` method returns a column of converted data.
  """
  def __init__(self, converter):
    self._converter = converter
    self._all_col_values = []     # Initially this has None's for converted values
    self._converted_values = []   # A list of all converted values
    self._converted_indices = []  # Indices of the converted values into self._all_col_values

  def convert_and_add(self, value):
    # For some reason, we get 'str' type rather than 'unicode' for empty strings.
    # Correct this, since all text should be unicode.
    value = u"" if value == "" else value
    try:
      conv = self._converter.convert(value)
      self._converted_values.append(conv)
      self._converted_indices.append(len(self._all_col_values))
      self._all_col_values.append(None)
    except Exception:
      self._all_col_values.append(six.text_type(value))

  def get_grist_column(self):
    """
    Returns a dictionary {"type": grist_type, "data": grist_value_array}.
    """
    grist_type, grist_values = self._converter.get_grist_column(self._converted_values)
    for i, v in zip(self._converted_indices, grist_values):
      self._all_col_values[i] = v
    return {"type": grist_type, "data": self._all_col_values}


def get_table_data(row_set, num_columns, num_rows=0):
  converters = _guess_basic_types(row_set.sample, num_columns)
  col_converters = [ColumnConverter(c) for c in converters]
  for num, row in enumerate(row_set):
    if num_rows and num == num_rows:
      break

    if num % 10000 == 0:
      log.info("Processing row %d", num)

    # Make sure we have a value for every column.
    missing_values = len(converters) - len(row)
    if missing_values > 0:
      row.extend([messytables.Cell("")] * missing_values)

    for cell, conv in zip(row, col_converters):
      conv.convert_and_add(cell.value)

  return [conv.get_grist_column() for conv in col_converters]
(core) support python3 in grist-core, and running engine via docker and/or gvisor Summary: * Moves essential plugins to grist-core, so that basic imports (e.g. csv) work. * Adds support for a `GRIST_SANDBOX_FLAVOR` flag that can systematically override how the data engine is run. - `GRIST_SANDBOX_FLAVOR=pynbox` is "classic" nacl-based sandbox. - `GRIST_SANDBOX_FLAVOR=docker` runs engines in individual docker containers. It requires an image specified in `sandbox/docker` (alternative images can be named with `GRIST_SANDBOX` flag - need to contain python and engine requirements). It is a simple reference implementation for sandboxing. - `GRIST_SANDBOX_FLAVOR=unsandboxed` runs whatever local version of python is specified by a `GRIST_SANDBOX` flag directly, with no sandboxing. Engine requirements must be installed, so an absolute path to a python executable in a virtualenv is easiest to manage. - `GRIST_SANDBOX_FLAVOR=gvisor` runs the data engine via gvisor's runsc. Experimental, with implementation not included in grist-core. Since gvisor runs on Linux only, this flavor supports wrapping the sandboxes in a single shared docker container. * Tweaks some recent express query parameter code to work in grist-core, which has a slightly different version of express (smoke test doesn't catch this since in Jenkins core is built within a workspace that has node_modules, and wires get crossed - in a dev environment the problem on master can be seen by doing `buildtools/build_core.sh /tmp/any_path_outside_grist`). The new sandbox options do not have tests yet, nor does this they change the behavior of grist servers today. They are there to clean up and consolidate a collection of patches I've been using that were getting cumbersome, and make it easier to run experiments. I haven't looked closely at imports beyond core. Test Plan: tested manually against regular grist and grist-core, including imports Reviewers: alexmojaki, dsagal Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D2942 2021-07-27 23:43:21 +00:00			`"""`
			`This module implements a way to detect and convert types that's better than messytables (at least`
			`in some relevant cases).`

			`It has a simple interface: get_table_data(row_set) which returns a list of columns, each a`
			`dictionary with "type" and "data" fields, where "type" is a Grist type string, and data is a list`
			`of values. All "data" lists will have the same length.`
			`"""`

(core) Move file import plugins into core/sandbox/grist Summary: Move all the plugins python code into the main folder with the core code. Register file importing functions in the same main.py entrypoint as the data engine. Remove options relating to different entrypoints and code directories. The only remaining plugin-specific option in NSandbox is the import directory/mount, i.e. where files to be parsed are placed. Test Plan: this Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2965 2021-08-09 14:51:43 +00:00			`from imports import dateguess`
(core) support python3 in grist-core, and running engine via docker and/or gvisor Summary: * Moves essential plugins to grist-core, so that basic imports (e.g. csv) work. * Adds support for a `GRIST_SANDBOX_FLAVOR` flag that can systematically override how the data engine is run. - `GRIST_SANDBOX_FLAVOR=pynbox` is "classic" nacl-based sandbox. - `GRIST_SANDBOX_FLAVOR=docker` runs engines in individual docker containers. It requires an image specified in `sandbox/docker` (alternative images can be named with `GRIST_SANDBOX` flag - need to contain python and engine requirements). It is a simple reference implementation for sandboxing. - `GRIST_SANDBOX_FLAVOR=unsandboxed` runs whatever local version of python is specified by a `GRIST_SANDBOX` flag directly, with no sandboxing. Engine requirements must be installed, so an absolute path to a python executable in a virtualenv is easiest to manage. - `GRIST_SANDBOX_FLAVOR=gvisor` runs the data engine via gvisor's runsc. Experimental, with implementation not included in grist-core. Since gvisor runs on Linux only, this flavor supports wrapping the sandboxes in a single shared docker container. * Tweaks some recent express query parameter code to work in grist-core, which has a slightly different version of express (smoke test doesn't catch this since in Jenkins core is built within a workspace that has node_modules, and wires get crossed - in a dev environment the problem on master can be seen by doing `buildtools/build_core.sh /tmp/any_path_outside_grist`). The new sandbox options do not have tests yet, nor does this they change the behavior of grist servers today. They are there to clean up and consolidate a collection of patches I've been using that were getting cumbersome, and make it easier to run experiments. I haven't looked closely at imports beyond core. Test Plan: tested manually against regular grist and grist-core, including imports Reviewers: alexmojaki, dsagal Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D2942 2021-07-27 23:43:21 +00:00			`import datetime`
			`import logging`
			`import re`
			`import messytables`
			`import moment # TODO grist internal libraries might not be available to plugins in the future.`
			`import dateutil.parser as date_parser`
			`import six`
			`from six.moves import zip, xrange`

			`log = logging.getLogger(__name__)`


			`# Typecheck using type(value) instead of isinstance(value, some_type) makes parsing 25% faster`
			`# pylint:disable=unidiomatic-typecheck`


			`# Our approach to type detection is different from that of messytables.`
			`# We first go through each cell in a sample of rows, trying to convert it to each of the basic`
			`# types, and keep a count of successes for each. We use the counts to decide the basic types (e.g.`
			`# numeric vs text). Then we go through the full data set converting to the chosen basic type.`
			`# During this process, we keep counts of suitable Grist types to consider (e.g. Int vs Numeric).`
			`# We use those counts to produce the selected Grist type at the end.`


			`class BaseConverter(object):`
			`@classmethod`
			`def test(cls, value):`
			`try:`
			`cls.convert(value)`
			`return True`
			`except Exception:`
			`return False`

			`@classmethod`
			`def convert(cls, value):`
			`"""Implement to convert imported value to a basic type."""`
			`raise NotImplementedError()`

			`@classmethod`
			`def get_grist_column(cls, values):`
			`"""`
			`Given an array of values returned successfully by convert(), return a tuple of`
			`(grist_type_string, grist_values), where grist_values is an array of values suitable for the`
			`returned grist type.`
			`"""`
			`raise NotImplementedError()`


			`class NumericConverter(BaseConverter):`
			`"""Handles numeric values, including Grist types Numeric and Int."""`

			`# A number matching this is probably an identifier of some sort. Converting it to a float will`
			`# lose precision, so it's better not to consider it numeric.`
			`_unlikely_float = re.compile(r'\d{17}\|^0\d')`

			`# Integers outside this range will be represented as floats. This is the limit for values that can`
			`# be stored in a JS Int32Array.`
			`_max_js_int = 1<<31`

			`# The thousands separator. It should be locale-specific, but we don't currently have a way to`
			`# detect locale from the data. (Also, the sandbox's locale module isn't fully functional.)`
			`_thousands_sep = ','`

			`@classmethod`
			`def convert(cls, value):`
			`if type(value) in six.integer_types + (float, complex):`
			`return value`
			`if type(value) in (str, six.text_type) and not cls._unlikely_float.search(value):`
			`return float(value.strip().lstrip('$').replace(cls._thousands_sep, ""))`
			`raise ValueError()`

			`@classmethod`
			`def _is_integer(cls, value):`
			`ttype = type(value)`
			`if ttype == int or (ttype == float and value.is_integer()):`
			`return -cls._max_js_int <= value < cls._max_js_int`
			`return False`

			`@classmethod`
			`def get_grist_column(cls, values):`
			`if all(cls._is_integer(v) for v in values):`
			`return ("Int", [int(v) for v in values])`
			`return ("Numeric", values)`


			`class DateParserInfo(date_parser.parserinfo):`
			`def validate(self, res):`
			`# Avoid this bogus combination which accepts plain numbers.`
			`if res.day and not res.month:`
			`return False`
			`return super(DateParserInfo, self).validate(res)`


			`class SimpleDateTimeConverter(BaseConverter):`
			`"""Handles Date and DateTime values which are already instances of datetime.datetime."""`

			`@classmethod`
			`def convert(cls, value):`
			`if type(value) is datetime.datetime:`
			`return value`
			`elif value == "":`
			`return None`
			`raise ValueError()`

			`@classmethod`
			`def _is_date(cls, value):`
			`return value is None or value.time() == datetime.time()`

			`@classmethod`
			`def get_grist_column(cls, values):`
			`grist_type = "Date" if all(cls._is_date(v) for v in values) else "DateTime"`
			`grist_values = [(v if (v is None) else moment.dt_to_ts(v))`
			`for v in values]`
			`return grist_type, grist_values`


			`class DateTimeCoverter(BaseConverter):`
			`"""Handles dateformats by guessed format."""`

			`def __init__(self, date_format):`
			`self._format = date_format`

			`def convert(self, value):`
			`if value == "":`
			`return None`
			`if type(value) in (str, six.text_type):`
			`# datetime.strptime doesn't handle %z and %Z tags in Python 2.`
			`if '%z' in self._format or '%Z' in self._format:`
			`return date_parser.parse(value)`
			`else:`
			`try:`
			`return datetime.datetime.strptime(value, self._format)`
			`except ValueError:`
			`return date_parser.parse(value)`

			`raise ValueError()`

			`def _is_date(self, value):`
			`return value is None or value.time() == datetime.time()`

			`def get_grist_column(self, values):`
			`grist_type = "Date" if all(self._is_date(v) for v in values) else "DateTime"`
			`grist_values = [(v if (v is None) else moment.dt_to_ts(v))`
			`for v in values]`
			`return grist_type, grist_values`


			`class BoolConverter(BaseConverter):`
			`"""Handles Boolean type."""`

			`_true_values = (1, '1', 'true', 'yes')`
			`_false_values = (0, '0', 'false', 'no')`

			`@classmethod`
			`def convert(cls, value):`
			`v = value.strip().lower() if type(value) in (str, six.text_type) else value`
			`if v in cls._true_values:`
			`return True`
			`elif v in cls._false_values:`
			`return False`
			`raise ValueError()`

			`@classmethod`
			`def get_grist_column(cls, values):`
			`return ("Bool", values)`


			`class TextConverter(BaseConverter):`
			`"""Fallback converter that converts everything to strings."""`
			`@classmethod`
			`def convert(cls, value):`
			`return six.text_type(value)`

			`@classmethod`
			`def get_grist_column(cls, values):`
			`return ("Text", values)`


			`class ColumnDetector(object):`
			`"""`
			ColumnDetector accepts calls to `add_value()`, and keeps track of successful conversions to
			different basic types. At the end `get_converter()` method returns the class of the most
			`suitable converter.`
			`"""`
			`# Converters are listed in the order of preference, which is only used if two converters succeed`
			`# on the same exact number of values. Text is always a fallback.`
			`converters = [SimpleDateTimeConverter, BoolConverter, NumericConverter]`

			`# If this many non-junk values or more can't be converted, fall back to text.`
			`_text_threshold = 0.10`

			`# Junk values: these aren't counted when deciding whether to fall back to text.`
			`_junk_re = re.compile(r'^\s(\|-+\|\?+\|n/?a)\s$', re.I)`

			`def __init__(self):`
			`self._counts = [0] * len(self.converters)`
			`self._count_nonjunk = 0`
			`self._count_total = 0`
			`self._data = []`

			`def add_value(self, value):`
			`self._count_total += 1`
			`if value is None or (type(value) in (str, six.text_type) and self._junk_re.match(value)):`
			`return`

			`self._data.append(value)`

			`self._count_nonjunk += 1`
			`for i, conv in enumerate(self.converters):`
			`if conv.test(value):`
			`self._counts[i] += 1`

			`def get_converter(self):`
			`if sum(self._counts) == 0:`
			`# if not already guessed as int, bool or datetime then we should try to guess date pattern`
			`str_data = [d for d in self._data if isinstance(d, six.string_types)]`
			`data_formats = dateguess.guess_bulk(str_data, error_rate=self._text_threshold)`
			`data_format = data_formats[0] if data_formats else None`
			`if data_format:`
			`return DateTimeCoverter(data_format)`

			`# We find the max by count, and secondarily by minimum index in the converters list.`
			`count, neg_index = max((c, -i) for (i, c) in enumerate(self._counts))`
			`if count > 0 and count >= self._count_nonjunk * (1 - self._text_threshold):`
			`return self.converters[-neg_index]`
			`return TextConverter`


			`def _guess_basic_types(rows, num_columns):`
			`column_detectors = [ColumnDetector() for i in xrange(num_columns)]`
			`for row in rows:`
			`for cell, detector in zip(row, column_detectors):`
			`detector.add_value(cell.value)`

			`return [detector.get_converter() for detector in column_detectors]`


			`class ColumnConverter(object):`
			`"""`
			`ColumnConverter converts and collects values using the passed-in converter object. At the end`
			`get_grist_column()` method returns a column of converted data.
			`"""`
			`def __init__(self, converter):`
			`self._converter = converter`
			`self._all_col_values = [] # Initially this has None's for converted values`
			`self._converted_values = [] # A list of all converted values`
			`self._converted_indices = [] # Indices of the converted values into self._all_col_values`

			`def convert_and_add(self, value):`
			`# For some reason, we get 'str' type rather than 'unicode' for empty strings.`
			`# Correct this, since all text should be unicode.`
			`value = u"" if value == "" else value`
			`try:`
			`conv = self._converter.convert(value)`
			`self._converted_values.append(conv)`
			`self._converted_indices.append(len(self._all_col_values))`
			`self._all_col_values.append(None)`
			`except Exception:`
			`self._all_col_values.append(six.text_type(value))`

			`def get_grist_column(self):`
			`"""`
			`Returns a dictionary {"type": grist_type, "data": grist_value_array}.`
			`"""`
			`grist_type, grist_values = self._converter.get_grist_column(self._converted_values)`
			`for i, v in zip(self._converted_indices, grist_values):`
			`self._all_col_values[i] = v`
			`return {"type": grist_type, "data": self._all_col_values}`


			`def get_table_data(row_set, num_columns, num_rows=0):`
			`converters = _guess_basic_types(row_set.sample, num_columns)`
			`col_converters = [ColumnConverter(c) for c in converters]`
			`for num, row in enumerate(row_set):`
			`if num_rows and num == num_rows:`
			`break`

			`if num % 10000 == 0:`
			`log.info("Processing row %d", num)`

			`# Make sure we have a value for every column.`
			`missing_values = len(converters) - len(row)`
			`if missing_values > 0:`
			`row.extend([messytables.Cell("")] * missing_values)`

			`for cell, conv in zip(row, col_converters):`
			`conv.convert_and_add(cell.value)`

			`return [conv.get_grist_column() for conv in col_converters]`