(core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows

Summary: Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py. Skip empty rows since excel files can easily contain huge numbers of them. Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format. Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies. Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3406
2026-03-02 04:09:24 +00:00 · 2022-05-06 17:26:45 +02:00
parent 0a61d740ba
commit 6c90de4d62
15 changed files with 127 additions and 135 deletions
--- a/sandbox/grist/parse_data.py
+++ b/sandbox/grist/parse_data.py
@@ -10,7 +10,6 @@ of values. All "data" lists will have the same length.
 import datetime
 import logging
 import re
-import messytables
 import moment # TODO grist internal libraries might not be available to plugins in the future.
 import six
 from six.moves import zip, xrange
@@ -59,12 +58,16 @@ class BaseConverter(object):
    raise NotImplementedError()


+numeric_types = six.integer_types + (float, complex, type(None))
+
 class NumericConverter(BaseConverter):
  """Handles the Grist Numeric type"""

  @classmethod
  def convert(cls, value):
-    if type(value) in six.integer_types + (float, complex):
+    if type(value) is bool:
+      return int(value)
+    elif type(value) in numeric_types:
      return value
    raise ValueError()

@@ -80,7 +83,7 @@ class SimpleDateTimeConverter(BaseConverter):
  def convert(cls, value):
    if type(value) is datetime.datetime:
      return value
-    elif value == "":
+    elif not value:
      return None
    raise ValueError()

@@ -103,6 +106,8 @@ class AnyConverter(BaseConverter):
  """
  @classmethod
  def convert(cls, value):
+    if value is None:
+      return u''
    return six.text_type(value)

  @classmethod
@@ -156,7 +161,7 @@ def _guess_basic_types(rows, num_columns):
  column_detectors = [ColumnDetector() for i in xrange(num_columns)]
  for row in rows:
    for cell, detector in zip(row, column_detectors):
-      detector.add_value(cell.value)
+      detector.add_value(cell)

  return [detector.get_converter() for detector in column_detectors]

@@ -194,10 +199,10 @@ class ColumnConverter(object):
    return {"type": grist_type, "data": self._all_col_values}


-def get_table_data(row_set, num_columns, num_rows=0):
-  converters = _guess_basic_types(row_set.sample, num_columns)
+def get_table_data(rows, num_columns, num_rows=0):
+  converters = _guess_basic_types(rows[:1000], num_columns)
  col_converters = [ColumnConverter(c) for c in converters]
-  for num, row in enumerate(row_set):
+  for num, row in enumerate(rows):
    if num_rows and num == num_rows:
      break

@@ -207,9 +212,9 @@ def get_table_data(row_set, num_columns, num_rows=0):
    # Make sure we have a value for every column.
    missing_values = len(converters) - len(row)
    if missing_values > 0:
-      row.extend([messytables.Cell("")] * missing_values)
+      row.extend([""] * missing_values)

    for cell, conv in zip(row, col_converters):
-      conv.convert_and_add(cell.value)
+      conv.convert_and_add(cell)

  return [conv.get_grist_column() for conv in col_converters]