Remove messytables dependency from xlsx import

2026-03-02 04:09:24 +00:00 · 2022-09-02 12:13:28 +02:00
parent b2b0950c9c
commit 54703e2794
4 changed files with 33 additions and 21 deletions
--- a/sandbox/grist/imports/import_utils.py
+++ b/sandbox/grist/imports/import_utils.py
@@ -4,6 +4,7 @@ Helper functions for import plugins
 import itertools
 import logging
 import os
+from collections import defaultdict

 import six
 from six.moves import zip
@@ -14,6 +15,20 @@ if six.PY2:

 log = logging.getLogger(__name__)

+def column_count_modal(rows):
+  """ Return the modal value of columns in the row_set's
+  sample. This can be assumed to be the number of columns
+  of the table. """
+  counts = defaultdict(int)
+  for row in rows:
+    length = len([c for c in row if not empty(c)])
+    if length > 1:
+      counts[length] += 1
+  if not len(counts):
+    return 0
+  return max(list(counts.items()), key=lambda k_v: k_v[1])[0]
+
+

 def empty(value):
  """ Stringify the value and check that it has a length. """
@@ -33,7 +48,7 @@ def capitalize(word):
  return word[0].capitalize() + word[1:]

 def _is_numeric(text):
-  for t in six.integer_types + (float, complex):
+  for t in six.integer_types + (float,):
    try:
      t(text)
      return True
@@ -54,7 +69,6 @@ def _is_header(header, data_rows):

  # If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
  # a header.
-  count_repeats = [0 for cell in header]
  for row in data_rows:
    for cell, header_cell in zip(row, header):
      if cell and cell == header_cell:
@@ -78,8 +92,11 @@ def find_first_non_empty_row(rows):
  Returns (data_offset, header) of the first row with non-empty fields
  or (0, []) if there are no non-empty rows.
  """
+  tolerance = 1
+  modal = column_count_modal(rows)
  for i, row in enumerate(rows):
-    if _count_nonempty(row) > 0:
+    length = _count_nonempty(row)
+    if length >= modal - tolerance:
      return i + 1, row
  # No non-empty rows.
  return 0, []
--- a/sandbox/grist/imports/import_xls.py
+++ b/sandbox/grist/imports/import_xls.py
@@ -4,7 +4,6 @@ and returns a object formatted so that it can be used by grist for a bulk add re
 """
 import logging

-import messytables
 import six
 import openpyxl
 from openpyxl.utils.datetime import from_excel
@@ -66,15 +65,10 @@ def parse_open_file(file_obj):
      # `if not any(row)` would be slightly faster, but would count `0` as empty.
      if not set(row) <= {None, ""}
    ]
-    sample = [
-      # Create messytables.Cells for the sake of messytables.headers_guess
-      [messytables.Cell(cell) for cell in row]
-      # Resetting dimensions via openpyxl causes rows to not be padded. Make sure
-      # sample rows are padded; get_table_data will handle padding the rest.
-      for row in _with_padding(rows[:1000])
-    ]
-    offset, headers = messytables.headers_guess(sample)
-    data_offset = offset + 1  # Add the header line
+    # Resetting dimensions via openpyxl causes rows to not be padded. Make sure
+    # sample rows are padded; get_table_data will handle padding the rest.
+    sample = _with_padding(rows[:1000])
+    data_offset, headers = import_utils.headers_guess(sample)
    rows = rows[data_offset:]

    # Make sure all header values are strings.
--- a/sandbox/grist/imports/import_xls_test.py
+++ b/sandbox/grist/imports/import_xls_test.py
@@ -13,6 +13,8 @@ def _get_fixture(filename):

 class TestImportXLS(unittest.TestCase):

+  maxDiff = None  # Display full diff if any.
+
  def _check_col(self, sheet, index, name, typename, values):
    self.assertEqual(sheet["column_metadata"][index]["id"], name)
    self.assertEqual(sheet["column_metadata"][index]["type"], typename)
@@ -103,17 +105,17 @@ class TestImportXLS(unittest.TestCase):
      'table_name': u'Transaction Report',
      'column_metadata': [
        {'type': 'Any', 'id': u''},
-        {'type': 'Numeric', 'id': u'Start'},
+        {'type': 'Any', 'id': u''},
        {'type': 'Numeric', 'id': u''},
        {'type': 'Numeric', 'id': u''},
-        {'type': 'Any', 'id': u'Seek no easy ways'},
+        {'type': 'Any', 'id': u''},
      ],
      'table_data': [
-        [u'SINGLE MERGED', u'The End'],
-        [1637384.52, None],
-        [2444344.06, None],
-        [2444344.06, None],
-        [u'', u''],
+        ['', u'SINGLE MERGED', u'The End'],
+        ['Start', '1637384.52', ''],
+        [None, 2444344.06, None],
+        [None, 2444344.06, None],
+        ['Seek no easy ways', u'', u''],
      ],
    }])

--- a/sandbox/requirements3.txt
+++ b/sandbox/requirements3.txt
@@ -19,7 +19,6 @@ jdcal==1.4.1
 json_table_schema==0.2.1
 lazy_object_proxy==1.6.0
 lxml==4.6.3                # used in csv plugin only?
-messytables==0.15.2
 python_dateutil==2.8.2
 openpyxl==3.0.10
 python_magic==0.4.12