diff --git a/sandbox/grist/imports/import_utils.py b/sandbox/grist/imports/import_utils.py index 8ace2224..40bb08c7 100644 --- a/sandbox/grist/imports/import_utils.py +++ b/sandbox/grist/imports/import_utils.py @@ -4,6 +4,7 @@ Helper functions for import plugins import itertools import logging import os +from collections import defaultdict import six from six.moves import zip @@ -14,6 +15,20 @@ if six.PY2: log = logging.getLogger(__name__) +def column_count_modal(rows): + """ Return the modal value of columns in the row_set's + sample. This can be assumed to be the number of columns + of the table. """ + counts = defaultdict(int) + for row in rows: + length = len([c for c in row if not empty(c)]) + if length > 1: + counts[length] += 1 + if not len(counts): + return 0 + return max(list(counts.items()), key=lambda k_v: k_v[1])[0] + + def empty(value): """ Stringify the value and check that it has a length. """ @@ -33,7 +48,7 @@ def capitalize(word): return word[0].capitalize() + word[1:] def _is_numeric(text): - for t in six.integer_types + (float, complex): + for t in six.integer_types + (float,): try: t(text) return True @@ -54,7 +69,6 @@ def _is_header(header, data_rows): # If it's all text, see if the values in the first row repeat in other rows. That's uncommon for # a header. - count_repeats = [0 for cell in header] for row in data_rows: for cell, header_cell in zip(row, header): if cell and cell == header_cell: @@ -78,8 +92,11 @@ def find_first_non_empty_row(rows): Returns (data_offset, header) of the first row with non-empty fields or (0, []) if there are no non-empty rows. """ + tolerance = 1 + modal = column_count_modal(rows) for i, row in enumerate(rows): - if _count_nonempty(row) > 0: + length = _count_nonempty(row) + if length >= modal - tolerance: return i + 1, row # No non-empty rows. return 0, [] diff --git a/sandbox/grist/imports/import_xls.py b/sandbox/grist/imports/import_xls.py index 2f3adfe3..0b525e92 100644 --- a/sandbox/grist/imports/import_xls.py +++ b/sandbox/grist/imports/import_xls.py @@ -4,7 +4,6 @@ and returns a object formatted so that it can be used by grist for a bulk add re """ import logging -import messytables import six import openpyxl from openpyxl.utils.datetime import from_excel @@ -66,15 +65,10 @@ def parse_open_file(file_obj): # `if not any(row)` would be slightly faster, but would count `0` as empty. if not set(row) <= {None, ""} ] - sample = [ - # Create messytables.Cells for the sake of messytables.headers_guess - [messytables.Cell(cell) for cell in row] - # Resetting dimensions via openpyxl causes rows to not be padded. Make sure - # sample rows are padded; get_table_data will handle padding the rest. - for row in _with_padding(rows[:1000]) - ] - offset, headers = messytables.headers_guess(sample) - data_offset = offset + 1 # Add the header line + # Resetting dimensions via openpyxl causes rows to not be padded. Make sure + # sample rows are padded; get_table_data will handle padding the rest. + sample = _with_padding(rows[:1000]) + data_offset, headers = import_utils.headers_guess(sample) rows = rows[data_offset:] # Make sure all header values are strings. diff --git a/sandbox/grist/imports/import_xls_test.py b/sandbox/grist/imports/import_xls_test.py index 1c7134cb..18c0fb47 100644 --- a/sandbox/grist/imports/import_xls_test.py +++ b/sandbox/grist/imports/import_xls_test.py @@ -13,6 +13,8 @@ def _get_fixture(filename): class TestImportXLS(unittest.TestCase): + maxDiff = None # Display full diff if any. + def _check_col(self, sheet, index, name, typename, values): self.assertEqual(sheet["column_metadata"][index]["id"], name) self.assertEqual(sheet["column_metadata"][index]["type"], typename) @@ -103,17 +105,17 @@ class TestImportXLS(unittest.TestCase): 'table_name': u'Transaction Report', 'column_metadata': [ {'type': 'Any', 'id': u''}, - {'type': 'Numeric', 'id': u'Start'}, + {'type': 'Any', 'id': u''}, {'type': 'Numeric', 'id': u''}, {'type': 'Numeric', 'id': u''}, - {'type': 'Any', 'id': u'Seek no easy ways'}, + {'type': 'Any', 'id': u''}, ], 'table_data': [ - [u'SINGLE MERGED', u'The End'], - [1637384.52, None], - [2444344.06, None], - [2444344.06, None], - [u'', u''], + ['', u'SINGLE MERGED', u'The End'], + ['Start', '1637384.52', ''], + [None, 2444344.06, None], + [None, 2444344.06, None], + ['Seek no easy ways', u'', u''], ], }]) diff --git a/sandbox/requirements3.txt b/sandbox/requirements3.txt index 7f016bfc..2088b691 100644 --- a/sandbox/requirements3.txt +++ b/sandbox/requirements3.txt @@ -19,7 +19,6 @@ jdcal==1.4.1 json_table_schema==0.2.1 lazy_object_proxy==1.6.0 lxml==4.6.3 # used in csv plugin only? -messytables==0.15.2 python_dateutil==2.8.2 openpyxl==3.0.10 python_magic==0.4.12