Merge pull request #289 from yohanboniface/xls-messytables

Remove messytables dependency from xlsx import
This commit is contained in:
Alex Hall 2022-09-28 16:49:11 +02:00 committed by GitHub
commit 0875153596
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 21 deletions

View File

@ -4,6 +4,7 @@ Helper functions for import plugins
import itertools import itertools
import logging import logging
import os import os
from collections import defaultdict
import six import six
from six.moves import zip from six.moves import zip
@ -14,6 +15,20 @@ if six.PY2:
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def column_count_modal(rows):
""" Return the modal value of columns in the row_set's
sample. This can be assumed to be the number of columns
of the table. """
counts = defaultdict(int)
for row in rows:
length = len([c for c in row if not empty(c)])
if length > 1:
counts[length] += 1
if not len(counts):
return 0
return max(list(counts.items()), key=lambda k_v: k_v[1])[0]
def empty(value): def empty(value):
""" Stringify the value and check that it has a length. """ """ Stringify the value and check that it has a length. """
@ -33,7 +48,7 @@ def capitalize(word):
return word[0].capitalize() + word[1:] return word[0].capitalize() + word[1:]
def _is_numeric(text): def _is_numeric(text):
for t in six.integer_types + (float, complex): for t in six.integer_types + (float,):
try: try:
t(text) t(text)
return True return True
@ -54,7 +69,6 @@ def _is_header(header, data_rows):
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for # If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
# a header. # a header.
count_repeats = [0 for cell in header]
for row in data_rows: for row in data_rows:
for cell, header_cell in zip(row, header): for cell, header_cell in zip(row, header):
if cell and cell == header_cell: if cell and cell == header_cell:
@ -78,8 +92,11 @@ def find_first_non_empty_row(rows):
Returns (data_offset, header) of the first row with non-empty fields Returns (data_offset, header) of the first row with non-empty fields
or (0, []) if there are no non-empty rows. or (0, []) if there are no non-empty rows.
""" """
tolerance = 1
modal = column_count_modal(rows)
for i, row in enumerate(rows): for i, row in enumerate(rows):
if _count_nonempty(row) > 0: length = _count_nonempty(row)
if length >= modal - tolerance:
return i + 1, row return i + 1, row
# No non-empty rows. # No non-empty rows.
return 0, [] return 0, []

View File

@ -4,7 +4,6 @@ and returns a object formatted so that it can be used by grist for a bulk add re
""" """
import logging import logging
import messytables
import six import six
import openpyxl import openpyxl
from openpyxl.utils.datetime import from_excel from openpyxl.utils.datetime import from_excel
@ -66,15 +65,10 @@ def parse_open_file(file_obj):
# `if not any(row)` would be slightly faster, but would count `0` as empty. # `if not any(row)` would be slightly faster, but would count `0` as empty.
if not set(row) <= {None, ""} if not set(row) <= {None, ""}
] ]
sample = [ # Resetting dimensions via openpyxl causes rows to not be padded. Make sure
# Create messytables.Cells for the sake of messytables.headers_guess # sample rows are padded; get_table_data will handle padding the rest.
[messytables.Cell(cell) for cell in row] sample = _with_padding(rows[:1000])
# Resetting dimensions via openpyxl causes rows to not be padded. Make sure data_offset, headers = import_utils.headers_guess(sample)
# sample rows are padded; get_table_data will handle padding the rest.
for row in _with_padding(rows[:1000])
]
offset, headers = messytables.headers_guess(sample)
data_offset = offset + 1 # Add the header line
rows = rows[data_offset:] rows = rows[data_offset:]
# Make sure all header values are strings. # Make sure all header values are strings.

View File

@ -13,6 +13,8 @@ def _get_fixture(filename):
class TestImportXLS(unittest.TestCase): class TestImportXLS(unittest.TestCase):
maxDiff = None # Display full diff if any.
def _check_col(self, sheet, index, name, typename, values): def _check_col(self, sheet, index, name, typename, values):
self.assertEqual(sheet["column_metadata"][index]["id"], name) self.assertEqual(sheet["column_metadata"][index]["id"], name)
self.assertEqual(sheet["column_metadata"][index]["type"], typename) self.assertEqual(sheet["column_metadata"][index]["type"], typename)
@ -103,17 +105,17 @@ class TestImportXLS(unittest.TestCase):
'table_name': u'Transaction Report', 'table_name': u'Transaction Report',
'column_metadata': [ 'column_metadata': [
{'type': 'Any', 'id': u''}, {'type': 'Any', 'id': u''},
{'type': 'Numeric', 'id': u'Start'}, {'type': 'Any', 'id': u''},
{'type': 'Numeric', 'id': u''}, {'type': 'Numeric', 'id': u''},
{'type': 'Numeric', 'id': u''}, {'type': 'Numeric', 'id': u''},
{'type': 'Any', 'id': u'Seek no easy ways'}, {'type': 'Any', 'id': u''},
], ],
'table_data': [ 'table_data': [
[u'SINGLE MERGED', u'The End'], ['', u'SINGLE MERGED', u'The End'],
[1637384.52, None], ['Start', '1637384.52', ''],
[2444344.06, None], [None, 2444344.06, None],
[2444344.06, None], [None, 2444344.06, None],
[u'', u''], ['Seek no easy ways', u'', u''],
], ],
}]) }])

View File

@ -19,7 +19,6 @@ jdcal==1.4.1
json_table_schema==0.2.1 json_table_schema==0.2.1
lazy_object_proxy==1.6.0 lazy_object_proxy==1.6.0
lxml==4.6.3 # used in csv plugin only? lxml==4.6.3 # used in csv plugin only?
messytables==0.15.2
python_dateutil==2.8.2 python_dateutil==2.8.2
openpyxl==3.0.10 openpyxl==3.0.10
python_magic==0.4.12 python_magic==0.4.12