mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
Merge pull request #289 from yohanboniface/xls-messytables
Remove messytables dependency from xlsx import
This commit is contained in:
commit
0875153596
@ -4,6 +4,7 @@ Helper functions for import plugins
|
|||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
import six
|
import six
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
@ -14,6 +15,20 @@ if six.PY2:
|
|||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def column_count_modal(rows):
|
||||||
|
""" Return the modal value of columns in the row_set's
|
||||||
|
sample. This can be assumed to be the number of columns
|
||||||
|
of the table. """
|
||||||
|
counts = defaultdict(int)
|
||||||
|
for row in rows:
|
||||||
|
length = len([c for c in row if not empty(c)])
|
||||||
|
if length > 1:
|
||||||
|
counts[length] += 1
|
||||||
|
if not len(counts):
|
||||||
|
return 0
|
||||||
|
return max(list(counts.items()), key=lambda k_v: k_v[1])[0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def empty(value):
|
def empty(value):
|
||||||
""" Stringify the value and check that it has a length. """
|
""" Stringify the value and check that it has a length. """
|
||||||
@ -33,7 +48,7 @@ def capitalize(word):
|
|||||||
return word[0].capitalize() + word[1:]
|
return word[0].capitalize() + word[1:]
|
||||||
|
|
||||||
def _is_numeric(text):
|
def _is_numeric(text):
|
||||||
for t in six.integer_types + (float, complex):
|
for t in six.integer_types + (float,):
|
||||||
try:
|
try:
|
||||||
t(text)
|
t(text)
|
||||||
return True
|
return True
|
||||||
@ -54,7 +69,6 @@ def _is_header(header, data_rows):
|
|||||||
|
|
||||||
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
|
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
|
||||||
# a header.
|
# a header.
|
||||||
count_repeats = [0 for cell in header]
|
|
||||||
for row in data_rows:
|
for row in data_rows:
|
||||||
for cell, header_cell in zip(row, header):
|
for cell, header_cell in zip(row, header):
|
||||||
if cell and cell == header_cell:
|
if cell and cell == header_cell:
|
||||||
@ -78,8 +92,11 @@ def find_first_non_empty_row(rows):
|
|||||||
Returns (data_offset, header) of the first row with non-empty fields
|
Returns (data_offset, header) of the first row with non-empty fields
|
||||||
or (0, []) if there are no non-empty rows.
|
or (0, []) if there are no non-empty rows.
|
||||||
"""
|
"""
|
||||||
|
tolerance = 1
|
||||||
|
modal = column_count_modal(rows)
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
if _count_nonempty(row) > 0:
|
length = _count_nonempty(row)
|
||||||
|
if length >= modal - tolerance:
|
||||||
return i + 1, row
|
return i + 1, row
|
||||||
# No non-empty rows.
|
# No non-empty rows.
|
||||||
return 0, []
|
return 0, []
|
||||||
|
@ -4,7 +4,6 @@ and returns a object formatted so that it can be used by grist for a bulk add re
|
|||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import messytables
|
|
||||||
import six
|
import six
|
||||||
import openpyxl
|
import openpyxl
|
||||||
from openpyxl.utils.datetime import from_excel
|
from openpyxl.utils.datetime import from_excel
|
||||||
@ -66,15 +65,10 @@ def parse_open_file(file_obj):
|
|||||||
# `if not any(row)` would be slightly faster, but would count `0` as empty.
|
# `if not any(row)` would be slightly faster, but would count `0` as empty.
|
||||||
if not set(row) <= {None, ""}
|
if not set(row) <= {None, ""}
|
||||||
]
|
]
|
||||||
sample = [
|
# Resetting dimensions via openpyxl causes rows to not be padded. Make sure
|
||||||
# Create messytables.Cells for the sake of messytables.headers_guess
|
# sample rows are padded; get_table_data will handle padding the rest.
|
||||||
[messytables.Cell(cell) for cell in row]
|
sample = _with_padding(rows[:1000])
|
||||||
# Resetting dimensions via openpyxl causes rows to not be padded. Make sure
|
data_offset, headers = import_utils.headers_guess(sample)
|
||||||
# sample rows are padded; get_table_data will handle padding the rest.
|
|
||||||
for row in _with_padding(rows[:1000])
|
|
||||||
]
|
|
||||||
offset, headers = messytables.headers_guess(sample)
|
|
||||||
data_offset = offset + 1 # Add the header line
|
|
||||||
rows = rows[data_offset:]
|
rows = rows[data_offset:]
|
||||||
|
|
||||||
# Make sure all header values are strings.
|
# Make sure all header values are strings.
|
||||||
|
@ -13,6 +13,8 @@ def _get_fixture(filename):
|
|||||||
|
|
||||||
class TestImportXLS(unittest.TestCase):
|
class TestImportXLS(unittest.TestCase):
|
||||||
|
|
||||||
|
maxDiff = None # Display full diff if any.
|
||||||
|
|
||||||
def _check_col(self, sheet, index, name, typename, values):
|
def _check_col(self, sheet, index, name, typename, values):
|
||||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||||
@ -103,17 +105,17 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
'table_name': u'Transaction Report',
|
'table_name': u'Transaction Report',
|
||||||
'column_metadata': [
|
'column_metadata': [
|
||||||
{'type': 'Any', 'id': u''},
|
{'type': 'Any', 'id': u''},
|
||||||
{'type': 'Numeric', 'id': u'Start'},
|
{'type': 'Any', 'id': u''},
|
||||||
{'type': 'Numeric', 'id': u''},
|
{'type': 'Numeric', 'id': u''},
|
||||||
{'type': 'Numeric', 'id': u''},
|
{'type': 'Numeric', 'id': u''},
|
||||||
{'type': 'Any', 'id': u'Seek no easy ways'},
|
{'type': 'Any', 'id': u''},
|
||||||
],
|
],
|
||||||
'table_data': [
|
'table_data': [
|
||||||
[u'SINGLE MERGED', u'The End'],
|
['', u'SINGLE MERGED', u'The End'],
|
||||||
[1637384.52, None],
|
['Start', '1637384.52', ''],
|
||||||
[2444344.06, None],
|
[None, 2444344.06, None],
|
||||||
[2444344.06, None],
|
[None, 2444344.06, None],
|
||||||
[u'', u''],
|
['Seek no easy ways', u'', u''],
|
||||||
],
|
],
|
||||||
}])
|
}])
|
||||||
|
|
||||||
|
@ -19,7 +19,6 @@ jdcal==1.4.1
|
|||||||
json_table_schema==0.2.1
|
json_table_schema==0.2.1
|
||||||
lazy_object_proxy==1.6.0
|
lazy_object_proxy==1.6.0
|
||||||
lxml==4.6.3 # used in csv plugin only?
|
lxml==4.6.3 # used in csv plugin only?
|
||||||
messytables==0.15.2
|
|
||||||
python_dateutil==2.8.2
|
python_dateutil==2.8.2
|
||||||
openpyxl==3.0.10
|
openpyxl==3.0.10
|
||||||
python_magic==0.4.12
|
python_magic==0.4.12
|
||||||
|
Loading…
Reference in New Issue
Block a user