(core) Handle importing xls files with invalid dimensions

Summary:
This addresses a rare bug where xls files with invalid dimensions
could not be imported into Grist due to how openpyxl handles
parsing them.

Test Plan: Server test.

Reviewers: alexmojaki

Reviewed By: alexmojaki

Differential Revision: https://phab.getgrist.com/D3485
This commit is contained in:
George Gevoian 2022-06-16 00:50:30 -07:00
parent 561d9696aa
commit 9b08666f96
3 changed files with 39 additions and 2 deletions

View File

@ -39,6 +39,10 @@ def parse_open_file(file_obj):
export_list = []
# A table set is a collection of tables:
for sheet in workbook:
# openpyxl fails to read xlsx files with incorrect dimensions; we reset here as a precaution.
# See https://openpyxl.readthedocs.io/en/stable/optimized.html#worksheet-dimensions.
sheet.reset_dimensions()
table_name = sheet.title
rows = [
list(row)
@ -50,7 +54,9 @@ def parse_open_file(file_obj):
sample = [
# Create messytables.Cells for the sake of messytables.headers_guess
[messytables.Cell(cell) for cell in row]
for row in rows[:1000]
# Resetting dimensions via openpyxl causes rows to not be padded. Make sure
# sample rows are padded; get_table_data will handle padding the rest.
for row in _with_padding(rows[:1000])
]
offset, headers = messytables.headers_guess(sample)
data_offset = offset + 1 # Add the header line
@ -100,3 +106,14 @@ def parse_open_file(file_obj):
parse_options = {}
return parse_options, export_list
def _with_padding(rows):
if not rows:
return []
max_width = max(len(row) for row in rows)
min_width = min(len(row) for row in rows)
if min_width == max_width:
return rows
for row in rows:
row.extend([""] * (max_width - len(row)))
return rows

View File

@ -163,7 +163,27 @@ class TestImportXLS(unittest.TestCase):
],
'table_data': [
[0, None, 1],
[None, 0, 2],
[u'', 0, 2],
],
}])
def test_invalid_dimensions(self):
# Check that files with invalid dimensions (typically a result of software
# incorrectly writing the xlsx file) are imported correctly. Previously, Grist
# would fail to import any rows from such files due to how openpyxl parses them.
parsed_file = import_xls.parse_file(*_get_fixture('test_invalid_dimensions.xlsx'))
tables = parsed_file[1]
self.assertEqual(tables, [{
'table_name': 'Sheet1',
'column_metadata': [
{'id': u'A', 'type': 'Numeric'},
{'id': u'B', 'type': 'Numeric'},
{'id': u'C', 'type': 'Numeric'},
],
'table_data': [
[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
],
}])