diff --git a/sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx b/sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx new file mode 100644 index 00000000..78ce4723 Binary files /dev/null and b/sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx differ diff --git a/sandbox/grist/imports/import_xls.py b/sandbox/grist/imports/import_xls.py index 3c019c2f..ebd15150 100644 --- a/sandbox/grist/imports/import_xls.py +++ b/sandbox/grist/imports/import_xls.py @@ -39,6 +39,10 @@ def parse_open_file(file_obj): export_list = [] # A table set is a collection of tables: for sheet in workbook: + # openpyxl fails to read xlsx files with incorrect dimensions; we reset here as a precaution. + # See https://openpyxl.readthedocs.io/en/stable/optimized.html#worksheet-dimensions. + sheet.reset_dimensions() + table_name = sheet.title rows = [ list(row) @@ -50,7 +54,9 @@ def parse_open_file(file_obj): sample = [ # Create messytables.Cells for the sake of messytables.headers_guess [messytables.Cell(cell) for cell in row] - for row in rows[:1000] + # Resetting dimensions via openpyxl causes rows to not be padded. Make sure + # sample rows are padded; get_table_data will handle padding the rest. + for row in _with_padding(rows[:1000]) ] offset, headers = messytables.headers_guess(sample) data_offset = offset + 1 # Add the header line @@ -100,3 +106,14 @@ def parse_open_file(file_obj): parse_options = {} return parse_options, export_list + +def _with_padding(rows): + if not rows: + return [] + max_width = max(len(row) for row in rows) + min_width = min(len(row) for row in rows) + if min_width == max_width: + return rows + for row in rows: + row.extend([""] * (max_width - len(row))) + return rows diff --git a/sandbox/grist/imports/test_import_xls.py b/sandbox/grist/imports/test_import_xls.py index 284b9671..86d5da1d 100644 --- a/sandbox/grist/imports/test_import_xls.py +++ b/sandbox/grist/imports/test_import_xls.py @@ -163,7 +163,27 @@ class TestImportXLS(unittest.TestCase): ], 'table_data': [ [0, None, 1], - [None, 0, 2], + [u'', 0, 2], + ], + }]) + + def test_invalid_dimensions(self): + # Check that files with invalid dimensions (typically a result of software + # incorrectly writing the xlsx file) are imported correctly. Previously, Grist + # would fail to import any rows from such files due to how openpyxl parses them. + parsed_file = import_xls.parse_file(*_get_fixture('test_invalid_dimensions.xlsx')) + tables = parsed_file[1] + self.assertEqual(tables, [{ + 'table_name': 'Sheet1', + 'column_metadata': [ + {'id': u'A', 'type': 'Numeric'}, + {'id': u'B', 'type': 'Numeric'}, + {'id': u'C', 'type': 'Numeric'}, + ], + 'table_data': [ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], ], }])