From 9b08666f96929f2fe527e858d0f4be2fc8814fe0 Mon Sep 17 00:00:00 2001 From: George Gevoian Date: Thu, 16 Jun 2022 00:50:30 -0700 Subject: [PATCH] (core) Handle importing xls files with invalid dimensions Summary: This addresses a rare bug where xls files with invalid dimensions could not be imported into Grist due to how openpyxl handles parsing them. Test Plan: Server test. Reviewers: alexmojaki Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D3485 --- .../fixtures/test_invalid_dimensions.xlsx | Bin 0 -> 4894 bytes sandbox/grist/imports/import_xls.py | 19 ++++++++++++++- sandbox/grist/imports/test_import_xls.py | 22 +++++++++++++++++- 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx diff --git a/sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx b/sandbox/grist/imports/fixtures/test_invalid_dimensions.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..78ce4723ebef46d117130df458affa93ed9d48bd GIT binary patch literal 4894 zcmZ`-2UJr{w@rXhL+@RB?}`X$K#(fENiU&yLzMs`U8I9FgD6F%qx2R80RibvNhBZw zQl(0f_JY3m`^o$M+?9LRx@+xoXU>_k_ny(!#Kofl00;>IO?M&sjwRvsvj6}<4hH}r z!9J~F4(>i;qUZPW)FBOA35uAFkho-cKpm~JeRXT>@DsI=u&n-5+pF2)jD8{T2tu>s zB7!9K(4B#+N;%V1jz8iMs&Oh_C1yhLiN_4^f;_VQRpwSVlL4+lV-0R{XkW>Ar~B@u zR8Qa=X7NuJdL4JHWan&(SQ=yOr|sd99+uX)XZA~bvBi5BXUtK8am;>XW>xX3N=(4K z6^_cakvWt6WRZP-6)~w6N}VBP1EXh5+Rg4_0f%sHLjFnf*R!#Aa~lk+>DM z^YU^N4fJqNFzEF}i&M1j3*lirM66tyGjL53lL6BdFMG37Rp{K}XnM9X^3O1o4n+be zC35L&Br`49_slQ|w`JD$JI{^!J8a*|>o$r;qPo`5P}wYrAx;xerkP^DsLj;C%Jg>O zz>Tq>jS;v5)R;q>WTyTe=t{lPbXdMxvDQG!?dly5Xc&DR&ResedK9-$a!9&lBcl%4 z!Il}EF$sg@SNPWyz-oz|7~}lc?zO^EkuNJalCBOoSy8V?en1?SnxL!`GPi>7JL~r) zn)VifTx&*OicJOdq{zf9jd)EH`d-E|4j0=}l%}vqz z#u5E)@W2`ZL? z9lyxiv2CwCXEp5S1zs15dc71UL!+I`@?fQw?#g`Ia?g4h^>zX41Jr|Mwc%L!Fr} zVH@R$9snTyy+(X|@3}kpoUf4eJJX&|#2H#=4DKMUIb2d-Q0w~TX8B7Qxe03J_p8U%X z&QM5!XaEE3JH9wLEN7#OU1MJf`>Yl!=prU z-%vn8<1t8LB_ZVgqJ*+cJPy)MVvLyEuB$d>Bsxq8S;#B%L}7MGRN1vKxSK^&`8hqv zN{IrLJ!}5-n(qUC-ThfQJ*7GHcNU4Ym2ass(O`+2Li8+X1^r9K{Yxxm+H|f@rCFf*Ctx_ExPSy`X%1f!;Kir*}O}=0@E3zB{exOTLS>(ms@( zTkRNhz3C!X`hFOc|4i8o+6y%R4(#L$4vfw^mdoZe%}BNSJS-UrL;=U6dYfq^ufUZ8 zESo6Qi26XPyG>x*=vN(8q#;xR!PN0MaZ({hU2E3H@SUZs9AZIDv#yTRaq3~vjY-R(=RO*+!Oe)oOnqm> zQczymJH0SzRz(i(#;fCdpv3{CeZ7$u6*KH>l)~5AQ{KxkVU@95<%JGBb7~@;)y@}M zw5gj?f0eSC%XGUvi|7ie>Cv<8p)rt6BH;EYHRlv`unSIGW-TU4+3gn!fG73?$ z9r}BYTW+xqYB%T(2>^o}jjRra5Yt3Y8naEgW$ttr&NgDOpE`vBUhp+kn#jluZ1p4g z92vM4{pe<8&^2NS5Rujtt9oh?oYBGppSJ@)_~ra7*`Y*@uDhhi#LH5-Y86LAh{0ZJ`OInw{^lKOWklbR^#Ye8wCZ%Gw5#v| zX?V1^nTe}%lWR`^@tLAmVUl4qjZ=2F7^Ga9wB7C60H?t#vO_oR(F_t40-Gm;KPQox zdZue$!)g3w!jC@&gXK8b5_)HUzEG1U;qprCk1=@d&5AdPD==jNn)W*^DF{V{TYx-T z!QT6gq64U>4E%mYc&|1i>~e7`Z7x4Kd{k{C@8%m`63naOr#AM$kT@S3RHmZKcAg1w9FTAp|GLVR{lwx|>}sRsTq z6>GHZoLZh=#J&tp{#=;(IanbCqv^?T7o$12`Wf6);aD6}3<*uQPN9z?d zOU3pL)qV;$sJMf;04W|;j)^0e*<@SZh=tyQ>DWRYnG+#q2oXmkde=;E9rQJ4X{*bS zXUE#k$v@NiYAq{3YLUaawI6rqIb)2IsT+%iI;{*!#{8)SA4?A`)b;omwp{l(tJv6a zowAs%fZALg{hrX30x692QUJ;TQ^NuNyk<>}$!46y2~O{Ij{|$RlG0ZkxxH<0R%&J# zP{SJbjt%-RB4Aatab-tqKZ_a$1rXe6YL*d`RF4o;aES=IA~Uzrln9qph1`a^1Cb(B zOZ&lw5*$;YYhyu1I|5H%U^m&@ILeC}W@ooS=naa*3+{6&9aqBN#F*a7&}OEOUvK*` z>}CtLQ(rveVA5ha3U;y;DjQy1B>LcJk%&1V_(Lh@mQ-i+FlZF3t~^+!W%{kMzg_7+ z{n&4Znxf_QP8=M)aYC!I-=$6fjhAVSSFAia0Cwxm4R=bXPS&1HBqm9)>v6bneF4JB`iZNrG9pi3I)G@x+L3J|h{THsU9KCtcFeuAU7vlc zyb?1VjPrLU6agPES70+BhxMgc%YII-KF+o<2YUlwn2V>=FWZqY)K1(X4#uBt*3pA~ z)TX3#ku%&R+6DH}S=wqjr9DKp8c}%7ZSh%eQya`cD?PKWalaAIGkTVsGmhVMM~0Pr z>4U7v|;dBL?+?Is9_n`X994iBq)f(57rW{Q5$k zhNl?s6`vqSEn(-tC-gXlT*kWe`pj2uEwg6?-0c2N-PO%jPSvet;j9cou8+B>y7&QI zuA+w71(_4_ElLmk4DXsv5W{={j~u&0dO!ZdjLYLiu$EUN`MQxN+w_{K47rJP84yiDKoBHtm^H1QQ z<+wx_bnSF#FpkfO8gvfIX92n{A9?>2x<&eTMcQ?n`c|-&2*Or{;-3|<_4Yn5NN%#u zdB=5a9MQDz=E#t6q|Y^!rRNf(IhdP4L&j$0XYY(Pey)G>7~X4n^7QF}dXRhb9Rmah z@4BT9TWJo>Y@rE?6}~MkpXsL0O`}cpm34O>=b`%{E?W95BTfDR zZE;`Shr#&*+AIfoeTbG>2j;_6xDsmIPt{h@o7j-CLSb9=Gh(YvcVS^DWI3cEvj3|) zfH_`aHXhU0ULYP(_jdI&l6a3Md_>GCBRJwrl!r zmh}P7!aP(>3GaLhSs9)HsIfRN!nYGHpS+|1Oxz3WAkFJ;OT2^c+cCx z$MQTit%hCsd8pn#zfkw9#0@0$%| zrjEZf`W3Hf#Lnmv;`UM&_cIc%!V1`SO@pRbue)p?ojJ*kcB^$C{TQ1{hMilaad1~* z=d$!}0XFYdLC1Tq=V_~UwhqE+RS1STNK+ziS2Ob;mJs!><0l0)5%Tr^s1Vawh%`s{ zgcAr?-*L#5AxPK&R@BPCGDBqwV%_1vBIS@HtM-)@HHDHa;imyyRwBMiC>%Ss&8A$M zv zE!J5&s`7YCUXDpUy>yRcv&@ng_sOT7yv^6l`(Id2U3QU+WSvm&10+G-TaJJ{^3I{$ z*KGIk3EHl1Wy|FoJW`$wUt>S9qAWf`FwL3MUqZSjNlu2Qj|J#z0&!^0`R~6RSzycY zkKaEIFfIZwZt?%1h_I!-u;IT5zPR`M0~W#h*mGw7&ra|h%7*)U!2jPPGyFngKM1IK zpn(6M@BWKp2K{!-7u@p2fnAi#KLbl6{uTYdi08#P7scjJoLsE8|2OHm7~vwF|A`O+ z{ugq;2*1cZf8gX~|A7B56J11K?9e}GKJ0PMzx1oFCJ38-0DuU4nP3Iz^*P@G{s-D? BQXv2U literal 0 HcmV?d00001 diff --git a/sandbox/grist/imports/import_xls.py b/sandbox/grist/imports/import_xls.py index 3c019c2f..ebd15150 100644 --- a/sandbox/grist/imports/import_xls.py +++ b/sandbox/grist/imports/import_xls.py @@ -39,6 +39,10 @@ def parse_open_file(file_obj): export_list = [] # A table set is a collection of tables: for sheet in workbook: + # openpyxl fails to read xlsx files with incorrect dimensions; we reset here as a precaution. + # See https://openpyxl.readthedocs.io/en/stable/optimized.html#worksheet-dimensions. + sheet.reset_dimensions() + table_name = sheet.title rows = [ list(row) @@ -50,7 +54,9 @@ def parse_open_file(file_obj): sample = [ # Create messytables.Cells for the sake of messytables.headers_guess [messytables.Cell(cell) for cell in row] - for row in rows[:1000] + # Resetting dimensions via openpyxl causes rows to not be padded. Make sure + # sample rows are padded; get_table_data will handle padding the rest. + for row in _with_padding(rows[:1000]) ] offset, headers = messytables.headers_guess(sample) data_offset = offset + 1 # Add the header line @@ -100,3 +106,14 @@ def parse_open_file(file_obj): parse_options = {} return parse_options, export_list + +def _with_padding(rows): + if not rows: + return [] + max_width = max(len(row) for row in rows) + min_width = min(len(row) for row in rows) + if min_width == max_width: + return rows + for row in rows: + row.extend([""] * (max_width - len(row))) + return rows diff --git a/sandbox/grist/imports/test_import_xls.py b/sandbox/grist/imports/test_import_xls.py index 284b9671..86d5da1d 100644 --- a/sandbox/grist/imports/test_import_xls.py +++ b/sandbox/grist/imports/test_import_xls.py @@ -163,7 +163,27 @@ class TestImportXLS(unittest.TestCase): ], 'table_data': [ [0, None, 1], - [None, 0, 2], + [u'', 0, 2], + ], + }]) + + def test_invalid_dimensions(self): + # Check that files with invalid dimensions (typically a result of software + # incorrectly writing the xlsx file) are imported correctly. Previously, Grist + # would fail to import any rows from such files due to how openpyxl parses them. + parsed_file = import_xls.parse_file(*_get_fixture('test_invalid_dimensions.xlsx')) + tables = parsed_file[1] + self.assertEqual(tables, [{ + 'table_name': 'Sheet1', + 'column_metadata': [ + {'id': u'A', 'type': 'Numeric'}, + {'id': u'B', 'type': 'Numeric'}, + {'id': u'C', 'type': 'Numeric'}, + ], + 'table_data': [ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], ], }])