From 6c90de4d62e91df0a954cd73e8b826ca275fb0d4 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Fri, 6 May 2022 17:26:45 +0200 Subject: [PATCH] (core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows Summary: Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py. Skip empty rows since excel files can easily contain huge numbers of them. Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format. Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies. Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3406 --- app/client/lib/uploads.ts | 2 +- app/server/lib/ActiveDocImport.ts | 4 +- app/server/lib/DocPluginManager.ts | 1 - app/server/lib/NSandbox.ts | 10 +- app/server/lib/guessExt.ts | 2 +- plugins/core/manifest.yml | 4 +- sandbox/grist/imports/__init__.py | 16 +++ .../imports/fixtures/test_empty_rows.xlsx | Bin 0 -> 5206 bytes sandbox/grist/imports/import_csv.py | 7 +- sandbox/grist/imports/import_xls.py | 128 +++++------------- sandbox/grist/imports/register.py | 3 +- sandbox/grist/imports/test_import_xls.py | 62 +++++---- sandbox/grist/parse_data.py | 23 ++-- sandbox/requirements.txt | 3 + sandbox/requirements3.txt | 3 + 15 files changed, 130 insertions(+), 138 deletions(-) create mode 100644 sandbox/grist/imports/fixtures/test_empty_rows.xlsx diff --git a/app/client/lib/uploads.ts b/app/client/lib/uploads.ts index be90a8c8..a7df3e66 100644 --- a/app/client/lib/uploads.ts +++ b/app/client/lib/uploads.ts @@ -32,7 +32,7 @@ export interface SelectFileOptions extends UploadOptions { // e.g. [".jpg", ".png"] } -export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xls", ".xlsx", ".xlsm"]; +export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xlsx", ".xlsm"]; /** * Shows the file-picker dialog with the given options, and uploads the selected files. If under diff --git a/app/server/lib/ActiveDocImport.ts b/app/server/lib/ActiveDocImport.ts index 0b285ff9..ff3f9d1c 100644 --- a/app/server/lib/ActiveDocImport.ts +++ b/app/server/lib/ActiveDocImport.ts @@ -271,8 +271,8 @@ export class ActiveDocImport { /** * Imports the data stored at tmpPath. * - * Currently it starts a python parser (that relies on the messytables library) as a child process - * outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may + * Currently it starts a python parser as a child process + * outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may * result in the import of multiple tables, in case of e.g. Excel formats. * @param {OptDocSession} docSession: Session instance to use for importing. * @param {String} tmpPath: The path from of the original file. diff --git a/app/server/lib/DocPluginManager.ts b/app/server/lib/DocPluginManager.ts index e02933d9..bd0310f4 100644 --- a/app/server/lib/DocPluginManager.ts +++ b/app/server/lib/DocPluginManager.ts @@ -136,7 +136,6 @@ export class DocPluginManager { if (messages.length) { const extToType: Record = { '.xlsx' : 'Excel', - '.xls' : 'Excel', '.json' : 'JSON', '.csv' : 'CSV', }; diff --git a/app/server/lib/NSandbox.ts b/app/server/lib/NSandbox.ts index bb6d35b6..81fec8ea 100644 --- a/app/server/lib/NSandbox.ts +++ b/app/server/lib/NSandbox.ts @@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess { // if checkpoints are in use. const venv = path.join(process.cwd(), pythonVersion === '2' ? 'venv' : 'sandbox_venv3'); - if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) { + const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir; + if (fs.existsSync(venv) && !useCheckpoint) { wrapperArgs.addMount(venv); wrapperArgs.push('-s', path.join(venv, 'bin', 'python')); } @@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess { // between the checkpoint and how it gets used later). // If a sandbox is being used for import, it will have a special mount we can't // deal with easily right now. Should be possible to do in future if desired. - if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir && - process.env.GRIST_CHECKPOINT) { + if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) { if (process.env.GRIST_CHECKPOINT_MAKE) { const child = - spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT, + spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!, `python${pythonVersion}`, '--', ...pythonArgs]); // We don't want process control for this. return {child, control: new NoProcessControl(child)}; } wrapperArgs.push('--restore'); - wrapperArgs.push(process.env.GRIST_CHECKPOINT); + wrapperArgs.push(process.env.GRIST_CHECKPOINT!); } const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]); // For gvisor under ptrace, main work is done by a traced process identifiable as diff --git a/app/server/lib/guessExt.ts b/app/server/lib/guessExt.ts index 00ed9319..95afeac5 100644 --- a/app/server/lib/guessExt.ts +++ b/app/server/lib/guessExt.ts @@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str return mimeExt; } - if (origExt === ".csv" || origExt === ".xls") { + if (origExt === ".csv") { // File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows // may report "application/vnd.ms-excel" for .csv files. See // https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220 diff --git a/plugins/core/manifest.yml b/plugins/core/manifest.yml index e9e27f61..2a468b36 100644 --- a/plugins/core/manifest.yml +++ b/plugins/core/manifest.yml @@ -5,11 +5,11 @@ components: safePython: sandbox/main.py contributions: fileParsers: - - fileExtensions: ["csv"] + - fileExtensions: ["csv", "tsv", "txt"] parseFile: component: safePython name: csv_parser - - fileExtensions: ["xls", "xlsx", "tsv", "txt", "xlsm"] + - fileExtensions: ["xlsx", "xlsm"] parseFile: component: safePython name: xls_parser diff --git a/sandbox/grist/imports/__init__.py b/sandbox/grist/imports/__init__.py index e69de29b..5c61320b 100644 --- a/sandbox/grist/imports/__init__.py +++ b/sandbox/grist/imports/__init__.py @@ -0,0 +1,16 @@ +import warnings + +import six + +original_formatwarning = warnings.formatwarning + + +def formatwarning(*args, **kwargs): + """ + Fixes an error on Jenkins where byte strings (instead of unicode) + were being written to stderr due to a warning from an internal library. + """ + return six.ensure_text(original_formatwarning(*args, **kwargs)) + + +warnings.formatwarning = formatwarning diff --git a/sandbox/grist/imports/fixtures/test_empty_rows.xlsx b/sandbox/grist/imports/fixtures/test_empty_rows.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5134bca11af43f176ac03acdbec2cfefb8855989 GIT binary patch literal 5206 zcmaJ_1z1$w)~0g^Ns*S49%+z91cn|$;-gbKhVB#)kdzQmN>Dl^q`OB#8fGXd=@3NX z4u1E)aQ*M+I?tXt&vVY&XYaM%z1F*ynj$J12@)0-77}M(z9!Nw!A0B~xmeh_a&uk( zmL#;PbN~V2yFO8DC;ao0SlRCyl;j)e`Dj`h9VkCW=5~f0e1A+#1c<5Zh;{Z4xHQd& zDLHV!fXs@9ky^TuC{4^}eg&Nam(E_?0z*0e-FJlKz>YF(E-zIO3gV;8;3u!N^M;g~*9i1NFYA0@l~_#?6Dus8FMS7L1Skp;lG;C-i2(72 zryaM4ql>MnqoXaCr@dW_MxVnhkg(~59SvsBX=p?F3iWAh9MU{tL0?+J6EfQ?FMz5PPW!yyGg64T0K|jOxM<|nMmrx&-b6Ar|3xy zav4t#ZO$L$4T+C=?~eFbm}%1s;KFOY(D`d+<^yx&-zyKrD^(rZn+1~BptS0psS_$) zq~mspgogc~1sJ@#|2zgt!2e)N5+EPj3DeH0u=~gn7FtkB|59+s%8+32T6)b)FKN40K|jalt&GM}LeS z_Kd->TAHi2egxbN7+TbeiOA`_BAu=Y@m@*W&Rny~#_@}yC|N&P^>>-HQa6^x&4{&J zhIwF`J~bJ%Tog5>IoQgk)*NI;7Gn>qSdS7SQOTqV+WbhyztXbNyIm|H_7!8(Fq~6M zWEc0t79^NN1c#kRA$mw!t#yt(US`Xra6V&|n-K7o$S3nA8ryT*?P{Ttsr88a7%!59&t!fW4i z=Q#!5Q^nY@Paj=KQ07q$NGPV-4E9%$gXd&Bt|0eshiUulFxI#y`Sx#KQuDSv?c5~4 z#>@*(-t7})BqRsaKVb&%Z_KztyzDGouMxAYW1u<bD^3Iz9GFf>-NK7AF zB^f&wHn$#$E@5Ur0)&Q*ae?N}vZR;IKsqgPD@| z7&Y_=akHVQ8a5#CvV1V+GB5)p7#O)M&(-ypP6*M*Br-b);Xy|&N$Un01Gh&$xcnd) z^Z~m^iV8hOlF9;9>c%{U>TI4h!mvn3^a-MXcD$c&X-Llyyx$*UhRKLNf&qh$ZOS%& zAq!rF4u5M)j&z14KZ_P4lw+!RWh49?iT_YmOS*)~t~urX;blUmEjXr|MNC?(_0tKc zVA$>=<~t#XqA!{8ye4BY+DXP4(@$Ns5`8&hY%sXo(rfWaQGaWsP7#`tUabBMy9cq3 zcyZkbPJC*JS>s8sW-GkRZx_q}KE z)OZz-SN}~(NZS8h_Jc~X9vdkxJTt6g7;hZohN3|M};EBILig~sB|Achy{V&6OCGAHanumFxpH0VMN#!)BoE+#Un91t!KE8W{?jKwR3^P};Q?CZ8`S%$ViK$PajN zU2O-d9`HQFN9RT5kcwxKF#sycHN_~s4{{E8A2;$KmPNfS6vsA{_u?L4kSD&K>75pG z{{-DV#;V8gl^j0K8IMag(bDNGko}LN2Zj?__>MyU;ZGwpbK3^-!5sH&J{S4YHU)sK z;+w;D+K&9Rnwb_7qGqvJw?fwLLWv3db3>HmanE>NHWj<`kPnjWaBa(tK)babo-*MR z)I*7d^`feEbl3*7sCd&HqO@5CfaU#-KFHFN?-U&6`@0iipzuygXJx`*^;_6!0ogg| z0bS9!mRV!h1(M_&!0LLy6Q2dlk_#x+>~Ci`V?}@|=5@Ombr=0w*u}s$uoA3Iu`Yw|K&u}w?(z88_iFG+Tg`Ms&Qdfr>r?nQ zy@K!ZiQT)0i6hlN>0<<~6~54VxkwZcAW`IOXXZN8YBJ$f5gqA?2B!=jVHCiHw~wC1 z&NpJCVpfcb6ZABF8jva})Y~i~(=3KBSCv1u=O^Z47EH)LS&Ir73frF0k<@Rf!0gbOolEf{F1%iPhK7oR^%crF13jnB$5)2A zfiF06#aX}Y6Y%LDMjcAHKTi=;jXU(_gu=^n&dc{&;J!t}{&l$@?SgIfYHD?W$3Hpk zp3QE~Y%GT}T&;;l;sfz3m=EEEnan{~zRM<*#f+Pi3OzgU<{vHB@nJucO?w8B>_l+> z6d&&W9UuOtleekT-_$Z*+4dK;ydZsY(xrfD7A@2iEmd}LirlTfG~6kWF!AxqpNmo- z$6{+Zad`5uS^jvFFJ?2MQwD(XAqOq2t;-#&Dd=KndyztWCJnSJCnt0w3@JrJLc5WWbBl{m;#lA}1r=DWyFAI4C7Vy9xH?pMJ^~QC?o-I-H z-#TYJJg7~REh&S|`=VT1Ms@Fp_b6iDOn;=0B>$!Y)+R0%<{A)}XAV}s7=d4`GEO^? zFzh7j{vhm~J3Tl7Lhp$pCaKgTFsx+5%<1iuOSkivIPAn<^8_nV=V8UmJd>y6ba8)e ziQ(PjS{wV06kL@LQ;ApWAf^zhQXkY!dQcJmaIHQ$VLqbGh14oWO1WlAL#L01bXiJt`_BIjJ1MaWC?>Z+ibE;wYJ)ck8-zuG)KBr@IB$(OLTYZ2d zy%Lnizg#&uGT^=VInIhBK3el*C>3?vTQBEC_2f~#j;c~p>wz{;sC;_T2P*p#E!OUE z2Pr~gp)knGyNO_(luaUb=@xlxA^Z+~j#i>49Bn1pWl!WFmFAFY%|`}q7}2JMl7ZD2 za#Eq6^>Fo4i^xS*rCIC5*y`A*d)57bz9a(5#C!@YZWnT=_CKmOpB&fQy@^6$tJGj% z9hc+)ex}Its}J&0rlFa{)$BB(`nlNoa(BWxNyTyl@;z(O=tRFmywq)^a;>uq02u#E zx3(ke9J#LXe&t*}Cb#4ncaaK@^fnv2>DDNxUq{e9ZDtRt-MV~6d4OE{r>fXxK4&ZM zhoi>BS`+1K+i>=T()H9DgV*ytENZha`IxzC=Cp@9WKP#4$l%w0S~f4EdWK-8vwwsm z;cY*eI5}OrDKoyop&dvPej+Yw?#3j5tpIR|X4A@yo1vGTk28A{Skzf#8nUo$D~WCi zg*cpDx|$3u53^w2ZK`?qWGv||T68t&wS7$U>zCNFkp%4zM)XnnMS|oMFWZ}+d@}~z z^CiOWp0p+p);HezijT24rXx@>I_*qIAHGaBTRKJrE14rOMuB3nKBPjui*B>zAPHgO z!e7)atUM5XV) zbkmzktEXuNC93C=s#da1h@YOKQ|uL|D!Yzik3-RiBKnJ>YpRnfIb9VR8dHm+NGL2$ z)i;Y9n#F@vUnD(l;5%Qt+YHaRLcX@9bZomkA;OwGdEX=qx~;~LMsqw zn61Nfk@nG5cR8}vr-j>)?zO#nW^tQHwpJ{&XEc$Tl52K9RTBrzs4d^?*vhVn zKt<^?7vT?$f`Xw!+Ch2vwy12HIQC4(Cbo*E5A;WgdcS)5%BE`5B&qtXNRDmc<9zzy z=hA}RQ{oVqyHd0L0xX=<{UunPwx3a3fdwkP?Hy>_MKG6d1a*=W5Gy6>HaIUe*7TyI zGgl(}+;C?X;}g9yy>V8%n|>R8n#MQU_W-;Ver9^l1yKXg>vUi12H!F7$e(3RGRov( z#;yGNeMoIkowsvZNQM)2x)piS)FU4L`XQ8hP4pKs2N(vd1#MpzJ^`mMw9yYlTS;kvlJ5q$(VKq%joy1%R5CJ@(U#Eo!Z{O7y>uc-KamfOkp znrz&N5`vf@7Vr<$ 0: - row.extend([messytables.Cell("")] * missing_values) + row.extend([""] * missing_values) for cell, conv in zip(row, col_converters): - conv.convert_and_add(cell.value) + conv.convert_and_add(cell) return [conv.get_grist_column() for conv in col_converters] diff --git a/sandbox/requirements.txt b/sandbox/requirements.txt index fc206ca0..492eb4c7 100644 --- a/sandbox/requirements.txt +++ b/sandbox/requirements.txt @@ -3,12 +3,15 @@ asttokens==2.0.5 backports.functools-lru-cache==1.6.4 chardet==4.0.0 enum34==1.1.10 +et-xmlfile==1.0.1 html5lib==0.999999999 iso8601==0.1.12 +jdcal==1.4.1 json_table_schema==0.2.1 lazy_object_proxy==1.6.0 lxml==4.6.3 # used in csv plugin only? messytables==0.15.2 +openpyxl==2.6.4 python_dateutil==2.6.0 python_magic==0.4.12 roman==2.0.0 diff --git a/sandbox/requirements3.txt b/sandbox/requirements3.txt index 0c2448c8..b673a446 100644 --- a/sandbox/requirements3.txt +++ b/sandbox/requirements3.txt @@ -3,12 +3,15 @@ asttokens==2.0.5 backports.functools-lru-cache==1.6.4 chardet==4.0.0 enum34==1.1.10 +et-xmlfile==1.0.1 html5lib==0.999999999 iso8601==0.1.12 +jdcal==1.4.1 json_table_schema==0.2.1 lazy_object_proxy==1.6.0 lxml==4.6.3 # used in csv plugin only? messytables==0.15.2 +openpyxl==2.6.4 python_dateutil==2.6.0 python_magic==0.4.12 roman==2.0.0