gristlabs_grist-core/app/server/lib/guessExt.ts
Alex Hall 6c90de4d62 (core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows
Summary:
Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py.

Skip empty rows since excel files can easily contain huge numbers of them.

Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format.

Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies.

Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3406
2022-05-12 14:43:21 +02:00

52 lines
2.0 KiB
TypeScript

import {fromFile} from 'file-type';
import {extension, lookup} from 'mime-types';
import * as path from 'path';
/**
* Get our best guess of the file extension, based on its original extension (as received from the
* user), mimeType (as reported by the browser upload, or perhaps some API), and the file
* contents.
*
* The resulting extension is used to choose a parser for imports, and to present the file back
* to the user for attachments.
*/
export async function guessExt(filePath: string, fileName: string, mimeType: string|null): Promise<string> {
const origExt = path.extname(fileName).toLowerCase(); // Has the form ".xls"
let mimeExt = extension(mimeType); // Has the form "xls"
mimeExt = mimeExt ? "." + mimeExt : null; // Use the more comparable form ".xls"
if (mimeExt === ".json") {
// It's common for JSON APIs to specify MIME type, but origExt might come from a URL with
// periods that don't indicate a meaningful extension. Trust mime-type here.
return mimeExt;
}
if (origExt === ".csv") {
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
// may report "application/vnd.ms-excel" for .csv files. See
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220
return origExt;
}
// If extension and mime type agree, let's call it a day.
if (origExt && (origExt === mimeExt || lookup(origExt.slice(1)) === mimeType)) {
return origExt;
}
// If not, let's take a look at the file contents.
const detected = await fromFile(filePath);
const detectedExt = detected ? "." + detected.ext : null;
if (detectedExt) {
// For the types for which detection works, we think we should prefer it.
return detectedExt;
}
if (mimeExt === '.txt' || mimeExt === '.bin') {
// text/plain (txt) and application/octet-stream (bin) are too generic, only use them if we
// don't have anything better.
return origExt || mimeExt;
}
// In other cases, it's a tough call.
return origExt || mimeExt;
}