(core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows

Summary:
Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py.

Skip empty rows since excel files can easily contain huge numbers of them.

Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format.

Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies.

Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3406
This commit is contained in:
Alex Hall
2022-05-06 17:26:45 +02:00
parent 0a61d740ba
commit 6c90de4d62
15 changed files with 127 additions and 135 deletions

View File

@@ -271,8 +271,8 @@ export class ActiveDocImport {
/**
* Imports the data stored at tmpPath.
*
* Currently it starts a python parser (that relies on the messytables library) as a child process
* outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may
* Currently it starts a python parser as a child process
* outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may
* result in the import of multiple tables, in case of e.g. Excel formats.
* @param {OptDocSession} docSession: Session instance to use for importing.
* @param {String} tmpPath: The path from of the original file.

View File

@@ -136,7 +136,6 @@ export class DocPluginManager {
if (messages.length) {
const extToType: Record<string, string> = {
'.xlsx' : 'Excel',
'.xls' : 'Excel',
'.json' : 'JSON',
'.csv' : 'CSV',
};

View File

@@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
// if checkpoints are in use.
const venv = path.join(process.cwd(),
pythonVersion === '2' ? 'venv' : 'sandbox_venv3');
if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) {
const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir;
if (fs.existsSync(venv) && !useCheckpoint) {
wrapperArgs.addMount(venv);
wrapperArgs.push('-s', path.join(venv, 'bin', 'python'));
}
@@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
// between the checkpoint and how it gets used later).
// If a sandbox is being used for import, it will have a special mount we can't
// deal with easily right now. Should be possible to do in future if desired.
if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir &&
process.env.GRIST_CHECKPOINT) {
if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) {
if (process.env.GRIST_CHECKPOINT_MAKE) {
const child =
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT,
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!,
`python${pythonVersion}`, '--', ...pythonArgs]);
// We don't want process control for this.
return {child, control: new NoProcessControl(child)};
}
wrapperArgs.push('--restore');
wrapperArgs.push(process.env.GRIST_CHECKPOINT);
wrapperArgs.push(process.env.GRIST_CHECKPOINT!);
}
const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]);
// For gvisor under ptrace, main work is done by a traced process identifiable as

View File

@@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str
return mimeExt;
}
if (origExt === ".csv" || origExt === ".xls") {
if (origExt === ".csv") {
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
// may report "application/vnd.ms-excel" for .csv files. See
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220