mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows
Summary: Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py. Skip empty rows since excel files can easily contain huge numbers of them. Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format. Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies. Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3406
This commit is contained in:
@@ -271,8 +271,8 @@ export class ActiveDocImport {
|
||||
/**
|
||||
* Imports the data stored at tmpPath.
|
||||
*
|
||||
* Currently it starts a python parser (that relies on the messytables library) as a child process
|
||||
* outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may
|
||||
* Currently it starts a python parser as a child process
|
||||
* outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may
|
||||
* result in the import of multiple tables, in case of e.g. Excel formats.
|
||||
* @param {OptDocSession} docSession: Session instance to use for importing.
|
||||
* @param {String} tmpPath: The path from of the original file.
|
||||
|
||||
@@ -136,7 +136,6 @@ export class DocPluginManager {
|
||||
if (messages.length) {
|
||||
const extToType: Record<string, string> = {
|
||||
'.xlsx' : 'Excel',
|
||||
'.xls' : 'Excel',
|
||||
'.json' : 'JSON',
|
||||
'.csv' : 'CSV',
|
||||
};
|
||||
|
||||
@@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
|
||||
// if checkpoints are in use.
|
||||
const venv = path.join(process.cwd(),
|
||||
pythonVersion === '2' ? 'venv' : 'sandbox_venv3');
|
||||
if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) {
|
||||
const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir;
|
||||
if (fs.existsSync(venv) && !useCheckpoint) {
|
||||
wrapperArgs.addMount(venv);
|
||||
wrapperArgs.push('-s', path.join(venv, 'bin', 'python'));
|
||||
}
|
||||
@@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
|
||||
// between the checkpoint and how it gets used later).
|
||||
// If a sandbox is being used for import, it will have a special mount we can't
|
||||
// deal with easily right now. Should be possible to do in future if desired.
|
||||
if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir &&
|
||||
process.env.GRIST_CHECKPOINT) {
|
||||
if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) {
|
||||
if (process.env.GRIST_CHECKPOINT_MAKE) {
|
||||
const child =
|
||||
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT,
|
||||
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!,
|
||||
`python${pythonVersion}`, '--', ...pythonArgs]);
|
||||
// We don't want process control for this.
|
||||
return {child, control: new NoProcessControl(child)};
|
||||
}
|
||||
wrapperArgs.push('--restore');
|
||||
wrapperArgs.push(process.env.GRIST_CHECKPOINT);
|
||||
wrapperArgs.push(process.env.GRIST_CHECKPOINT!);
|
||||
}
|
||||
const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]);
|
||||
// For gvisor under ptrace, main work is done by a traced process identifiable as
|
||||
|
||||
@@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str
|
||||
return mimeExt;
|
||||
}
|
||||
|
||||
if (origExt === ".csv" || origExt === ".xls") {
|
||||
if (origExt === ".csv") {
|
||||
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
|
||||
// may report "application/vnd.ms-excel" for .csv files. See
|
||||
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220
|
||||
|
||||
Reference in New Issue
Block a user