diff --git a/app/client/lib/uploads.ts b/app/client/lib/uploads.ts index be90a8c8..a7df3e66 100644 --- a/app/client/lib/uploads.ts +++ b/app/client/lib/uploads.ts @@ -32,7 +32,7 @@ export interface SelectFileOptions extends UploadOptions { // e.g. [".jpg", ".png"] } -export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xls", ".xlsx", ".xlsm"]; +export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xlsx", ".xlsm"]; /** * Shows the file-picker dialog with the given options, and uploads the selected files. If under diff --git a/app/server/lib/ActiveDocImport.ts b/app/server/lib/ActiveDocImport.ts index 0b285ff9..ff3f9d1c 100644 --- a/app/server/lib/ActiveDocImport.ts +++ b/app/server/lib/ActiveDocImport.ts @@ -271,8 +271,8 @@ export class ActiveDocImport { /** * Imports the data stored at tmpPath. * - * Currently it starts a python parser (that relies on the messytables library) as a child process - * outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may + * Currently it starts a python parser as a child process + * outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may * result in the import of multiple tables, in case of e.g. Excel formats. * @param {OptDocSession} docSession: Session instance to use for importing. * @param {String} tmpPath: The path from of the original file. diff --git a/app/server/lib/DocPluginManager.ts b/app/server/lib/DocPluginManager.ts index e02933d9..bd0310f4 100644 --- a/app/server/lib/DocPluginManager.ts +++ b/app/server/lib/DocPluginManager.ts @@ -136,7 +136,6 @@ export class DocPluginManager { if (messages.length) { const extToType: Record = { '.xlsx' : 'Excel', - '.xls' : 'Excel', '.json' : 'JSON', '.csv' : 'CSV', }; diff --git a/app/server/lib/NSandbox.ts b/app/server/lib/NSandbox.ts index bb6d35b6..81fec8ea 100644 --- a/app/server/lib/NSandbox.ts +++ b/app/server/lib/NSandbox.ts @@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess { // if checkpoints are in use. const venv = path.join(process.cwd(), pythonVersion === '2' ? 'venv' : 'sandbox_venv3'); - if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) { + const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir; + if (fs.existsSync(venv) && !useCheckpoint) { wrapperArgs.addMount(venv); wrapperArgs.push('-s', path.join(venv, 'bin', 'python')); } @@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess { // between the checkpoint and how it gets used later). // If a sandbox is being used for import, it will have a special mount we can't // deal with easily right now. Should be possible to do in future if desired. - if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir && - process.env.GRIST_CHECKPOINT) { + if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) { if (process.env.GRIST_CHECKPOINT_MAKE) { const child = - spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT, + spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!, `python${pythonVersion}`, '--', ...pythonArgs]); // We don't want process control for this. return {child, control: new NoProcessControl(child)}; } wrapperArgs.push('--restore'); - wrapperArgs.push(process.env.GRIST_CHECKPOINT); + wrapperArgs.push(process.env.GRIST_CHECKPOINT!); } const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]); // For gvisor under ptrace, main work is done by a traced process identifiable as diff --git a/app/server/lib/guessExt.ts b/app/server/lib/guessExt.ts index 00ed9319..95afeac5 100644 --- a/app/server/lib/guessExt.ts +++ b/app/server/lib/guessExt.ts @@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str return mimeExt; } - if (origExt === ".csv" || origExt === ".xls") { + if (origExt === ".csv") { // File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows // may report "application/vnd.ms-excel" for .csv files. See // https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220 diff --git a/plugins/core/manifest.yml b/plugins/core/manifest.yml index e9e27f61..2a468b36 100644 --- a/plugins/core/manifest.yml +++ b/plugins/core/manifest.yml @@ -5,11 +5,11 @@ components: safePython: sandbox/main.py contributions: fileParsers: - - fileExtensions: ["csv"] + - fileExtensions: ["csv", "tsv", "txt"] parseFile: component: safePython name: csv_parser - - fileExtensions: ["xls", "xlsx", "tsv", "txt", "xlsm"] + - fileExtensions: ["xlsx", "xlsm"] parseFile: component: safePython name: xls_parser diff --git a/sandbox/grist/imports/__init__.py b/sandbox/grist/imports/__init__.py index e69de29b..5c61320b 100644 --- a/sandbox/grist/imports/__init__.py +++ b/sandbox/grist/imports/__init__.py @@ -0,0 +1,16 @@ +import warnings + +import six + +original_formatwarning = warnings.formatwarning + + +def formatwarning(*args, **kwargs): + """ + Fixes an error on Jenkins where byte strings (instead of unicode) + were being written to stderr due to a warning from an internal library. + """ + return six.ensure_text(original_formatwarning(*args, **kwargs)) + + +warnings.formatwarning = formatwarning diff --git a/sandbox/grist/imports/fixtures/test_empty_rows.xlsx b/sandbox/grist/imports/fixtures/test_empty_rows.xlsx new file mode 100644 index 00000000..5134bca1 Binary files /dev/null and b/sandbox/grist/imports/fixtures/test_empty_rows.xlsx differ diff --git a/sandbox/grist/imports/import_csv.py b/sandbox/grist/imports/import_csv.py index ff0ca70d..431b69a1 100644 --- a/sandbox/grist/imports/import_csv.py +++ b/sandbox/grist/imports/import_csv.py @@ -150,8 +150,11 @@ def _parse_open_file(file_obj, parse_options=None): headers = [''] * len(headers) row_set.register_processor(messytables.offset_processor(data_offset)) - - table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows) + rows = [ + [cell.value for cell in row] + for row in row_set + ] + table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows) # Identify and remove empty columns, and populate separate metadata and data lists. column_metadata = [] diff --git a/sandbox/grist/imports/import_xls.py b/sandbox/grist/imports/import_xls.py index 93013e25..7acd5322 100644 --- a/sandbox/grist/imports/import_xls.py +++ b/sandbox/grist/imports/import_xls.py @@ -2,13 +2,10 @@ This module reads a file path that is passed in using ActiveDoc.importFile() and returns a object formatted so that it can be used by grist for a bulk add records action """ -import csv import logging -import os -import chardet import messytables -import messytables.excel +import openpyxl import six from six.moves import zip @@ -18,76 +15,58 @@ from imports import import_utils log = logging.getLogger(__name__) -def import_file(file_source, parse_options): +def import_file(file_source): path = import_utils.get_path(file_source["path"]) - orig_name = file_source["origName"] - parse_options, tables = parse_file(path, orig_name, parse_options) + parse_options, tables = parse_file(path) return {"parseOptions": parse_options, "tables": tables} -# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any -# behavior. -orig_dialect = messytables.CSVRowSet._dialect -def override_dialect(self): - if self.delimiter == '\t': - return csv.excel_tab - return orig_dialect.fget(self) -messytables.CSVRowSet._dialect = property(override_dialect) - -def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None): - # pylint: disable=unused-argument + +def parse_file(file_path): with open(file_path, "rb") as f: - try: - return parse_open_file(f, orig_name, table_name_hint=table_name_hint) - except Exception as e: - # Log the full error, but simplify the thrown error to omit the unhelpful extra args. - log.warn("import_xls parse_file failed: %s", e) - if six.PY2 and e.args and isinstance(e.args[0], six.string_types): - raise Exception(e.args[0]) - raise - - -def parse_open_file(file_obj, orig_name, table_name_hint=None): - file_root, file_ext = os.path.splitext(orig_name) - table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False) - - # Messytable's encoding detection uses too small a sample, so we override it here. - if isinstance(table_set, messytables.CSVTableSet): - sample = file_obj.read(100000) - table_set.encoding = chardet.detect(sample)['encoding'] - # In addition, always prefer UTF8 over ASCII. - if table_set.encoding == 'ascii': - table_set.encoding = 'utf8' + return parse_open_file(f) + + +def parse_open_file(file_obj): + workbook = openpyxl.load_workbook( + file_obj, + read_only=True, + keep_vba=False, + data_only=True, + keep_links=False, + ) skipped_tables = 0 export_list = [] # A table set is a collection of tables: - for row_set in table_set.tables: - table_name = row_set.name - - if isinstance(row_set, messytables.CSVRowSet): - # For csv files, we can do better for table_name by using the filename. - table_name = import_utils.capitalize(table_name_hint or - os.path.basename(file_root.decode('utf8'))) - - # Messytables doesn't guess whether headers are present, so we need to step in. - data_offset, headers = import_utils.headers_guess(list(row_set.sample)) - else: - # Let messytables guess header names and the offset of the header. - offset, headers = messytables.headers_guess(row_set.sample) - data_offset = offset + 1 # Add the header line + for sheet in workbook: + table_name = sheet.title + rows = [ + row + for row in sheet.iter_rows(values_only=True) + # Exclude empty rows, i.e. rows with only empty values. + # `if not any(row)` would be slightly faster, but would count `0` as empty. + if not set(row) <= {None, ""} + ] + sample = [ + # Create messytables.Cells for the sake of messytables.headers_guess + [messytables.Cell(cell) for cell in row] + for row in rows[:1000] + ] + offset, headers = messytables.headers_guess(sample) + data_offset = offset + 1 # Add the header line + rows = rows[data_offset:] # Make sure all header values are strings. for i, header in enumerate(headers): - if not isinstance(header, six.string_types): + if header is None: + headers[i] = u'' + elif not isinstance(header, six.string_types): headers[i] = six.text_type(header) log.debug("Guessed data_offset as %s", data_offset) log.debug("Guessed headers as: %s", headers) - row_set.register_processor(messytables.offset_processor(data_offset)) - - - table_data_with_types = parse_data.get_table_data(row_set, len(headers)) + table_data_with_types = parse_data.get_table_data(rows, len(headers)) # Identify and remove empty columns, and populate separate metadata and data lists. column_metadata = [] @@ -121,36 +100,3 @@ def parse_open_file(file_obj, orig_name, table_name_hint=None): parse_options = {} return parse_options, export_list - - -# This change was initially introduced in https://phab.getgrist.com/D2145 -# Monkey-patching done in https://phab.getgrist.com/D2965 -# to move towards normal dependency management -@staticmethod -def from_xlrdcell(xlrd_cell, sheet, col, row): - from messytables.excel import ( - XLS_TYPES, StringType, DateType, InvalidDateError, xlrd, time, datetime, XLSCell - ) - value = xlrd_cell.value - cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType()) - if cell_type == DateType(None): - # Try-catch added by Dmitry, to avoid failing even if we see a date we can't handle. - try: - if value == 0: - raise InvalidDateError - year, month, day, hour, minute, second = \ - xlrd.xldate_as_tuple(value, sheet.book.datemode) - if (year, month, day) == (0, 0, 0): - value = time(hour, minute, second) - else: - value = datetime(year, month, day, hour, minute, second) - except Exception: - # Keep going, and we'll just interpret the date as a number. - pass - messy_cell = XLSCell(value, type=cell_type) - messy_cell.sheet = sheet - messy_cell.xlrd_cell = xlrd_cell - messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y) - return messy_cell - -messytables.excel.XLSCell.from_xlrdcell = from_xlrdcell diff --git a/sandbox/grist/imports/register.py b/sandbox/grist/imports/register.py index b7e82c04..b72163ed 100644 --- a/sandbox/grist/imports/register.py +++ b/sandbox/grist/imports/register.py @@ -6,8 +6,9 @@ def register_import_parsers(sandbox): sandbox.register("csv_parser.parseFile", parse_csv) def parse_excel(file_source, parse_options): + # pylint: disable=unused-argument from imports.import_xls import import_file - return import_file(file_source, parse_options) + return import_file(file_source) sandbox.register("xls_parser.parseFile", parse_excel) diff --git a/sandbox/grist/imports/test_import_xls.py b/sandbox/grist/imports/test_import_xls.py index 7a4a3326..284b9671 100644 --- a/sandbox/grist/imports/test_import_xls.py +++ b/sandbox/grist/imports/test_import_xls.py @@ -8,7 +8,7 @@ import unittest from imports import import_xls def _get_fixture(filename): - return [os.path.join(os.path.dirname(__file__), "fixtures", filename), filename] + return [os.path.join(os.path.dirname(__file__), "fixtures", filename)] class TestImportXLS(unittest.TestCase): @@ -42,7 +42,7 @@ class TestImportXLS(unittest.TestCase): {"type": "Any", "id": "corner-cases"}) self.assertEqual(parsed_file[1][0]["table_data"][3], # The type is detected as text, so all values should be text. - [u'=function()', u'3.0', u'two spaces after ', + [u'=function()', u'3', u'two spaces after ', u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak']) # check that multiple tables are created when there are multiple sheets in a document @@ -53,12 +53,12 @@ class TestImportXLS(unittest.TestCase): def test_excel_types(self): parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx')) sheet = parsed_file[1][0] - self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', '']) - self._check_col(sheet, 1, "int2", "Numeric", [5, '', '']) + self._check_col(sheet, 0, "int1", "Numeric", [-1234123, None, None]) + self._check_col(sheet, 1, "int2", "Numeric", [5, None, None]) self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', '']) self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', '']) - self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', '']) - self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', '']) + self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, None, None]) + self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), None, None]) self._check_col(sheet, 6, "date1", "DateTime", [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None]) self._check_col(sheet, 7, "date2", "Date", @@ -78,7 +78,7 @@ class TestImportXLS(unittest.TestCase): None, 1452038400.0, 1451549340.0, 1483214940.0, None, 1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0]) self._check_col(sheet, 1, "float_not_int", "Numeric", - [1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18]) + [1,2,3,4,5,None,6,7,8,9,10,10.25,11,12,13,14,15,16,17,18]) self._check_col(sheet, 2, "int_not_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) self._check_col(sheet, 3, "float_not_bool", "Any", @@ -91,12 +91,11 @@ class TestImportXLS(unittest.TestCase): [4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 4.0, 6.0, '3-4', 4.0, 6.5]) self._check_col(sheet, 7, "float_not_text", "Numeric", - [-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, "", "-", - 12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50]) + [-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, None, "-", + 12.25, 0.00, None, 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50]) def test_excel_single_merged_cell(self): - # An older version of xlrd had a bug where a single cell marked as 'merged' would cause an - # exception. + # An older version had a bug where a single cell marked as 'merged' would cause an exception. parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx')) tables = parsed_file[1] self.assertEqual(tables, [{ @@ -110,21 +109,19 @@ class TestImportXLS(unittest.TestCase): ], 'table_data': [ [u'SINGLE MERGED', u'The End'], - [1637384.52, u''], - [2444344.06, u''], - [2444344.06, u''], + [1637384.52, None], + [2444344.06, None], + [2444344.06, None], [u'', u''], ], }]) def test_excel_strange_dates(self): - # TODO fails with xlrd.xldate.XLDateAmbiguous: 4.180902777777778 # Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00). parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx')) tables = parsed_file[1] # We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30" - # would be best left as text, but here become "01:10:00" (after xlrd parses the first as - # datetime.time), and as 4.18... (after xlrd fails and we resort to the numerical value). + # would be best left as text. self.assertEqual(tables, [{ 'table_name': u'Sheet1', 'column_metadata': [ @@ -132,22 +129,41 @@ class TestImportXLS(unittest.TestCase): {'id': 'b', 'type': 'Date'}, {'id': 'c', 'type': 'Any'}, {'id': 'd', 'type': 'Any'}, - {'id': 'e', 'type': 'Numeric'}, - {'id': 'f', 'type': 'Numeric'}, + {'id': 'e', 'type': 'DateTime'}, + {'id': 'f', 'type': 'Date'}, {'id': 'g', 'type': 'Any'}, {'id': 'h', 'type': 'Date'}, - {'id': 'i', 'type': 'Numeric'}, + {'id': 'i', 'type': 'Date'}, ], 'table_data': [ [u'21:14:00'], [1568851200.0], [u'01:10:00'], [u'10:20:30'], - [4.180902777777778], - [20], + [-2208713970.0], + [-2207347200.0], [u'7/4/1776'], [205286400.0], - [0], + [-2209161600.0], + ], + }]) + + def test_empty_rows(self): + # Check that empty rows aren't imported, + # and that files with lots of empty rows are imported quickly. + # The fixture file is mostly empty but has data in the last row, + # with over a million empty rows in between. + parsed_file = import_xls.parse_file(*_get_fixture('test_empty_rows.xlsx')) + tables = parsed_file[1] + self.assertEqual(tables, [{ + 'table_name': u'Sheet1', + 'column_metadata': [ + {'id': 'a', 'type': 'Numeric'}, + {'id': 'b', 'type': 'Numeric'}, + ], + 'table_data': [ + [0, None, 1], + [None, 0, 2], ], }]) diff --git a/sandbox/grist/parse_data.py b/sandbox/grist/parse_data.py index c5aa1cf4..47f95d04 100644 --- a/sandbox/grist/parse_data.py +++ b/sandbox/grist/parse_data.py @@ -10,7 +10,6 @@ of values. All "data" lists will have the same length. import datetime import logging import re -import messytables import moment # TODO grist internal libraries might not be available to plugins in the future. import six from six.moves import zip, xrange @@ -59,12 +58,16 @@ class BaseConverter(object): raise NotImplementedError() +numeric_types = six.integer_types + (float, complex, type(None)) + class NumericConverter(BaseConverter): """Handles the Grist Numeric type""" @classmethod def convert(cls, value): - if type(value) in six.integer_types + (float, complex): + if type(value) is bool: + return int(value) + elif type(value) in numeric_types: return value raise ValueError() @@ -80,7 +83,7 @@ class SimpleDateTimeConverter(BaseConverter): def convert(cls, value): if type(value) is datetime.datetime: return value - elif value == "": + elif not value: return None raise ValueError() @@ -103,6 +106,8 @@ class AnyConverter(BaseConverter): """ @classmethod def convert(cls, value): + if value is None: + return u'' return six.text_type(value) @classmethod @@ -156,7 +161,7 @@ def _guess_basic_types(rows, num_columns): column_detectors = [ColumnDetector() for i in xrange(num_columns)] for row in rows: for cell, detector in zip(row, column_detectors): - detector.add_value(cell.value) + detector.add_value(cell) return [detector.get_converter() for detector in column_detectors] @@ -194,10 +199,10 @@ class ColumnConverter(object): return {"type": grist_type, "data": self._all_col_values} -def get_table_data(row_set, num_columns, num_rows=0): - converters = _guess_basic_types(row_set.sample, num_columns) +def get_table_data(rows, num_columns, num_rows=0): + converters = _guess_basic_types(rows[:1000], num_columns) col_converters = [ColumnConverter(c) for c in converters] - for num, row in enumerate(row_set): + for num, row in enumerate(rows): if num_rows and num == num_rows: break @@ -207,9 +212,9 @@ def get_table_data(row_set, num_columns, num_rows=0): # Make sure we have a value for every column. missing_values = len(converters) - len(row) if missing_values > 0: - row.extend([messytables.Cell("")] * missing_values) + row.extend([""] * missing_values) for cell, conv in zip(row, col_converters): - conv.convert_and_add(cell.value) + conv.convert_and_add(cell) return [conv.get_grist_column() for conv in col_converters] diff --git a/sandbox/requirements.txt b/sandbox/requirements.txt index fc206ca0..492eb4c7 100644 --- a/sandbox/requirements.txt +++ b/sandbox/requirements.txt @@ -3,12 +3,15 @@ asttokens==2.0.5 backports.functools-lru-cache==1.6.4 chardet==4.0.0 enum34==1.1.10 +et-xmlfile==1.0.1 html5lib==0.999999999 iso8601==0.1.12 +jdcal==1.4.1 json_table_schema==0.2.1 lazy_object_proxy==1.6.0 lxml==4.6.3 # used in csv plugin only? messytables==0.15.2 +openpyxl==2.6.4 python_dateutil==2.6.0 python_magic==0.4.12 roman==2.0.0 diff --git a/sandbox/requirements3.txt b/sandbox/requirements3.txt index 0c2448c8..b673a446 100644 --- a/sandbox/requirements3.txt +++ b/sandbox/requirements3.txt @@ -3,12 +3,15 @@ asttokens==2.0.5 backports.functools-lru-cache==1.6.4 chardet==4.0.0 enum34==1.1.10 +et-xmlfile==1.0.1 html5lib==0.999999999 iso8601==0.1.12 +jdcal==1.4.1 json_table_schema==0.2.1 lazy_object_proxy==1.6.0 lxml==4.6.3 # used in csv plugin only? messytables==0.15.2 +openpyxl==2.6.4 python_dateutil==2.6.0 python_magic==0.4.12 roman==2.0.0