From 9bbf66e50e35c78eca7f57007a8b83e30bc81ecb Mon Sep 17 00:00:00 2001 From: Yohan Boniface Date: Thu, 1 Sep 2022 18:56:30 +0200 Subject: [PATCH] wip: remove dependency to messytables --- sandbox/grist/imports/import_csv.py | 191 +++++++++++----------- sandbox/grist/imports/import_csv_test.py | 36 ++-- sandbox/grist/imports/import_utils.py | 19 ++- sandbox/grist/imports/messytables_test.py | 22 --- 4 files changed, 130 insertions(+), 138 deletions(-) delete mode 100644 sandbox/grist/imports/messytables_test.py diff --git a/sandbox/grist/imports/import_csv.py b/sandbox/grist/imports/import_csv.py index 431b69a1..5c3cc046 100644 --- a/sandbox/grist/imports/import_csv.py +++ b/sandbox/grist/imports/import_csv.py @@ -1,11 +1,11 @@ """ Plugin for importing CSV files """ -import os +import codecs +import csv import logging import chardet -import messytables import six from six.moves import zip @@ -90,108 +90,117 @@ def parse_file(file_path, parse_options=None): """ parse_options = parse_options or {} - with open(file_path, "rb") as f: + with codecs.open(file_path, "rb") as f: + sample = f.read(100000) + encoding = chardet.detect(sample)['encoding'] or "utf8" + # In addition, always prefer UTF8 over ASCII. + if encoding == 'ascii': + encoding = 'utf8' + log.info("Using encoding %s" % encoding) + + with codecs.open(file_path, mode="r", encoding=encoding) as f: parsing_options, export_list = _parse_open_file(f, parse_options=parse_options) return parsing_options, export_list +def _guess_dialect(file_obj): + try: + # Restrict allowed delimiters to prevent guessing other char than this list. + dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|']) + log.info("Guessed dialect %s" % dict(dialect.__dict__)) + except csv.Error: + log.info("Cannot guess dialect using Excel as fallback.") + return csv.excel + else: + return dialect + finally: + file_obj.seek(0) + def _parse_open_file(file_obj, parse_options=None): - options = {} csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace'] - csv_options = {k: parse_options.get(k) for k in csv_keys} + options = {} + dialect = _guess_dialect(file_obj) + + csv_options = {k: parse_options.get(k, getattr(dialect, k, None)) for k in csv_keys} if six.PY2: csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v for k, v in csv_options.items()} - table_set = messytables.CSVTableSet(file_obj, - delimiter=csv_options['delimiter'], - quotechar=csv_options['quotechar'], - lineterminator=csv_options['lineterminator'], - doublequote=csv_options['doublequote'], - skipinitialspace=csv_options['skipinitialspace']) + csv_options = {k: v for k, v in csv_options.items() if v is not None} + reader = csv.reader(file_obj, **csv_options) num_rows = parse_options.get('NUM_ROWS', 0) - # Messytable's encoding detection uses too small a sample, so we override it here. - sample = file_obj.read(100000) - table_set.encoding = chardet.detect(sample)['encoding'] - # In addition, always prefer UTF8 over ASCII. - if table_set.encoding == 'ascii': - table_set.encoding = 'utf8' - - export_list = [] - # A table set is a collection of tables: - for row_set in table_set.tables: - table_name = None - sample_rows = list(row_set.sample) - # Messytables doesn't guess whether headers are present, so we need to step in. - data_offset, headers = import_utils.headers_guess(sample_rows) - - # Make sure all header values are strings. - for i, header in enumerate(headers): - if not isinstance(header, six.string_types): - headers[i] = six.text_type(header) - - log.info("Guessed data_offset as %s", data_offset) - log.info("Guessed headers as: %s", headers) - - have_guessed_headers = any(headers) - include_col_names_as_headers = parse_options.get('include_col_names_as_headers', - have_guessed_headers) - - if include_col_names_as_headers and not have_guessed_headers: - # use first line as headers - data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows) - headers = import_utils.expand_headers(first_row, data_offset, sample_rows) - - elif not include_col_names_as_headers and have_guessed_headers: - # move guessed headers to data - data_offset -= 1 - headers = [''] * len(headers) - - row_set.register_processor(messytables.offset_processor(data_offset)) - rows = [ - [cell.value for cell in row] - for row in row_set - ] - table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows) - - # Identify and remove empty columns, and populate separate metadata and data lists. - column_metadata = [] - table_data = [] - for col_data, header in zip(table_data_with_types, headers): - if not header and all(val == "" for val in col_data["data"]): - continue # empty column - data = col_data.pop("data") - col_data["id"] = header - column_metadata.append(col_data) - table_data.append(data) - - if not table_data: - # Don't add tables with no columns. - continue - - guessed = row_set._dialect - quoting = parse_options.get('quoting') - options = {"delimiter": parse_options.get('delimiter', guessed.delimiter), - "doublequote": parse_options.get('doublequote', guessed.doublequote), - "lineterminator": parse_options.get('lineterminator', guessed.lineterminator), - "quotechar": parse_options.get('quotechar', guessed.quotechar), - "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace), - "include_col_names_as_headers": include_col_names_as_headers, - "start_with_row": 1, - "NUM_ROWS": num_rows, - "SCHEMA": SCHEMA - } - - log.info("Output table %r with %d columns", table_name, len(column_metadata)) - for c in column_metadata: - log.debug("Output column %s", c) - export_list.append({ - "table_name": table_name, - "column_metadata": column_metadata, - "table_data": table_data - }) + + table_name = None + rows = list(reader) + sample_len = 100 + sample_rows = rows[:sample_len] + data_offset, headers = import_utils.headers_guess(sample_rows) + + # Make sure all header values are strings. + for i, header in enumerate(headers): + if not isinstance(header, six.string_types): + headers[i] = six.text_type(header) + + log.info("Guessed data_offset as %s", data_offset) + log.info("Guessed headers as: %s", headers) + + have_guessed_headers = any(headers) + include_col_names_as_headers = parse_options.get('include_col_names_as_headers', + have_guessed_headers) + + if include_col_names_as_headers and not have_guessed_headers: + # use first line as headers + data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows) + headers = import_utils.expand_headers(first_row, data_offset, sample_rows) + + elif not include_col_names_as_headers and have_guessed_headers: + # move guessed headers to data + data_offset -= 1 + headers = [''] * len(headers) + + rows = rows[data_offset:] # Use row.pop instead to make it faster ? + table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows) + + # Identify and remove empty columns, and populate separate metadata and data lists. + column_metadata = [] + table_data = [] + for col_data, header in zip(table_data_with_types, headers): + if not header and all(val == "" for val in col_data["data"]): + continue # empty column + data = col_data.pop("data") + col_data["id"] = header + column_metadata.append(col_data) + table_data.append(data) + + if not table_data: + log.info("No data found. Aborting CSV import.") + # Don't add tables with no columns. + return {}, [] + + guessed = reader.dialect + quoting = parse_options.get('quoting') + options = {"delimiter": parse_options.get('delimiter', guessed.delimiter), + "doublequote": parse_options.get('doublequote', guessed.doublequote), + "lineterminator": parse_options.get('lineterminator', guessed.lineterminator), + "quotechar": parse_options.get('quotechar', guessed.quotechar), + "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace), + "include_col_names_as_headers": include_col_names_as_headers, + "start_with_row": 1, + "NUM_ROWS": num_rows, + "SCHEMA": SCHEMA + } + + log.info("Output table %r with %d columns", table_name, len(column_metadata)) + for c in column_metadata: + log.debug("Output column %s", c) + + export_list = [{ + "table_name": table_name, + "column_metadata": column_metadata, + "table_data": table_data + }] return options, export_list diff --git a/sandbox/grist/imports/import_csv_test.py b/sandbox/grist/imports/import_csv_test.py index 320129ba..c715cd6e 100644 --- a/sandbox/grist/imports/import_csv_test.py +++ b/sandbox/grist/imports/import_csv_test.py @@ -2,7 +2,7 @@ import os import textwrap import unittest -from six import BytesIO, text_type +from six import StringIO, text_type import csv from imports import import_csv @@ -12,12 +12,6 @@ def _get_fixture(filename): return os.path.join(os.path.dirname(__file__), "fixtures", filename) -def bytes_io_from_str(string): - if isinstance(string, text_type): - string = string.encode("utf8") - return BytesIO(string) - - class TestImportCSV(unittest.TestCase): maxDiff = None @@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase): u'']) def test_wrong_cols1(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1, name2, name3 a1,b1,c1 @@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""]) def test_wrong_cols2(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1 a1,b1 @@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "", "Text", ["", "c2"]) def test_offset(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ ,,,,,,, name1,name2,name3 @@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"]) def test_offset_no_header(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ 4,b1,c1 4,b2,c2 @@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"]) def test_empty_headers(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ ,,-,- b,a,a,a,a @@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"]) - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ -,-,-,-,-,- b,a,a,a,a @@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 5, "-", "Text", ["", "", ""]) def test_guess_missing_user_option(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1,;name2,;name3 a1,;b1,;c1 @@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"]) def test_one_line_file_no_header(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ 2,name2,name3 """)) @@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "", "Text", ["name3"]) def test_one_line_file_with_header(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1,name2,name3 """)) @@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "name3", "Text", []) def test_empty_file(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ """)) @@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase): self.assertEqual(parsed_file, ({}, [])) def test_option_num_rows(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1,name2,name3 a1,b1,c1 @@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3']) def test_option_num_rows_no_header(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ ,, ,, @@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2']) def test_option_use_col_name_as_header(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ name1,name2,name3 a1,1,c1 @@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase): self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"]) def test_option_use_col_name_as_header_no_headers(self): - file_obj = bytes_io_from_str(textwrap.dedent( + file_obj = StringIO(textwrap.dedent( """\ ,,, ,,, diff --git a/sandbox/grist/imports/import_utils.py b/sandbox/grist/imports/import_utils.py index 37671568..1de748a0 100644 --- a/sandbox/grist/imports/import_utils.py +++ b/sandbox/grist/imports/import_utils.py @@ -14,6 +14,17 @@ if six.PY2: log = logging.getLogger(__name__) + +def empty(value): + """ Stringify the value and check that it has a length. """ + if value is None: + return True + if not isinstance(value, six.string_types): + value = six.text_type(value) + if len(value.strip()): + return False + return True + # Get path to an imported file. def get_path(file_source): importdir = os.environ.get('IMPORTDIR') or '/importdir' @@ -39,7 +50,7 @@ def _is_header(header, data_rows): """ # See if the row has any non-text values. for cell in header: - if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value): + if not isinstance(cell, six.string_types) or _is_numeric(cell): return False @@ -48,7 +59,7 @@ def _is_header(header, data_rows): count_repeats = [0 for cell in header] for row in data_rows: for cell, header_cell in zip(row, header): - if cell.value and cell.value == header_cell.value: + if cell and cell == header_cell: return False return True @@ -59,7 +70,7 @@ def _count_nonempty(row): """ count = 0 for i, c in enumerate(row): - if not c.empty: + if not empty(c): count = i + 1 return count @@ -83,7 +94,7 @@ def expand_headers(headers, data_offset, rows): row_length = max(itertools.chain([len(headers)], (_count_nonempty(r) for r in itertools.islice(rows, data_offset, None)))) - header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers)) + header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers)) return header_values diff --git a/sandbox/grist/imports/messytables_test.py b/sandbox/grist/imports/messytables_test.py deleted file mode 100644 index d36ce731..00000000 --- a/sandbox/grist/imports/messytables_test.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest -import messytables -import os - -class TestMessyTables(unittest.TestCase): - - # Just a skeleton test - def test_any_tableset(self): - path = os.path.join(os.path.dirname(__file__), - "fixtures", "nyc_schools_progress_report_ec_2013.xlsx") - with open(path, "rb") as f: - table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1]) - - self.assertIsInstance(table_set, messytables.XLSTableSet) - self.assertEqual([t.name for t in table_set.tables], - ['Summary', 'Student Progress', 'Student Performance', 'School Environment', - 'Closing the Achievement Gap', 'Middle School Course Metrics', - 'All Information', 'Peer Groups']) - - -if __name__ == "__main__": - unittest.main()