From 9bbf66e50e35c78eca7f57007a8b83e30bc81ecb Mon Sep 17 00:00:00 2001
From: Yohan Boniface <yohanboniface@free.fr>
Date: Thu, 1 Sep 2022 18:56:30 +0200
Subject: [PATCH] wip: remove dependency to messytables

---
 sandbox/grist/imports/import_csv.py       | 191 +++++++++++-----------
 sandbox/grist/imports/import_csv_test.py  |  36 ++--
 sandbox/grist/imports/import_utils.py     |  19 ++-
 sandbox/grist/imports/messytables_test.py |  22 ---
 4 files changed, 130 insertions(+), 138 deletions(-)
 delete mode 100644 sandbox/grist/imports/messytables_test.py

diff --git a/sandbox/grist/imports/import_csv.py b/sandbox/grist/imports/import_csv.py
index 431b69a1..5c3cc046 100644
--- a/sandbox/grist/imports/import_csv.py
+++ b/sandbox/grist/imports/import_csv.py
@@ -1,11 +1,11 @@
 """
 Plugin for importing CSV files
 """
-import os
+import codecs
+import csv
 import logging
 
 import chardet
-import messytables
 import six
 from six.moves import zip
 
@@ -90,108 +90,117 @@ def parse_file(file_path, parse_options=None):
   """
   parse_options = parse_options or {}
 
-  with open(file_path, "rb") as f:
+  with codecs.open(file_path, "rb") as f:
+    sample = f.read(100000)
+    encoding = chardet.detect(sample)['encoding'] or "utf8"
+    # In addition, always prefer UTF8 over ASCII.
+    if encoding == 'ascii':
+      encoding = 'utf8'
+  log.info("Using encoding %s" % encoding)
+
+  with codecs.open(file_path, mode="r", encoding=encoding) as f:
     parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
     return parsing_options, export_list
 
 
+def _guess_dialect(file_obj):
+  try:
+    # Restrict allowed delimiters to prevent guessing other char than this list.
+    dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
+    log.info("Guessed dialect %s" % dict(dialect.__dict__))
+  except csv.Error:
+    log.info("Cannot guess dialect using Excel as fallback.")
+    return csv.excel
+  else:
+    return dialect
+  finally:
+    file_obj.seek(0)
+
 def _parse_open_file(file_obj, parse_options=None):
-  options = {}
   csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
-  csv_options = {k: parse_options.get(k) for k in csv_keys}
+  options = {}
+  dialect = _guess_dialect(file_obj)
+
+  csv_options = {k: parse_options.get(k, getattr(dialect, k, None)) for k in csv_keys}
   if six.PY2:
     csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
                    for k, v in csv_options.items()}
 
-  table_set = messytables.CSVTableSet(file_obj,
-                                      delimiter=csv_options['delimiter'],
-                                      quotechar=csv_options['quotechar'],
-                                      lineterminator=csv_options['lineterminator'],
-                                      doublequote=csv_options['doublequote'],
-                                      skipinitialspace=csv_options['skipinitialspace'])
+  csv_options = {k: v for k, v in csv_options.items() if v is not None}
+  reader = csv.reader(file_obj, **csv_options)
 
   num_rows = parse_options.get('NUM_ROWS', 0)
 
-  # Messytable's encoding detection uses too small a sample, so we override it here.
-  sample = file_obj.read(100000)
-  table_set.encoding = chardet.detect(sample)['encoding']
-  # In addition, always prefer UTF8 over ASCII.
-  if table_set.encoding == 'ascii':
-    table_set.encoding = 'utf8'
-
-  export_list = []
-  # A table set is a collection of tables:
-  for row_set in table_set.tables:
-    table_name = None
-    sample_rows = list(row_set.sample)
-    # Messytables doesn't guess whether headers are present, so we need to step in.
-    data_offset, headers = import_utils.headers_guess(sample_rows)
-
-    # Make sure all header values are strings.
-    for i, header in enumerate(headers):
-      if not isinstance(header, six.string_types):
-        headers[i] = six.text_type(header)
-
-    log.info("Guessed data_offset as %s", data_offset)
-    log.info("Guessed headers as: %s", headers)
-
-    have_guessed_headers = any(headers)
-    include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
-                                                     have_guessed_headers)
-
-    if include_col_names_as_headers and not have_guessed_headers:
-      # use first line as headers
-      data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
-      headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
-
-    elif not include_col_names_as_headers and have_guessed_headers:
-      # move guessed headers to data
-      data_offset -= 1
-      headers = [''] * len(headers)
-
-    row_set.register_processor(messytables.offset_processor(data_offset))
-    rows = [
-      [cell.value for cell in row]
-      for row in row_set
-    ]
-    table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
-
-    # Identify and remove empty columns, and populate separate metadata and data lists.
-    column_metadata = []
-    table_data = []
-    for col_data, header in zip(table_data_with_types, headers):
-      if not header and all(val == "" for val in col_data["data"]):
-        continue # empty column
-      data = col_data.pop("data")
-      col_data["id"] = header
-      column_metadata.append(col_data)
-      table_data.append(data)
-
-    if not table_data:
-      # Don't add tables with no columns.
-      continue
-
-    guessed = row_set._dialect
-    quoting = parse_options.get('quoting')
-    options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
-               "doublequote": parse_options.get('doublequote', guessed.doublequote),
-               "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
-               "quotechar": parse_options.get('quotechar', guessed.quotechar),
-               "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
-               "include_col_names_as_headers": include_col_names_as_headers,
-               "start_with_row": 1,
-               "NUM_ROWS": num_rows,
-               "SCHEMA": SCHEMA
-               }
-
-    log.info("Output table %r with %d columns", table_name, len(column_metadata))
-    for c in column_metadata:
-      log.debug("Output column %s", c)
-    export_list.append({
-      "table_name": table_name,
-      "column_metadata": column_metadata,
-      "table_data": table_data
-    })
+
+  table_name = None
+  rows = list(reader)
+  sample_len = 100
+  sample_rows = rows[:sample_len]
+  data_offset, headers = import_utils.headers_guess(sample_rows)
+
+  # Make sure all header values are strings.
+  for i, header in enumerate(headers):
+    if not isinstance(header, six.string_types):
+      headers[i] = six.text_type(header)
+
+  log.info("Guessed data_offset as %s", data_offset)
+  log.info("Guessed headers as: %s", headers)
+
+  have_guessed_headers = any(headers)
+  include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
+                                                   have_guessed_headers)
+
+  if include_col_names_as_headers and not have_guessed_headers:
+    # use first line as headers
+    data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
+    headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
+
+  elif not include_col_names_as_headers and have_guessed_headers:
+    # move guessed headers to data
+    data_offset -= 1
+    headers = [''] * len(headers)
+
+  rows = rows[data_offset:]  # Use row.pop instead to make it faster ?
+  table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
+
+  # Identify and remove empty columns, and populate separate metadata and data lists.
+  column_metadata = []
+  table_data = []
+  for col_data, header in zip(table_data_with_types, headers):
+    if not header and all(val == "" for val in col_data["data"]):
+      continue # empty column
+    data = col_data.pop("data")
+    col_data["id"] = header
+    column_metadata.append(col_data)
+    table_data.append(data)
+
+  if not table_data:
+    log.info("No data found. Aborting CSV import.")
+    # Don't add tables with no columns.
+    return {}, []
+
+  guessed = reader.dialect
+  quoting = parse_options.get('quoting')
+  options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
+             "doublequote": parse_options.get('doublequote', guessed.doublequote),
+             "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
+             "quotechar": parse_options.get('quotechar', guessed.quotechar),
+             "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
+             "include_col_names_as_headers": include_col_names_as_headers,
+             "start_with_row": 1,
+             "NUM_ROWS": num_rows,
+             "SCHEMA": SCHEMA
+             }
+
+  log.info("Output table %r with %d columns", table_name, len(column_metadata))
+  for c in column_metadata:
+    log.debug("Output column %s", c)
+
+  export_list = [{
+    "table_name": table_name,
+    "column_metadata": column_metadata,
+    "table_data": table_data
+  }]
 
   return options, export_list
 
diff --git a/sandbox/grist/imports/import_csv_test.py b/sandbox/grist/imports/import_csv_test.py
index 320129ba..c715cd6e 100644
--- a/sandbox/grist/imports/import_csv_test.py
+++ b/sandbox/grist/imports/import_csv_test.py
@@ -2,7 +2,7 @@
 import os
 import textwrap
 import unittest
-from six import BytesIO, text_type
+from six import StringIO, text_type
 import csv
 
 from imports import import_csv
@@ -12,12 +12,6 @@ def _get_fixture(filename):
   return os.path.join(os.path.dirname(__file__), "fixtures", filename)
 
 
-def bytes_io_from_str(string):
-  if isinstance(string, text_type):
-    string = string.encode("utf8")
-  return BytesIO(string)
-
-
 class TestImportCSV(unittest.TestCase):
 
   maxDiff = None
@@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
                      u''])
 
   def test_wrong_cols1(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1, name2, name3
       a1,b1,c1
@@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
 
   def test_wrong_cols2(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1
       a1,b1
@@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
 
   def test_offset(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       ,,,,,,,
       name1,name2,name3
@@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
 
   def test_offset_no_header(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       4,b1,c1
       4,b2,c2
@@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
 
   def test_empty_headers(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       ,,-,-
       b,a,a,a,a
@@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
     self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
 
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       -,-,-,-,-,-
       b,a,a,a,a
@@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
 
   def test_guess_missing_user_option(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1,;name2,;name3
       a1,;b1,;c1
@@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
 
   def test_one_line_file_no_header(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       2,name2,name3
       """))
@@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "", "Text", ["name3"])
 
   def test_one_line_file_with_header(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1,name2,name3
       """))
@@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "name3", "Text", [])
 
   def test_empty_file(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       """))
 
@@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
     self.assertEqual(parsed_file, ({}, []))
 
   def test_option_num_rows(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1,name2,name3
       a1,b1,c1
@@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
 
   def test_option_num_rows_no_header(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       ,,
       ,,
@@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
 
   def test_option_use_col_name_as_header(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       name1,name2,name3
       a1,1,c1
@@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
     self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
 
   def test_option_use_col_name_as_header_no_headers(self):
-    file_obj = bytes_io_from_str(textwrap.dedent(
+    file_obj = StringIO(textwrap.dedent(
       """\
       ,,,
       ,,,
diff --git a/sandbox/grist/imports/import_utils.py b/sandbox/grist/imports/import_utils.py
index 37671568..1de748a0 100644
--- a/sandbox/grist/imports/import_utils.py
+++ b/sandbox/grist/imports/import_utils.py
@@ -14,6 +14,17 @@ if six.PY2:
 
 log = logging.getLogger(__name__)
 
+
+def empty(value):
+  """ Stringify the value and check that it has a length. """
+  if value is None:
+    return True
+  if not isinstance(value, six.string_types):
+    value = six.text_type(value)
+  if len(value.strip()):
+    return False
+  return True
+
 # Get path to an imported file.
 def get_path(file_source):
   importdir = os.environ.get('IMPORTDIR') or '/importdir'
@@ -39,7 +50,7 @@ def _is_header(header, data_rows):
   """
   # See if the row has any non-text values.
   for cell in header:
-    if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
+    if not isinstance(cell, six.string_types) or _is_numeric(cell):
       return False
 
 
@@ -48,7 +59,7 @@ def _is_header(header, data_rows):
   count_repeats = [0 for cell in header]
   for row in data_rows:
     for cell, header_cell in zip(row, header):
-      if cell.value and cell.value == header_cell.value:
+      if cell and cell == header_cell:
         return False
 
   return True
@@ -59,7 +70,7 @@ def _count_nonempty(row):
   """
   count = 0
   for i, c in enumerate(row):
-    if not c.empty:
+    if not empty(c):
       count = i + 1
   return count
 
@@ -83,7 +94,7 @@ def expand_headers(headers, data_offset, rows):
   row_length = max(itertools.chain([len(headers)],
                                    (_count_nonempty(r) for r in itertools.islice(rows, data_offset,
                                                                                  None))))
-  header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
+  header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
   return header_values
 
 
diff --git a/sandbox/grist/imports/messytables_test.py b/sandbox/grist/imports/messytables_test.py
deleted file mode 100644
index d36ce731..00000000
--- a/sandbox/grist/imports/messytables_test.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import unittest
-import messytables
-import os
-
-class TestMessyTables(unittest.TestCase):
-
-  # Just a skeleton test
-  def test_any_tableset(self):
-    path = os.path.join(os.path.dirname(__file__),
-                        "fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
-    with open(path, "rb") as f:
-      table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
-
-    self.assertIsInstance(table_set, messytables.XLSTableSet)
-    self.assertEqual([t.name for t in table_set.tables],
-                     ['Summary', 'Student Progress', 'Student Performance', 'School Environment',
-                      'Closing the Achievement Gap', 'Middle School Course Metrics',
-                      'All Information', 'Peer Groups'])
-
-
-if __name__ == "__main__":
-  unittest.main()