Merge pull request #265 from yohanboniface/bye-messytable

Draft: remove dependency to messytables
This commit is contained in:
Alex Hall 2022-09-20 19:40:33 +02:00 committed by GitHub
commit b2b0950c9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 130 additions and 137 deletions

View File

@ -1,11 +1,11 @@
""" """
Plugin for importing CSV files Plugin for importing CSV files
""" """
import os import codecs
import csv
import logging import logging
import chardet import chardet
import messytables
import six import six
from six.moves import zip from six.moves import zip
@ -90,108 +90,116 @@ def parse_file(file_path, parse_options=None):
""" """
parse_options = parse_options or {} parse_options = parse_options or {}
with open(file_path, "rb") as f: with codecs.open(file_path, "rb") as f:
sample = f.read(100000)
encoding = chardet.detect(sample)['encoding'] or "utf8"
# In addition, always prefer UTF8 over ASCII.
if encoding == 'ascii':
encoding = 'utf8'
log.info("Using encoding %s", encoding)
with codecs.open(file_path, mode="r", encoding=encoding) as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options) parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
return parsing_options, export_list return parsing_options, export_list
def _guess_dialect(file_obj):
try:
# Restrict allowed delimiters to prevent guessing other char than this list.
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
log.info("Guessed dialect %s", dict(dialect.__dict__))
# Mimic messytables default for now.
dialect.lineterminator = "\n"
dialect.doublequote = True
return dialect
except csv.Error:
log.info("Cannot guess dialect using Excel as fallback.")
return csv.excel
finally:
file_obj.seek(0)
def _parse_open_file(file_obj, parse_options=None): def _parse_open_file(file_obj, parse_options=None):
options = {}
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace'] csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
csv_options = {k: parse_options.get(k) for k in csv_keys} options = {}
if six.PY2: dialect = _guess_dialect(file_obj)
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
for k, v in csv_options.items()}
table_set = messytables.CSVTableSet(file_obj, csv_options = {}
delimiter=csv_options['delimiter'], for key in csv_keys:
quotechar=csv_options['quotechar'], value = parse_options.get(key, getattr(dialect, key, None))
lineterminator=csv_options['lineterminator'], if value is not None:
doublequote=csv_options['doublequote'], csv_options[key] = value
skipinitialspace=csv_options['skipinitialspace'])
reader = csv.reader(file_obj, **csv_options)
rows = list(reader)
sample_len = 100
sample_rows = rows[:sample_len]
data_offset, headers = import_utils.headers_guess(sample_rows)
# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
headers[i] = six.text_type(header)
log.info("Guessed data_offset as %s", data_offset)
log.info("Guessed headers as: %s", headers)
have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
have_guessed_headers)
if include_col_names_as_headers and not have_guessed_headers:
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data
data_offset -= 1
headers = [''] * len(headers)
rows = rows[data_offset:]
num_rows = parse_options.get('NUM_ROWS', 0) num_rows = parse_options.get('NUM_ROWS', 0)
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
# Messytable's encoding detection uses too small a sample, so we override it here. # Identify and remove empty columns, and populate separate metadata and data lists.
sample = file_obj.read(100000) column_metadata = []
table_set.encoding = chardet.detect(sample)['encoding'] table_data = []
# In addition, always prefer UTF8 over ASCII. for col_data, header in zip(table_data_with_types, headers):
if table_set.encoding == 'ascii': if not header and all(val == "" for val in col_data["data"]):
table_set.encoding = 'utf8' continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
export_list = [] if not table_data:
# A table set is a collection of tables: log.info("No data found. Aborting CSV import.")
for row_set in table_set.tables: # Don't add tables with no columns.
table_name = None return {}, []
sample_rows = list(row_set.sample)
# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(sample_rows)
# Make sure all header values are strings. guessed = reader.dialect
for i, header in enumerate(headers): quoting = parse_options.get('quoting')
if not isinstance(header, six.string_types): options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
headers[i] = six.text_type(header) "doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1,
"NUM_ROWS": num_rows,
"SCHEMA": SCHEMA
}
log.info("Guessed data_offset as %s", data_offset) log.info("Output table with %d columns", len(column_metadata))
log.info("Guessed headers as: %s", headers) for c in column_metadata:
log.debug("Output column %s", c)
have_guessed_headers = any(headers) export_list = [{
include_col_names_as_headers = parse_options.get('include_col_names_as_headers', "table_name": None,
have_guessed_headers) "column_metadata": column_metadata,
"table_data": table_data
if include_col_names_as_headers and not have_guessed_headers: }]
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data
data_offset -= 1
headers = [''] * len(headers)
row_set.register_processor(messytables.offset_processor(data_offset))
rows = [
[cell.value for cell in row]
for row in row_set
]
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
table_data = []
for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]):
continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
if not table_data:
# Don't add tables with no columns.
continue
guessed = row_set._dialect
quoting = parse_options.get('quoting')
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1,
"NUM_ROWS": num_rows,
"SCHEMA": SCHEMA
}
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list.append({
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
})
return options, export_list return options, export_list

View File

@ -2,7 +2,7 @@
import os import os
import textwrap import textwrap
import unittest import unittest
from six import BytesIO, text_type from six import StringIO, text_type
import csv import csv
from imports import import_csv from imports import import_csv
@ -12,12 +12,6 @@ def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename) return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):
if isinstance(string, text_type):
string = string.encode("utf8")
return BytesIO(string)
class TestImportCSV(unittest.TestCase): class TestImportCSV(unittest.TestCase):
maxDiff = None maxDiff = None
@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
u'']) u''])
def test_wrong_cols1(self): def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1, name2, name3 name1, name2, name3
a1,b1,c1 a1,b1,c1
@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
def test_wrong_cols2(self): def test_wrong_cols2(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1 name1
a1,b1 a1,b1
@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["", "c2"]) self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
def test_offset(self): def test_offset(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,,,,,, ,,,,,,,
name1,name2,name3 name1,name2,name3
@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"]) self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
def test_offset_no_header(self): def test_offset_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
4,b1,c1 4,b1,c1
4,b2,c2 4,b2,c2
@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
def test_empty_headers(self): def test_empty_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,-,- ,,-,-
b,a,a,a,a b,a,a,a,a
@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
-,-,-,-,-,- -,-,-,-,-,-
b,a,a,a,a b,a,a,a,a
@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""]) self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
def test_guess_missing_user_option(self): def test_guess_missing_user_option(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,;name2,;name3 name1,;name2,;name3
a1,;b1,;c1 a1,;b1,;c1
@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"]) self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
def test_one_line_file_no_header(self): def test_one_line_file_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
2,name2,name3 2,name2,name3
""")) """))
@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["name3"]) self._check_col(parsed_file, 2, "", "Text", ["name3"])
def test_one_line_file_with_header(self): def test_one_line_file_with_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
""")) """))
@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", []) self._check_col(parsed_file, 2, "name3", "Text", [])
def test_empty_file(self): def test_empty_file(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
""")) """))
@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
self.assertEqual(parsed_file, ({}, [])) self.assertEqual(parsed_file, ({}, []))
def test_option_num_rows(self): def test_option_num_rows(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,b1,c1 a1,b1,c1
@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3']) self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
def test_option_num_rows_no_header(self): def test_option_num_rows_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,, ,,
,, ,,
@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2']) self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
def test_option_use_col_name_as_header(self): def test_option_use_col_name_as_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,1,c1 a1,1,c1
@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
def test_option_use_col_name_as_header_no_headers(self): def test_option_use_col_name_as_header_no_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,, ,,,
,,, ,,,

View File

@ -14,6 +14,15 @@ if six.PY2:
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def empty(value):
""" Stringify the value and check that it has a length. """
if value is None:
return True
if not isinstance(value, six.string_types):
value = six.text_type(value)
return not value.strip()
# Get path to an imported file. # Get path to an imported file.
def get_path(file_source): def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir' importdir = os.environ.get('IMPORTDIR') or '/importdir'
@ -39,7 +48,7 @@ def _is_header(header, data_rows):
""" """
# See if the row has any non-text values. # See if the row has any non-text values.
for cell in header: for cell in header:
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value): if not isinstance(cell, six.string_types) or _is_numeric(cell):
return False return False
@ -48,7 +57,7 @@ def _is_header(header, data_rows):
count_repeats = [0 for cell in header] count_repeats = [0 for cell in header]
for row in data_rows: for row in data_rows:
for cell, header_cell in zip(row, header): for cell, header_cell in zip(row, header):
if cell.value and cell.value == header_cell.value: if cell and cell == header_cell:
return False return False
return True return True
@ -59,7 +68,7 @@ def _count_nonempty(row):
""" """
count = 0 count = 0
for i, c in enumerate(row): for i, c in enumerate(row):
if not c.empty: if not empty(c):
count = i + 1 count = i + 1
return count return count
@ -83,7 +92,7 @@ def expand_headers(headers, data_offset, rows):
row_length = max(itertools.chain([len(headers)], row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset, (_count_nonempty(r) for r in itertools.islice(rows, data_offset,
None)))) None))))
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers)) header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values return header_values

View File

@ -1,22 +0,0 @@
import unittest
import messytables
import os
class TestMessyTables(unittest.TestCase):
# Just a skeleton test
def test_any_tableset(self):
path = os.path.join(os.path.dirname(__file__),
"fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
with open(path, "rb") as f:
table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
self.assertIsInstance(table_set, messytables.XLSTableSet)
self.assertEqual([t.name for t in table_set.tables],
['Summary', 'Student Progress', 'Student Performance', 'School Environment',
'Closing the Achievement Gap', 'Middle School Course Metrics',
'All Information', 'Peer Groups'])
if __name__ == "__main__":
unittest.main()

View File

@ -9,6 +9,7 @@ tests under 'arc unit' and under Jenkins.
./sandbox/nacl/bin/run python /grist/runtests.py [--xunit] ./sandbox/nacl/bin/run python /grist/runtests.py [--xunit]
""" """
import codecs import codecs
import logging
import os import os
import sys import sys
import unittest import unittest
@ -30,6 +31,9 @@ def main():
utf8_stdout = codecs.getwriter('utf8')(utf8_stdout) utf8_stdout = codecs.getwriter('utf8')(utf8_stdout)
test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout) test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout)
if "-v" in argv or "--verbose" in argv:
logging.basicConfig(level=logging.DEBUG)
if all(arg.startswith("-") for arg in argv[1:]): if all(arg.startswith("-") for arg in argv[1:]):
argv.insert(1, "discover") argv.insert(1, "discover")