Merge pull request #265 from yohanboniface/bye-messytable

Draft: remove dependency to messytables
pull/289/head
Alex Hall 2 years ago committed by GitHub
commit b2b0950c9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,11 +1,11 @@
""" """
Plugin for importing CSV files Plugin for importing CSV files
""" """
import os import codecs
import csv
import logging import logging
import chardet import chardet
import messytables
import six import six
from six.moves import zip from six.moves import zip
@ -90,108 +90,116 @@ def parse_file(file_path, parse_options=None):
""" """
parse_options = parse_options or {} parse_options = parse_options or {}
with open(file_path, "rb") as f: with codecs.open(file_path, "rb") as f:
sample = f.read(100000)
encoding = chardet.detect(sample)['encoding'] or "utf8"
# In addition, always prefer UTF8 over ASCII.
if encoding == 'ascii':
encoding = 'utf8'
log.info("Using encoding %s", encoding)
with codecs.open(file_path, mode="r", encoding=encoding) as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options) parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
return parsing_options, export_list return parsing_options, export_list
def _guess_dialect(file_obj):
try:
# Restrict allowed delimiters to prevent guessing other char than this list.
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
log.info("Guessed dialect %s", dict(dialect.__dict__))
# Mimic messytables default for now.
dialect.lineterminator = "\n"
dialect.doublequote = True
return dialect
except csv.Error:
log.info("Cannot guess dialect using Excel as fallback.")
return csv.excel
finally:
file_obj.seek(0)
def _parse_open_file(file_obj, parse_options=None): def _parse_open_file(file_obj, parse_options=None):
options = {}
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace'] csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
csv_options = {k: parse_options.get(k) for k in csv_keys} options = {}
if six.PY2: dialect = _guess_dialect(file_obj)
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
for k, v in csv_options.items()}
table_set = messytables.CSVTableSet(file_obj,
delimiter=csv_options['delimiter'],
quotechar=csv_options['quotechar'],
lineterminator=csv_options['lineterminator'],
doublequote=csv_options['doublequote'],
skipinitialspace=csv_options['skipinitialspace'])
num_rows = parse_options.get('NUM_ROWS', 0) csv_options = {}
for key in csv_keys:
value = parse_options.get(key, getattr(dialect, key, None))
if value is not None:
csv_options[key] = value
# Messytable's encoding detection uses too small a sample, so we override it here. reader = csv.reader(file_obj, **csv_options)
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding'] rows = list(reader)
# In addition, always prefer UTF8 over ASCII. sample_len = 100
if table_set.encoding == 'ascii': sample_rows = rows[:sample_len]
table_set.encoding = 'utf8' data_offset, headers = import_utils.headers_guess(sample_rows)
export_list = [] # Make sure all header values are strings.
# A table set is a collection of tables: for i, header in enumerate(headers):
for row_set in table_set.tables: if not isinstance(header, six.string_types):
table_name = None headers[i] = six.text_type(header)
sample_rows = list(row_set.sample)
# Messytables doesn't guess whether headers are present, so we need to step in. log.info("Guessed data_offset as %s", data_offset)
data_offset, headers = import_utils.headers_guess(sample_rows) log.info("Guessed headers as: %s", headers)
# Make sure all header values are strings. have_guessed_headers = any(headers)
for i, header in enumerate(headers): include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
if not isinstance(header, six.string_types): have_guessed_headers)
headers[i] = six.text_type(header)
if include_col_names_as_headers and not have_guessed_headers:
log.info("Guessed data_offset as %s", data_offset) # use first line as headers
log.info("Guessed headers as: %s", headers) data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers', elif not include_col_names_as_headers and have_guessed_headers:
have_guessed_headers) # move guessed headers to data
data_offset -= 1
if include_col_names_as_headers and not have_guessed_headers: headers = [''] * len(headers)
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows) rows = rows[data_offset:]
headers = import_utils.expand_headers(first_row, data_offset, sample_rows) num_rows = parse_options.get('NUM_ROWS', 0)
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data # Identify and remove empty columns, and populate separate metadata and data lists.
data_offset -= 1 column_metadata = []
headers = [''] * len(headers) table_data = []
for col_data, header in zip(table_data_with_types, headers):
row_set.register_processor(messytables.offset_processor(data_offset)) if not header and all(val == "" for val in col_data["data"]):
rows = [ continue # empty column
[cell.value for cell in row] data = col_data.pop("data")
for row in row_set col_data["id"] = header
] column_metadata.append(col_data)
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows) table_data.append(data)
# Identify and remove empty columns, and populate separate metadata and data lists. if not table_data:
column_metadata = [] log.info("No data found. Aborting CSV import.")
table_data = [] # Don't add tables with no columns.
for col_data, header in zip(table_data_with_types, headers): return {}, []
if not header and all(val == "" for val in col_data["data"]):
continue # empty column guessed = reader.dialect
data = col_data.pop("data") quoting = parse_options.get('quoting')
col_data["id"] = header options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
column_metadata.append(col_data) "doublequote": parse_options.get('doublequote', guessed.doublequote),
table_data.append(data) "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
if not table_data: "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
# Don't add tables with no columns. "include_col_names_as_headers": include_col_names_as_headers,
continue "start_with_row": 1,
"NUM_ROWS": num_rows,
guessed = row_set._dialect "SCHEMA": SCHEMA
quoting = parse_options.get('quoting') }
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote), log.info("Output table with %d columns", len(column_metadata))
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator), for c in column_metadata:
"quotechar": parse_options.get('quotechar', guessed.quotechar), log.debug("Output column %s", c)
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers, export_list = [{
"start_with_row": 1, "table_name": None,
"NUM_ROWS": num_rows, "column_metadata": column_metadata,
"SCHEMA": SCHEMA "table_data": table_data
} }]
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list.append({
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
})
return options, export_list return options, export_list

@ -2,7 +2,7 @@
import os import os
import textwrap import textwrap
import unittest import unittest
from six import BytesIO, text_type from six import StringIO, text_type
import csv import csv
from imports import import_csv from imports import import_csv
@ -12,12 +12,6 @@ def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename) return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):
if isinstance(string, text_type):
string = string.encode("utf8")
return BytesIO(string)
class TestImportCSV(unittest.TestCase): class TestImportCSV(unittest.TestCase):
maxDiff = None maxDiff = None
@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
u'']) u''])
def test_wrong_cols1(self): def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1, name2, name3 name1, name2, name3
a1,b1,c1 a1,b1,c1
@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
def test_wrong_cols2(self): def test_wrong_cols2(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1 name1
a1,b1 a1,b1
@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["", "c2"]) self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
def test_offset(self): def test_offset(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,,,,,, ,,,,,,,
name1,name2,name3 name1,name2,name3
@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"]) self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
def test_offset_no_header(self): def test_offset_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
4,b1,c1 4,b1,c1
4,b2,c2 4,b2,c2
@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
def test_empty_headers(self): def test_empty_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,-,- ,,-,-
b,a,a,a,a b,a,a,a,a
@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
-,-,-,-,-,- -,-,-,-,-,-
b,a,a,a,a b,a,a,a,a
@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""]) self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
def test_guess_missing_user_option(self): def test_guess_missing_user_option(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,;name2,;name3 name1,;name2,;name3
a1,;b1,;c1 a1,;b1,;c1
@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"]) self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
def test_one_line_file_no_header(self): def test_one_line_file_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
2,name2,name3 2,name2,name3
""")) """))
@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["name3"]) self._check_col(parsed_file, 2, "", "Text", ["name3"])
def test_one_line_file_with_header(self): def test_one_line_file_with_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
""")) """))
@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", []) self._check_col(parsed_file, 2, "name3", "Text", [])
def test_empty_file(self): def test_empty_file(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
""")) """))
@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
self.assertEqual(parsed_file, ({}, [])) self.assertEqual(parsed_file, ({}, []))
def test_option_num_rows(self): def test_option_num_rows(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,b1,c1 a1,b1,c1
@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3']) self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
def test_option_num_rows_no_header(self): def test_option_num_rows_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,, ,,
,, ,,
@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2']) self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
def test_option_use_col_name_as_header(self): def test_option_use_col_name_as_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,1,c1 a1,1,c1
@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
def test_option_use_col_name_as_header_no_headers(self): def test_option_use_col_name_as_header_no_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,, ,,,
,,, ,,,

@ -14,6 +14,15 @@ if six.PY2:
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def empty(value):
""" Stringify the value and check that it has a length. """
if value is None:
return True
if not isinstance(value, six.string_types):
value = six.text_type(value)
return not value.strip()
# Get path to an imported file. # Get path to an imported file.
def get_path(file_source): def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir' importdir = os.environ.get('IMPORTDIR') or '/importdir'
@ -39,7 +48,7 @@ def _is_header(header, data_rows):
""" """
# See if the row has any non-text values. # See if the row has any non-text values.
for cell in header: for cell in header:
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value): if not isinstance(cell, six.string_types) or _is_numeric(cell):
return False return False
@ -48,7 +57,7 @@ def _is_header(header, data_rows):
count_repeats = [0 for cell in header] count_repeats = [0 for cell in header]
for row in data_rows: for row in data_rows:
for cell, header_cell in zip(row, header): for cell, header_cell in zip(row, header):
if cell.value and cell.value == header_cell.value: if cell and cell == header_cell:
return False return False
return True return True
@ -59,7 +68,7 @@ def _count_nonempty(row):
""" """
count = 0 count = 0
for i, c in enumerate(row): for i, c in enumerate(row):
if not c.empty: if not empty(c):
count = i + 1 count = i + 1
return count return count
@ -83,7 +92,7 @@ def expand_headers(headers, data_offset, rows):
row_length = max(itertools.chain([len(headers)], row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset, (_count_nonempty(r) for r in itertools.islice(rows, data_offset,
None)))) None))))
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers)) header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values return header_values

@ -1,22 +0,0 @@
import unittest
import messytables
import os
class TestMessyTables(unittest.TestCase):
# Just a skeleton test
def test_any_tableset(self):
path = os.path.join(os.path.dirname(__file__),
"fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
with open(path, "rb") as f:
table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
self.assertIsInstance(table_set, messytables.XLSTableSet)
self.assertEqual([t.name for t in table_set.tables],
['Summary', 'Student Progress', 'Student Performance', 'School Environment',
'Closing the Achievement Gap', 'Middle School Course Metrics',
'All Information', 'Peer Groups'])
if __name__ == "__main__":
unittest.main()

@ -9,6 +9,7 @@ tests under 'arc unit' and under Jenkins.
./sandbox/nacl/bin/run python /grist/runtests.py [--xunit] ./sandbox/nacl/bin/run python /grist/runtests.py [--xunit]
""" """
import codecs import codecs
import logging
import os import os
import sys import sys
import unittest import unittest
@ -30,6 +31,9 @@ def main():
utf8_stdout = codecs.getwriter('utf8')(utf8_stdout) utf8_stdout = codecs.getwriter('utf8')(utf8_stdout)
test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout) test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout)
if "-v" in argv or "--verbose" in argv:
logging.basicConfig(level=logging.DEBUG)
if all(arg.startswith("-") for arg in argv[1:]): if all(arg.startswith("-") for arg in argv[1:]):
argv.insert(1, "discover") argv.insert(1, "discover")

Loading…
Cancel
Save