wip: remove dependency to messytables

This commit is contained in:
Yohan Boniface 2022-09-01 18:56:30 +02:00
parent 410cf61d94
commit 9bbf66e50e
4 changed files with 119 additions and 127 deletions

View File

@ -1,11 +1,11 @@
""" """
Plugin for importing CSV files Plugin for importing CSV files
""" """
import os import codecs
import csv
import logging import logging
import chardet import chardet
import messytables
import six import six
from six.moves import zip from six.moves import zip
@ -90,108 +90,117 @@ def parse_file(file_path, parse_options=None):
""" """
parse_options = parse_options or {} parse_options = parse_options or {}
with open(file_path, "rb") as f: with codecs.open(file_path, "rb") as f:
sample = f.read(100000)
encoding = chardet.detect(sample)['encoding'] or "utf8"
# In addition, always prefer UTF8 over ASCII.
if encoding == 'ascii':
encoding = 'utf8'
log.info("Using encoding %s" % encoding)
with codecs.open(file_path, mode="r", encoding=encoding) as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options) parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
return parsing_options, export_list return parsing_options, export_list
def _guess_dialect(file_obj):
try:
# Restrict allowed delimiters to prevent guessing other char than this list.
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
log.info("Guessed dialect %s" % dict(dialect.__dict__))
except csv.Error:
log.info("Cannot guess dialect using Excel as fallback.")
return csv.excel
else:
return dialect
finally:
file_obj.seek(0)
def _parse_open_file(file_obj, parse_options=None): def _parse_open_file(file_obj, parse_options=None):
options = {}
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace'] csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
csv_options = {k: parse_options.get(k) for k in csv_keys} options = {}
dialect = _guess_dialect(file_obj)
csv_options = {k: parse_options.get(k, getattr(dialect, k, None)) for k in csv_keys}
if six.PY2: if six.PY2:
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
for k, v in csv_options.items()} for k, v in csv_options.items()}
table_set = messytables.CSVTableSet(file_obj, csv_options = {k: v for k, v in csv_options.items() if v is not None}
delimiter=csv_options['delimiter'], reader = csv.reader(file_obj, **csv_options)
quotechar=csv_options['quotechar'],
lineterminator=csv_options['lineterminator'],
doublequote=csv_options['doublequote'],
skipinitialspace=csv_options['skipinitialspace'])
num_rows = parse_options.get('NUM_ROWS', 0) num_rows = parse_options.get('NUM_ROWS', 0)
# Messytable's encoding detection uses too small a sample, so we override it here.
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding']
# In addition, always prefer UTF8 over ASCII.
if table_set.encoding == 'ascii':
table_set.encoding = 'utf8'
export_list = [] table_name = None
# A table set is a collection of tables: rows = list(reader)
for row_set in table_set.tables: sample_len = 100
table_name = None sample_rows = rows[:sample_len]
sample_rows = list(row_set.sample) data_offset, headers = import_utils.headers_guess(sample_rows)
# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(sample_rows)
# Make sure all header values are strings. # Make sure all header values are strings.
for i, header in enumerate(headers): for i, header in enumerate(headers):
if not isinstance(header, six.string_types): if not isinstance(header, six.string_types):
headers[i] = six.text_type(header) headers[i] = six.text_type(header)
log.info("Guessed data_offset as %s", data_offset) log.info("Guessed data_offset as %s", data_offset)
log.info("Guessed headers as: %s", headers) log.info("Guessed headers as: %s", headers)
have_guessed_headers = any(headers) have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers', include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
have_guessed_headers) have_guessed_headers)
if include_col_names_as_headers and not have_guessed_headers: if include_col_names_as_headers and not have_guessed_headers:
# use first line as headers # use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows) data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows) headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
elif not include_col_names_as_headers and have_guessed_headers: elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data # move guessed headers to data
data_offset -= 1 data_offset -= 1
headers = [''] * len(headers) headers = [''] * len(headers)
row_set.register_processor(messytables.offset_processor(data_offset)) rows = rows[data_offset:] # Use row.pop instead to make it faster ?
rows = [ table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
[cell.value for cell in row]
for row in row_set
]
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
# Identify and remove empty columns, and populate separate metadata and data lists. # Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = [] column_metadata = []
table_data = [] table_data = []
for col_data, header in zip(table_data_with_types, headers): for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]): if not header and all(val == "" for val in col_data["data"]):
continue # empty column continue # empty column
data = col_data.pop("data") data = col_data.pop("data")
col_data["id"] = header col_data["id"] = header
column_metadata.append(col_data) column_metadata.append(col_data)
table_data.append(data) table_data.append(data)
if not table_data: if not table_data:
# Don't add tables with no columns. log.info("No data found. Aborting CSV import.")
continue # Don't add tables with no columns.
return {}, []
guessed = row_set._dialect guessed = reader.dialect
quoting = parse_options.get('quoting') quoting = parse_options.get('quoting')
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter), options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote), "doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator), "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar), "quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace), "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers, "include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1, "start_with_row": 1,
"NUM_ROWS": num_rows, "NUM_ROWS": num_rows,
"SCHEMA": SCHEMA "SCHEMA": SCHEMA
} }
log.info("Output table %r with %d columns", table_name, len(column_metadata)) log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata: for c in column_metadata:
log.debug("Output column %s", c) log.debug("Output column %s", c)
export_list.append({
"table_name": table_name, export_list = [{
"column_metadata": column_metadata, "table_name": table_name,
"table_data": table_data "column_metadata": column_metadata,
}) "table_data": table_data
}]
return options, export_list return options, export_list

View File

@ -2,7 +2,7 @@
import os import os
import textwrap import textwrap
import unittest import unittest
from six import BytesIO, text_type from six import StringIO, text_type
import csv import csv
from imports import import_csv from imports import import_csv
@ -12,12 +12,6 @@ def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename) return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):
if isinstance(string, text_type):
string = string.encode("utf8")
return BytesIO(string)
class TestImportCSV(unittest.TestCase): class TestImportCSV(unittest.TestCase):
maxDiff = None maxDiff = None
@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
u'']) u''])
def test_wrong_cols1(self): def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1, name2, name3 name1, name2, name3
a1,b1,c1 a1,b1,c1
@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
def test_wrong_cols2(self): def test_wrong_cols2(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1 name1
a1,b1 a1,b1
@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["", "c2"]) self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
def test_offset(self): def test_offset(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,,,,,, ,,,,,,,
name1,name2,name3 name1,name2,name3
@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"]) self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
def test_offset_no_header(self): def test_offset_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
4,b1,c1 4,b1,c1
4,b2,c2 4,b2,c2
@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
def test_empty_headers(self): def test_empty_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,-,- ,,-,-
b,a,a,a,a b,a,a,a,a
@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"]) self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
-,-,-,-,-,- -,-,-,-,-,-
b,a,a,a,a b,a,a,a,a
@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""]) self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
def test_guess_missing_user_option(self): def test_guess_missing_user_option(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,;name2,;name3 name1,;name2,;name3
a1,;b1,;c1 a1,;b1,;c1
@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"]) self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
def test_one_line_file_no_header(self): def test_one_line_file_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
2,name2,name3 2,name2,name3
""")) """))
@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["name3"]) self._check_col(parsed_file, 2, "", "Text", ["name3"])
def test_one_line_file_with_header(self): def test_one_line_file_with_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
""")) """))
@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", []) self._check_col(parsed_file, 2, "name3", "Text", [])
def test_empty_file(self): def test_empty_file(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
""")) """))
@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
self.assertEqual(parsed_file, ({}, [])) self.assertEqual(parsed_file, ({}, []))
def test_option_num_rows(self): def test_option_num_rows(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,b1,c1 a1,b1,c1
@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3']) self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
def test_option_num_rows_no_header(self): def test_option_num_rows_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,, ,,
,, ,,
@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2']) self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
def test_option_use_col_name_as_header(self): def test_option_use_col_name_as_header(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
name1,name2,name3 name1,name2,name3
a1,1,c1 a1,1,c1
@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"]) self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
def test_option_use_col_name_as_header_no_headers(self): def test_option_use_col_name_as_header_no_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent( file_obj = StringIO(textwrap.dedent(
"""\ """\
,,, ,,,
,,, ,,,

View File

@ -14,6 +14,17 @@ if six.PY2:
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def empty(value):
""" Stringify the value and check that it has a length. """
if value is None:
return True
if not isinstance(value, six.string_types):
value = six.text_type(value)
if len(value.strip()):
return False
return True
# Get path to an imported file. # Get path to an imported file.
def get_path(file_source): def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir' importdir = os.environ.get('IMPORTDIR') or '/importdir'
@ -39,7 +50,7 @@ def _is_header(header, data_rows):
""" """
# See if the row has any non-text values. # See if the row has any non-text values.
for cell in header: for cell in header:
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value): if not isinstance(cell, six.string_types) or _is_numeric(cell):
return False return False
@ -48,7 +59,7 @@ def _is_header(header, data_rows):
count_repeats = [0 for cell in header] count_repeats = [0 for cell in header]
for row in data_rows: for row in data_rows:
for cell, header_cell in zip(row, header): for cell, header_cell in zip(row, header):
if cell.value and cell.value == header_cell.value: if cell and cell == header_cell:
return False return False
return True return True
@ -59,7 +70,7 @@ def _count_nonempty(row):
""" """
count = 0 count = 0
for i, c in enumerate(row): for i, c in enumerate(row):
if not c.empty: if not empty(c):
count = i + 1 count = i + 1
return count return count
@ -83,7 +94,7 @@ def expand_headers(headers, data_offset, rows):
row_length = max(itertools.chain([len(headers)], row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset, (_count_nonempty(r) for r in itertools.islice(rows, data_offset,
None)))) None))))
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers)) header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values return header_values

View File

@ -1,22 +0,0 @@
import unittest
import messytables
import os
class TestMessyTables(unittest.TestCase):
# Just a skeleton test
def test_any_tableset(self):
path = os.path.join(os.path.dirname(__file__),
"fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
with open(path, "rb") as f:
table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
self.assertIsInstance(table_set, messytables.XLSTableSet)
self.assertEqual([t.name for t in table_set.tables],
['Summary', 'Student Progress', 'Student Performance', 'School Environment',
'Closing the Achievement Gap', 'Middle School Course Metrics',
'All Information', 'Peer Groups'])
if __name__ == "__main__":
unittest.main()