wip: remove dependency to messytables

This commit is contained in:
Yohan Boniface 2022-09-01 18:56:30 +02:00
parent 410cf61d94
commit 9bbf66e50e
4 changed files with 119 additions and 127 deletions

View File

@ -1,11 +1,11 @@
"""
Plugin for importing CSV files
"""
import os
import codecs
import csv
import logging
import chardet
import messytables
import six
from six.moves import zip
@ -90,108 +90,117 @@ def parse_file(file_path, parse_options=None):
"""
parse_options = parse_options or {}
with open(file_path, "rb") as f:
with codecs.open(file_path, "rb") as f:
sample = f.read(100000)
encoding = chardet.detect(sample)['encoding'] or "utf8"
# In addition, always prefer UTF8 over ASCII.
if encoding == 'ascii':
encoding = 'utf8'
log.info("Using encoding %s" % encoding)
with codecs.open(file_path, mode="r", encoding=encoding) as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
return parsing_options, export_list
def _guess_dialect(file_obj):
try:
# Restrict allowed delimiters to prevent guessing other char than this list.
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
log.info("Guessed dialect %s" % dict(dialect.__dict__))
except csv.Error:
log.info("Cannot guess dialect using Excel as fallback.")
return csv.excel
else:
return dialect
finally:
file_obj.seek(0)
def _parse_open_file(file_obj, parse_options=None):
options = {}
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
csv_options = {k: parse_options.get(k) for k in csv_keys}
options = {}
dialect = _guess_dialect(file_obj)
csv_options = {k: parse_options.get(k, getattr(dialect, k, None)) for k in csv_keys}
if six.PY2:
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
for k, v in csv_options.items()}
table_set = messytables.CSVTableSet(file_obj,
delimiter=csv_options['delimiter'],
quotechar=csv_options['quotechar'],
lineterminator=csv_options['lineterminator'],
doublequote=csv_options['doublequote'],
skipinitialspace=csv_options['skipinitialspace'])
csv_options = {k: v for k, v in csv_options.items() if v is not None}
reader = csv.reader(file_obj, **csv_options)
num_rows = parse_options.get('NUM_ROWS', 0)
# Messytable's encoding detection uses too small a sample, so we override it here.
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding']
# In addition, always prefer UTF8 over ASCII.
if table_set.encoding == 'ascii':
table_set.encoding = 'utf8'
export_list = []
# A table set is a collection of tables:
for row_set in table_set.tables:
table_name = None
sample_rows = list(row_set.sample)
# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(sample_rows)
table_name = None
rows = list(reader)
sample_len = 100
sample_rows = rows[:sample_len]
data_offset, headers = import_utils.headers_guess(sample_rows)
# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
headers[i] = six.text_type(header)
# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
headers[i] = six.text_type(header)
log.info("Guessed data_offset as %s", data_offset)
log.info("Guessed headers as: %s", headers)
log.info("Guessed data_offset as %s", data_offset)
log.info("Guessed headers as: %s", headers)
have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
have_guessed_headers)
have_guessed_headers = any(headers)
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
have_guessed_headers)
if include_col_names_as_headers and not have_guessed_headers:
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
if include_col_names_as_headers and not have_guessed_headers:
# use first line as headers
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data
data_offset -= 1
headers = [''] * len(headers)
elif not include_col_names_as_headers and have_guessed_headers:
# move guessed headers to data
data_offset -= 1
headers = [''] * len(headers)
row_set.register_processor(messytables.offset_processor(data_offset))
rows = [
[cell.value for cell in row]
for row in row_set
]
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
rows = rows[data_offset:] # Use row.pop instead to make it faster ?
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
table_data = []
for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]):
continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
table_data = []
for col_data, header in zip(table_data_with_types, headers):
if not header and all(val == "" for val in col_data["data"]):
continue # empty column
data = col_data.pop("data")
col_data["id"] = header
column_metadata.append(col_data)
table_data.append(data)
if not table_data:
# Don't add tables with no columns.
continue
if not table_data:
log.info("No data found. Aborting CSV import.")
# Don't add tables with no columns.
return {}, []
guessed = row_set._dialect
quoting = parse_options.get('quoting')
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1,
"NUM_ROWS": num_rows,
"SCHEMA": SCHEMA
}
guessed = reader.dialect
quoting = parse_options.get('quoting')
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
"doublequote": parse_options.get('doublequote', guessed.doublequote),
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
"quotechar": parse_options.get('quotechar', guessed.quotechar),
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
"include_col_names_as_headers": include_col_names_as_headers,
"start_with_row": 1,
"NUM_ROWS": num_rows,
"SCHEMA": SCHEMA
}
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list.append({
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
})
log.info("Output table %r with %d columns", table_name, len(column_metadata))
for c in column_metadata:
log.debug("Output column %s", c)
export_list = [{
"table_name": table_name,
"column_metadata": column_metadata,
"table_data": table_data
}]
return options, export_list

View File

@ -2,7 +2,7 @@
import os
import textwrap
import unittest
from six import BytesIO, text_type
from six import StringIO, text_type
import csv
from imports import import_csv
@ -12,12 +12,6 @@ def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
def bytes_io_from_str(string):
if isinstance(string, text_type):
string = string.encode("utf8")
return BytesIO(string)
class TestImportCSV(unittest.TestCase):
maxDiff = None
@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
u''])
def test_wrong_cols1(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1, name2, name3
a1,b1,c1
@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
def test_wrong_cols2(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1
a1,b1
@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
def test_offset(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
,,,,,,,
name1,name2,name3
@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
def test_offset_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
4,b1,c1
4,b2,c2
@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
def test_empty_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
,,-,-
b,a,a,a,a
@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
-,-,-,-,-,-
b,a,a,a,a
@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
def test_guess_missing_user_option(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1,;name2,;name3
a1,;b1,;c1
@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
def test_one_line_file_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
2,name2,name3
"""))
@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ["name3"])
def test_one_line_file_with_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1,name2,name3
"""))
@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", [])
def test_empty_file(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
"""))
@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
self.assertEqual(parsed_file, ({}, []))
def test_option_num_rows(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1,name2,name3
a1,b1,c1
@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
def test_option_num_rows_no_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
,,
,,
@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
def test_option_use_col_name_as_header(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
name1,name2,name3
a1,1,c1
@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
def test_option_use_col_name_as_header_no_headers(self):
file_obj = bytes_io_from_str(textwrap.dedent(
file_obj = StringIO(textwrap.dedent(
"""\
,,,
,,,

View File

@ -14,6 +14,17 @@ if six.PY2:
log = logging.getLogger(__name__)
def empty(value):
""" Stringify the value and check that it has a length. """
if value is None:
return True
if not isinstance(value, six.string_types):
value = six.text_type(value)
if len(value.strip()):
return False
return True
# Get path to an imported file.
def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir'
@ -39,7 +50,7 @@ def _is_header(header, data_rows):
"""
# See if the row has any non-text values.
for cell in header:
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
if not isinstance(cell, six.string_types) or _is_numeric(cell):
return False
@ -48,7 +59,7 @@ def _is_header(header, data_rows):
count_repeats = [0 for cell in header]
for row in data_rows:
for cell, header_cell in zip(row, header):
if cell.value and cell.value == header_cell.value:
if cell and cell == header_cell:
return False
return True
@ -59,7 +70,7 @@ def _count_nonempty(row):
"""
count = 0
for i, c in enumerate(row):
if not c.empty:
if not empty(c):
count = i + 1
return count
@ -83,7 +94,7 @@ def expand_headers(headers, data_offset, rows):
row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
None))))
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values

View File

@ -1,22 +0,0 @@
import unittest
import messytables
import os
class TestMessyTables(unittest.TestCase):
# Just a skeleton test
def test_any_tableset(self):
path = os.path.join(os.path.dirname(__file__),
"fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
with open(path, "rb") as f:
table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
self.assertIsInstance(table_set, messytables.XLSTableSet)
self.assertEqual([t.name for t in table_set.tables],
['Summary', 'Student Progress', 'Student Performance', 'School Environment',
'Closing the Achievement Gap', 'Middle School Course Metrics',
'All Information', 'Peer Groups'])
if __name__ == "__main__":
unittest.main()