mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
Merge pull request #265 from yohanboniface/bye-messytable
Draft: remove dependency to messytables
This commit is contained in:
commit
b2b0950c9c
@ -1,11 +1,11 @@
|
|||||||
"""
|
"""
|
||||||
Plugin for importing CSV files
|
Plugin for importing CSV files
|
||||||
"""
|
"""
|
||||||
import os
|
import codecs
|
||||||
|
import csv
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
import messytables
|
|
||||||
import six
|
import six
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
|
|
||||||
@ -90,41 +90,50 @@ def parse_file(file_path, parse_options=None):
|
|||||||
"""
|
"""
|
||||||
parse_options = parse_options or {}
|
parse_options = parse_options or {}
|
||||||
|
|
||||||
with open(file_path, "rb") as f:
|
with codecs.open(file_path, "rb") as f:
|
||||||
|
sample = f.read(100000)
|
||||||
|
encoding = chardet.detect(sample)['encoding'] or "utf8"
|
||||||
|
# In addition, always prefer UTF8 over ASCII.
|
||||||
|
if encoding == 'ascii':
|
||||||
|
encoding = 'utf8'
|
||||||
|
log.info("Using encoding %s", encoding)
|
||||||
|
|
||||||
|
with codecs.open(file_path, mode="r", encoding=encoding) as f:
|
||||||
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
|
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
|
||||||
return parsing_options, export_list
|
return parsing_options, export_list
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_dialect(file_obj):
|
||||||
|
try:
|
||||||
|
# Restrict allowed delimiters to prevent guessing other char than this list.
|
||||||
|
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
|
||||||
|
log.info("Guessed dialect %s", dict(dialect.__dict__))
|
||||||
|
# Mimic messytables default for now.
|
||||||
|
dialect.lineterminator = "\n"
|
||||||
|
dialect.doublequote = True
|
||||||
|
return dialect
|
||||||
|
except csv.Error:
|
||||||
|
log.info("Cannot guess dialect using Excel as fallback.")
|
||||||
|
return csv.excel
|
||||||
|
finally:
|
||||||
|
file_obj.seek(0)
|
||||||
|
|
||||||
def _parse_open_file(file_obj, parse_options=None):
|
def _parse_open_file(file_obj, parse_options=None):
|
||||||
options = {}
|
|
||||||
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
|
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
|
||||||
csv_options = {k: parse_options.get(k) for k in csv_keys}
|
options = {}
|
||||||
if six.PY2:
|
dialect = _guess_dialect(file_obj)
|
||||||
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
|
|
||||||
for k, v in csv_options.items()}
|
|
||||||
|
|
||||||
table_set = messytables.CSVTableSet(file_obj,
|
csv_options = {}
|
||||||
delimiter=csv_options['delimiter'],
|
for key in csv_keys:
|
||||||
quotechar=csv_options['quotechar'],
|
value = parse_options.get(key, getattr(dialect, key, None))
|
||||||
lineterminator=csv_options['lineterminator'],
|
if value is not None:
|
||||||
doublequote=csv_options['doublequote'],
|
csv_options[key] = value
|
||||||
skipinitialspace=csv_options['skipinitialspace'])
|
|
||||||
|
|
||||||
num_rows = parse_options.get('NUM_ROWS', 0)
|
reader = csv.reader(file_obj, **csv_options)
|
||||||
|
|
||||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
rows = list(reader)
|
||||||
sample = file_obj.read(100000)
|
sample_len = 100
|
||||||
table_set.encoding = chardet.detect(sample)['encoding']
|
sample_rows = rows[:sample_len]
|
||||||
# In addition, always prefer UTF8 over ASCII.
|
|
||||||
if table_set.encoding == 'ascii':
|
|
||||||
table_set.encoding = 'utf8'
|
|
||||||
|
|
||||||
export_list = []
|
|
||||||
# A table set is a collection of tables:
|
|
||||||
for row_set in table_set.tables:
|
|
||||||
table_name = None
|
|
||||||
sample_rows = list(row_set.sample)
|
|
||||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
|
||||||
data_offset, headers = import_utils.headers_guess(sample_rows)
|
data_offset, headers = import_utils.headers_guess(sample_rows)
|
||||||
|
|
||||||
# Make sure all header values are strings.
|
# Make sure all header values are strings.
|
||||||
@ -149,11 +158,8 @@ def _parse_open_file(file_obj, parse_options=None):
|
|||||||
data_offset -= 1
|
data_offset -= 1
|
||||||
headers = [''] * len(headers)
|
headers = [''] * len(headers)
|
||||||
|
|
||||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
rows = rows[data_offset:]
|
||||||
rows = [
|
num_rows = parse_options.get('NUM_ROWS', 0)
|
||||||
[cell.value for cell in row]
|
|
||||||
for row in row_set
|
|
||||||
]
|
|
||||||
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
|
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
|
||||||
|
|
||||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||||
@ -168,10 +174,11 @@ def _parse_open_file(file_obj, parse_options=None):
|
|||||||
table_data.append(data)
|
table_data.append(data)
|
||||||
|
|
||||||
if not table_data:
|
if not table_data:
|
||||||
|
log.info("No data found. Aborting CSV import.")
|
||||||
# Don't add tables with no columns.
|
# Don't add tables with no columns.
|
||||||
continue
|
return {}, []
|
||||||
|
|
||||||
guessed = row_set._dialect
|
guessed = reader.dialect
|
||||||
quoting = parse_options.get('quoting')
|
quoting = parse_options.get('quoting')
|
||||||
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
||||||
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
||||||
@ -184,14 +191,15 @@ def _parse_open_file(file_obj, parse_options=None):
|
|||||||
"SCHEMA": SCHEMA
|
"SCHEMA": SCHEMA
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
log.info("Output table with %d columns", len(column_metadata))
|
||||||
for c in column_metadata:
|
for c in column_metadata:
|
||||||
log.debug("Output column %s", c)
|
log.debug("Output column %s", c)
|
||||||
export_list.append({
|
|
||||||
"table_name": table_name,
|
export_list = [{
|
||||||
|
"table_name": None,
|
||||||
"column_metadata": column_metadata,
|
"column_metadata": column_metadata,
|
||||||
"table_data": table_data
|
"table_data": table_data
|
||||||
})
|
}]
|
||||||
|
|
||||||
return options, export_list
|
return options, export_list
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
import os
|
import os
|
||||||
import textwrap
|
import textwrap
|
||||||
import unittest
|
import unittest
|
||||||
from six import BytesIO, text_type
|
from six import StringIO, text_type
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
from imports import import_csv
|
from imports import import_csv
|
||||||
@ -12,12 +12,6 @@ def _get_fixture(filename):
|
|||||||
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
|
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
|
||||||
|
|
||||||
|
|
||||||
def bytes_io_from_str(string):
|
|
||||||
if isinstance(string, text_type):
|
|
||||||
string = string.encode("utf8")
|
|
||||||
return BytesIO(string)
|
|
||||||
|
|
||||||
|
|
||||||
class TestImportCSV(unittest.TestCase):
|
class TestImportCSV(unittest.TestCase):
|
||||||
|
|
||||||
maxDiff = None
|
maxDiff = None
|
||||||
@ -107,7 +101,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
u''])
|
u''])
|
||||||
|
|
||||||
def test_wrong_cols1(self):
|
def test_wrong_cols1(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1, name2, name3
|
name1, name2, name3
|
||||||
a1,b1,c1
|
a1,b1,c1
|
||||||
@ -124,7 +118,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
|
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "", ""])
|
||||||
|
|
||||||
def test_wrong_cols2(self):
|
def test_wrong_cols2(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1
|
name1
|
||||||
a1,b1
|
a1,b1
|
||||||
@ -140,7 +134,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
|
self._check_col(parsed_file, 2, "", "Text", ["", "c2"])
|
||||||
|
|
||||||
def test_offset(self):
|
def test_offset(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
,,,,,,,
|
,,,,,,,
|
||||||
name1,name2,name3
|
name1,name2,name3
|
||||||
@ -160,7 +154,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
|
self._check_col(parsed_file, 3, "", "Text", ["", "", "d4"])
|
||||||
|
|
||||||
def test_offset_no_header(self):
|
def test_offset_no_header(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
4,b1,c1
|
4,b1,c1
|
||||||
4,b2,c2
|
4,b2,c2
|
||||||
@ -176,7 +170,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
|
self._check_col(parsed_file, 2, "", "Text", ["c1", "c2", "c3"])
|
||||||
|
|
||||||
def test_empty_headers(self):
|
def test_empty_headers(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
,,-,-
|
,,-,-
|
||||||
b,a,a,a,a
|
b,a,a,a,a
|
||||||
@ -194,7 +188,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
self._check_col(parsed_file, 3, "-", "Text", ["a", "a", "a"])
|
||||||
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
|
self._check_col(parsed_file, 4, "", "Text", ["a", "a", "a"])
|
||||||
|
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
-,-,-,-,-,-
|
-,-,-,-,-,-
|
||||||
b,a,a,a,a
|
b,a,a,a,a
|
||||||
@ -212,7 +206,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
|
self._check_col(parsed_file, 5, "-", "Text", ["", "", ""])
|
||||||
|
|
||||||
def test_guess_missing_user_option(self):
|
def test_guess_missing_user_option(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1,;name2,;name3
|
name1,;name2,;name3
|
||||||
a1,;b1,;c1
|
a1,;b1,;c1
|
||||||
@ -242,7 +236,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
|
self._check_col(parsed_file, 2, ";name3", "Text", [";c1", ";c2", ";c3"])
|
||||||
|
|
||||||
def test_one_line_file_no_header(self):
|
def test_one_line_file_no_header(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
2,name2,name3
|
2,name2,name3
|
||||||
"""))
|
"""))
|
||||||
@ -256,7 +250,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "", "Text", ["name3"])
|
self._check_col(parsed_file, 2, "", "Text", ["name3"])
|
||||||
|
|
||||||
def test_one_line_file_with_header(self):
|
def test_one_line_file_with_header(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1,name2,name3
|
name1,name2,name3
|
||||||
"""))
|
"""))
|
||||||
@ -270,7 +264,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "name3", "Text", [])
|
self._check_col(parsed_file, 2, "name3", "Text", [])
|
||||||
|
|
||||||
def test_empty_file(self):
|
def test_empty_file(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
"""))
|
"""))
|
||||||
|
|
||||||
@ -278,7 +272,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self.assertEqual(parsed_file, ({}, []))
|
self.assertEqual(parsed_file, ({}, []))
|
||||||
|
|
||||||
def test_option_num_rows(self):
|
def test_option_num_rows(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1,name2,name3
|
name1,name2,name3
|
||||||
a1,b1,c1
|
a1,b1,c1
|
||||||
@ -310,7 +304,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
self._check_col(parsed_file, 2, "name3", "Text", ['c1', 'c2', 'c3'])
|
||||||
|
|
||||||
def test_option_num_rows_no_header(self):
|
def test_option_num_rows_no_header(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
,,
|
,,
|
||||||
,,
|
,,
|
||||||
@ -336,7 +330,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
|
self._check_col(parsed_file, 2, "", "Text", ['c1', 'c2'])
|
||||||
|
|
||||||
def test_option_use_col_name_as_header(self):
|
def test_option_use_col_name_as_header(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
name1,name2,name3
|
name1,name2,name3
|
||||||
a1,1,c1
|
a1,1,c1
|
||||||
@ -361,7 +355,7 @@ class TestImportCSV(unittest.TestCase):
|
|||||||
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
self._check_col(parsed_file, 2, "name3", "Text", ["c1", "c2", "c3"])
|
||||||
|
|
||||||
def test_option_use_col_name_as_header_no_headers(self):
|
def test_option_use_col_name_as_header_no_headers(self):
|
||||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
file_obj = StringIO(textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
,,,
|
,,,
|
||||||
,,,
|
,,,
|
||||||
|
@ -14,6 +14,15 @@ if six.PY2:
|
|||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def empty(value):
|
||||||
|
""" Stringify the value and check that it has a length. """
|
||||||
|
if value is None:
|
||||||
|
return True
|
||||||
|
if not isinstance(value, six.string_types):
|
||||||
|
value = six.text_type(value)
|
||||||
|
return not value.strip()
|
||||||
|
|
||||||
# Get path to an imported file.
|
# Get path to an imported file.
|
||||||
def get_path(file_source):
|
def get_path(file_source):
|
||||||
importdir = os.environ.get('IMPORTDIR') or '/importdir'
|
importdir = os.environ.get('IMPORTDIR') or '/importdir'
|
||||||
@ -39,7 +48,7 @@ def _is_header(header, data_rows):
|
|||||||
"""
|
"""
|
||||||
# See if the row has any non-text values.
|
# See if the row has any non-text values.
|
||||||
for cell in header:
|
for cell in header:
|
||||||
if not isinstance(cell.value, six.string_types) or _is_numeric(cell.value):
|
if not isinstance(cell, six.string_types) or _is_numeric(cell):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@ -48,7 +57,7 @@ def _is_header(header, data_rows):
|
|||||||
count_repeats = [0 for cell in header]
|
count_repeats = [0 for cell in header]
|
||||||
for row in data_rows:
|
for row in data_rows:
|
||||||
for cell, header_cell in zip(row, header):
|
for cell, header_cell in zip(row, header):
|
||||||
if cell.value and cell.value == header_cell.value:
|
if cell and cell == header_cell:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -59,7 +68,7 @@ def _count_nonempty(row):
|
|||||||
"""
|
"""
|
||||||
count = 0
|
count = 0
|
||||||
for i, c in enumerate(row):
|
for i, c in enumerate(row):
|
||||||
if not c.empty:
|
if not empty(c):
|
||||||
count = i + 1
|
count = i + 1
|
||||||
return count
|
return count
|
||||||
|
|
||||||
@ -83,7 +92,7 @@ def expand_headers(headers, data_offset, rows):
|
|||||||
row_length = max(itertools.chain([len(headers)],
|
row_length = max(itertools.chain([len(headers)],
|
||||||
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
|
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
|
||||||
None))))
|
None))))
|
||||||
header_values = [h.value.strip() for h in headers] + [u''] * (row_length - len(headers))
|
header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
|
||||||
return header_values
|
return header_values
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,22 +0,0 @@
|
|||||||
import unittest
|
|
||||||
import messytables
|
|
||||||
import os
|
|
||||||
|
|
||||||
class TestMessyTables(unittest.TestCase):
|
|
||||||
|
|
||||||
# Just a skeleton test
|
|
||||||
def test_any_tableset(self):
|
|
||||||
path = os.path.join(os.path.dirname(__file__),
|
|
||||||
"fixtures", "nyc_schools_progress_report_ec_2013.xlsx")
|
|
||||||
with open(path, "rb") as f:
|
|
||||||
table_set = messytables.any.any_tableset(f, extension=os.path.splitext(path)[1])
|
|
||||||
|
|
||||||
self.assertIsInstance(table_set, messytables.XLSTableSet)
|
|
||||||
self.assertEqual([t.name for t in table_set.tables],
|
|
||||||
['Summary', 'Student Progress', 'Student Performance', 'School Environment',
|
|
||||||
'Closing the Achievement Gap', 'Middle School Course Metrics',
|
|
||||||
'All Information', 'Peer Groups'])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
@ -9,6 +9,7 @@ tests under 'arc unit' and under Jenkins.
|
|||||||
./sandbox/nacl/bin/run python /grist/runtests.py [--xunit]
|
./sandbox/nacl/bin/run python /grist/runtests.py [--xunit]
|
||||||
"""
|
"""
|
||||||
import codecs
|
import codecs
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
@ -30,6 +31,9 @@ def main():
|
|||||||
utf8_stdout = codecs.getwriter('utf8')(utf8_stdout)
|
utf8_stdout = codecs.getwriter('utf8')(utf8_stdout)
|
||||||
test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout)
|
test_runner = xmlrunner.XMLTestRunner(stream=utf8_stdout)
|
||||||
|
|
||||||
|
if "-v" in argv or "--verbose" in argv:
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
if all(arg.startswith("-") for arg in argv[1:]):
|
if all(arg.startswith("-") for arg in argv[1:]):
|
||||||
argv.insert(1, "discover")
|
argv.insert(1, "discover")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user