|
|
|
@ -1,11 +1,11 @@
|
|
|
|
|
"""
|
|
|
|
|
Plugin for importing CSV files
|
|
|
|
|
"""
|
|
|
|
|
import os
|
|
|
|
|
import codecs
|
|
|
|
|
import csv
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
import messytables
|
|
|
|
|
import six
|
|
|
|
|
from six.moves import zip
|
|
|
|
|
|
|
|
|
@ -90,108 +90,117 @@ def parse_file(file_path, parse_options=None):
|
|
|
|
|
"""
|
|
|
|
|
parse_options = parse_options or {}
|
|
|
|
|
|
|
|
|
|
with open(file_path, "rb") as f:
|
|
|
|
|
with codecs.open(file_path, "rb") as f:
|
|
|
|
|
sample = f.read(100000)
|
|
|
|
|
encoding = chardet.detect(sample)['encoding'] or "utf8"
|
|
|
|
|
# In addition, always prefer UTF8 over ASCII.
|
|
|
|
|
if encoding == 'ascii':
|
|
|
|
|
encoding = 'utf8'
|
|
|
|
|
log.info("Using encoding %s" % encoding)
|
|
|
|
|
|
|
|
|
|
with codecs.open(file_path, mode="r", encoding=encoding) as f:
|
|
|
|
|
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
|
|
|
|
|
return parsing_options, export_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _guess_dialect(file_obj):
|
|
|
|
|
try:
|
|
|
|
|
# Restrict allowed delimiters to prevent guessing other char than this list.
|
|
|
|
|
dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
|
|
|
|
|
log.info("Guessed dialect %s" % dict(dialect.__dict__))
|
|
|
|
|
except csv.Error:
|
|
|
|
|
log.info("Cannot guess dialect using Excel as fallback.")
|
|
|
|
|
return csv.excel
|
|
|
|
|
else:
|
|
|
|
|
return dialect
|
|
|
|
|
finally:
|
|
|
|
|
file_obj.seek(0)
|
|
|
|
|
|
|
|
|
|
def _parse_open_file(file_obj, parse_options=None):
|
|
|
|
|
options = {}
|
|
|
|
|
csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
|
|
|
|
|
csv_options = {k: parse_options.get(k) for k in csv_keys}
|
|
|
|
|
options = {}
|
|
|
|
|
dialect = _guess_dialect(file_obj)
|
|
|
|
|
|
|
|
|
|
csv_options = {k: parse_options.get(k, getattr(dialect, k, None)) for k in csv_keys}
|
|
|
|
|
if six.PY2:
|
|
|
|
|
csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
|
|
|
|
|
for k, v in csv_options.items()}
|
|
|
|
|
|
|
|
|
|
table_set = messytables.CSVTableSet(file_obj,
|
|
|
|
|
delimiter=csv_options['delimiter'],
|
|
|
|
|
quotechar=csv_options['quotechar'],
|
|
|
|
|
lineterminator=csv_options['lineterminator'],
|
|
|
|
|
doublequote=csv_options['doublequote'],
|
|
|
|
|
skipinitialspace=csv_options['skipinitialspace'])
|
|
|
|
|
csv_options = {k: v for k, v in csv_options.items() if v is not None}
|
|
|
|
|
reader = csv.reader(file_obj, **csv_options)
|
|
|
|
|
|
|
|
|
|
num_rows = parse_options.get('NUM_ROWS', 0)
|
|
|
|
|
|
|
|
|
|
# Messytable's encoding detection uses too small a sample, so we override it here.
|
|
|
|
|
sample = file_obj.read(100000)
|
|
|
|
|
table_set.encoding = chardet.detect(sample)['encoding']
|
|
|
|
|
# In addition, always prefer UTF8 over ASCII.
|
|
|
|
|
if table_set.encoding == 'ascii':
|
|
|
|
|
table_set.encoding = 'utf8'
|
|
|
|
|
|
|
|
|
|
export_list = []
|
|
|
|
|
# A table set is a collection of tables:
|
|
|
|
|
for row_set in table_set.tables:
|
|
|
|
|
table_name = None
|
|
|
|
|
sample_rows = list(row_set.sample)
|
|
|
|
|
# Messytables doesn't guess whether headers are present, so we need to step in.
|
|
|
|
|
data_offset, headers = import_utils.headers_guess(sample_rows)
|
|
|
|
|
|
|
|
|
|
# Make sure all header values are strings.
|
|
|
|
|
for i, header in enumerate(headers):
|
|
|
|
|
if not isinstance(header, six.string_types):
|
|
|
|
|
headers[i] = six.text_type(header)
|
|
|
|
|
|
|
|
|
|
log.info("Guessed data_offset as %s", data_offset)
|
|
|
|
|
log.info("Guessed headers as: %s", headers)
|
|
|
|
|
|
|
|
|
|
have_guessed_headers = any(headers)
|
|
|
|
|
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
|
|
|
|
|
have_guessed_headers)
|
|
|
|
|
|
|
|
|
|
if include_col_names_as_headers and not have_guessed_headers:
|
|
|
|
|
# use first line as headers
|
|
|
|
|
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
|
|
|
|
|
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
|
|
|
|
|
|
|
|
|
|
elif not include_col_names_as_headers and have_guessed_headers:
|
|
|
|
|
# move guessed headers to data
|
|
|
|
|
data_offset -= 1
|
|
|
|
|
headers = [''] * len(headers)
|
|
|
|
|
|
|
|
|
|
row_set.register_processor(messytables.offset_processor(data_offset))
|
|
|
|
|
rows = [
|
|
|
|
|
[cell.value for cell in row]
|
|
|
|
|
for row in row_set
|
|
|
|
|
]
|
|
|
|
|
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
|
|
|
|
|
|
|
|
|
|
# Identify and remove empty columns, and populate separate metadata and data lists.
|
|
|
|
|
column_metadata = []
|
|
|
|
|
table_data = []
|
|
|
|
|
for col_data, header in zip(table_data_with_types, headers):
|
|
|
|
|
if not header and all(val == "" for val in col_data["data"]):
|
|
|
|
|
continue # empty column
|
|
|
|
|
data = col_data.pop("data")
|
|
|
|
|
col_data["id"] = header
|
|
|
|
|
column_metadata.append(col_data)
|
|
|
|
|
table_data.append(data)
|
|
|
|
|
|
|
|
|
|
if not table_data:
|
|
|
|
|
# Don't add tables with no columns.
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
guessed = row_set._dialect
|
|
|
|
|
quoting = parse_options.get('quoting')
|
|
|
|
|
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
|
|
|
|
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
|
|
|
|
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
|
|
|
|
|
"quotechar": parse_options.get('quotechar', guessed.quotechar),
|
|
|
|
|
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
|
|
|
|
|
"include_col_names_as_headers": include_col_names_as_headers,
|
|
|
|
|
"start_with_row": 1,
|
|
|
|
|
"NUM_ROWS": num_rows,
|
|
|
|
|
"SCHEMA": SCHEMA
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
|
|
|
|
for c in column_metadata:
|
|
|
|
|
log.debug("Output column %s", c)
|
|
|
|
|
export_list.append({
|
|
|
|
|
"table_name": table_name,
|
|
|
|
|
"column_metadata": column_metadata,
|
|
|
|
|
"table_data": table_data
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
table_name = None
|
|
|
|
|
rows = list(reader)
|
|
|
|
|
sample_len = 100
|
|
|
|
|
sample_rows = rows[:sample_len]
|
|
|
|
|
data_offset, headers = import_utils.headers_guess(sample_rows)
|
|
|
|
|
|
|
|
|
|
# Make sure all header values are strings.
|
|
|
|
|
for i, header in enumerate(headers):
|
|
|
|
|
if not isinstance(header, six.string_types):
|
|
|
|
|
headers[i] = six.text_type(header)
|
|
|
|
|
|
|
|
|
|
log.info("Guessed data_offset as %s", data_offset)
|
|
|
|
|
log.info("Guessed headers as: %s", headers)
|
|
|
|
|
|
|
|
|
|
have_guessed_headers = any(headers)
|
|
|
|
|
include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
|
|
|
|
|
have_guessed_headers)
|
|
|
|
|
|
|
|
|
|
if include_col_names_as_headers and not have_guessed_headers:
|
|
|
|
|
# use first line as headers
|
|
|
|
|
data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
|
|
|
|
|
headers = import_utils.expand_headers(first_row, data_offset, sample_rows)
|
|
|
|
|
|
|
|
|
|
elif not include_col_names_as_headers and have_guessed_headers:
|
|
|
|
|
# move guessed headers to data
|
|
|
|
|
data_offset -= 1
|
|
|
|
|
headers = [''] * len(headers)
|
|
|
|
|
|
|
|
|
|
rows = rows[data_offset:] # Use row.pop instead to make it faster ?
|
|
|
|
|
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
|
|
|
|
|
|
|
|
|
|
# Identify and remove empty columns, and populate separate metadata and data lists.
|
|
|
|
|
column_metadata = []
|
|
|
|
|
table_data = []
|
|
|
|
|
for col_data, header in zip(table_data_with_types, headers):
|
|
|
|
|
if not header and all(val == "" for val in col_data["data"]):
|
|
|
|
|
continue # empty column
|
|
|
|
|
data = col_data.pop("data")
|
|
|
|
|
col_data["id"] = header
|
|
|
|
|
column_metadata.append(col_data)
|
|
|
|
|
table_data.append(data)
|
|
|
|
|
|
|
|
|
|
if not table_data:
|
|
|
|
|
log.info("No data found. Aborting CSV import.")
|
|
|
|
|
# Don't add tables with no columns.
|
|
|
|
|
return {}, []
|
|
|
|
|
|
|
|
|
|
guessed = reader.dialect
|
|
|
|
|
quoting = parse_options.get('quoting')
|
|
|
|
|
options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
|
|
|
|
|
"doublequote": parse_options.get('doublequote', guessed.doublequote),
|
|
|
|
|
"lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
|
|
|
|
|
"quotechar": parse_options.get('quotechar', guessed.quotechar),
|
|
|
|
|
"skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
|
|
|
|
|
"include_col_names_as_headers": include_col_names_as_headers,
|
|
|
|
|
"start_with_row": 1,
|
|
|
|
|
"NUM_ROWS": num_rows,
|
|
|
|
|
"SCHEMA": SCHEMA
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log.info("Output table %r with %d columns", table_name, len(column_metadata))
|
|
|
|
|
for c in column_metadata:
|
|
|
|
|
log.debug("Output column %s", c)
|
|
|
|
|
|
|
|
|
|
export_list = [{
|
|
|
|
|
"table_name": table_name,
|
|
|
|
|
"column_metadata": column_metadata,
|
|
|
|
|
"table_data": table_data
|
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
return options, export_list
|
|
|
|
|
|
|
|
|
|