mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
(core) Switch excel import parsing from messytables+xlrd to openpyxl, and ignore empty rows
Summary: Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py. Skip empty rows since excel files can easily contain huge numbers of them. Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format. Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies. Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3406
This commit is contained in:
parent
0a61d740ba
commit
6c90de4d62
@ -32,7 +32,7 @@ export interface SelectFileOptions extends UploadOptions {
|
|||||||
// e.g. [".jpg", ".png"]
|
// e.g. [".jpg", ".png"]
|
||||||
}
|
}
|
||||||
|
|
||||||
export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xls", ".xlsx", ".xlsm"];
|
export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xlsx", ".xlsm"];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Shows the file-picker dialog with the given options, and uploads the selected files. If under
|
* Shows the file-picker dialog with the given options, and uploads the selected files. If under
|
||||||
|
@ -271,8 +271,8 @@ export class ActiveDocImport {
|
|||||||
/**
|
/**
|
||||||
* Imports the data stored at tmpPath.
|
* Imports the data stored at tmpPath.
|
||||||
*
|
*
|
||||||
* Currently it starts a python parser (that relies on the messytables library) as a child process
|
* Currently it starts a python parser as a child process
|
||||||
* outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may
|
* outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may
|
||||||
* result in the import of multiple tables, in case of e.g. Excel formats.
|
* result in the import of multiple tables, in case of e.g. Excel formats.
|
||||||
* @param {OptDocSession} docSession: Session instance to use for importing.
|
* @param {OptDocSession} docSession: Session instance to use for importing.
|
||||||
* @param {String} tmpPath: The path from of the original file.
|
* @param {String} tmpPath: The path from of the original file.
|
||||||
|
@ -136,7 +136,6 @@ export class DocPluginManager {
|
|||||||
if (messages.length) {
|
if (messages.length) {
|
||||||
const extToType: Record<string, string> = {
|
const extToType: Record<string, string> = {
|
||||||
'.xlsx' : 'Excel',
|
'.xlsx' : 'Excel',
|
||||||
'.xls' : 'Excel',
|
|
||||||
'.json' : 'JSON',
|
'.json' : 'JSON',
|
||||||
'.csv' : 'CSV',
|
'.csv' : 'CSV',
|
||||||
};
|
};
|
||||||
|
@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
|
|||||||
// if checkpoints are in use.
|
// if checkpoints are in use.
|
||||||
const venv = path.join(process.cwd(),
|
const venv = path.join(process.cwd(),
|
||||||
pythonVersion === '2' ? 'venv' : 'sandbox_venv3');
|
pythonVersion === '2' ? 'venv' : 'sandbox_venv3');
|
||||||
if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) {
|
const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir;
|
||||||
|
if (fs.existsSync(venv) && !useCheckpoint) {
|
||||||
wrapperArgs.addMount(venv);
|
wrapperArgs.addMount(venv);
|
||||||
wrapperArgs.push('-s', path.join(venv, 'bin', 'python'));
|
wrapperArgs.push('-s', path.join(venv, 'bin', 'python'));
|
||||||
}
|
}
|
||||||
@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
|
|||||||
// between the checkpoint and how it gets used later).
|
// between the checkpoint and how it gets used later).
|
||||||
// If a sandbox is being used for import, it will have a special mount we can't
|
// If a sandbox is being used for import, it will have a special mount we can't
|
||||||
// deal with easily right now. Should be possible to do in future if desired.
|
// deal with easily right now. Should be possible to do in future if desired.
|
||||||
if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir &&
|
if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) {
|
||||||
process.env.GRIST_CHECKPOINT) {
|
|
||||||
if (process.env.GRIST_CHECKPOINT_MAKE) {
|
if (process.env.GRIST_CHECKPOINT_MAKE) {
|
||||||
const child =
|
const child =
|
||||||
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT,
|
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!,
|
||||||
`python${pythonVersion}`, '--', ...pythonArgs]);
|
`python${pythonVersion}`, '--', ...pythonArgs]);
|
||||||
// We don't want process control for this.
|
// We don't want process control for this.
|
||||||
return {child, control: new NoProcessControl(child)};
|
return {child, control: new NoProcessControl(child)};
|
||||||
}
|
}
|
||||||
wrapperArgs.push('--restore');
|
wrapperArgs.push('--restore');
|
||||||
wrapperArgs.push(process.env.GRIST_CHECKPOINT);
|
wrapperArgs.push(process.env.GRIST_CHECKPOINT!);
|
||||||
}
|
}
|
||||||
const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]);
|
const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]);
|
||||||
// For gvisor under ptrace, main work is done by a traced process identifiable as
|
// For gvisor under ptrace, main work is done by a traced process identifiable as
|
||||||
|
@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str
|
|||||||
return mimeExt;
|
return mimeExt;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (origExt === ".csv" || origExt === ".xls") {
|
if (origExt === ".csv") {
|
||||||
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
|
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
|
||||||
// may report "application/vnd.ms-excel" for .csv files. See
|
// may report "application/vnd.ms-excel" for .csv files. See
|
||||||
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220
|
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220
|
||||||
|
@ -5,11 +5,11 @@ components:
|
|||||||
safePython: sandbox/main.py
|
safePython: sandbox/main.py
|
||||||
contributions:
|
contributions:
|
||||||
fileParsers:
|
fileParsers:
|
||||||
- fileExtensions: ["csv"]
|
- fileExtensions: ["csv", "tsv", "txt"]
|
||||||
parseFile:
|
parseFile:
|
||||||
component: safePython
|
component: safePython
|
||||||
name: csv_parser
|
name: csv_parser
|
||||||
- fileExtensions: ["xls", "xlsx", "tsv", "txt", "xlsm"]
|
- fileExtensions: ["xlsx", "xlsm"]
|
||||||
parseFile:
|
parseFile:
|
||||||
component: safePython
|
component: safePython
|
||||||
name: xls_parser
|
name: xls_parser
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
|
original_formatwarning = warnings.formatwarning
|
||||||
|
|
||||||
|
|
||||||
|
def formatwarning(*args, **kwargs):
|
||||||
|
"""
|
||||||
|
Fixes an error on Jenkins where byte strings (instead of unicode)
|
||||||
|
were being written to stderr due to a warning from an internal library.
|
||||||
|
"""
|
||||||
|
return six.ensure_text(original_formatwarning(*args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
|
warnings.formatwarning = formatwarning
|
BIN
sandbox/grist/imports/fixtures/test_empty_rows.xlsx
Normal file
BIN
sandbox/grist/imports/fixtures/test_empty_rows.xlsx
Normal file
Binary file not shown.
@ -150,8 +150,11 @@ def _parse_open_file(file_obj, parse_options=None):
|
|||||||
headers = [''] * len(headers)
|
headers = [''] * len(headers)
|
||||||
|
|
||||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
row_set.register_processor(messytables.offset_processor(data_offset))
|
||||||
|
rows = [
|
||||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows)
|
[cell.value for cell in row]
|
||||||
|
for row in row_set
|
||||||
|
]
|
||||||
|
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)
|
||||||
|
|
||||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||||
column_metadata = []
|
column_metadata = []
|
||||||
|
@ -2,13 +2,10 @@
|
|||||||
This module reads a file path that is passed in using ActiveDoc.importFile()
|
This module reads a file path that is passed in using ActiveDoc.importFile()
|
||||||
and returns a object formatted so that it can be used by grist for a bulk add records action
|
and returns a object formatted so that it can be used by grist for a bulk add records action
|
||||||
"""
|
"""
|
||||||
import csv
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
|
|
||||||
import chardet
|
|
||||||
import messytables
|
import messytables
|
||||||
import messytables.excel
|
import openpyxl
|
||||||
import six
|
import six
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
|
|
||||||
@ -18,76 +15,58 @@ from imports import import_utils
|
|||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def import_file(file_source, parse_options):
|
def import_file(file_source):
|
||||||
path = import_utils.get_path(file_source["path"])
|
path = import_utils.get_path(file_source["path"])
|
||||||
orig_name = file_source["origName"]
|
parse_options, tables = parse_file(path)
|
||||||
parse_options, tables = parse_file(path, orig_name, parse_options)
|
|
||||||
return {"parseOptions": parse_options, "tables": tables}
|
return {"parseOptions": parse_options, "tables": tables}
|
||||||
|
|
||||||
# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any
|
|
||||||
# behavior.
|
|
||||||
orig_dialect = messytables.CSVRowSet._dialect
|
|
||||||
def override_dialect(self):
|
|
||||||
if self.delimiter == '\t':
|
|
||||||
return csv.excel_tab
|
|
||||||
return orig_dialect.fget(self)
|
|
||||||
messytables.CSVRowSet._dialect = property(override_dialect)
|
|
||||||
|
|
||||||
def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None):
|
def parse_file(file_path):
|
||||||
# pylint: disable=unused-argument
|
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
try:
|
return parse_open_file(f)
|
||||||
return parse_open_file(f, orig_name, table_name_hint=table_name_hint)
|
|
||||||
except Exception as e:
|
|
||||||
# Log the full error, but simplify the thrown error to omit the unhelpful extra args.
|
|
||||||
log.warn("import_xls parse_file failed: %s", e)
|
|
||||||
if six.PY2 and e.args and isinstance(e.args[0], six.string_types):
|
|
||||||
raise Exception(e.args[0])
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def parse_open_file(file_obj, orig_name, table_name_hint=None):
|
def parse_open_file(file_obj):
|
||||||
file_root, file_ext = os.path.splitext(orig_name)
|
workbook = openpyxl.load_workbook(
|
||||||
table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False)
|
file_obj,
|
||||||
|
read_only=True,
|
||||||
# Messytable's encoding detection uses too small a sample, so we override it here.
|
keep_vba=False,
|
||||||
if isinstance(table_set, messytables.CSVTableSet):
|
data_only=True,
|
||||||
sample = file_obj.read(100000)
|
keep_links=False,
|
||||||
table_set.encoding = chardet.detect(sample)['encoding']
|
)
|
||||||
# In addition, always prefer UTF8 over ASCII.
|
|
||||||
if table_set.encoding == 'ascii':
|
|
||||||
table_set.encoding = 'utf8'
|
|
||||||
|
|
||||||
skipped_tables = 0
|
skipped_tables = 0
|
||||||
export_list = []
|
export_list = []
|
||||||
# A table set is a collection of tables:
|
# A table set is a collection of tables:
|
||||||
for row_set in table_set.tables:
|
for sheet in workbook:
|
||||||
table_name = row_set.name
|
table_name = sheet.title
|
||||||
|
rows = [
|
||||||
if isinstance(row_set, messytables.CSVRowSet):
|
row
|
||||||
# For csv files, we can do better for table_name by using the filename.
|
for row in sheet.iter_rows(values_only=True)
|
||||||
table_name = import_utils.capitalize(table_name_hint or
|
# Exclude empty rows, i.e. rows with only empty values.
|
||||||
os.path.basename(file_root.decode('utf8')))
|
# `if not any(row)` would be slightly faster, but would count `0` as empty.
|
||||||
|
if not set(row) <= {None, ""}
|
||||||
# Messytables doesn't guess whether headers are present, so we need to step in.
|
]
|
||||||
data_offset, headers = import_utils.headers_guess(list(row_set.sample))
|
sample = [
|
||||||
else:
|
# Create messytables.Cells for the sake of messytables.headers_guess
|
||||||
# Let messytables guess header names and the offset of the header.
|
[messytables.Cell(cell) for cell in row]
|
||||||
offset, headers = messytables.headers_guess(row_set.sample)
|
for row in rows[:1000]
|
||||||
data_offset = offset + 1 # Add the header line
|
]
|
||||||
|
offset, headers = messytables.headers_guess(sample)
|
||||||
|
data_offset = offset + 1 # Add the header line
|
||||||
|
rows = rows[data_offset:]
|
||||||
|
|
||||||
# Make sure all header values are strings.
|
# Make sure all header values are strings.
|
||||||
for i, header in enumerate(headers):
|
for i, header in enumerate(headers):
|
||||||
if not isinstance(header, six.string_types):
|
if header is None:
|
||||||
|
headers[i] = u''
|
||||||
|
elif not isinstance(header, six.string_types):
|
||||||
headers[i] = six.text_type(header)
|
headers[i] = six.text_type(header)
|
||||||
|
|
||||||
log.debug("Guessed data_offset as %s", data_offset)
|
log.debug("Guessed data_offset as %s", data_offset)
|
||||||
log.debug("Guessed headers as: %s", headers)
|
log.debug("Guessed headers as: %s", headers)
|
||||||
|
|
||||||
row_set.register_processor(messytables.offset_processor(data_offset))
|
table_data_with_types = parse_data.get_table_data(rows, len(headers))
|
||||||
|
|
||||||
|
|
||||||
table_data_with_types = parse_data.get_table_data(row_set, len(headers))
|
|
||||||
|
|
||||||
# Identify and remove empty columns, and populate separate metadata and data lists.
|
# Identify and remove empty columns, and populate separate metadata and data lists.
|
||||||
column_metadata = []
|
column_metadata = []
|
||||||
@ -121,36 +100,3 @@ def parse_open_file(file_obj, orig_name, table_name_hint=None):
|
|||||||
|
|
||||||
parse_options = {}
|
parse_options = {}
|
||||||
return parse_options, export_list
|
return parse_options, export_list
|
||||||
|
|
||||||
|
|
||||||
# This change was initially introduced in https://phab.getgrist.com/D2145
|
|
||||||
# Monkey-patching done in https://phab.getgrist.com/D2965
|
|
||||||
# to move towards normal dependency management
|
|
||||||
@staticmethod
|
|
||||||
def from_xlrdcell(xlrd_cell, sheet, col, row):
|
|
||||||
from messytables.excel import (
|
|
||||||
XLS_TYPES, StringType, DateType, InvalidDateError, xlrd, time, datetime, XLSCell
|
|
||||||
)
|
|
||||||
value = xlrd_cell.value
|
|
||||||
cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
|
|
||||||
if cell_type == DateType(None):
|
|
||||||
# Try-catch added by Dmitry, to avoid failing even if we see a date we can't handle.
|
|
||||||
try:
|
|
||||||
if value == 0:
|
|
||||||
raise InvalidDateError
|
|
||||||
year, month, day, hour, minute, second = \
|
|
||||||
xlrd.xldate_as_tuple(value, sheet.book.datemode)
|
|
||||||
if (year, month, day) == (0, 0, 0):
|
|
||||||
value = time(hour, minute, second)
|
|
||||||
else:
|
|
||||||
value = datetime(year, month, day, hour, minute, second)
|
|
||||||
except Exception:
|
|
||||||
# Keep going, and we'll just interpret the date as a number.
|
|
||||||
pass
|
|
||||||
messy_cell = XLSCell(value, type=cell_type)
|
|
||||||
messy_cell.sheet = sheet
|
|
||||||
messy_cell.xlrd_cell = xlrd_cell
|
|
||||||
messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y)
|
|
||||||
return messy_cell
|
|
||||||
|
|
||||||
messytables.excel.XLSCell.from_xlrdcell = from_xlrdcell
|
|
||||||
|
@ -6,8 +6,9 @@ def register_import_parsers(sandbox):
|
|||||||
sandbox.register("csv_parser.parseFile", parse_csv)
|
sandbox.register("csv_parser.parseFile", parse_csv)
|
||||||
|
|
||||||
def parse_excel(file_source, parse_options):
|
def parse_excel(file_source, parse_options):
|
||||||
|
# pylint: disable=unused-argument
|
||||||
from imports.import_xls import import_file
|
from imports.import_xls import import_file
|
||||||
return import_file(file_source, parse_options)
|
return import_file(file_source)
|
||||||
|
|
||||||
sandbox.register("xls_parser.parseFile", parse_excel)
|
sandbox.register("xls_parser.parseFile", parse_excel)
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ import unittest
|
|||||||
from imports import import_xls
|
from imports import import_xls
|
||||||
|
|
||||||
def _get_fixture(filename):
|
def _get_fixture(filename):
|
||||||
return [os.path.join(os.path.dirname(__file__), "fixtures", filename), filename]
|
return [os.path.join(os.path.dirname(__file__), "fixtures", filename)]
|
||||||
|
|
||||||
|
|
||||||
class TestImportXLS(unittest.TestCase):
|
class TestImportXLS(unittest.TestCase):
|
||||||
@ -42,7 +42,7 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
{"type": "Any", "id": "corner-cases"})
|
{"type": "Any", "id": "corner-cases"})
|
||||||
self.assertEqual(parsed_file[1][0]["table_data"][3],
|
self.assertEqual(parsed_file[1][0]["table_data"][3],
|
||||||
# The type is detected as text, so all values should be text.
|
# The type is detected as text, so all values should be text.
|
||||||
[u'=function()', u'3.0', u'two spaces after ',
|
[u'=function()', u'3', u'two spaces after ',
|
||||||
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
|
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
|
||||||
|
|
||||||
# check that multiple tables are created when there are multiple sheets in a document
|
# check that multiple tables are created when there are multiple sheets in a document
|
||||||
@ -53,12 +53,12 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
def test_excel_types(self):
|
def test_excel_types(self):
|
||||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||||
sheet = parsed_file[1][0]
|
sheet = parsed_file[1][0]
|
||||||
self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', ''])
|
self._check_col(sheet, 0, "int1", "Numeric", [-1234123, None, None])
|
||||||
self._check_col(sheet, 1, "int2", "Numeric", [5, '', ''])
|
self._check_col(sheet, 1, "int2", "Numeric", [5, None, None])
|
||||||
self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
|
self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
|
||||||
self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
|
self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
|
||||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, None, None])
|
||||||
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
|
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), None, None])
|
||||||
self._check_col(sheet, 6, "date1", "DateTime",
|
self._check_col(sheet, 6, "date1", "DateTime",
|
||||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||||
self._check_col(sheet, 7, "date2", "Date",
|
self._check_col(sheet, 7, "date2", "Date",
|
||||||
@ -78,7 +78,7 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
None, 1452038400.0, 1451549340.0, 1483214940.0, None,
|
None, 1452038400.0, 1451549340.0, 1483214940.0, None,
|
||||||
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
|
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
|
||||||
self._check_col(sheet, 1, "float_not_int", "Numeric",
|
self._check_col(sheet, 1, "float_not_int", "Numeric",
|
||||||
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
|
[1,2,3,4,5,None,6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
|
||||||
self._check_col(sheet, 2, "int_not_bool", "Any",
|
self._check_col(sheet, 2, "int_not_bool", "Any",
|
||||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||||
self._check_col(sheet, 3, "float_not_bool", "Any",
|
self._check_col(sheet, 3, "float_not_bool", "Any",
|
||||||
@ -91,12 +91,11 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
|
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
|
||||||
4.0, 6.0, '3-4', 4.0, 6.5])
|
4.0, 6.0, '3-4', 4.0, 6.5])
|
||||||
self._check_col(sheet, 7, "float_not_text", "Numeric",
|
self._check_col(sheet, 7, "float_not_text", "Numeric",
|
||||||
[-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, "", "-",
|
[-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, None, "-",
|
||||||
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
|
12.25, 0.00, None, 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
|
||||||
|
|
||||||
def test_excel_single_merged_cell(self):
|
def test_excel_single_merged_cell(self):
|
||||||
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
|
# An older version had a bug where a single cell marked as 'merged' would cause an exception.
|
||||||
# exception.
|
|
||||||
parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx'))
|
parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx'))
|
||||||
tables = parsed_file[1]
|
tables = parsed_file[1]
|
||||||
self.assertEqual(tables, [{
|
self.assertEqual(tables, [{
|
||||||
@ -110,21 +109,19 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
'table_data': [
|
'table_data': [
|
||||||
[u'SINGLE MERGED', u'The End'],
|
[u'SINGLE MERGED', u'The End'],
|
||||||
[1637384.52, u''],
|
[1637384.52, None],
|
||||||
[2444344.06, u''],
|
[2444344.06, None],
|
||||||
[2444344.06, u''],
|
[2444344.06, None],
|
||||||
[u'', u''],
|
[u'', u''],
|
||||||
],
|
],
|
||||||
}])
|
}])
|
||||||
|
|
||||||
def test_excel_strange_dates(self):
|
def test_excel_strange_dates(self):
|
||||||
# TODO fails with xlrd.xldate.XLDateAmbiguous: 4.180902777777778
|
|
||||||
# Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00).
|
# Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00).
|
||||||
parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx'))
|
parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx'))
|
||||||
tables = parsed_file[1]
|
tables = parsed_file[1]
|
||||||
# We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30"
|
# We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30"
|
||||||
# would be best left as text, but here become "01:10:00" (after xlrd parses the first as
|
# would be best left as text.
|
||||||
# datetime.time), and as 4.18... (after xlrd fails and we resort to the numerical value).
|
|
||||||
self.assertEqual(tables, [{
|
self.assertEqual(tables, [{
|
||||||
'table_name': u'Sheet1',
|
'table_name': u'Sheet1',
|
||||||
'column_metadata': [
|
'column_metadata': [
|
||||||
@ -132,22 +129,41 @@ class TestImportXLS(unittest.TestCase):
|
|||||||
{'id': 'b', 'type': 'Date'},
|
{'id': 'b', 'type': 'Date'},
|
||||||
{'id': 'c', 'type': 'Any'},
|
{'id': 'c', 'type': 'Any'},
|
||||||
{'id': 'd', 'type': 'Any'},
|
{'id': 'd', 'type': 'Any'},
|
||||||
{'id': 'e', 'type': 'Numeric'},
|
{'id': 'e', 'type': 'DateTime'},
|
||||||
{'id': 'f', 'type': 'Numeric'},
|
{'id': 'f', 'type': 'Date'},
|
||||||
{'id': 'g', 'type': 'Any'},
|
{'id': 'g', 'type': 'Any'},
|
||||||
{'id': 'h', 'type': 'Date'},
|
{'id': 'h', 'type': 'Date'},
|
||||||
{'id': 'i', 'type': 'Numeric'},
|
{'id': 'i', 'type': 'Date'},
|
||||||
],
|
],
|
||||||
'table_data': [
|
'table_data': [
|
||||||
[u'21:14:00'],
|
[u'21:14:00'],
|
||||||
[1568851200.0],
|
[1568851200.0],
|
||||||
[u'01:10:00'],
|
[u'01:10:00'],
|
||||||
[u'10:20:30'],
|
[u'10:20:30'],
|
||||||
[4.180902777777778],
|
[-2208713970.0],
|
||||||
[20],
|
[-2207347200.0],
|
||||||
[u'7/4/1776'],
|
[u'7/4/1776'],
|
||||||
[205286400.0],
|
[205286400.0],
|
||||||
[0],
|
[-2209161600.0],
|
||||||
|
],
|
||||||
|
}])
|
||||||
|
|
||||||
|
def test_empty_rows(self):
|
||||||
|
# Check that empty rows aren't imported,
|
||||||
|
# and that files with lots of empty rows are imported quickly.
|
||||||
|
# The fixture file is mostly empty but has data in the last row,
|
||||||
|
# with over a million empty rows in between.
|
||||||
|
parsed_file = import_xls.parse_file(*_get_fixture('test_empty_rows.xlsx'))
|
||||||
|
tables = parsed_file[1]
|
||||||
|
self.assertEqual(tables, [{
|
||||||
|
'table_name': u'Sheet1',
|
||||||
|
'column_metadata': [
|
||||||
|
{'id': 'a', 'type': 'Numeric'},
|
||||||
|
{'id': 'b', 'type': 'Numeric'},
|
||||||
|
],
|
||||||
|
'table_data': [
|
||||||
|
[0, None, 1],
|
||||||
|
[None, 0, 2],
|
||||||
],
|
],
|
||||||
}])
|
}])
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ of values. All "data" lists will have the same length.
|
|||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import messytables
|
|
||||||
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
import moment # TODO grist internal libraries might not be available to plugins in the future.
|
||||||
import six
|
import six
|
||||||
from six.moves import zip, xrange
|
from six.moves import zip, xrange
|
||||||
@ -59,12 +58,16 @@ class BaseConverter(object):
|
|||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
numeric_types = six.integer_types + (float, complex, type(None))
|
||||||
|
|
||||||
class NumericConverter(BaseConverter):
|
class NumericConverter(BaseConverter):
|
||||||
"""Handles the Grist Numeric type"""
|
"""Handles the Grist Numeric type"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def convert(cls, value):
|
def convert(cls, value):
|
||||||
if type(value) in six.integer_types + (float, complex):
|
if type(value) is bool:
|
||||||
|
return int(value)
|
||||||
|
elif type(value) in numeric_types:
|
||||||
return value
|
return value
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
@ -80,7 +83,7 @@ class SimpleDateTimeConverter(BaseConverter):
|
|||||||
def convert(cls, value):
|
def convert(cls, value):
|
||||||
if type(value) is datetime.datetime:
|
if type(value) is datetime.datetime:
|
||||||
return value
|
return value
|
||||||
elif value == "":
|
elif not value:
|
||||||
return None
|
return None
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
@ -103,6 +106,8 @@ class AnyConverter(BaseConverter):
|
|||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def convert(cls, value):
|
def convert(cls, value):
|
||||||
|
if value is None:
|
||||||
|
return u''
|
||||||
return six.text_type(value)
|
return six.text_type(value)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -156,7 +161,7 @@ def _guess_basic_types(rows, num_columns):
|
|||||||
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
|
column_detectors = [ColumnDetector() for i in xrange(num_columns)]
|
||||||
for row in rows:
|
for row in rows:
|
||||||
for cell, detector in zip(row, column_detectors):
|
for cell, detector in zip(row, column_detectors):
|
||||||
detector.add_value(cell.value)
|
detector.add_value(cell)
|
||||||
|
|
||||||
return [detector.get_converter() for detector in column_detectors]
|
return [detector.get_converter() for detector in column_detectors]
|
||||||
|
|
||||||
@ -194,10 +199,10 @@ class ColumnConverter(object):
|
|||||||
return {"type": grist_type, "data": self._all_col_values}
|
return {"type": grist_type, "data": self._all_col_values}
|
||||||
|
|
||||||
|
|
||||||
def get_table_data(row_set, num_columns, num_rows=0):
|
def get_table_data(rows, num_columns, num_rows=0):
|
||||||
converters = _guess_basic_types(row_set.sample, num_columns)
|
converters = _guess_basic_types(rows[:1000], num_columns)
|
||||||
col_converters = [ColumnConverter(c) for c in converters]
|
col_converters = [ColumnConverter(c) for c in converters]
|
||||||
for num, row in enumerate(row_set):
|
for num, row in enumerate(rows):
|
||||||
if num_rows and num == num_rows:
|
if num_rows and num == num_rows:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -207,9 +212,9 @@ def get_table_data(row_set, num_columns, num_rows=0):
|
|||||||
# Make sure we have a value for every column.
|
# Make sure we have a value for every column.
|
||||||
missing_values = len(converters) - len(row)
|
missing_values = len(converters) - len(row)
|
||||||
if missing_values > 0:
|
if missing_values > 0:
|
||||||
row.extend([messytables.Cell("")] * missing_values)
|
row.extend([""] * missing_values)
|
||||||
|
|
||||||
for cell, conv in zip(row, col_converters):
|
for cell, conv in zip(row, col_converters):
|
||||||
conv.convert_and_add(cell.value)
|
conv.convert_and_add(cell)
|
||||||
|
|
||||||
return [conv.get_grist_column() for conv in col_converters]
|
return [conv.get_grist_column() for conv in col_converters]
|
||||||
|
@ -3,12 +3,15 @@ asttokens==2.0.5
|
|||||||
backports.functools-lru-cache==1.6.4
|
backports.functools-lru-cache==1.6.4
|
||||||
chardet==4.0.0
|
chardet==4.0.0
|
||||||
enum34==1.1.10
|
enum34==1.1.10
|
||||||
|
et-xmlfile==1.0.1
|
||||||
html5lib==0.999999999
|
html5lib==0.999999999
|
||||||
iso8601==0.1.12
|
iso8601==0.1.12
|
||||||
|
jdcal==1.4.1
|
||||||
json_table_schema==0.2.1
|
json_table_schema==0.2.1
|
||||||
lazy_object_proxy==1.6.0
|
lazy_object_proxy==1.6.0
|
||||||
lxml==4.6.3 # used in csv plugin only?
|
lxml==4.6.3 # used in csv plugin only?
|
||||||
messytables==0.15.2
|
messytables==0.15.2
|
||||||
|
openpyxl==2.6.4
|
||||||
python_dateutil==2.6.0
|
python_dateutil==2.6.0
|
||||||
python_magic==0.4.12
|
python_magic==0.4.12
|
||||||
roman==2.0.0
|
roman==2.0.0
|
||||||
|
@ -3,12 +3,15 @@ asttokens==2.0.5
|
|||||||
backports.functools-lru-cache==1.6.4
|
backports.functools-lru-cache==1.6.4
|
||||||
chardet==4.0.0
|
chardet==4.0.0
|
||||||
enum34==1.1.10
|
enum34==1.1.10
|
||||||
|
et-xmlfile==1.0.1
|
||||||
html5lib==0.999999999
|
html5lib==0.999999999
|
||||||
iso8601==0.1.12
|
iso8601==0.1.12
|
||||||
|
jdcal==1.4.1
|
||||||
json_table_schema==0.2.1
|
json_table_schema==0.2.1
|
||||||
lazy_object_proxy==1.6.0
|
lazy_object_proxy==1.6.0
|
||||||
lxml==4.6.3 # used in csv plugin only?
|
lxml==4.6.3 # used in csv plugin only?
|
||||||
messytables==0.15.2
|
messytables==0.15.2
|
||||||
|
openpyxl==2.6.4
|
||||||
python_dateutil==2.6.0
|
python_dateutil==2.6.0
|
||||||
python_magic==0.4.12
|
python_magic==0.4.12
|
||||||
roman==2.0.0
|
roman==2.0.0
|
||||||
|
Loading…
Reference in New Issue
Block a user