You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
3.9 KiB

Helper functions for import plugins
import itertools
import logging
import os
import six
from six.moves import zip
if six.PY2:
raise RuntimeError("Imports should use a Python 3 environment")
log = logging.getLogger(__name__)
def empty(value):
""" Stringify the value and check that it has a length. """
if value is None:
return True
if not isinstance(value, six.string_types):
value = six.text_type(value)
return not value.strip()
# Get path to an imported file.
def get_path(file_source):
importdir = os.environ.get('IMPORTDIR') or '/importdir'
return os.path.join(importdir, file_source)
def capitalize(word):
"""Capitalize the first character in the word (without lowercasing the rest)."""
return word[0].capitalize() + word[1:]
def _is_numeric(text):
for t in six.integer_types + (float, complex):
return True
except (ValueError, OverflowError):
return False
def _is_header(header, data_rows):
Returns whether header can be considered a legitimate header for data_rows.
# See if the row has any non-text values.
for cell in header:
if not isinstance(cell, six.string_types) or _is_numeric(cell):
return False
# If it's all text, see if the values in the first row repeat in other rows. That's uncommon for
# a header.
count_repeats = [0 for cell in header]
for row in data_rows:
for cell, header_cell in zip(row, header):
if cell and cell == header_cell:
return False
return True
def _count_nonempty(row):
Returns the count of cells in row, ignoring trailing empty cells.
count = 0
for i, c in enumerate(row):
if not empty(c):
count = i + 1
return count
def find_first_non_empty_row(rows):
Returns (data_offset, header) of the first row with non-empty fields
or (0, []) if there are no non-empty rows.
for i, row in enumerate(rows):
if _count_nonempty(row) > 0:
return i + 1, row
# No non-empty rows.
return 0, []
def expand_headers(headers, data_offset, rows):
Returns expanded header to have enough columns for all rows in the given sample.
row_length = max(itertools.chain([len(headers)],
(_count_nonempty(r) for r in itertools.islice(rows, data_offset,
header_values = [h.strip() for h in headers] + [u''] * (row_length - len(headers))
return header_values
def headers_guess(rows):
Our own smarter version of messytables.headers_guess, which also guesses as to whether one of
the first rows is in fact a header. Returns (data_offset, headers) where data_offset is the
index of the first line of data, and headers is the list of guessed headers (which will contain
empty strings if the file had no headers).
# Messytables guesses at the length of data rows, and then assumes that the first row that has
# close to that many non-empty fields is the header, where by "close" it means 1 less.
# For Grist, it's better to mistake headers for data than to mistake data for headers. Note that
# there is csv.Sniffer().has_header(), which tries to be clever, but it's messes up too much.
# We only consider for the header the first row with non-empty cells. It is a header if
# - it has no non-text fields
# - none of the fields have a value that repeats in that column of data
# Find the first row with non-empty fields.
data_offset, header = find_first_non_empty_row(rows)
if not header:
return data_offset, header
# Let's see if row is really a header.
if not _is_header(header, itertools.islice(rows, data_offset, None)):
data_offset -= 1
header = []
# Expand header to have enough columns for all rows in the given sample.
header_values = expand_headers(header, data_offset, rows)
return data_offset, header_values