gristlabs_grist-core/sandbox/grist/imports/import_csv.py

"""
Plugin for importing CSV files
"""
import codecs
import csv
import logging

import chardet
import six
from six.moves import zip

import parse_data
from imports import import_utils


log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

SCHEMA = [
          {
            'name': 'lineterminator',
            'label': 'Line terminator',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'include_col_names_as_headers',
            'label': 'First row contains headers',
            'type': 'boolean',
            'visible': True,
          },
          {
            'name': 'delimiter',
            'label': 'Field separator',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'skipinitialspace',
            'label': 'Skip leading whitespace',
            'type': 'boolean',
            'visible': True,
          },
          {
            'name': 'quotechar',
            'label': 'Quote character',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'doublequote',
            'label': 'Quotes in fields are doubled',
            'type': 'boolean',
            'visible': True,
          },

          {
            'name': 'quoting',
            'label': 'Convert quoted fields',
            'type': 'number',
            'visible': False,       # Not supported by messytables
          },
          {
            'name': 'escapechar',
            'label': 'Escape character',
            'type': 'string',
            'visible': False,       # Not supported by messytables
          },
          {
            'name': 'start_with_row',
            'label': 'Start with row',
            'type': 'number',
            'visible': False,       # Not yet implemented
          },
          {
            'name': 'NUM_ROWS',
            'label': 'Number of rows',
            'type': 'number',
            'visible': False,
          },
          {
            'name': 'encoding',
            'label': 'Character encoding. See https://tinyurl.com/py3codecs',
            'type': 'string',
            'visible': True,
          }
          ]

def parse_file_source(file_source, options):
  parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
  return {"parseOptions": parsing_options, "tables": export_list}

def parse_file(file_path, parse_options=None):
  """
  Reads a file path and parse options that are passed in using ActiveDoc.importFile()
  and returns a tuple with parsing options (users' or guessed) and an object formatted so that
  it can be used by grist for a bulk add records action.
  """
  parse_options = parse_options or {}

  given_encoding = parse_options.get('encoding')
  encoding = given_encoding or detect_encoding(file_path)
  log.info("Using encoding %s (%s)", encoding, "given" if given_encoding else "detected")

  try:
    return _parse_with_encoding(file_path, parse_options, encoding)
  except Exception as e:
    encoding = 'utf-8'
    # For valid encodings, we can do our best and report count of errors. But an invalid encoding
    # or one with a BOM will produce an exception. For those, fall back to utf-8.
    parsing_options, export_list = _parse_with_encoding(file_path, parse_options, encoding)
    parsing_options["WARNING"] = "{}: {}. Falling back to {}.\n{}".format(
        type(e).__name__, e, encoding, parsing_options.get("WARNING", ""))
    return parsing_options, export_list


def _parse_with_encoding(file_path, parse_options, encoding):
  codec_errors = CodecErrorsReplace()
  codecs.register_error('custom', codec_errors)
  with codecs.open(file_path, mode="r", encoding=encoding, errors="custom") as f:
    parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
    parsing_options["encoding"] = encoding
    if codec_errors.error_count:
      parsing_options["WARNING"] = (
          "Using encoding %s, encountered %s errors. Use Import Options to change" %
          (encoding, codec_errors.error_count))
    return parsing_options, export_list


def _guess_dialect(file_obj):
  try:
    # Restrict allowed delimiters to prevent guessing other char than this list.
    dialect = csv.Sniffer().sniff(file_obj.read(100000), delimiters=['\t', ',', ';', '|'])
    log.info("Guessed dialect %s", dict(dialect.__dict__))
    # Mimic messytables default for now.
    dialect.lineterminator = "\n"
    dialect.doublequote = True
    return dialect
  except csv.Error:
    log.info("Cannot guess dialect using Excel as fallback.")
    return csv.excel
  finally:
    file_obj.seek(0)

def _parse_open_file(file_obj, parse_options=None):
  csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
  options = {}
  dialect = _guess_dialect(file_obj)

  csv_options = {}
  for key in csv_keys:
    value = parse_options.get(key, getattr(dialect, key, None))
    if value is not None:
      csv_options[key] = value

  reader = csv.reader(file_obj, **csv_options)

  rows = list(reader)
  sample_len = 100
  sample_rows = rows[:sample_len]
  data_offset, headers = import_utils.headers_guess(sample_rows)

  # Make sure all header values are strings.
  for i, header in enumerate(headers):
    if not isinstance(header, six.string_types):
      headers[i] = six.text_type(header)

  log.info("Guessed data_offset as %s", data_offset)
  log.info("Guessed headers as: %s", headers)

  have_guessed_headers = any(headers)
  include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
                                                   have_guessed_headers)

  if include_col_names_as_headers and not have_guessed_headers:
    # use first line as headers
    data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
    headers = import_utils.expand_headers(first_row, data_offset, sample_rows)

  elif not include_col_names_as_headers and have_guessed_headers:
    # move guessed headers to data
    data_offset -= 1
    headers = [''] * len(headers)

  rows = rows[data_offset:]
  num_rows = parse_options.get('NUM_ROWS', 0)
  table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)

  # Identify and remove empty columns, and populate separate metadata and data lists.
  column_metadata = []
  table_data = []
  for col_data, header in zip(table_data_with_types, headers):
    if not header and all(val == "" for val in col_data["data"]):
      continue # empty column
    data = col_data.pop("data")
    col_data["id"] = header
    column_metadata.append(col_data)
    table_data.append(data)

  if not table_data:
    log.info("No data found. Aborting CSV import.")
    # Don't add tables with no columns.
    return {}, []

  guessed = reader.dialect
  quoting = parse_options.get('quoting')
  options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
             "doublequote": parse_options.get('doublequote', guessed.doublequote),
             "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
             "quotechar": parse_options.get('quotechar', guessed.quotechar),
             "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
             "include_col_names_as_headers": include_col_names_as_headers,
             "start_with_row": 1,
             "NUM_ROWS": num_rows,
             "SCHEMA": SCHEMA
             }

  log.info("Output table with %d columns", len(column_metadata))
  for c in column_metadata:
    log.debug("Output column %s", c)

  export_list = [{
    "table_name": None,
    "column_metadata": column_metadata,
    "table_data": table_data
  }]

  return options, export_list


class CodecErrorsReplace(object):
  def __init__(self):
    self.error_count = 0
    self.first_error = None

  def __call__(self, error):
    self.error_count += 1
    if not self.first_error:
      self.first_error = error
    return codecs.replace_errors(error)


def detect_encoding(file_path):
  # Use line-by-line detection as suggested in
  # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage.
  # Using a fixed-sized sample is worse as the sample may end mid-character.
  detector = chardet.UniversalDetector()
  with codecs.open(file_path, "rb") as f:
    for line in f.readlines():
      detector.feed(line)
      if detector.done:
        break
  detector.close()
  encoding = detector.result["encoding"]
  # Default to utf-8, and always prefer it over ASCII as the most common superset.
  if not encoding or encoding == 'ascii':
    encoding = 'utf-8'
  return encoding