gristlabs_grist-core/sandbox/grist/imports/import_csv.py

"""
Plugin for importing CSV files
"""
import os
import logging

import chardet
import messytables
import six
from six.moves import zip

import parse_data
from imports import import_utils


log = logging.getLogger(__name__)

SCHEMA = [
          {
            'name': 'lineterminator',
            'label': 'Line terminator',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'include_col_names_as_headers',
            'label': 'First row contains headers',
            'type': 'boolean',
            'visible': True,
          },
          {
            'name': 'delimiter',
            'label': 'Field separator',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'skipinitialspace',
            'label': 'Skip leading whitespace',
            'type': 'boolean',
            'visible': True,
          },
          {
            'name': 'quotechar',
            'label': 'Quote character',
            'type': 'string',
            'visible': True,
          },
          {
            'name': 'doublequote',
            'label': 'Quotes in fields are doubled',
            'type': 'boolean',
            'visible': True,
          },

          {
            'name': 'quoting',
            'label': 'Convert quoted fields',
            'type': 'number',
            'visible': False,       # Not supported by messytables
          },
          {
            'name': 'escapechar',
            'label': 'Escape character',
            'type': 'string',
            'visible': False,       # Not supported by messytables
          },
          {
            'name': 'start_with_row',
            'label': 'Start with row',
            'type': 'number',
            'visible': False,       # Not yet implemented
          },
          {
            'name': 'NUM_ROWS',
            'label': 'Number of rows',
            'type': 'number',
            'visible': False,
          }]

def parse_file_source(file_source, options):
  parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
  return {"parseOptions": parsing_options, "tables": export_list}

def parse_file(file_path, parse_options=None):
  """
  Reads a file path and parse options that are passed in using ActiveDoc.importFile()
  and returns a tuple with parsing options (users' or guessed) and an object formatted so that
  it can be used by grist for a bulk add records action.
  """
  parse_options = parse_options or {}

  with open(file_path, "rb") as f:
    parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
    return parsing_options, export_list


def _parse_open_file(file_obj, parse_options=None):
  options = {}
  csv_keys = ['delimiter', 'quotechar', 'lineterminator', 'doublequote', 'skipinitialspace']
  csv_options = {k: parse_options.get(k) for k in csv_keys}
  if six.PY2:
    csv_options = {k: v.encode('utf8') if isinstance(v, six.text_type) else v
                   for k, v in csv_options.items()}

  table_set = messytables.CSVTableSet(file_obj,
                                      delimiter=csv_options['delimiter'],
                                      quotechar=csv_options['quotechar'],
                                      lineterminator=csv_options['lineterminator'],
                                      doublequote=csv_options['doublequote'],
                                      skipinitialspace=csv_options['skipinitialspace'])

  num_rows = parse_options.get('NUM_ROWS', 0)

  # Messytable's encoding detection uses too small a sample, so we override it here.
  sample = file_obj.read(100000)
  table_set.encoding = chardet.detect(sample)['encoding']
  # In addition, always prefer UTF8 over ASCII.
  if table_set.encoding == 'ascii':
    table_set.encoding = 'utf8'

  export_list = []
  # A table set is a collection of tables:
  for row_set in table_set.tables:
    table_name = None
    sample_rows = list(row_set.sample)
    # Messytables doesn't guess whether headers are present, so we need to step in.
    data_offset, headers = import_utils.headers_guess(sample_rows)

    # Make sure all header values are strings.
    for i, header in enumerate(headers):
      if not isinstance(header, six.string_types):
        headers[i] = six.text_type(header)

    log.info("Guessed data_offset as %s", data_offset)
    log.info("Guessed headers as: %s", headers)

    have_guessed_headers = any(headers)
    include_col_names_as_headers = parse_options.get('include_col_names_as_headers',
                                                     have_guessed_headers)

    if include_col_names_as_headers and not have_guessed_headers:
      # use first line as headers
      data_offset, first_row = import_utils.find_first_non_empty_row(sample_rows)
      headers = import_utils.expand_headers(first_row, data_offset, sample_rows)

    elif not include_col_names_as_headers and have_guessed_headers:
      # move guessed headers to data
      data_offset -= 1
      headers = [''] * len(headers)

    row_set.register_processor(messytables.offset_processor(data_offset))
    rows = [
      [cell.value for cell in row]
      for row in row_set
    ]
    table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)

    # Identify and remove empty columns, and populate separate metadata and data lists.
    column_metadata = []
    table_data = []
    for col_data, header in zip(table_data_with_types, headers):
      if not header and all(val == "" for val in col_data["data"]):
        continue # empty column
      data = col_data.pop("data")
      col_data["id"] = header
      column_metadata.append(col_data)
      table_data.append(data)

    if not table_data:
      # Don't add tables with no columns.
      continue

    guessed = row_set._dialect
    quoting = parse_options.get('quoting')
    options = {"delimiter": parse_options.get('delimiter', guessed.delimiter),
               "doublequote": parse_options.get('doublequote', guessed.doublequote),
               "lineterminator": parse_options.get('lineterminator', guessed.lineterminator),
               "quotechar": parse_options.get('quotechar', guessed.quotechar),
               "skipinitialspace": parse_options.get('skipinitialspace', guessed.skipinitialspace),
               "include_col_names_as_headers": include_col_names_as_headers,
               "start_with_row": 1,
               "NUM_ROWS": num_rows,
               "SCHEMA": SCHEMA
               }

    log.info("Output table %r with %d columns", table_name, len(column_metadata))
    for c in column_metadata:
      log.debug("Output column %s", c)
    export_list.append({
      "table_name": table_name,
      "column_metadata": column_metadata,
      "table_data": table_data
    })

  return options, export_list

def get_version():
  """ Return name and version of plug-in"""
  pass