From d5a4605d2a3d04e0e87ede334d9b9d0e54b13f08 Mon Sep 17 00:00:00 2001
From: Dmitry S <dsagal+git@gmail.com>
Date: Fri, 18 Aug 2023 17:03:27 -0400
Subject: [PATCH] (core) Improve encoding detection for csv imports, and make
 encoding an editable option.

Summary:
- Using a sample of data was causing poor detection if the sample were
  cut mid-character. Switch to using line-based detection.
- Add a simple option for changing encoding. No convenient UI is offered
  since config UI is auto-generated, but this at least makes it possible to
  recover from bad guesses.
- Upgrades chardet library for good measure.

- Also fixes python3-building step, to more reliably rebuild Python
  dependencies when requirements3.* files change.

Test Plan:
Added a python-side test case, and a browser test that encodings can
be switched, errors are displayed, and wrong encodings fail recoverably.

Reviewers: alexmojaki

Reviewed By: alexmojaki

Differential Revision: https://phab.getgrist.com/D3979
---
 app/client/components/Importer.ts             | 12 +++
 app/client/components/ParseOptions.ts         |  3 +-
 app/plugin/FileParserAPI-ti.ts                |  1 +
 app/plugin/FileParserAPI.ts                   |  1 +
 .../imports/fixtures/test_encoding_utf8.csv   |  4 +
 sandbox/grist/imports/import_csv.py           | 75 +++++++++++++++----
 sandbox/grist/imports/import_csv_test.py      | 75 +++++++++++++++++--
 sandbox/requirements3.txt                     |  2 +-
 test/server/testUtils.ts                      | 13 ++--
 9 files changed, 160 insertions(+), 26 deletions(-)
 create mode 100644 sandbox/grist/imports/fixtures/test_encoding_utf8.csv

diff --git a/app/client/components/Importer.ts b/app/client/components/Importer.ts
index b33c9f38..d8941e1f 100644
--- a/app/client/components/Importer.ts
+++ b/app/client/components/Importer.ts
@@ -1082,6 +1082,7 @@ export class Importer extends DisposableWithEvents {
               options,
             )
           ]),
+          cssWarningText(dom.text(use => use(this._parseOptions)?.WARNING || ""), testId('warning')),
           dom.domComputed(use => {
             if (use(isSkipTable)) {
               return cssOverlay(t('Skip Table on Import'), testId('preview-overlay'));
@@ -1098,6 +1099,7 @@ export class Importer extends DisposableWithEvents {
         )
       );
     });
+
     const buttons = cssImportButtons(cssImportButtonsLine(
       bigPrimaryButton('Import',
         dom.on('click', () => this._maybeFinishImport(upload)),
@@ -1369,6 +1371,10 @@ export class Importer extends DisposableWithEvents {
         (p: ParseOptions) => {
           anotherScreen.dispose();
           this._parseOptions.set(p);
+          // Drop what we previously matched because we may have different columns.
+          // If user manually matched, then changed import options, they'll have to re-match; when
+          // columns change at all, the alternative has incorrect columns in UI and is more confusing.
+          this._sourceInfoArray.set([]);
           this._reImport(upload).catch((err) => reportError(err));
         },
         () => {
@@ -1517,6 +1523,12 @@ const cssTabsWrapper = styled('div', `
   flex-direction: column;
 `);
 
+const cssWarningText = styled('div', `
+  margin-bottom: 8px;
+  color: ${theme.errorText};
+  white-space: pre-line;
+`);
+
 const cssTableList = styled('div', `
   align-self: flex-start;
   max-width: 100%;
diff --git a/app/client/components/ParseOptions.ts b/app/client/components/ParseOptions.ts
index d2905759..41dcb08a 100644
--- a/app/client/components/ParseOptions.ts
+++ b/app/client/components/ParseOptions.ts
@@ -1,6 +1,7 @@
 import {bigBasicButton, bigPrimaryButton} from 'app/client/ui2018/buttons';
 import {squareCheckbox} from 'app/client/ui2018/checkbox';
 import {testId, theme} from 'app/client/ui2018/cssVars';
+import {makeLinks} from 'app/client/ui2018/links';
 import {cssModalButtons} from 'app/client/ui2018/modals';
 import {ParseOptionSchema} from 'app/plugin/FileParserAPI';
 import {Computed, dom, DomContents, IDisposableOwner, input, Observable, styled} from 'grainjs';
@@ -60,7 +61,7 @@ export function buildParseOptionsForm(
   return [
     cssParseOptionForm(
       items.map((item) => cssParseOption(
-        cssParseOptionName(item.label),
+        cssParseOptionName(makeLinks(item.label)),
         optionToInput(owner, item.type, optionsMap.get(item.name)!),
         testId('parseopts-opt'),
       )),
diff --git a/app/plugin/FileParserAPI-ti.ts b/app/plugin/FileParserAPI-ti.ts
index d37da466..51d2cf7c 100644
--- a/app/plugin/FileParserAPI-ti.ts
+++ b/app/plugin/FileParserAPI-ti.ts
@@ -15,6 +15,7 @@ export const ParseFileAPI = t.iface([], {
 export const ParseOptions = t.iface([], {
   "NUM_ROWS": t.opt("number"),
   "SCHEMA": t.opt(t.array("ParseOptionSchema")),
+  "WARNING": t.opt("string"),
 });
 
 export const ParseOptionSchema = t.iface([], {
diff --git a/app/plugin/FileParserAPI.ts b/app/plugin/FileParserAPI.ts
index 867f41b4..6a207984 100644
--- a/app/plugin/FileParserAPI.ts
+++ b/app/plugin/FileParserAPI.ts
@@ -20,6 +20,7 @@ export interface ParseFileAPI {
 export interface ParseOptions {
   NUM_ROWS?: number;
   SCHEMA?: ParseOptionSchema[];
+  WARNING?: string;     // Only on response, includes a warning from parsing, if any.
 }
 
 /**
diff --git a/sandbox/grist/imports/fixtures/test_encoding_utf8.csv b/sandbox/grist/imports/fixtures/test_encoding_utf8.csv
new file mode 100644
index 00000000..f2d30bca
--- /dev/null
+++ b/sandbox/grist/imports/fixtures/test_encoding_utf8.csv
@@ -0,0 +1,4 @@
+Name,Age,Επάγγελμα,Πόλη
+John Smith,30,Γιατρός,Athens
+Μαρία Παπαδοπούλου,25,Engineer,Thessaloniki
+Δημήτρης Johnson,40,Δικηγόρος,Piraeus
diff --git a/sandbox/grist/imports/import_csv.py b/sandbox/grist/imports/import_csv.py
index f35498c3..b6d3b2b0 100644
--- a/sandbox/grist/imports/import_csv.py
+++ b/sandbox/grist/imports/import_csv.py
@@ -14,7 +14,7 @@ from imports import import_utils
 
 
 log = logging.getLogger(__name__)
-log.setLevel(logging.WARNING)
+log.setLevel(logging.INFO)
 
 SCHEMA = [
           {
@@ -77,7 +77,14 @@ SCHEMA = [
             'label': 'Number of rows',
             'type': 'number',
             'visible': False,
-          }]
+          },
+          {
+            'name': 'encoding',
+            'label': 'Character encoding. See https://tinyurl.com/py3codecs',
+            'type': 'string',
+            'visible': True,
+          }
+          ]
 
 def parse_file_source(file_source, options):
   parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
@@ -91,16 +98,32 @@ def parse_file(file_path, parse_options=None):
   """
   parse_options = parse_options or {}
 
-  with codecs.open(file_path, "rb") as f:
-    sample = f.read(100000)
-  encoding = chardet.detect(sample)['encoding'] or "utf8"
-  # In addition, always prefer UTF8 over ASCII.
-  if encoding == 'ascii':
-    encoding = 'utf8'
-  log.info("Using encoding %s", encoding)
+  given_encoding = parse_options.get('encoding')
+  encoding = given_encoding or detect_encoding(file_path)
+  log.info("Using encoding %s (%s)", encoding, "given" if given_encoding else "detected")
 
-  with codecs.open(file_path, mode="r", encoding=encoding) as f:
+  try:
+    return _parse_with_encoding(file_path, parse_options, encoding)
+  except Exception as e:
+    encoding = 'utf-8'
+    # For valid encodings, we can do our best and report count of errors. But an invalid encoding
+    # or one with a BOM will produce an exception. For those, fall back to utf-8.
+    parsing_options, export_list = _parse_with_encoding(file_path, parse_options, encoding)
+    parsing_options["WARNING"] = "{}: {}. Falling back to {}.\n{}".format(
+        type(e).__name__, e, encoding, parsing_options.get("WARNING", ""))
+    return parsing_options, export_list
+
+
+def _parse_with_encoding(file_path, parse_options, encoding):
+  codec_errors = CodecErrorsReplace()
+  codecs.register_error('custom', codec_errors)
+  with codecs.open(file_path, mode="r", encoding=encoding, errors="custom") as f:
     parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
+    parsing_options["encoding"] = encoding
+    if codec_errors.error_count:
+      parsing_options["WARNING"] = (
+          "Using encoding %s, encountered %s errors. Use Import Options to change" %
+          (encoding, codec_errors.error_count))
     return parsing_options, export_list
 
 
@@ -204,6 +227,32 @@ def _parse_open_file(file_obj, parse_options=None):
 
   return options, export_list
 
-def get_version():
-  """ Return name and version of plug-in"""
-  pass
+
+class CodecErrorsReplace(object):
+  def __init__(self):
+    self.error_count = 0
+    self.first_error = None
+
+  def __call__(self, error):
+    self.error_count += 1
+    if not self.first_error:
+      self.first_error = error
+    return codecs.replace_errors(error)
+
+
+def detect_encoding(file_path):
+  # Use line-by-line detection as suggested in
+  # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage.
+  # Using a fixed-sized sample is worse as the sample may end mid-character.
+  detector = chardet.UniversalDetector()
+  with codecs.open(file_path, "rb") as f:
+    for line in f.readlines():
+      detector.feed(line)
+      if detector.done:
+        break
+  detector.close()
+  encoding = detector.result["encoding"]
+  # Default to utf-8, and always prefer it over ASCII as the most common superset.
+  if not encoding or encoding == 'ascii':
+    encoding = 'utf-8'
+  return encoding
diff --git a/sandbox/grist/imports/import_csv_test.py b/sandbox/grist/imports/import_csv_test.py
index c715cd6e..4d0f1a6f 100644
--- a/sandbox/grist/imports/import_csv_test.py
+++ b/sandbox/grist/imports/import_csv_test.py
@@ -1,9 +1,11 @@
 # This Python file uses the following encoding: utf-8
+# pylint:disable=line-too-long
+import csv
 import os
 import textwrap
+import tempfile
 import unittest
 from six import StringIO, text_type
-import csv
 
 from imports import import_csv
 
@@ -11,6 +13,20 @@ from imports import import_csv
 def _get_fixture(filename):
   return os.path.join(os.path.dirname(__file__), "fixtures", filename)
 
+# For a non-utf8 fixture, there is a problem with 'arc diff' which can't handle files with
+# non-utf8 encodings. So create one on the fly.
+non_utf8_fixture = None
+non_utf8_file = None
+def setUpModule():
+  global non_utf8_file, non_utf8_fixture    # pylint:disable=global-statement
+  with open(_get_fixture('test_encoding_utf8.csv')) as f:
+    non_utf8_file = tempfile.NamedTemporaryFile(mode='wb')
+    non_utf8_file.write(f.read().encode('iso-8859-7'))
+    non_utf8_file.flush()
+  non_utf8_fixture = non_utf8_file.name
+
+def tearDownModule():
+  non_utf8_file.close()
 
 class TestImportCSV(unittest.TestCase):
 
@@ -53,7 +69,7 @@ class TestImportCSV(unittest.TestCase):
   def test_csv_types(self):
     options, parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='')
     sheet = parsed_file[0]
-    self._check_options(options)
+    self._check_options(options, encoding='utf-8')
 
     self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
     self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
@@ -84,8 +100,9 @@ class TestImportCSV(unittest.TestCase):
     options["parse_options"].pop("limit_rows")
     options["parse_options"].pop("quoting")
     options["parse_options"].pop("escapechar")
+    options["parse_options"]["encoding"] = "utf-8"   # Expected encoding
     self.assertEqual(options["parse_options"], parsed_options)
-    self._check_options(parsed_options)
+    self._check_options(parsed_options, encoding='utf-8')
     parsed_file = parsed_file[0]
 
     self._check_num_cols(parsed_file, 5)
@@ -385,7 +402,7 @@ class TestImportCSV(unittest.TestCase):
 
   def test_csv_with_very_long_cell(self):
     options, parsed_file = import_csv.parse_file(_get_fixture('test_long_cell.csv'), parse_options='')
-    self._check_options(options)
+    self._check_options(options, encoding='utf-8')
     sheet = parsed_file[0]
     long_cell = sheet["table_data"][1][0]
     self.assertEqual(len(long_cell), 8058)
@@ -394,13 +411,61 @@ class TestImportCSV(unittest.TestCase):
 
   def test_csv_with_surprising_isdigit(self):
     options, parsed_file = import_csv.parse_file(_get_fixture('test_isdigit.csv'), parse_options='')
-    self._check_options(options)
+    self._check_options(options, encoding='utf-8')
     sheet = parsed_file[0]
     self._check_num_cols(sheet, 3)
     self._check_col(sheet, 0, "PHONE", "Text", [u'201-¾᠓𑄺꤈꤈꧐꤆'])
     self._check_col(sheet, 1, "VALUE", "Text", [u'¹5'])
     self._check_col(sheet, 2, "DATE", "Text", [u'2018-0²-27 16:08:39 +0000'])
 
+  def test_csv_encoding_detection_utf8(self):
+    options, parsed_file = import_csv.parse_file(_get_fixture('test_encoding_utf8.csv'), parse_options='')
+    self._check_options(options, encoding='utf-8')
+    sheet = parsed_file[0]
+    self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
+    self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
+
+  def test_csv_encoding_detection_greek(self):
+    # ISO-8859-7 is close to CP1253, and this fixure file would be identical in these two.
+    options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options='')
+    self._check_options(options, encoding='ISO-8859-7')
+    sheet = parsed_file[0]
+    self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
+    self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
+
+    # Similar enough encoding that the result is correct.
+    options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options={"encoding": "cp1253"})
+    self._check_options(options, encoding='cp1253')   # The encoding should be respected
+    sheet = parsed_file[0]
+    self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
+    self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
+
+  def test_csv_encoding_errors_are_handled(self):
+    # With ascii, we'll get many decoding errors, but parsing should still succeed.
+    parse_options = {
+      "encoding": "ascii",
+      "include_col_names_as_headers": True,
+    }
+    options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options)
+    self._check_options(options,
+        encoding='ascii',
+        WARNING='Using encoding ascii, encountered 108 errors. Use Import Options to change')
+    sheet = parsed_file[0]
+    self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'����� ������������', u'�������� Johnson'])
+    self._check_col(sheet, 2, "���������", "Text", [u'�������', u'Engineer', u'���������'])
+
+  def test_csv_encoding_mismatch(self):
+    # Here we use a wrong single-byte encoding, to check that it succeeds even if with nonsense.
+    parse_options = {
+      "encoding": "cp1254",
+      "include_col_names_as_headers": True,
+    }
+    options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options)
+    self._check_options(options, encoding='cp1254')
+    sheet = parsed_file[0]
+    self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Ìáñßá Ğáğáäïğïıëïõ', u'ÄçìŞôñçò Johnson'])
+    self._check_col(sheet, 2, "ÅğÜããåëìá", "Text", [u'Ãéáôñüò', u'Engineer', u'Äéêçãüñïò'])
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sandbox/requirements3.txt b/sandbox/requirements3.txt
index ef5dc1a8..d8168ae3 100644
--- a/sandbox/requirements3.txt
+++ b/sandbox/requirements3.txt
@@ -10,7 +10,7 @@ asttokens==2.2.1
     # via
     #   friendly-traceback
     #   stack-data
-chardet==4.0.0
+chardet==5.1.0
     # via -r core/sandbox/requirements3.in
 et-xmlfile==1.0.1
     # via openpyxl
diff --git a/test/server/testUtils.ts b/test/server/testUtils.ts
index 31642fcd..cdd4e72f 100644
--- a/test/server/testUtils.ts
+++ b/test/server/testUtils.ts
@@ -17,6 +17,7 @@ import * as path from 'path';
 import * as fse from 'fs-extra';
 import clone = require('lodash/clone');
 import * as tmp from 'tmp-promise';
+import {Options as TmpOptions} from 'tmp';
 import * as winston from 'winston';
 import { serialize } from 'winston/lib/winston/common';
 
@@ -27,15 +28,15 @@ import { getAppRoot } from 'app/server/lib/places';
 /**
  * Creates a temporary file with the given contents.
  * @param {String} content. Data to store in the file.
- * @param {[Boolean]} optKeep. Optionally pass in true to keep the file from being deleted, which
+ * @param {[Boolean]} options.keep. Optionally pass in true to keep the file from being deleted, which
  *    is useful to see the content while debugging a test.
  * @returns {Promise} A promise for the path of the new file.
  */
-export async function writeTmpFile(content: any, optKeep?: boolean) {
+export async function writeTmpFile(content: any, options: TmpOptions = {}) {
   // discardDescriptor ensures tmp module closes it. It can lead to horrible bugs to close this
   // descriptor yourself, since tmp also closes it on exit, and if it's a different descriptor by
   // that time, it can lead to a crash. See https://github.com/raszi/node-tmp/issues/168
-  const obj = await tmp.file({keep: optKeep, discardDescriptor: true});
+  const obj = await tmp.file({discardDescriptor: true, ...options});
   await fse.writeFile(obj.path, content);
   return obj.path;
 }
@@ -44,17 +45,17 @@ export async function writeTmpFile(content: any, optKeep?: boolean) {
  * Creates a temporary file with `numLines` of generated data, each line about 30 bytes long.
  * This is useful for testing operations with large files.
  * @param {Number} numLines. How many lines to store in the file.
- * @param {[Boolean]} optKeep. Optionally pass in true to keep the file from being deleted, which
+ * @param {[Boolean]} options.keep. Optionally pass in true to keep the file from being deleted, which
  *    is useful to see the content while debugging a test.
  * @returns {Promise} A promise for the path of the new file.
  */
-export async function generateTmpFile(numLines: number, optKeep?: boolean) {
+export async function generateTmpFile(numLines: number, options: TmpOptions = {}) {
   // Generate a bigger data file.
   const data = [];
   for (let i = 0; i < numLines; i++) {
     data.push(i + " abcdefghijklmnopqrstuvwxyz\n");
   }
-  return writeTmpFile(data.join(""), optKeep);
+  return writeTmpFile(data.join(""), options);
 }