From d5a4605d2a3d04e0e87ede334d9b9d0e54b13f08 Mon Sep 17 00:00:00 2001 From: Dmitry S Date: Fri, 18 Aug 2023 17:03:27 -0400 Subject: [PATCH] (core) Improve encoding detection for csv imports, and make encoding an editable option. Summary: - Using a sample of data was causing poor detection if the sample were cut mid-character. Switch to using line-based detection. - Add a simple option for changing encoding. No convenient UI is offered since config UI is auto-generated, but this at least makes it possible to recover from bad guesses. - Upgrades chardet library for good measure. - Also fixes python3-building step, to more reliably rebuild Python dependencies when requirements3.* files change. Test Plan: Added a python-side test case, and a browser test that encodings can be switched, errors are displayed, and wrong encodings fail recoverably. Reviewers: alexmojaki Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D3979 --- app/client/components/Importer.ts | 12 +++ app/client/components/ParseOptions.ts | 3 +- app/plugin/FileParserAPI-ti.ts | 1 + app/plugin/FileParserAPI.ts | 1 + .../imports/fixtures/test_encoding_utf8.csv | 4 + sandbox/grist/imports/import_csv.py | 75 +++++++++++++++---- sandbox/grist/imports/import_csv_test.py | 75 +++++++++++++++++-- sandbox/requirements3.txt | 2 +- test/server/testUtils.ts | 13 ++-- 9 files changed, 160 insertions(+), 26 deletions(-) create mode 100644 sandbox/grist/imports/fixtures/test_encoding_utf8.csv diff --git a/app/client/components/Importer.ts b/app/client/components/Importer.ts index b33c9f38..d8941e1f 100644 --- a/app/client/components/Importer.ts +++ b/app/client/components/Importer.ts @@ -1082,6 +1082,7 @@ export class Importer extends DisposableWithEvents { options, ) ]), + cssWarningText(dom.text(use => use(this._parseOptions)?.WARNING || ""), testId('warning')), dom.domComputed(use => { if (use(isSkipTable)) { return cssOverlay(t('Skip Table on Import'), testId('preview-overlay')); @@ -1098,6 +1099,7 @@ export class Importer extends DisposableWithEvents { ) ); }); + const buttons = cssImportButtons(cssImportButtonsLine( bigPrimaryButton('Import', dom.on('click', () => this._maybeFinishImport(upload)), @@ -1369,6 +1371,10 @@ export class Importer extends DisposableWithEvents { (p: ParseOptions) => { anotherScreen.dispose(); this._parseOptions.set(p); + // Drop what we previously matched because we may have different columns. + // If user manually matched, then changed import options, they'll have to re-match; when + // columns change at all, the alternative has incorrect columns in UI and is more confusing. + this._sourceInfoArray.set([]); this._reImport(upload).catch((err) => reportError(err)); }, () => { @@ -1517,6 +1523,12 @@ const cssTabsWrapper = styled('div', ` flex-direction: column; `); +const cssWarningText = styled('div', ` + margin-bottom: 8px; + color: ${theme.errorText}; + white-space: pre-line; +`); + const cssTableList = styled('div', ` align-self: flex-start; max-width: 100%; diff --git a/app/client/components/ParseOptions.ts b/app/client/components/ParseOptions.ts index d2905759..41dcb08a 100644 --- a/app/client/components/ParseOptions.ts +++ b/app/client/components/ParseOptions.ts @@ -1,6 +1,7 @@ import {bigBasicButton, bigPrimaryButton} from 'app/client/ui2018/buttons'; import {squareCheckbox} from 'app/client/ui2018/checkbox'; import {testId, theme} from 'app/client/ui2018/cssVars'; +import {makeLinks} from 'app/client/ui2018/links'; import {cssModalButtons} from 'app/client/ui2018/modals'; import {ParseOptionSchema} from 'app/plugin/FileParserAPI'; import {Computed, dom, DomContents, IDisposableOwner, input, Observable, styled} from 'grainjs'; @@ -60,7 +61,7 @@ export function buildParseOptionsForm( return [ cssParseOptionForm( items.map((item) => cssParseOption( - cssParseOptionName(item.label), + cssParseOptionName(makeLinks(item.label)), optionToInput(owner, item.type, optionsMap.get(item.name)!), testId('parseopts-opt'), )), diff --git a/app/plugin/FileParserAPI-ti.ts b/app/plugin/FileParserAPI-ti.ts index d37da466..51d2cf7c 100644 --- a/app/plugin/FileParserAPI-ti.ts +++ b/app/plugin/FileParserAPI-ti.ts @@ -15,6 +15,7 @@ export const ParseFileAPI = t.iface([], { export const ParseOptions = t.iface([], { "NUM_ROWS": t.opt("number"), "SCHEMA": t.opt(t.array("ParseOptionSchema")), + "WARNING": t.opt("string"), }); export const ParseOptionSchema = t.iface([], { diff --git a/app/plugin/FileParserAPI.ts b/app/plugin/FileParserAPI.ts index 867f41b4..6a207984 100644 --- a/app/plugin/FileParserAPI.ts +++ b/app/plugin/FileParserAPI.ts @@ -20,6 +20,7 @@ export interface ParseFileAPI { export interface ParseOptions { NUM_ROWS?: number; SCHEMA?: ParseOptionSchema[]; + WARNING?: string; // Only on response, includes a warning from parsing, if any. } /** diff --git a/sandbox/grist/imports/fixtures/test_encoding_utf8.csv b/sandbox/grist/imports/fixtures/test_encoding_utf8.csv new file mode 100644 index 00000000..f2d30bca --- /dev/null +++ b/sandbox/grist/imports/fixtures/test_encoding_utf8.csv @@ -0,0 +1,4 @@ +Name,Age,Επάγγελμα,Πόλη +John Smith,30,Γιατρός,Athens +Μαρία Παπαδοπούλου,25,Engineer,Thessaloniki +Δημήτρης Johnson,40,Δικηγόρος,Piraeus diff --git a/sandbox/grist/imports/import_csv.py b/sandbox/grist/imports/import_csv.py index f35498c3..b6d3b2b0 100644 --- a/sandbox/grist/imports/import_csv.py +++ b/sandbox/grist/imports/import_csv.py @@ -14,7 +14,7 @@ from imports import import_utils log = logging.getLogger(__name__) -log.setLevel(logging.WARNING) +log.setLevel(logging.INFO) SCHEMA = [ { @@ -77,7 +77,14 @@ SCHEMA = [ 'label': 'Number of rows', 'type': 'number', 'visible': False, - }] + }, + { + 'name': 'encoding', + 'label': 'Character encoding. See https://tinyurl.com/py3codecs', + 'type': 'string', + 'visible': True, + } + ] def parse_file_source(file_source, options): parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options) @@ -91,16 +98,32 @@ def parse_file(file_path, parse_options=None): """ parse_options = parse_options or {} - with codecs.open(file_path, "rb") as f: - sample = f.read(100000) - encoding = chardet.detect(sample)['encoding'] or "utf8" - # In addition, always prefer UTF8 over ASCII. - if encoding == 'ascii': - encoding = 'utf8' - log.info("Using encoding %s", encoding) + given_encoding = parse_options.get('encoding') + encoding = given_encoding or detect_encoding(file_path) + log.info("Using encoding %s (%s)", encoding, "given" if given_encoding else "detected") - with codecs.open(file_path, mode="r", encoding=encoding) as f: + try: + return _parse_with_encoding(file_path, parse_options, encoding) + except Exception as e: + encoding = 'utf-8' + # For valid encodings, we can do our best and report count of errors. But an invalid encoding + # or one with a BOM will produce an exception. For those, fall back to utf-8. + parsing_options, export_list = _parse_with_encoding(file_path, parse_options, encoding) + parsing_options["WARNING"] = "{}: {}. Falling back to {}.\n{}".format( + type(e).__name__, e, encoding, parsing_options.get("WARNING", "")) + return parsing_options, export_list + + +def _parse_with_encoding(file_path, parse_options, encoding): + codec_errors = CodecErrorsReplace() + codecs.register_error('custom', codec_errors) + with codecs.open(file_path, mode="r", encoding=encoding, errors="custom") as f: parsing_options, export_list = _parse_open_file(f, parse_options=parse_options) + parsing_options["encoding"] = encoding + if codec_errors.error_count: + parsing_options["WARNING"] = ( + "Using encoding %s, encountered %s errors. Use Import Options to change" % + (encoding, codec_errors.error_count)) return parsing_options, export_list @@ -204,6 +227,32 @@ def _parse_open_file(file_obj, parse_options=None): return options, export_list -def get_version(): - """ Return name and version of plug-in""" - pass + +class CodecErrorsReplace(object): + def __init__(self): + self.error_count = 0 + self.first_error = None + + def __call__(self, error): + self.error_count += 1 + if not self.first_error: + self.first_error = error + return codecs.replace_errors(error) + + +def detect_encoding(file_path): + # Use line-by-line detection as suggested in + # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage. + # Using a fixed-sized sample is worse as the sample may end mid-character. + detector = chardet.UniversalDetector() + with codecs.open(file_path, "rb") as f: + for line in f.readlines(): + detector.feed(line) + if detector.done: + break + detector.close() + encoding = detector.result["encoding"] + # Default to utf-8, and always prefer it over ASCII as the most common superset. + if not encoding or encoding == 'ascii': + encoding = 'utf-8' + return encoding diff --git a/sandbox/grist/imports/import_csv_test.py b/sandbox/grist/imports/import_csv_test.py index c715cd6e..4d0f1a6f 100644 --- a/sandbox/grist/imports/import_csv_test.py +++ b/sandbox/grist/imports/import_csv_test.py @@ -1,9 +1,11 @@ # This Python file uses the following encoding: utf-8 +# pylint:disable=line-too-long +import csv import os import textwrap +import tempfile import unittest from six import StringIO, text_type -import csv from imports import import_csv @@ -11,6 +13,20 @@ from imports import import_csv def _get_fixture(filename): return os.path.join(os.path.dirname(__file__), "fixtures", filename) +# For a non-utf8 fixture, there is a problem with 'arc diff' which can't handle files with +# non-utf8 encodings. So create one on the fly. +non_utf8_fixture = None +non_utf8_file = None +def setUpModule(): + global non_utf8_file, non_utf8_fixture # pylint:disable=global-statement + with open(_get_fixture('test_encoding_utf8.csv')) as f: + non_utf8_file = tempfile.NamedTemporaryFile(mode='wb') + non_utf8_file.write(f.read().encode('iso-8859-7')) + non_utf8_file.flush() + non_utf8_fixture = non_utf8_file.name + +def tearDownModule(): + non_utf8_file.close() class TestImportCSV(unittest.TestCase): @@ -53,7 +69,7 @@ class TestImportCSV(unittest.TestCase): def test_csv_types(self): options, parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='') sheet = parsed_file[0] - self._check_options(options) + self._check_options(options, encoding='utf-8') self._check_col(sheet, 0, "int1", "Int", [-1234123, '', '']) self._check_col(sheet, 1, "int2", "Int", [5, '', '']) @@ -84,8 +100,9 @@ class TestImportCSV(unittest.TestCase): options["parse_options"].pop("limit_rows") options["parse_options"].pop("quoting") options["parse_options"].pop("escapechar") + options["parse_options"]["encoding"] = "utf-8" # Expected encoding self.assertEqual(options["parse_options"], parsed_options) - self._check_options(parsed_options) + self._check_options(parsed_options, encoding='utf-8') parsed_file = parsed_file[0] self._check_num_cols(parsed_file, 5) @@ -385,7 +402,7 @@ class TestImportCSV(unittest.TestCase): def test_csv_with_very_long_cell(self): options, parsed_file = import_csv.parse_file(_get_fixture('test_long_cell.csv'), parse_options='') - self._check_options(options) + self._check_options(options, encoding='utf-8') sheet = parsed_file[0] long_cell = sheet["table_data"][1][0] self.assertEqual(len(long_cell), 8058) @@ -394,13 +411,61 @@ class TestImportCSV(unittest.TestCase): def test_csv_with_surprising_isdigit(self): options, parsed_file = import_csv.parse_file(_get_fixture('test_isdigit.csv'), parse_options='') - self._check_options(options) + self._check_options(options, encoding='utf-8') sheet = parsed_file[0] self._check_num_cols(sheet, 3) self._check_col(sheet, 0, "PHONE", "Text", [u'201-¾᠓𑄺꤈꤈꧐꤆']) self._check_col(sheet, 1, "VALUE", "Text", [u'¹5']) self._check_col(sheet, 2, "DATE", "Text", [u'2018-0²-27 16:08:39 +0000']) + def test_csv_encoding_detection_utf8(self): + options, parsed_file = import_csv.parse_file(_get_fixture('test_encoding_utf8.csv'), parse_options='') + self._check_options(options, encoding='utf-8') + sheet = parsed_file[0] + self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson']) + self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος']) + + def test_csv_encoding_detection_greek(self): + # ISO-8859-7 is close to CP1253, and this fixure file would be identical in these two. + options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options='') + self._check_options(options, encoding='ISO-8859-7') + sheet = parsed_file[0] + self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson']) + self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος']) + + # Similar enough encoding that the result is correct. + options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options={"encoding": "cp1253"}) + self._check_options(options, encoding='cp1253') # The encoding should be respected + sheet = parsed_file[0] + self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson']) + self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος']) + + def test_csv_encoding_errors_are_handled(self): + # With ascii, we'll get many decoding errors, but parsing should still succeed. + parse_options = { + "encoding": "ascii", + "include_col_names_as_headers": True, + } + options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options) + self._check_options(options, + encoding='ascii', + WARNING='Using encoding ascii, encountered 108 errors. Use Import Options to change') + sheet = parsed_file[0] + self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'����� ������������', u'�������� Johnson']) + self._check_col(sheet, 2, "���������", "Text", [u'�������', u'Engineer', u'���������']) + + def test_csv_encoding_mismatch(self): + # Here we use a wrong single-byte encoding, to check that it succeeds even if with nonsense. + parse_options = { + "encoding": "cp1254", + "include_col_names_as_headers": True, + } + options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options) + self._check_options(options, encoding='cp1254') + sheet = parsed_file[0] + self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Ìáñßá Ğáğáäïğïıëïõ', u'ÄçìŞôñçò Johnson']) + self._check_col(sheet, 2, "ÅğÜããåëìá", "Text", [u'Ãéáôñüò', u'Engineer', u'Äéêçãüñïò']) + if __name__ == '__main__': unittest.main() diff --git a/sandbox/requirements3.txt b/sandbox/requirements3.txt index ef5dc1a8..d8168ae3 100644 --- a/sandbox/requirements3.txt +++ b/sandbox/requirements3.txt @@ -10,7 +10,7 @@ asttokens==2.2.1 # via # friendly-traceback # stack-data -chardet==4.0.0 +chardet==5.1.0 # via -r core/sandbox/requirements3.in et-xmlfile==1.0.1 # via openpyxl diff --git a/test/server/testUtils.ts b/test/server/testUtils.ts index 31642fcd..cdd4e72f 100644 --- a/test/server/testUtils.ts +++ b/test/server/testUtils.ts @@ -17,6 +17,7 @@ import * as path from 'path'; import * as fse from 'fs-extra'; import clone = require('lodash/clone'); import * as tmp from 'tmp-promise'; +import {Options as TmpOptions} from 'tmp'; import * as winston from 'winston'; import { serialize } from 'winston/lib/winston/common'; @@ -27,15 +28,15 @@ import { getAppRoot } from 'app/server/lib/places'; /** * Creates a temporary file with the given contents. * @param {String} content. Data to store in the file. - * @param {[Boolean]} optKeep. Optionally pass in true to keep the file from being deleted, which + * @param {[Boolean]} options.keep. Optionally pass in true to keep the file from being deleted, which * is useful to see the content while debugging a test. * @returns {Promise} A promise for the path of the new file. */ -export async function writeTmpFile(content: any, optKeep?: boolean) { +export async function writeTmpFile(content: any, options: TmpOptions = {}) { // discardDescriptor ensures tmp module closes it. It can lead to horrible bugs to close this // descriptor yourself, since tmp also closes it on exit, and if it's a different descriptor by // that time, it can lead to a crash. See https://github.com/raszi/node-tmp/issues/168 - const obj = await tmp.file({keep: optKeep, discardDescriptor: true}); + const obj = await tmp.file({discardDescriptor: true, ...options}); await fse.writeFile(obj.path, content); return obj.path; } @@ -44,17 +45,17 @@ export async function writeTmpFile(content: any, optKeep?: boolean) { * Creates a temporary file with `numLines` of generated data, each line about 30 bytes long. * This is useful for testing operations with large files. * @param {Number} numLines. How many lines to store in the file. - * @param {[Boolean]} optKeep. Optionally pass in true to keep the file from being deleted, which + * @param {[Boolean]} options.keep. Optionally pass in true to keep the file from being deleted, which * is useful to see the content while debugging a test. * @returns {Promise} A promise for the path of the new file. */ -export async function generateTmpFile(numLines: number, optKeep?: boolean) { +export async function generateTmpFile(numLines: number, options: TmpOptions = {}) { // Generate a bigger data file. const data = []; for (let i = 0; i < numLines; i++) { data.push(i + " abcdefghijklmnopqrstuvwxyz\n"); } - return writeTmpFile(data.join(""), optKeep); + return writeTmpFile(data.join(""), options); }