(core) Improve encoding detection for csv imports, and make encoding an editable option.

Summary:
- Using a sample of data was causing poor detection if the sample were
  cut mid-character. Switch to using line-based detection.
- Add a simple option for changing encoding. No convenient UI is offered
  since config UI is auto-generated, but this at least makes it possible to
  recover from bad guesses.
- Upgrades chardet library for good measure.

- Also fixes python3-building step, to more reliably rebuild Python
  dependencies when requirements3.* files change.

Test Plan:
Added a python-side test case, and a browser test that encodings can
be switched, errors are displayed, and wrong encodings fail recoverably.

Reviewers: alexmojaki

Reviewed By: alexmojaki

Differential Revision: https://phab.getgrist.com/D3979
This commit is contained in:
Dmitry S
2023-08-18 17:03:27 -04:00
parent b9adcefcce
commit d5a4605d2a
9 changed files with 160 additions and 26 deletions

View File

@@ -0,0 +1,4 @@
Name,Age,Επάγγελμα,Πόλη
John Smith,30,Γιατρός,Athens
Μαρία Παπαδοπούλου,25,Engineer,Thessaloniki
Δημήτρης Johnson,40,Δικηγόρος,Piraeus
1 Name Age Επάγγελμα Πόλη
2 John Smith 30 Γιατρός Athens
3 Μαρία Παπαδοπούλου 25 Engineer Thessaloniki
4 Δημήτρης Johnson 40 Δικηγόρος Piraeus

View File

@@ -14,7 +14,7 @@ from imports import import_utils
log = logging.getLogger(__name__)
log.setLevel(logging.WARNING)
log.setLevel(logging.INFO)
SCHEMA = [
{
@@ -77,7 +77,14 @@ SCHEMA = [
'label': 'Number of rows',
'type': 'number',
'visible': False,
}]
},
{
'name': 'encoding',
'label': 'Character encoding. See https://tinyurl.com/py3codecs',
'type': 'string',
'visible': True,
}
]
def parse_file_source(file_source, options):
parsing_options, export_list = parse_file(import_utils.get_path(file_source["path"]), options)
@@ -91,16 +98,32 @@ def parse_file(file_path, parse_options=None):
"""
parse_options = parse_options or {}
with codecs.open(file_path, "rb") as f:
sample = f.read(100000)
encoding = chardet.detect(sample)['encoding'] or "utf8"
# In addition, always prefer UTF8 over ASCII.
if encoding == 'ascii':
encoding = 'utf8'
log.info("Using encoding %s", encoding)
given_encoding = parse_options.get('encoding')
encoding = given_encoding or detect_encoding(file_path)
log.info("Using encoding %s (%s)", encoding, "given" if given_encoding else "detected")
with codecs.open(file_path, mode="r", encoding=encoding) as f:
try:
return _parse_with_encoding(file_path, parse_options, encoding)
except Exception as e:
encoding = 'utf-8'
# For valid encodings, we can do our best and report count of errors. But an invalid encoding
# or one with a BOM will produce an exception. For those, fall back to utf-8.
parsing_options, export_list = _parse_with_encoding(file_path, parse_options, encoding)
parsing_options["WARNING"] = "{}: {}. Falling back to {}.\n{}".format(
type(e).__name__, e, encoding, parsing_options.get("WARNING", ""))
return parsing_options, export_list
def _parse_with_encoding(file_path, parse_options, encoding):
codec_errors = CodecErrorsReplace()
codecs.register_error('custom', codec_errors)
with codecs.open(file_path, mode="r", encoding=encoding, errors="custom") as f:
parsing_options, export_list = _parse_open_file(f, parse_options=parse_options)
parsing_options["encoding"] = encoding
if codec_errors.error_count:
parsing_options["WARNING"] = (
"Using encoding %s, encountered %s errors. Use Import Options to change" %
(encoding, codec_errors.error_count))
return parsing_options, export_list
@@ -204,6 +227,32 @@ def _parse_open_file(file_obj, parse_options=None):
return options, export_list
def get_version():
""" Return name and version of plug-in"""
pass
class CodecErrorsReplace(object):
def __init__(self):
self.error_count = 0
self.first_error = None
def __call__(self, error):
self.error_count += 1
if not self.first_error:
self.first_error = error
return codecs.replace_errors(error)
def detect_encoding(file_path):
# Use line-by-line detection as suggested in
# https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage.
# Using a fixed-sized sample is worse as the sample may end mid-character.
detector = chardet.UniversalDetector()
with codecs.open(file_path, "rb") as f:
for line in f.readlines():
detector.feed(line)
if detector.done:
break
detector.close()
encoding = detector.result["encoding"]
# Default to utf-8, and always prefer it over ASCII as the most common superset.
if not encoding or encoding == 'ascii':
encoding = 'utf-8'
return encoding

View File

@@ -1,9 +1,11 @@
# This Python file uses the following encoding: utf-8
# pylint:disable=line-too-long
import csv
import os
import textwrap
import tempfile
import unittest
from six import StringIO, text_type
import csv
from imports import import_csv
@@ -11,6 +13,20 @@ from imports import import_csv
def _get_fixture(filename):
return os.path.join(os.path.dirname(__file__), "fixtures", filename)
# For a non-utf8 fixture, there is a problem with 'arc diff' which can't handle files with
# non-utf8 encodings. So create one on the fly.
non_utf8_fixture = None
non_utf8_file = None
def setUpModule():
global non_utf8_file, non_utf8_fixture # pylint:disable=global-statement
with open(_get_fixture('test_encoding_utf8.csv')) as f:
non_utf8_file = tempfile.NamedTemporaryFile(mode='wb')
non_utf8_file.write(f.read().encode('iso-8859-7'))
non_utf8_file.flush()
non_utf8_fixture = non_utf8_file.name
def tearDownModule():
non_utf8_file.close()
class TestImportCSV(unittest.TestCase):
@@ -53,7 +69,7 @@ class TestImportCSV(unittest.TestCase):
def test_csv_types(self):
options, parsed_file = import_csv.parse_file(_get_fixture('test_excel_types.csv'), parse_options='')
sheet = parsed_file[0]
self._check_options(options)
self._check_options(options, encoding='utf-8')
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
@@ -84,8 +100,9 @@ class TestImportCSV(unittest.TestCase):
options["parse_options"].pop("limit_rows")
options["parse_options"].pop("quoting")
options["parse_options"].pop("escapechar")
options["parse_options"]["encoding"] = "utf-8" # Expected encoding
self.assertEqual(options["parse_options"], parsed_options)
self._check_options(parsed_options)
self._check_options(parsed_options, encoding='utf-8')
parsed_file = parsed_file[0]
self._check_num_cols(parsed_file, 5)
@@ -385,7 +402,7 @@ class TestImportCSV(unittest.TestCase):
def test_csv_with_very_long_cell(self):
options, parsed_file = import_csv.parse_file(_get_fixture('test_long_cell.csv'), parse_options='')
self._check_options(options)
self._check_options(options, encoding='utf-8')
sheet = parsed_file[0]
long_cell = sheet["table_data"][1][0]
self.assertEqual(len(long_cell), 8058)
@@ -394,13 +411,61 @@ class TestImportCSV(unittest.TestCase):
def test_csv_with_surprising_isdigit(self):
options, parsed_file = import_csv.parse_file(_get_fixture('test_isdigit.csv'), parse_options='')
self._check_options(options)
self._check_options(options, encoding='utf-8')
sheet = parsed_file[0]
self._check_num_cols(sheet, 3)
self._check_col(sheet, 0, "PHONE", "Text", [u'201-¾᠓𑄺꤈꤈꧐꤆'])
self._check_col(sheet, 1, "VALUE", "Text", [u'¹5'])
self._check_col(sheet, 2, "DATE", "Text", [u'2018-0²-27 16:08:39 +0000'])
def test_csv_encoding_detection_utf8(self):
options, parsed_file = import_csv.parse_file(_get_fixture('test_encoding_utf8.csv'), parse_options='')
self._check_options(options, encoding='utf-8')
sheet = parsed_file[0]
self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
def test_csv_encoding_detection_greek(self):
# ISO-8859-7 is close to CP1253, and this fixure file would be identical in these two.
options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options='')
self._check_options(options, encoding='ISO-8859-7')
sheet = parsed_file[0]
self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
# Similar enough encoding that the result is correct.
options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options={"encoding": "cp1253"})
self._check_options(options, encoding='cp1253') # The encoding should be respected
sheet = parsed_file[0]
self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Μαρία Παπαδοπούλου', u'Δημήτρης Johnson'])
self._check_col(sheet, 2, "Επάγγελμα", "Text", [u'Γιατρός', u'Engineer', u'Δικηγόρος'])
def test_csv_encoding_errors_are_handled(self):
# With ascii, we'll get many decoding errors, but parsing should still succeed.
parse_options = {
"encoding": "ascii",
"include_col_names_as_headers": True,
}
options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options)
self._check_options(options,
encoding='ascii',
WARNING='Using encoding ascii, encountered 108 errors. Use Import Options to change')
sheet = parsed_file[0]
self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>', u'<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> Johnson'])
self._check_col(sheet, 2, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", "Text", [u'<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>', u'Engineer', u'<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>'])
def test_csv_encoding_mismatch(self):
# Here we use a wrong single-byte encoding, to check that it succeeds even if with nonsense.
parse_options = {
"encoding": "cp1254",
"include_col_names_as_headers": True,
}
options, parsed_file = import_csv.parse_file(non_utf8_fixture, parse_options=parse_options)
self._check_options(options, encoding='cp1254')
sheet = parsed_file[0]
self._check_col(sheet, 0, "Name", "Text", [u'John Smith', u'Ìáñßá Ğáğáäïğïıëïõ', u'ÄçìŞôñçò Johnson'])
self._check_col(sheet, 2, "ÅğÜããåëìá", "Text", [u'Ãéáôñüò', u'Engineer', u'Äéêçãüñïò'])
if __name__ == '__main__':
unittest.main()

View File

@@ -10,7 +10,7 @@ asttokens==2.2.1
# via
# friendly-traceback
# stack-data
chardet==4.0.0
chardet==5.1.0
# via -r core/sandbox/requirements3.in
et-xmlfile==1.0.1
# via openpyxl