(core) Run and test imports only in Python 3, upgrade openpyxl, fix weird date handling

Summary:
Python 2 only needs to be supported for the sake of old documents and formulas. This doesn't apply to the separate sandboxes that parse files for imports. Using Python 3 only allows using newer libraries and library versions. In particular, the latest version of openpyxl doesn't support Python 2. This will also make it easier to make other similar changes in the future, such as replacing messytables with a modern library. See https://grist.slack.com/archives/C0234CPPXPA/p1661261829343999?thread_ts=1661260442.837959&cid=C0234CPPXPA

The latest openpyxl is better at handling a particular edge case with broken dates in Excel, but still doesn't quite do what we want, so we monkeypatch it. Discussion: https://grist.slack.com/archives/C02EGJ1FUCV/p1661440851911869?thread_ts=1661154219.515549&cid=C02EGJ1FUCV

Setting `preferredPythonVersion` to '3' in SafePythonComponent ensures that JS always creates import sandboxes that use Python 3. Within Python, a module used by all imports will raise an error in Python 2. Python unit tests of imports are now only run in Python 3, using the `load_tests` protocol of `unittest`.

Test Plan: Mostly existing tests. Added another strange date to the Excel fixture.

Reviewers: dsagal

Reviewed By: dsagal

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D3606
This commit is contained in:
Alex Hall 2022-09-02 16:02:34 +02:00
parent ecf2fdf71a
commit 42afb17e36
12 changed files with 68 additions and 18 deletions

View File

@ -7,6 +7,7 @@ import * as t from "ts-interface-checker";
export const ColumnToMap = t.iface([], { export const ColumnToMap = t.iface([], {
"name": "string", "name": "string",
"title": t.opt(t.union("string", "null")), "title": t.opt(t.union("string", "null")),
"description": t.opt(t.union("string", "null")),
"type": t.opt("string"), "type": t.opt("string"),
"optional": t.opt("boolean"), "optional": t.opt("boolean"),
"allowMultiple": t.opt("boolean"), "allowMultiple": t.opt("boolean"),

View File

@ -41,6 +41,7 @@ export class SafePythonComponent extends BaseComponent {
importMount: this._tmpDir, importMount: this._tmpDir,
logTimes: true, logTimes: true,
logMeta: this._logMeta, logMeta: this._logMeta,
preferredPythonVersion: '3',
}); });
} }

View File

@ -1,17 +1,17 @@
""" """
Helper functions for import plugins Helper functions for import plugins
""" """
import sys
import itertools import itertools
import logging import logging
import os import os
# Include /thirdparty into module search paths, in particular for messytables.
sys.path.append('/thirdparty')
import six import six
from six.moves import zip from six.moves import zip
if six.PY2:
raise RuntimeError("Imports should use a Python 3 environment")
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Get path to an imported file. # Get path to an imported file.

View File

@ -5,8 +5,10 @@ and returns a object formatted so that it can be used by grist for a bulk add re
import logging import logging
import messytables import messytables
import openpyxl
import six import six
import openpyxl
from openpyxl.utils.datetime import from_excel
from openpyxl.worksheet import _reader
from six.moves import zip from six.moves import zip
import parse_data import parse_data
@ -15,6 +17,19 @@ from imports import import_utils
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Some strange Excel files have values that are marked as dates but are invalid as dates.
# Normally, openpyxl converts these to `#VALUE!`, but keeping the original value is better.
# In the case where this was encountered, the original value is a large number:
# the 6281228502068 in test_excel_strange_dates.
# Here we monkeypatch openpyxl to keep the original value.
def new_from_excel(value, *args, **kwargs):
try:
return from_excel(value, *args, **kwargs)
except (ValueError, OverflowError):
return value
_reader.from_excel = new_from_excel
def import_file(file_source): def import_file(file_source):
path = import_utils.get_path(file_source["path"]) path = import_utils.get_path(file_source["path"])
parse_options, tables = parse_file(path) parse_options, tables = parse_file(path)

View File

@ -133,7 +133,8 @@ class TestImportXLS(unittest.TestCase):
{'id': 'f', 'type': 'Date'}, {'id': 'f', 'type': 'Date'},
{'id': 'g', 'type': 'Any'}, {'id': 'g', 'type': 'Any'},
{'id': 'h', 'type': 'Date'}, {'id': 'h', 'type': 'Date'},
{'id': 'i', 'type': 'Date'}, {'id': 'i', 'type': 'Any'},
{'id': 'j', 'type': 'Numeric'},
], ],
'table_data': [ 'table_data': [
[u'21:14:00'], [u'21:14:00'],
@ -144,7 +145,8 @@ class TestImportXLS(unittest.TestCase):
[-2207347200.0], [-2207347200.0],
[u'7/4/1776'], [u'7/4/1776'],
[205286400.0], [205286400.0],
[-2209161600.0], ['00:00:00'],
[6281228502068],
], ],
}]) }])

View File

@ -0,0 +1,41 @@
"""
Imports only run in Python 3 sandboxes, so the tests here only run in Python 3.
The test files in this directory have been renamed to look like 'import_xls_test.py' instead
of 'test_import_xls.py' so that they're not discovered automatically by default.
`load_tests` below then discovers that pattern directly, but only in Python 3.
This allows the tests to be skipped without having to specify a pattern when discovering tests.
The downside is that if you *do* want to specify a pattern, that probably won't work.
The reason for this method is that there's a bug in Python 2's unittest module
regarding packages and directories that we must handle.
This means that in Python 2, I wasn't able to prevent files like 'test_import_xls.py'
from being discovered automatically regardless of what I did in `load_tests`.
Compare https://docs.python.org/3.9/library/unittest.html#load-tests-protocol
vs https://docs.python.org/2/library/unittest.html#load-tests-protocol
from "If discovery is started" on both pages. Note in particular:
> Changed in version 3.5: Discovery no longer checks package names for matching pattern
due to the impossibility of package names matching the default pattern.
The reason for skipping entire files from being discovered instead of skipping TestCase classes
is that just importing the test file will fail with an error, both because we manually raise
an exception and because dependencies are missing.
"""
import os
import six
def load_tests(loader, standard_tests, _pattern):
if six.PY2:
return standard_tests
this_dir = os.path.join(os.path.dirname(__file__))
package_tests = list(loader.discover(start_dir=this_dir, pattern='*_test.py'))
if len(package_tests) < 3:
raise Exception("Expected more import tests to be discovered")
standard_tests.addTests(package_tests)
return standard_tests

View File

@ -3,22 +3,12 @@ asttokens==2.0.5
backports.functools-lru-cache==1.6.4 backports.functools-lru-cache==1.6.4
chardet==4.0.0 chardet==4.0.0
enum34==1.1.10 enum34==1.1.10
et-xmlfile==1.0.1
html5lib==1.1
iso8601==0.1.12 iso8601==0.1.12
jdcal==1.4.1
json_table_schema==0.2.1
lazy_object_proxy==1.6.0 lazy_object_proxy==1.6.0
lxml==4.6.3 # used in csv plugin only?
messytables==0.15.2
openpyxl==2.6.4
python_dateutil==2.8.2 python_dateutil==2.8.2
python_magic==0.4.12
roman==2.0.0 roman==2.0.0
singledispatch==3.6.2 singledispatch==3.6.2
six==1.16.0 six==1.16.0
sortedcontainers==2.4.0 sortedcontainers==2.4.0
webencodings==0.5
wrapt==1.12.1 wrapt==1.12.1
xlrd==1.2.0
unittest-xml-reporting==2.0.0 unittest-xml-reporting==2.0.0

View File

@ -20,8 +20,8 @@ json_table_schema==0.2.1
lazy_object_proxy==1.6.0 lazy_object_proxy==1.6.0
lxml==4.6.3 # used in csv plugin only? lxml==4.6.3 # used in csv plugin only?
messytables==0.15.2 messytables==0.15.2
openpyxl==2.6.4
python_dateutil==2.8.2 python_dateutil==2.8.2
openpyxl==3.0.10
python_magic==0.4.12 python_magic==0.4.12
roman==2.0.0 roman==2.0.0
singledispatch==3.6.2 singledispatch==3.6.2