(core) Fix import parsing from choking up on Python isdigit() surprises

Summary:
Python isdigit() returns true for unicode characters such as "²", which fail
when used as an argument to int().

Instead, be explicit about only considering characters 0-9 to be digits.

Test Plan: Added a test case which produces an error without this change.

Reviewers: alexmojaki

Reviewed By: alexmojaki

Differential Revision: https://phab.getgrist.com/D3027
This commit is contained in:
Dmitry S 2021-09-20 14:38:02 -04:00
parent cd241a633a
commit 64d9faed5a
3 changed files with 27 additions and 12 deletions

View File

@ -63,39 +63,44 @@ TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
AM_PM = {'am', 'pm'} AM_PM = {'am', 'pm'}
DAYS_OF_WEEK_NAME = calendar.day_name DAYS_OF_WEEK_NAME = calendar.day_name
DAYS_OF_WEEK_ABBR = calendar.day_abbr DAYS_OF_WEEK_ABBR = calendar.day_abbr
ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
# Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
# Use isdigit(x) instead, to only match ASCII digits 0-9.
isdigit = ASCII_DIGITS_RE.match
DATE_ELEMENTS = [ DATE_ELEMENTS = [
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements # Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0), ("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0), ("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0), ("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0), ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0), ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0), ("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
("Day of week", "%A", lambda x, p, v: x.isalpha() ("Day of week", "%A", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0), and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
("Day of week abbr", "%a", lambda x, p, v: x.isalpha() ("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0), and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6 ("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
and 0 <= int(x[0:2]) < 24 and 0 <= int(x[0:2]) < 24
and 0 <= int(x[2:4]) < 60 and 0 <= int(x[2:4]) < 60
and 0 <= int(x[4:6]) < 60, "HMS", 0), and 0 <= int(x[4:6]) < 60, "HMS", 0),
("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0), ("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2 ("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
and 0 <= int(x) <= 11, "H", 0), and 0 <= int(x) <= 11, "H", 0),
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0), ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0), ("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0), ("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None ("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
and p.val == '.', "f", 0), and p.val == '.', "f", 0),
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2 ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
and x in TZ_VALID_NAMES, "Z", 0), and x in TZ_VALID_NAMES, "Z", 0),
("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15 ("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '+', "Z", 1), and p.val == '+', "Z", 1),
("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15 ("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
and 0 <= int(x[2:4]) < 60 and p is not None and 0 <= int(x[2:4]) < 60 and p is not None
and p.val == '-', "Z", 1), and p.val == '-', "Z", 1),
] ]

View File

@ -0,0 +1,2 @@
PHONE,VALUE,DATE
201-¾᠓𑄺꤈꤈꧐꤆,¹5,2018-0²-27 16:08:39 +0000
1 PHONE VALUE DATE
2 201-¾᠓𑄺꤈꤈꧐꤆ ¹5 2018-0²-27 16:08:39 +0000

View File

@ -336,6 +336,14 @@ class TestImportCSV(unittest.TestCase):
self._check_col(sheet, 0, "ID", "Int", [17]) self._check_col(sheet, 0, "ID", "Int", [17])
self._check_col(sheet, 1, "LongText", "Text", [long_cell]) self._check_col(sheet, 1, "LongText", "Text", [long_cell])
def test_csv_with_surprising_isdigit(self):
parsed_file = import_csv.parse_file(_get_fixture('test_isdigit.csv'), parse_options='')
sheet = parsed_file[1][0]
self._check_num_cols(sheet, 3)
self._check_col(sheet, 0, "PHONE", "Text", [u'201-¾᠓𑄺꤈꤈꧐꤆'])
self._check_col(sheet, 1, "VALUE", "Text", [u'¹5'])
self._check_col(sheet, 2, "DATE", "Text", [u'2018-0²-27 16:08:39 +0000'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()