(core) Fix import parsing from choking up on Python isdigit() surprises

Summary: Python isdigit() returns true for unicode characters such as "²", which fail when used as an argument to int(). Instead, be explicit about only considering characters 0-9 to be digits. Test Plan: Added a test case which produces an error without this change. Reviewers: alexmojaki Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D3027
2026-03-02 04:09:24 +00:00 · 2021-09-20 14:38:02 -04:00
parent cd241a633a
commit 64d9faed5a
3 changed files with 27 additions and 12 deletions
--- a/sandbox/grist/imports/dateguess.py
+++ b/sandbox/grist/imports/dateguess.py
@@ -63,39 +63,44 @@ TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
 AM_PM = {'am', 'pm'}
 DAYS_OF_WEEK_NAME = calendar.day_name
 DAYS_OF_WEEK_ABBR = calendar.day_abbr
 ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
 # Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
 # Use isdigit(x) instead, to only match ASCII digits 0-9.
 isdigit = ASCII_DIGITS_RE.match
 DATE_ELEMENTS = [
  # Name   Pattern  Predicate               Group (mutual exclusive)  Consumes N prev elements
-  ("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),
+  ("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
-  ("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),
+  ("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
-  ("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
+  ("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
  ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
  ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
-  ("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
+  ("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
  ("Day of week", "%A", lambda x, p, v: x.isalpha()
                                        and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
  ("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
                                             and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
-  ("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6
+  ("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
                                                and 0 <= int(x[0:2]) < 24
                                                and 0 <= int(x[2:4]) < 60
                                                and 0 <= int(x[4:6]) < 60, "HMS", 0),
-  ("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
+  ("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
-  ("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2
+  ("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
                                              and 0 <= int(x) <= 11, "H", 0),
  ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
-  ("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
+  ("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
-  ("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
+  ("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
-  ("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None
+  ("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
                                               and p.val == '.', "f", 0),
  ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
                                          and x in TZ_VALID_NAMES, "Z", 0),
-  ("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
+  ("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '+', "Z", 1),
-  ("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
+  ("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '-', "Z", 1),
 ]
--- a/sandbox/grist/imports/fixtures/test_isdigit.csv
+++ b/sandbox/grist/imports/fixtures/test_isdigit.csv
@@ -0,0 +1,2 @@
 PHONE,VALUE,DATE
 201-¾᠓𑄺꤈꤈꧐꤆,¹5,2018-0²-27 16:08:39 +0000
--- a/sandbox/grist/imports/test_import_csv.py
+++ b/sandbox/grist/imports/test_import_csv.py
@@ -336,6 +336,14 @@ class TestImportCSV(unittest.TestCase):
    self._check_col(sheet, 0, "ID", "Int", [17])
    self._check_col(sheet, 1, "LongText", "Text", [long_cell])
  def test_csv_with_surprising_isdigit(self):
    parsed_file = import_csv.parse_file(_get_fixture('test_isdigit.csv'), parse_options='')
    sheet = parsed_file[1][0]
    self._check_num_cols(sheet, 3)
    self._check_col(sheet, 0, "PHONE", "Text", [u'201-¾᠓𑄺꤈꤈꧐꤆'])
    self._check_col(sheet, 1, "VALUE", "Text", [u'¹5'])
    self._check_col(sheet, 2, "DATE", "Text", [u'2018-0²-27 16:08:39 +0000'])
 if __name__ == '__main__':
  unittest.main()
		`@@ -0,0 +1,2 @@`
							`PHONE,VALUE,DATE`
							`201-¾᠓𑄺꤈꤈꧐꤆,¹5,2018-0²-27 16:08:39 +0000`