(core) Remove accents when picking identifiers

Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores Test Plan: Added unit test Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2994
2026-03-02 04:09:24 +00:00 · 2021-08-24 14:12:10 +02:00
parent b3636b97e2
commit 427a17d038
2 changed files with 62 additions and 4 deletions
--- a/sandbox/grist/identifiers.py
+++ b/sandbox/grist/identifiers.py
@@ -1,11 +1,17 @@
+# coding=utf-8
 """
 A module for creating and sanitizing identifiers
 """
-import re
-from string import ascii_uppercase
 import itertools
+import re
+import unicodedata
 from keyword import iskeyword
+from string import ascii_uppercase
+
+import six
+
 import logger
+
 log = logger.Logger(__name__, logger.INFO)

 _invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
@@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
  Returns empty string if there are no valid identifier characters, so consider using as
  (_sanitize_ident(...) or "your_default").
  """
-  ident = "" if ident is None else str(ident)
+  ident = u"" if ident is None else six.text_type(ident)
+
+  # https://stackoverflow.com/a/517974/2482744
+  # Separate out combining characters (e.g. accents)
+  ident = unicodedata.normalize('NFKD', ident)
+  # then remove them completely
+  # This means that 'é' becomes 'e' instead of '_' or 'e_'
+  ident = "".join(c for c in ident if not unicodedata.combining(c))
+
+  # TODO allow non-ascii characters in identifiers when using Python 3
  ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
  ident = _invalid_ident_start_re.sub(prefix, ident)
  if not ident:
--- a/sandbox/grist/test_gencode.py
+++ b/sandbox/grist/test_gencode.py
@@ -1,3 +1,5 @@
+# coding=utf-8
+
 import unittest
 import difflib
 import re
@@ -6,7 +8,6 @@ from six.moves import xrange

 import gencode
 import identifiers
-import records
 import schema
 import table
 import testutil
@@ -83,6 +84,48 @@ class TestGenCode(unittest.TestCase):
    module = gcode.usercode
    self.assertTrue(isinstance(module.Students, table.UserTable))

+  def test_ident_combining_chars(self):
+    def check(label, ident):
+      self.assertEqual(ident, identifiers.pick_table_ident(label))
+      self.assertEqual(ident, identifiers.pick_col_ident(label))
+      self.assertEqual(ident.lower(), identifiers.pick_col_ident(label.lower()))
+
+    # Actual example table name from a user
+    # unicodedata.normalize can separate accents but doesn't help with Đ
+    check(
+      u"Bảng_Đặc_Thù",
+      u"Bang__ac_Thu",
+    )
+
+    check(
+      u"Noëlle",
+      u"Noelle",
+    )
+    check(
+      u"Séamus",
+      u"Seamus",
+    )
+    check(
+      u"Hélène",
+      u"Helene",
+    )
+    check(
+      u"Dilâçar",
+      u"Dilacar",
+    )
+    check(
+      u"Erdoğan",
+      u"Erdogan",
+    )
+    check(
+      u"Ñwalme",
+      u"Nwalme",
+    )
+    check(
+      u"Árvíztűrő tükörfúrógép",
+      u"Arvizturo_tukorfurogep",
+    )
+
  def test_pick_col_ident(self):
    self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf")
    self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")