(core) Remove accents when picking identifiers

Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores Test Plan: Added unit test Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2994
2026-03-02 04:09:24 +00:00 · 2021-08-24 14:12:10 +02:00
parent b3636b97e2
commit 427a17d038
2 changed files with 62 additions and 4 deletions
--- a/sandbox/grist/identifiers.py
+++ b/sandbox/grist/identifiers.py
@@ -1,11 +1,17 @@
+# coding=utf-8
 """
 A module for creating and sanitizing identifiers
 """
-import re
-from string import ascii_uppercase
 import itertools
+import re
+import unicodedata
 from keyword import iskeyword
+from string import ascii_uppercase
+
+import six
+
 import logger
+
 log = logger.Logger(__name__, logger.INFO)

 _invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
@@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
  Returns empty string if there are no valid identifier characters, so consider using as
  (_sanitize_ident(...) or "your_default").
  """
-  ident = "" if ident is None else str(ident)
+  ident = u"" if ident is None else six.text_type(ident)
+
+  # https://stackoverflow.com/a/517974/2482744
+  # Separate out combining characters (e.g. accents)
+  ident = unicodedata.normalize('NFKD', ident)
+  # then remove them completely
+  # This means that 'é' becomes 'e' instead of '_' or 'e_'
+  ident = "".join(c for c in ident if not unicodedata.combining(c))
+
+  # TODO allow non-ascii characters in identifiers when using Python 3
  ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
  ident = _invalid_ident_start_re.sub(prefix, ident)
  if not ident: