(core) Remove accents when picking identifiers

Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores Test Plan: Added unit test Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2994
2025-06-13 20:53:59 +00:00 · 2021-08-24 14:12:10 +02:00 · 2021-08-24 14:12:10 +02:00 · 427a17d038
commit 427a17d038
parent b3636b97e2
2 changed files with 62 additions and 4 deletions
--- a/sandbox/grist/identifiers.py
+++ b/sandbox/grist/identifiers.py
@ -1,11 +1,17 @@
 # coding=utf-8
 """
 A module for creating and sanitizing identifiers
 """
 import re
 from string import ascii_uppercase
 import itertools
 import re
 import unicodedata
 from keyword import iskeyword
 from string import ascii_uppercase
 import six
 import logger
 log = logger.Logger(__name__, logger.INFO)
 _invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
  Returns empty string if there are no valid identifier characters, so consider using as
  (_sanitize_ident(...) or "your_default").
  """
-  ident = "" if ident is None else str(ident)
+  ident = u"" if ident is None else six.text_type(ident)
  # https://stackoverflow.com/a/517974/2482744
  # Separate out combining characters (e.g. accents)
  ident = unicodedata.normalize('NFKD', ident)
  # then remove them completely
  # This means that 'é' becomes 'e' instead of '_' or 'e_'
  ident = "".join(c for c in ident if not unicodedata.combining(c))
  # TODO allow non-ascii characters in identifiers when using Python 3
  ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
  ident = _invalid_ident_start_re.sub(prefix, ident)
  if not ident:
--- a/sandbox/grist/test_gencode.py
+++ b/sandbox/grist/test_gencode.py
@ -1,3 +1,5 @@
 # coding=utf-8
 import unittest
 import difflib
 import re
@ -6,7 +8,6 @@ from six.moves import xrange
 import gencode
 import identifiers
 import records
 import schema
 import table
 import testutil
@ -83,6 +84,48 @@ class TestGenCode(unittest.TestCase):
    module = gcode.usercode
    self.assertTrue(isinstance(module.Students, table.UserTable))
  def test_ident_combining_chars(self):
    def check(label, ident):
      self.assertEqual(ident, identifiers.pick_table_ident(label))
      self.assertEqual(ident, identifiers.pick_col_ident(label))
      self.assertEqual(ident.lower(), identifiers.pick_col_ident(label.lower()))
    # Actual example table name from a user
    # unicodedata.normalize can separate accents but doesn't help with Đ
    check(
      u"Bảng_Đặc_Thù",
      u"Bang__ac_Thu",
    )
    check(
      u"Noëlle",
      u"Noelle",
    )
    check(
      u"Séamus",
      u"Seamus",
    )
    check(
      u"Hélène",
      u"Helene",
    )
    check(
      u"Dilâçar",
      u"Dilacar",
    )
    check(
      u"Erdoğan",
      u"Erdogan",
    )
    check(
      u"Ñwalme",
      u"Nwalme",
    )
    check(
      u"Árvíztűrő tükörfúrógép",
      u"Arvizturo_tukorfurogep",
    )
  def test_pick_col_ident(self):
    self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf")
    self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")