(core) Remove accents when picking identifiers

Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores

Test Plan: Added unit test

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D2994
This commit is contained in:
Alex Hall 2021-08-24 14:12:10 +02:00
parent b3636b97e2
commit 427a17d038
2 changed files with 62 additions and 4 deletions

View File

@ -1,11 +1,17 @@
# coding=utf-8
"""
A module for creating and sanitizing identifiers
"""
import re
from string import ascii_uppercase
import itertools
import re
import unicodedata
from keyword import iskeyword
from string import ascii_uppercase
import six
import logger
log = logger.Logger(__name__, logger.INFO)
_invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
Returns empty string if there are no valid identifier characters, so consider using as
(_sanitize_ident(...) or "your_default").
"""
ident = "" if ident is None else str(ident)
ident = u"" if ident is None else six.text_type(ident)
# https://stackoverflow.com/a/517974/2482744
# Separate out combining characters (e.g. accents)
ident = unicodedata.normalize('NFKD', ident)
# then remove them completely
# This means that 'é' becomes 'e' instead of '_' or 'e_'
ident = "".join(c for c in ident if not unicodedata.combining(c))
# TODO allow non-ascii characters in identifiers when using Python 3
ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
ident = _invalid_ident_start_re.sub(prefix, ident)
if not ident:

View File

@ -1,3 +1,5 @@
# coding=utf-8
import unittest
import difflib
import re
@ -6,7 +8,6 @@ from six.moves import xrange
import gencode
import identifiers
import records
import schema
import table
import testutil
@ -83,6 +84,48 @@ class TestGenCode(unittest.TestCase):
module = gcode.usercode
self.assertTrue(isinstance(module.Students, table.UserTable))
def test_ident_combining_chars(self):
def check(label, ident):
self.assertEqual(ident, identifiers.pick_table_ident(label))
self.assertEqual(ident, identifiers.pick_col_ident(label))
self.assertEqual(ident.lower(), identifiers.pick_col_ident(label.lower()))
# Actual example table name from a user
# unicodedata.normalize can separate accents but doesn't help with Đ
check(
u"Bảng_Đặc_Thù",
u"Bang__ac_Thu",
)
check(
u"Noëlle",
u"Noelle",
)
check(
u"Séamus",
u"Seamus",
)
check(
u"Hélène",
u"Helene",
)
check(
u"Dilâçar",
u"Dilacar",
)
check(
u"Erdoğan",
u"Erdogan",
)
check(
u"Ñwalme",
u"Nwalme",
)
check(
u"Árvíztűrő tükörfúrógép",
u"Arvizturo_tukorfurogep",
)
def test_pick_col_ident(self):
self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf")
self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")