From 427a17d0388d5f5c4de4ed4f3287a48c210d7eed Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Tue, 24 Aug 2021 14:12:10 +0200 Subject: [PATCH] (core) Remove accents when picking identifiers Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores Test Plan: Added unit test Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2994 --- sandbox/grist/identifiers.py | 21 +++++++++++++--- sandbox/grist/test_gencode.py | 45 ++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/sandbox/grist/identifiers.py b/sandbox/grist/identifiers.py index f4696598..e4b847f0 100644 --- a/sandbox/grist/identifiers.py +++ b/sandbox/grist/identifiers.py @@ -1,11 +1,17 @@ +# coding=utf-8 """ A module for creating and sanitizing identifiers """ -import re -from string import ascii_uppercase import itertools +import re +import unicodedata from keyword import iskeyword +from string import ascii_uppercase + +import six + import logger + log = logger.Logger(__name__, logger.INFO) _invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+') @@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False): Returns empty string if there are no valid identifier characters, so consider using as (_sanitize_ident(...) or "your_default"). """ - ident = "" if ident is None else str(ident) + ident = u"" if ident is None else six.text_type(ident) + + # https://stackoverflow.com/a/517974/2482744 + # Separate out combining characters (e.g. accents) + ident = unicodedata.normalize('NFKD', ident) + # then remove them completely + # This means that 'é' becomes 'e' instead of '_' or 'e_' + ident = "".join(c for c in ident if not unicodedata.combining(c)) + + # TODO allow non-ascii characters in identifiers when using Python 3 ident = _invalid_ident_char_re.sub('_', ident).lstrip('_') ident = _invalid_ident_start_re.sub(prefix, ident) if not ident: diff --git a/sandbox/grist/test_gencode.py b/sandbox/grist/test_gencode.py index c73b76f1..3c9795c5 100644 --- a/sandbox/grist/test_gencode.py +++ b/sandbox/grist/test_gencode.py @@ -1,3 +1,5 @@ +# coding=utf-8 + import unittest import difflib import re @@ -6,7 +8,6 @@ from six.moves import xrange import gencode import identifiers -import records import schema import table import testutil @@ -83,6 +84,48 @@ class TestGenCode(unittest.TestCase): module = gcode.usercode self.assertTrue(isinstance(module.Students, table.UserTable)) + def test_ident_combining_chars(self): + def check(label, ident): + self.assertEqual(ident, identifiers.pick_table_ident(label)) + self.assertEqual(ident, identifiers.pick_col_ident(label)) + self.assertEqual(ident.lower(), identifiers.pick_col_ident(label.lower())) + + # Actual example table name from a user + # unicodedata.normalize can separate accents but doesn't help with Đ + check( + u"Bảng_Đặc_Thù", + u"Bang__ac_Thu", + ) + + check( + u"Noëlle", + u"Noelle", + ) + check( + u"Séamus", + u"Seamus", + ) + check( + u"Hélène", + u"Helene", + ) + check( + u"Dilâçar", + u"Dilacar", + ) + check( + u"Erdoğan", + u"Erdogan", + ) + check( + u"Ñwalme", + u"Nwalme", + ) + check( + u"Árvíztűrő tükörfúrógép", + u"Arvizturo_tukorfurogep", + ) + def test_pick_col_ident(self): self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf") self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")