mirror of
https://github.com/gristlabs/grist-core.git
synced 2024-10-27 20:44:07 +00:00
(core) Remove accents when picking identifiers
Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores Test Plan: Added unit test Reviewers: paulfitz Reviewed By: paulfitz Subscribers: dsagal Differential Revision: https://phab.getgrist.com/D2994
This commit is contained in:
parent
b3636b97e2
commit
427a17d038
@ -1,11 +1,17 @@
|
|||||||
|
# coding=utf-8
|
||||||
"""
|
"""
|
||||||
A module for creating and sanitizing identifiers
|
A module for creating and sanitizing identifiers
|
||||||
"""
|
"""
|
||||||
import re
|
|
||||||
from string import ascii_uppercase
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
from keyword import iskeyword
|
from keyword import iskeyword
|
||||||
|
from string import ascii_uppercase
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
import logger
|
import logger
|
||||||
|
|
||||||
log = logger.Logger(__name__, logger.INFO)
|
log = logger.Logger(__name__, logger.INFO)
|
||||||
|
|
||||||
_invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
|
_invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
|
||||||
@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
|
|||||||
Returns empty string if there are no valid identifier characters, so consider using as
|
Returns empty string if there are no valid identifier characters, so consider using as
|
||||||
(_sanitize_ident(...) or "your_default").
|
(_sanitize_ident(...) or "your_default").
|
||||||
"""
|
"""
|
||||||
ident = "" if ident is None else str(ident)
|
ident = u"" if ident is None else six.text_type(ident)
|
||||||
|
|
||||||
|
# https://stackoverflow.com/a/517974/2482744
|
||||||
|
# Separate out combining characters (e.g. accents)
|
||||||
|
ident = unicodedata.normalize('NFKD', ident)
|
||||||
|
# then remove them completely
|
||||||
|
# This means that 'é' becomes 'e' instead of '_' or 'e_'
|
||||||
|
ident = "".join(c for c in ident if not unicodedata.combining(c))
|
||||||
|
|
||||||
|
# TODO allow non-ascii characters in identifiers when using Python 3
|
||||||
ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
|
ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
|
||||||
ident = _invalid_ident_start_re.sub(prefix, ident)
|
ident = _invalid_ident_start_re.sub(prefix, ident)
|
||||||
if not ident:
|
if not ident:
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import difflib
|
import difflib
|
||||||
import re
|
import re
|
||||||
@ -6,7 +8,6 @@ from six.moves import xrange
|
|||||||
|
|
||||||
import gencode
|
import gencode
|
||||||
import identifiers
|
import identifiers
|
||||||
import records
|
|
||||||
import schema
|
import schema
|
||||||
import table
|
import table
|
||||||
import testutil
|
import testutil
|
||||||
@ -83,6 +84,48 @@ class TestGenCode(unittest.TestCase):
|
|||||||
module = gcode.usercode
|
module = gcode.usercode
|
||||||
self.assertTrue(isinstance(module.Students, table.UserTable))
|
self.assertTrue(isinstance(module.Students, table.UserTable))
|
||||||
|
|
||||||
|
def test_ident_combining_chars(self):
|
||||||
|
def check(label, ident):
|
||||||
|
self.assertEqual(ident, identifiers.pick_table_ident(label))
|
||||||
|
self.assertEqual(ident, identifiers.pick_col_ident(label))
|
||||||
|
self.assertEqual(ident.lower(), identifiers.pick_col_ident(label.lower()))
|
||||||
|
|
||||||
|
# Actual example table name from a user
|
||||||
|
# unicodedata.normalize can separate accents but doesn't help with Đ
|
||||||
|
check(
|
||||||
|
u"Bảng_Đặc_Thù",
|
||||||
|
u"Bang__ac_Thu",
|
||||||
|
)
|
||||||
|
|
||||||
|
check(
|
||||||
|
u"Noëlle",
|
||||||
|
u"Noelle",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Séamus",
|
||||||
|
u"Seamus",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Hélène",
|
||||||
|
u"Helene",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Dilâçar",
|
||||||
|
u"Dilacar",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Erdoğan",
|
||||||
|
u"Erdogan",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Ñwalme",
|
||||||
|
u"Nwalme",
|
||||||
|
)
|
||||||
|
check(
|
||||||
|
u"Árvíztűrő tükörfúrógép",
|
||||||
|
u"Arvizturo_tukorfurogep",
|
||||||
|
)
|
||||||
|
|
||||||
def test_pick_col_ident(self):
|
def test_pick_col_ident(self):
|
||||||
self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf")
|
self.assertEqual(identifiers.pick_col_ident("asdf"), "asdf")
|
||||||
self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")
|
self.assertEqual(identifiers.pick_col_ident(" a s==d!~@#$%^f"), "a_s_d_f")
|
||||||
|
Loading…
Reference in New Issue
Block a user