(core) Remove accents when picking identifiers

Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores

Test Plan: Added unit test

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D2994
This commit is contained in:
Alex Hall
2021-08-24 14:12:10 +02:00
parent b3636b97e2
commit 427a17d038
2 changed files with 62 additions and 4 deletions

View File

@@ -1,11 +1,17 @@
# coding=utf-8
"""
A module for creating and sanitizing identifiers
"""
import re
from string import ascii_uppercase
import itertools
import re
import unicodedata
from keyword import iskeyword
from string import ascii_uppercase
import six
import logger
log = logger.Logger(__name__, logger.INFO)
_invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
@@ -20,7 +26,16 @@ def _sanitize_ident(ident, prefix="c", capitalize=False):
Returns empty string if there are no valid identifier characters, so consider using as
(_sanitize_ident(...) or "your_default").
"""
ident = "" if ident is None else str(ident)
ident = u"" if ident is None else six.text_type(ident)
# https://stackoverflow.com/a/517974/2482744
# Separate out combining characters (e.g. accents)
ident = unicodedata.normalize('NFKD', ident)
# then remove them completely
# This means that 'é' becomes 'e' instead of '_' or 'e_'
ident = "".join(c for c in ident if not unicodedata.combining(c))
# TODO allow non-ascii characters in identifiers when using Python 3
ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
ident = _invalid_ident_start_re.sub(prefix, ident)
if not ident: