gristlabs_grist-core/sandbox/grist/identifiers.py
Alex Hall 427a17d038 (core) Remove accents when picking identifiers
Summary: Uses python unicodedata module to normalise a string and remove combining characters, leaving behind more ascii letters and fewer underscores

Test Plan: Added unit test

Reviewers: paulfitz

Reviewed By: paulfitz

Subscribers: dsagal

Differential Revision: https://phab.getgrist.com/D2994
2021-08-24 14:47:50 +02:00

133 lines
4.5 KiB
Python

# coding=utf-8
"""
A module for creating and sanitizing identifiers
"""
import itertools
import re
import unicodedata
from keyword import iskeyword
from string import ascii_uppercase
import six
import logger
log = logger.Logger(__name__, logger.INFO)
_invalid_ident_char_re = re.compile(r'[^a-zA-Z0-9_]+')
_invalid_ident_start_re = re.compile(r'^(?=[0-9_])')
def _sanitize_ident(ident, prefix="c", capitalize=False):
"""
Helper for pick_ident, which given a suggested identifier, massages it to ensure it's valid for
python (and sqlite). In particular, leaves only alphanumeric characters, and prepends `prefix`
if it doesn't start with a letter.
Returns empty string if there are no valid identifier characters, so consider using as
(_sanitize_ident(...) or "your_default").
"""
ident = u"" if ident is None else six.text_type(ident)
# https://stackoverflow.com/a/517974/2482744
# Separate out combining characters (e.g. accents)
ident = unicodedata.normalize('NFKD', ident)
# then remove them completely
# This means that 'é' becomes 'e' instead of '_' or 'e_'
ident = "".join(c for c in ident if not unicodedata.combining(c))
# TODO allow non-ascii characters in identifiers when using Python 3
ident = _invalid_ident_char_re.sub('_', ident).lstrip('_')
ident = _invalid_ident_start_re.sub(prefix, ident)
if not ident:
return ident
if capitalize:
# Just capitalize the first letter (do NOT lowercase other letters like str.title() does).
ident = ident[0].capitalize() + ident[1:]
# Prevent names that are illegal to assign to
# iskeyword doesn't catch None/True/False in Python 2.x, but does in 3.x
# (None is actually an error, Python 2.x doesn't make assigning to True or False an error,
# but I think we don't want to allow users to do that)
while iskeyword(ident) or ident in ['None', 'True', 'False']:
ident = prefix + ident
return ident
_ends_in_digit_re = re.compile(r'\d$')
def _add_suffix(ident_base, avoid=set(), next_suffix=1):
"""
Helper which appends a numerical suffix to ident_base, incrementing it until the result doesn't
conflict with anything in the `avoid` set.
"""
if _ends_in_digit_re.search(ident_base):
ident_base += "_"
while True:
ident = "%s%d" % (ident_base, next_suffix)
if ident.upper() not in avoid:
return ident
next_suffix += 1
def _maybe_add_suffix(ident, avoid):
"""
Returns the first of ident, ident2, ident3 etc. that's not in the `avoid` set.
"""
return ident if (ident.upper() not in avoid) else _add_suffix(ident, avoid, 2)
def _uppercase(avoid):
return {name.upper() for name in avoid}
def pick_table_ident(ident, avoid=set()):
"""
Given a suggested identifier (which may be None), creates a sanitized table identifier,
possibly with a numerical suffix that doesn't conflict with anything in the `avoid` set.
"""
avoid = _uppercase(avoid)
ident = _sanitize_ident(ident, prefix="T", capitalize=True)
return _maybe_add_suffix(ident, avoid) if ident else _add_suffix("Table", avoid, 1)
def pick_col_ident(ident, avoid=set()):
"""
Given a suggested identifier (which may be None), creates a sanitized column identifier,
possibly with a numerical suffix that doesn't conflict with anything in the `avoid` set.
"""
avoid = _uppercase(avoid)
ident = _sanitize_ident(ident, prefix="c")
return _maybe_add_suffix(ident, avoid) if ident else _gen_ident(avoid)
def pick_col_ident_list(ident_list, avoid=set()):
"""
Given a list of suggested identifiers (which may be invalid), returns a list of valid sanitized
unique identifiers, that don't conflict with anything in the `avoid` set or with each other.
"""
avoid = _uppercase(avoid)
result = []
for ident in ident_list:
ident = pick_col_ident(ident, avoid=avoid)
avoid.add(ident.upper())
result.append(ident)
return result
def _gen_ident(avoid):
"""
Helper for pick_ident, which generates a valid identifier
when pick_ident is called without a suggested identifier or default.
It returns the first identifier that does not conflict with any elements of the avoid set.
The identifier is a letter or combination of letters that follows a
similar pattern to what excel uses for naming columns.
i.e. A, B, ... Z, AA, AB, ... AZ, BA, etc
"""
avoid = _uppercase(avoid)
for letter in _make_letters():
if letter not in avoid:
return letter
def _make_letters():
length = 1
while True:
for letters in itertools.product(ascii_uppercase, repeat=length):
yield ''.join(letters)
length +=1