gristlabs_grist-core/sandbox/grist/functions/text.py

# -*- coding: UTF-8 -*-

import datetime
import numbers
import re

import dateutil.parser
import phonenumbers
import six
from six import unichr         # pylint: disable=redefined-builtin
from six.moves import xrange

from usertypes import AltText  # pylint: disable=import-error
from .math import ROUND
from .unimplemented import unimplemented


def CHAR(table_number):
  """
  Convert a number into a character according to the current Unicode table.
  Same as `unichr(number)`.

  >>> CHAR(65)
  u'A'
  >>> CHAR(33)
  u'!'
  """
  return unichr(table_number)


# See http://stackoverflow.com/a/93029/328565
_control_chars = ''.join(map(unichr, list(xrange(0,32)) + list(xrange(127,160))))
_control_char_re = re.compile('[%s]' % re.escape(_control_chars))

def CLEAN(text):
  """
  Returns the text with the non-printable characters removed.

  This removes both characters with values 0 through 31, and other Unicode characters in the
  "control characters" category.

  >>> CLEAN(CHAR(9) + "Monthly report" + CHAR(10))
  u'Monthly report'
  """
  return _control_char_re.sub('', text)


def CODE(string):
  """
  Returns the numeric Unicode map value of the first character in the string provided.
  Same as `ord(string[0])`.

  >>> CODE("A")
  65
  >>> CODE("!")
  33
  >>> CODE("!A")
  33
  """
  return ord(string[0])


def CONCATENATE(string, *more_strings):
  u"""
  Joins together any number of text strings into one string. Also available under the name
  `CONCAT`. Similar to the Python expression `"".join(array_of_strings)`.

  >>> CONCATENATE("Stream population for ", "trout", " ", "species", " is ", 32, "/mile.")
  u'Stream population for trout species is 32/mile.'
  >>> CONCATENATE("In ", 4, " days it is ", datetime.date(2016,1,1))
  u'In 4 days it is 2016-01-01'
  >>> CONCATENATE("abc")
  u'abc'
  >>> CONCATENATE(0, "abc")
  u'0abc'
  >>> assert CONCATENATE(2, u" crème ", u"brûlée") == u'2 crème brûlée'
  >>> assert CONCATENATE(2,  " crème ", u"brûlée") == u'2 crème brûlée'
  >>> assert CONCATENATE(2,  " crème ",  "brûlée") == u'2 crème brûlée'
  """
  return u''.join(
    val.decode('utf8') if isinstance(val, six.binary_type) else   # pylint:disable=no-member
    six.text_type(val)
    for val in (string,) + more_strings
  )


def CONCAT(string, *more_strings):
  """
  Joins together any number of text strings into one string. Also available under the name
  `CONCATENATE`. Similar to the Python expression `"".join(array_of_strings)`.

  >>> CONCAT("Stream population for ", "trout", " ", "species", " is ", 32, "/mile.")
  u'Stream population for trout species is 32/mile.'
  >>> CONCAT("In ", 4, " days it is ", datetime.date(2016,1,1))
  u'In 4 days it is 2016-01-01'
  >>> CONCAT("abc")
  u'abc'
  >>> CONCAT(0, "abc")
  u'0abc'
  >>> assert CONCAT(2, u" crème ", u"brûlée") == u'2 crème brûlée'
  """
  return CONCATENATE(string, *more_strings)

def DOLLAR(number, decimals=2):
  """
  Formats a number into a formatted dollar amount, with decimals rounded to the specified place (.
  If decimals value is omitted, it defaults to 2.

  >>> DOLLAR(1234.567)
  '$1,234.57'
  >>> DOLLAR(1234.567, -2)
  '$1,200'
  >>> DOLLAR(-1234.567, -2)
  '($1,200)'
  >>> DOLLAR(-0.123, 4)
  '($0.1230)'
  >>> DOLLAR(99.888)
  '$99.89'
  >>> DOLLAR(0)
  '$0.00'
  >>> DOLLAR(10, 0)
  '$10'
  """
  formatted = "${:,.{}f}".format(ROUND(abs(number), decimals), max(0, decimals))
  return formatted if number >= 0 else "(" + formatted + ")"


def EXACT(string1, string2):
  """
  Tests whether two strings are identical. Same as `string2 == string2`.

  >>> EXACT("word", "word")
  True
  >>> EXACT("Word", "word")
  False
  >>> EXACT("w ord", "word")
  False
  """
  return string1 == string2


def FIND(find_text, within_text, start_num=1):
  """
  Returns the position at which a string is first found within text.

  Find is case-sensitive. The returned position is 1 if within_text starts with find_text.
  Start_num specifies the character at which to start the search, defaulting to 1 (the first
  character of within_text).

  If find_text is not found, or start_num is invalid, raises ValueError.

  >>> FIND("M", "Miriam McGovern")
  1
  >>> FIND("m", "Miriam McGovern")
  6
  >>> FIND("M", "Miriam McGovern", 3)
  8
  >>> FIND(" #", "Hello world # Test")
  12
  >>> FIND("gle", "Google", 1)
  4
  >>> FIND("GLE", "Google", 1)
  Traceback (most recent call last):
  ...
  ValueError: substring not found
  >>> FIND("page", "homepage")
  5
  >>> FIND("page", "homepage", 6)
  Traceback (most recent call last):
  ...
  ValueError: substring not found
  """
  return within_text.index(find_text, start_num - 1) + 1


def FIXED(number, decimals=2, no_commas=False):
  """
  Formats a number with a fixed number of decimal places (2 by default), and commas.
  If no_commas is True, then omits the commas.

  >>> FIXED(1234.567, 1)
  '1,234.6'
  >>> FIXED(1234.567, -1)
  '1,230'
  >>> FIXED(-1234.567, -1, True)
  '-1230'
  >>> FIXED(44.332)
  '44.33'
  >>> FIXED(3521.478, 2, False)
  '3,521.48'
  >>> FIXED(-3521.478, 1, True)
  '-3521.5'
  >>> FIXED(3521.478, 0, True)
  '3521'
  >>> FIXED(3521.478, -2, True)
  '3500'
  """
  comma_flag = '' if no_commas else ','
  return "{:{}.{}f}".format(ROUND(number, decimals), comma_flag, max(0, decimals))


def LEFT(string, num_chars=1):
  """
  Returns a substring of length num_chars from the beginning of the given string. If num_chars is
  omitted, it is assumed to be 1. Same as `string[:num_chars]`.

  >>> LEFT("Sale Price", 4)
  'Sale'
  >>> LEFT('Swededn')
  'S'
  >>> LEFT('Text', -1)
  Traceback (most recent call last):
  ...
  ValueError: num_chars invalid
  """
  if num_chars < 0:
    raise ValueError("num_chars invalid")
  return string[:num_chars]


def LEN(text):
  """
  Returns the number of characters in a text string, or the number of items in a list. Same as
  [`len`](https://docs.python.org/3/library/functions.html#len) in python.
  See [Record Set](#recordset) for an example of using `len` on a list of records.

  >>> LEN("Phoenix, AZ")
  11
  >>> LEN("")
  0
  >>> LEN("     One   ")
  11
  """
  return len(text)


def LOWER(text):
  """
  Converts a specified string to lowercase. Same as `text.lower()`.

  >>> LOWER("E. E. Cummings")
  'e. e. cummings'
  >>> LOWER("Apt. 2B")
  'apt. 2b'
  """
  return text.lower()


def MID(text, start_num, num_chars):
  """
  Returns a segment of a string, starting at start_num. The first character in text has
  start_num 1.

  >>> MID("Fluid Flow", 1, 5)
  'Fluid'
  >>> MID("Fluid Flow", 7, 20)
  'Flow'
  >>> MID("Fluid Flow", 20, 5)
  ''
  >>> MID("Fluid Flow", 0, 5)
  Traceback (most recent call last):
  ...
  ValueError: start_num invalid
  """
  if start_num < 1:
    raise ValueError("start_num invalid")
  return text[start_num - 1 : start_num - 1 + num_chars]


output_formats = {
    "+":        phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "INTL":     phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "#":        phonenumbers.PhoneNumberFormat.NATIONAL,
    "NATL":     phonenumbers.PhoneNumberFormat.NATIONAL,
    "*":        phonenumbers.PhoneNumberFormat.E164,
    "E164":     phonenumbers.PhoneNumberFormat.E164,
    "tel":      phonenumbers.PhoneNumberFormat.RFC3966,
    "RFC3966":  phonenumbers.PhoneNumberFormat.RFC3966,
}

def PHONE_FORMAT(value, country=None, format=None):  # pylint: disable=redefined-builtin
  """
  Formats a phone number.

  With no optional arguments, the number must start with "+" and the international dialing prefix,
  and will be formatted as an international number, e.g. `+12345678901` becomes `+1 234-567-8901`.

  The `country` argument allows specifying a 2-letter country code (e.g. "US" or "GB") for
  interpreting phone numbers that don't start with "+". E.g. `PHONE_FORMAT('2025555555', 'US')`
  would be seen as a US number and formatted as "(202) 555-5555". Phone numbers that start with
  "+" ignore `country`. E.g. `PHONE_FORMAT('+33555555555', 'US')` is a French number because '+33'
  is the international prefix for France.

  The `format` argument specifies the output format, according to this table:

    - `"#"` or `"NATL"` (default) - use the national format, without the international dialing
      prefix, when possible. E.g. `(234) 567-8901` for "US", or `02 34 56 78 90` for "FR". If
      `country` is omitted, or the number does not correspond to the given country, the
      international format is used instead.
    - `"+"` or `"INTL"` - international format, e.g. `+1 234-567-8901` or
      `+33 2 34 56 78 90`.
    - `"*"` or `"E164"` - E164 format, like international but with no separators, e.g.
      `+12345678901`.
    - `"tel"` or `"RFC3966"` - format suitable to use as a [hyperlink](col-types.md#hyperlinks),
      e.g. 'tel:+1-234-567-8901'.

  When specifying the `format` argument, you may omit the `country` argument. I.e.
  `PHONE_FORMAT(value, "tel")` is equivalent to `PHONE_FORMAT(value, None, "tel")`.

  For more details, see the [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers)
  Python library, which underlies this function.

  >>> PHONE_FORMAT("+12345678901")
  u'+1 234-567-8901'
  >>> PHONE_FORMAT("2345678901", "US")
  u'(234) 567-8901'
  >>> PHONE_FORMAT("2345678901", "GB")
  u'023 4567 8901'
  >>> PHONE_FORMAT("2345678901", "GB", "+")
  u'+44 23 4567 8901'
  >>> PHONE_FORMAT("+442345678901", "GB")
  u'023 4567 8901'
  >>> PHONE_FORMAT("+12345678901", "GB")
  u'+1 234-567-8901'
  >>> PHONE_FORMAT("(234) 567-8901")    # doctest: +IGNORE_EXCEPTION_DETAIL
  Traceback (most recent call last):
  ...
  NumberParseException: (0) Missing or invalid default region.
  >>> PHONE_FORMAT("(234)567 89-01", "US", "tel")
  u'tel:+1-234-567-8901'
  >>> PHONE_FORMAT("2/3456/7890", "FR", '#')
  u'02 34 56 78 90'
  >>> PHONE_FORMAT("+33234567890", '#')
  u'+33 2 34 56 78 90'
  >>> PHONE_FORMAT("+33234567890", 'tel')
  u'tel:+33-2-34-56-78-90'
  >>> PHONE_FORMAT("tel:+1-234-567-8901", country="US", format="*")
  u'+12345678901'
  >>> PHONE_FORMAT(33234567890)
  Traceback (most recent call last):
  ...
  TypeError: Phone number must be a text value. \
If formatting a value from a Numeric column, convert that column to Text first.
  """
  if not value:
    return value

  if not isinstance(value, six.string_types):
    raise TypeError("Phone number must be a text value. " +
      "If formatting a value from a Numeric column, convert that column to Text first.")

  if format is None and country in output_formats:
    format = country
    country = None
  parsed = phonenumbers.parse(str(value), country)
  out_fmt = output_formats.get(format or "#")
  if out_fmt is None:
    raise ValueError("Unrecognized phone format; try +, INTL, #, NATL, *, E164, tel, or RFC3966")

  if out_fmt == phonenumbers.PhoneNumberFormat.NATIONAL and not country:
    # With no country, we lose info in NATIONAL format (because numbers must be specified with an
    # international prefix, and the output would discard it). Use INTERNATIONAL instead.
    out_fmt = phonenumbers.PhoneNumberFormat.INTERNATIONAL

  result = phonenumbers.format_number(parsed, out_fmt)

  # If using a national format with a country, check that we don't garble numbers with a different
  # international prefix. If so, use an international format. E.g. for
  # PHONE_FORMAT('+12345678901', 'FR'), the output should include the US dialing prefix.
  if (out_fmt == phonenumbers.PhoneNumberFormat.NATIONAL and country and
      phonenumbers.parse(result, country) != parsed):
    result = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.INTERNATIONAL)

  return result


def PROPER(text):
  """
  Capitalizes each word in a specified string. It converts the first letter of each word to
  uppercase, and all other letters to lowercase. Same as `text.title()`.

  >>> PROPER('this is a TITLE')
  'This Is A Title'
  >>> PROPER('2-way street')
  '2-Way Street'
  >>> PROPER('76BudGet')
  '76Budget'
  """
  return text.title()


def REGEXEXTRACT(text, regular_expression):
  """
  Extracts the first part of text that matches regular_expression.

  >>> REGEXEXTRACT("Google Doc 101", "[0-9]+")
  '101'
  >>> REGEXEXTRACT("The price today is $826.25", "[0-9]*\\.[0-9]+[0-9]+")
  '826.25'

  If there is a parenthesized expression, it is returned instead of the whole match.
  >>> REGEXEXTRACT("(Content) between brackets", "\\(([A-Za-z]+)\\)")
  'Content'
  >>> REGEXEXTRACT("Foo", "Bar")
  Traceback (most recent call last):
  ...
  ValueError: REGEXEXTRACT text does not match
  """
  m = re.search(regular_expression, text)
  if not m:
    raise ValueError("REGEXEXTRACT text does not match")
  return m.group(1) if m.lastindex else m.group(0)


def REGEXMATCH(text, regular_expression):
  """
  Returns whether a piece of text matches a regular expression.

  >>> REGEXMATCH("Google Doc 101", "[0-9]+")
  True
  >>> REGEXMATCH("Google Doc", "[0-9]+")
  False
  >>> REGEXMATCH("The price today is $826.25", "[0-9]*\\.[0-9]+[0-9]+")
  True
  >>> REGEXMATCH("(Content) between brackets", "\\(([A-Za-z]+)\\)")
  True
  >>> REGEXMATCH("Foo", "Bar")
  False
  """
  return bool(re.search(regular_expression, text))


def REGEXREPLACE(text, regular_expression, replacement):
  """
  Replaces all parts of text matching the given regular expression with replacement text.

  >>> REGEXREPLACE("Google Doc 101", "[0-9]+", "777")
  'Google Doc 777'
  >>> REGEXREPLACE("Google Doc", "[0-9]+", "777")
  'Google Doc'
  >>> REGEXREPLACE("The price is $826.25", "[0-9]*\\.[0-9]+[0-9]+", "315.75")
  'The price is $315.75'
  >>> REGEXREPLACE("(Content) between brackets", "\\(([A-Za-z]+)\\)", "Word")
  'Word between brackets'
  >>> REGEXREPLACE("Foo", "Bar", "Baz")
  'Foo'
  """
  return re.sub(regular_expression, replacement, text)


def REPLACE(text, position, length, new_text):
  """
  Replaces part of a text string with a different text string. Position is counted from 1.

  >>> REPLACE("abcdefghijk", 6, 5, "*")
  'abcde*k'
  >>> REPLACE("2009", 3, 2, "10")
  '2010'
  >>> REPLACE('123456', 1, 3, '@')
  '@456'
  >>> REPLACE('foo', 1, 0, 'bar')
  'barfoo'
  >>> REPLACE('foo', 0, 1, 'bar')
  Traceback (most recent call last):
  ...
  ValueError: position invalid
  """
  if position < 1:
    raise ValueError("position invalid")
  return text[:position - 1] + new_text + text[position - 1 + length:]


def REPT(text, number_times):
  """
  Returns specified text repeated a number of times. Same as `text * number_times`.

  The result of the REPT function cannot be longer than 32767 characters, or it raises a
  ValueError.

  >>> REPT("*-", 3)
  '*-*-*-'
  >>> REPT('-', 10)
  '----------'
  >>> REPT('-', 0)
  ''
  >>> len(REPT('---', 10000))
  30000
  >>> REPT('---', 11000)
  Traceback (most recent call last):
  ...
  ValueError: number_times invalid
  >>> REPT('-', -1)
  Traceback (most recent call last):
  ...
  ValueError: number_times invalid
  """
  if number_times < 0 or len(text) * number_times > 32767:
    raise ValueError("number_times invalid")
  return text * int(number_times)


def RIGHT(string, num_chars=1):
  """
  Returns a substring of length num_chars from the end of a specified string. If num_chars is
  omitted, it is assumed to be 1. Same as `string[-num_chars:]`.

  >>> RIGHT("Sale Price", 5)
  'Price'
  >>> RIGHT('Stock Number')
  'r'
  >>> RIGHT('Text', 100)
  'Text'
  >>> RIGHT('Text', -1)
  Traceback (most recent call last):
  ...
  ValueError: num_chars invalid
  """
  if num_chars < 0:
    raise ValueError("num_chars invalid")
  return string[-num_chars:]


def SEARCH(find_text, within_text, start_num=1):
  """
  Returns the position at which a string is first found within text, ignoring case.

  Find is case-sensitive. The returned position is 1 if within_text starts with find_text.
  Start_num specifies the character at which to start the search, defaulting to 1 (the first
  character of within_text).

  If find_text is not found, or start_num is invalid, raises ValueError.
  >>> SEARCH("e", "Statements", 6)
  7
  >>> SEARCH("margin", "Profit Margin")
  8
  >>> SEARCH(" ", "Profit Margin")
  7
  >>> SEARCH('"', 'The "boss" is here.')
  5
  >>> SEARCH("gle", "Google")
  4
  >>> SEARCH("GLE", "Google")
  4
  """
  # .lower() isn't always correct for unicode. See http://stackoverflow.com/a/29247821/328565
  return within_text.lower().index(find_text.lower(), start_num - 1) + 1


def SUBSTITUTE(text, old_text, new_text, instance_num=None):
  u"""
  Replaces existing text with new text in a string. It is useful when you know the substring of
  text to replace. Use REPLACE when you know the position of text to replace.

  If instance_num is given, it specifies which occurrence of old_text to replace. If omitted, all
  occurrences are replaced.

  Same as `text.replace(old_text, new_text)` when instance_num is omitted.

  >>> SUBSTITUTE("Sales Data", "Sales", "Cost")
  u'Cost Data'
  >>> SUBSTITUTE("Quarter 1, 2008", "1", "2", 1)
  u'Quarter 2, 2008'
  >>> SUBSTITUTE("Quarter 1, 2011", "1", "2", 3)
  u'Quarter 1, 2012'

  More tests:
  >>> SUBSTITUTE("Hello world", "", "-")
  u'Hello world'
  >>> SUBSTITUTE("Hello world", " ", "-")
  u'Hello-world'
  >>> SUBSTITUTE("Hello world", " ", 12.1)
  u'Hello12.1world'
  >>> SUBSTITUTE(u"Hello world", u" ", 12.1)
  u'Hello12.1world'
  >>> SUBSTITUTE("Hello world", "world", "")
  u'Hello '
  >>> SUBSTITUTE("Hello", "world", "")
  u'Hello'

  Overlapping matches are all counted when looking for instance_num.
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx')
  u'xxxxxxxx'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 1)
  u'xxxxabab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 2)
  u'abxxxxab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 3)
  u'ababxxxx'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 4)
  u'abababab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 0)
  Traceback (most recent call last):
  ...
  ValueError: instance_num invalid
  >>> SUBSTITUTE( "crème",  "è", "e")
  u'creme'
  >>> SUBSTITUTE(u"crème", u"è", "e")
  u'creme'
  >>> SUBSTITUTE(u"crème",  "è", "e")
  u'creme'
  >>> SUBSTITUTE( "crème", u"è", "e")
  u'creme'
  """
  text = six.text_type(text)
  old_text = six.text_type(old_text)
  new_text = six.text_type(new_text)

  if not old_text:
    return text

  if instance_num is None:
    return text.replace(old_text, new_text)

  if instance_num <= 0:
    raise ValueError("instance_num invalid")

  # No trivial way to replace nth occurrence.
  i = -1
  for c in xrange(instance_num):
    i = text.find(old_text, i + 1)
    if i < 0:
      return text
  return text[:i] + new_text + text[i + len(old_text):]


def T(value):
  """
  Returns value if value is text, or the empty string when value is not text.

  >>> T('Text')
  u'Text'
  >>> T(826)
  u''
  >>> T('826')
  u'826'
  >>> T(False)
  u''
  >>> T('100 points')
  u'100 points'
  >>> T(AltText('Text'))
  u'Text'
  >>> T(float('nan'))
  u''
  """
  return (value.decode('utf8') if isinstance(value, six.binary_type) else
          value if isinstance(value, six.text_type) else
          six.text_type(value) if isinstance(value, AltText) else u"")

def TASTEME(food):
  chews = re.findall(r'\b[A-Z]+\b', food.upper())
  claw = slice(2, None)
  spit = lambda chow: chow[claw]
  return (chews or None) and not all(fang != snap
    for bite in chews for fang, snap in zip(bite, spit(bite)))


@unimplemented
def TEXT(number, format_type):    # pylint: disable=unused-argument
  """
  Converts a number into text according to a specified format. It is not yet implemented in
  Grist. You can use the similar Python functions str() to convert numbers into strings, and
  optionally format() to specify the number format.
  """
  raise NotImplementedError()


_trim_re = re.compile(r'  +')

def TRIM(text):
  """
  Removes all spaces from text except for single spaces between words. Note that TRIM does not
  remove other whitespace such as tab or newline characters.

  >>> TRIM(" First Quarter\\n    Earnings     ")
  'First Quarter\\n Earnings'
  >>> TRIM("")
  ''
  """
  return _trim_re.sub(' ', text.strip())


def UPPER(text):
  """
  Converts a specified string to uppercase. Same as `text.lower()`.

  >>> UPPER("e. e. cummings")
  'E. E. CUMMINGS'
  >>> UPPER("Apt. 2B")
  'APT. 2B'
  """
  return text.upper()


def VALUE(text):
  """
  Converts a string in accepted date, time or number formats into a number or date.

  >>> VALUE("$1,000")
  1000
  >>> assert VALUE("16:48:00") - VALUE("12:00:00") == datetime.timedelta(0, 17280)
  >>> VALUE("01/01/2012")
  datetime.datetime(2012, 1, 1, 0, 0)
  >>> VALUE("")
  0
  >>> VALUE(0)
  0
  >>> VALUE("826")
  826
  >>> VALUE("-826.123123123")
  -826.123123123
  >>> VALUE(float('nan'))
  nan
  >>> VALUE("Invalid")
  Traceback (most recent call last):
  ...
  ValueError: text cannot be parsed to a number
  >>> VALUE("13/13/13")
  Traceback (most recent call last):
  ...
  ValueError: text cannot be parsed to a number
  """
  # This is not particularly robust, but makes an attempt to handle a number of cases: numbers,
  # including optional comma separators, dates/times, leading dollar-sign.
  if isinstance(text, (numbers.Number, datetime.date)):
    return text
  text = text.strip().lstrip('$')
  nocommas = text.replace(',', '')
  if nocommas == "":
    return 0

  try:
    return int(nocommas)
  except ValueError:
    pass

  try:
    return float(nocommas)
  except ValueError:
    pass

  try:
    return dateutil.parser.parse(text)
  except ValueError:
    pass

  raise ValueError('text cannot be parsed to a number')