gristlabs_grist-core/sandbox/grist/functions/text.py

# -*- coding: UTF-8 -*-

import datetime
import numbers
import re

import dateutil.parser
import phonenumbers
import six
from six import unichr         # pylint: disable=redefined-builtin
from six.moves import xrange

from usertypes import AltText  # pylint: disable=import-error
from .math import ROUND
from .unimplemented import unimplemented


def CHAR(table_number):
  """
  Convert a number into a character according to the current Unicode table.
  Same as `unichr(number)`.

  >>> CHAR(65)
  u'A'
  >>> CHAR(33)
  u'!'
  """
  return unichr(table_number)


# See http://stackoverflow.com/a/93029/328565
_control_chars = ''.join(map(unichr, list(xrange(0,32)) + list(xrange(127,160))))
_control_char_re = re.compile('[%s]' % re.escape(_control_chars))

def CLEAN(text):
  """
  Returns the text with the non-printable characters removed.

  This removes both characters with values 0 through 31, and other Unicode characters in the
  "control characters" category.

  >>> CLEAN(CHAR(9) + "Monthly report" + CHAR(10))
  u'Monthly report'
  """
  return _control_char_re.sub('', text)


def CODE(string):
  """
  Returns the numeric Unicode map value of the first character in the string provided.
  Same as `ord(string[0])`.

  >>> CODE("A")
  65
  >>> CODE("!")
  33
  >>> CODE("!A")
  33
  """
  return ord(string[0])


def CONCATENATE(string, *more_strings):
  u"""
  Joins together any number of text strings into one string. Also available under the name
  `CONCAT`. Similar to the Python expression `"".join(array_of_strings)`.

  >>> CONCATENATE("Stream population for ", "trout", " ", "species", " is ", 32, "/mile.")
  u'Stream population for trout species is 32/mile.'
  >>> CONCATENATE("In ", 4, " days it is ", datetime.date(2016,1,1))
  u'In 4 days it is 2016-01-01'
  >>> CONCATENATE("abc")
  u'abc'
  >>> CONCATENATE(0, "abc")
  u'0abc'
  >>> assert CONCATENATE(2, u" crème ", u"brûlée") == u'2 crème brûlée'
  >>> assert CONCATENATE(2,  " crème ", u"brûlée") == u'2 crème brûlée'
  >>> assert CONCATENATE(2,  " crème ",  "brûlée") == u'2 crème brûlée'
  """
  return u''.join(
    val.decode('utf8') if isinstance(val, six.binary_type) else   # pylint:disable=no-member
    six.text_type(val)
    for val in (string,) + more_strings
  )


def CONCAT(string, *more_strings):
  """
  Joins together any number of text strings into one string. Also available under the name
  `CONCATENATE`. Similar to the Python expression `"".join(array_of_strings)`.

  >>> CONCAT("Stream population for ", "trout", " ", "species", " is ", 32, "/mile.")
  u'Stream population for trout species is 32/mile.'
  >>> CONCAT("In ", 4, " days it is ", datetime.date(2016,1,1))
  u'In 4 days it is 2016-01-01'
  >>> CONCAT("abc")
  u'abc'
  >>> CONCAT(0, "abc")
  u'0abc'
  >>> assert CONCAT(2, u" crème ", u"brûlée") == u'2 crème brûlée'
  """
  return CONCATENATE(string, *more_strings)

def DOLLAR(number, decimals=2):
  """
  Formats a number into a formatted dollar amount, with decimals rounded to the specified place (.
  If decimals value is omitted, it defaults to 2.

  >>> DOLLAR(1234.567)
  '$1,234.57'
  >>> DOLLAR(1234.567, -2)
  '$1,200'
  >>> DOLLAR(-1234.567, -2)
  '($1,200)'
  >>> DOLLAR(-0.123, 4)
  '($0.1230)'
  >>> DOLLAR(99.888)
  '$99.89'
  >>> DOLLAR(0)
  '$0.00'
  >>> DOLLAR(10, 0)
  '$10'
  """
  formatted = "${:,.{}f}".format(ROUND(abs(number), decimals), max(0, decimals))
  return formatted if number >= 0 else "(" + formatted + ")"


def EXACT(string1, string2):
  """
  Tests whether two strings are identical. Same as `string2 == string2`.

  >>> EXACT("word", "word")
  True
  >>> EXACT("Word", "word")
  False
  >>> EXACT("w ord", "word")
  False
  """
  return string1 == string2


def FIND(find_text, within_text, start_num=1):
  """
  Returns the position at which a string is first found within text.

  Find is case-sensitive. The returned position is 1 if within_text starts with find_text.
  Start_num specifies the character at which to start the search, defaulting to 1 (the first
  character of within_text).

  If find_text is not found, or start_num is invalid, raises ValueError.

  >>> FIND("M", "Miriam McGovern")
  1
  >>> FIND("m", "Miriam McGovern")
  6
  >>> FIND("M", "Miriam McGovern", 3)
  8
  >>> FIND(" #", "Hello world # Test")
  12
  >>> FIND("gle", "Google", 1)
  4
  >>> FIND("GLE", "Google", 1)
  Traceback (most recent call last):
  ...
  ValueError: substring not found
  >>> FIND("page", "homepage")
  5
  >>> FIND("page", "homepage", 6)
  Traceback (most recent call last):
  ...
  ValueError: substring not found
  """
  return within_text.index(find_text, start_num - 1) + 1


def FIXED(number, decimals=2, no_commas=False):
  """
  Formats a number with a fixed number of decimal places (2 by default), and commas.
  If no_commas is True, then omits the commas.

  >>> FIXED(1234.567, 1)
  '1,234.6'
  >>> FIXED(1234.567, -1)
  '1,230'
  >>> FIXED(-1234.567, -1, True)
  '-1230'
  >>> FIXED(44.332)
  '44.33'
  >>> FIXED(3521.478, 2, False)
  '3,521.48'
  >>> FIXED(-3521.478, 1, True)
  '-3521.5'
  >>> FIXED(3521.478, 0, True)
  '3521'
  >>> FIXED(3521.478, -2, True)
  '3500'
  """
  comma_flag = '' if no_commas else ','
  return "{:{}.{}f}".format(ROUND(number, decimals), comma_flag, max(0, decimals))


def LEFT(string, num_chars=1):
  """
  Returns a substring of length num_chars from the beginning of the given string. If num_chars is
  omitted, it is assumed to be 1. Same as `string[:num_chars]`.

  >>> LEFT("Sale Price", 4)
  'Sale'
  >>> LEFT('Swededn')
  'S'
  >>> LEFT('Text', -1)
  Traceback (most recent call last):
  ...
  ValueError: num_chars invalid
  """
  if num_chars < 0:
    raise ValueError("num_chars invalid")
  return string[:num_chars]


def LEN(text):
  """
  Returns the number of characters in a text string, or the number of items in a list. Same as
  [`len`](https://docs.python.org/3/library/functions.html#len) in python.
  See [Record Set](#recordset) for an example of using `len` on a list of records.

  >>> LEN("Phoenix, AZ")
  11
  >>> LEN("")
  0
  >>> LEN("     One   ")
  11
  """
  return len(text)


def LOWER(text):
  """
  Converts a specified string to lowercase. Same as `text.lower()`.

  >>> LOWER("E. E. Cummings")
  'e. e. cummings'
  >>> LOWER("Apt. 2B")
  'apt. 2b'
  """
  return text.lower()


def MID(text, start_num, num_chars):
  """
  Returns a segment of a string, starting at start_num. The first character in text has
  start_num 1.

  >>> MID("Fluid Flow", 1, 5)
  'Fluid'
  >>> MID("Fluid Flow", 7, 20)
  'Flow'
  >>> MID("Fluid Flow", 20, 5)
  ''
  >>> MID("Fluid Flow", 0, 5)
  Traceback (most recent call last):
  ...
  ValueError: start_num invalid
  """
  if start_num < 1:
    raise ValueError("start_num invalid")
  return text[start_num - 1 : start_num - 1 + num_chars]


output_formats = {
    "+":        phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "INTL":     phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "#":        phonenumbers.PhoneNumberFormat.NATIONAL,
    "NATL":     phonenumbers.PhoneNumberFormat.NATIONAL,
    "*":        phonenumbers.PhoneNumberFormat.E164,
    "E164":     phonenumbers.PhoneNumberFormat.E164,
    "tel":      phonenumbers.PhoneNumberFormat.RFC3966,
    "RFC3966":  phonenumbers.PhoneNumberFormat.RFC3966,
}

def PHONE_FORMAT(value, country=None, format=None):  # pylint: disable=redefined-builtin
  """
  Formats a phone number.

  With no optional arguments, the number must start with "+" and the international dialing prefix,
  and will be formatted as an international number, e.g. `+12345678901` becomes `+1 234-567-8901`.

  The `country` argument allows specifying a 2-letter country code (e.g. "US" or "GB") for
  interpreting phone numbers that don't start with "+". E.g. `PHONE_FORMAT('2025555555', 'US')`
  would be seen as a US number and formatted as "(202) 555-5555". Phone numbers that start with
  "+" ignore `country`. E.g. `PHONE_FORMAT('+33555555555', 'US')` is a French number because '+33'
  is the international prefix for France.

  The `format` argument specifies the output format, according to this table:

    - `"#"` or `"NATL"` (default) - use the national format, without the international dialing
      prefix, when possible. E.g. `(234) 567-8901` for "US", or `02 34 56 78 90` for "FR". If
      `country` is omitted, or the number does not correspond to the given country, the
      international format is used instead.
    - `"+"` or `"INTL"` - international format, e.g. `+1 234-567-8901` or
      `+33 2 34 56 78 90`.
    - `"*"` or `"E164"` - E164 format, like international but with no separators, e.g.
      `+12345678901`.
    - `"tel"` or `"RFC3966"` - format suitable to use as a [hyperlink](col-types.md#hyperlinks),
      e.g. 'tel:+1-234-567-8901'.

  When specifying the `format` argument, you may omit the `country` argument. I.e.
  `PHONE_FORMAT(value, "tel")` is equivalent to `PHONE_FORMAT(value, None, "tel")`.

  For more details, see the [phonenumbers](https://github.com/daviddrysdale/python-phonenumbers)
  Python library, which underlies this function.

  >>> PHONE_FORMAT("+12345678901")
  u'+1 234-567-8901'
  >>> PHONE_FORMAT("2345678901", "US")
  u'(234) 567-8901'
  >>> PHONE_FORMAT("2345678901", "GB")
  u'023 4567 8901'
  >>> PHONE_FORMAT("2345678901", "GB", "+")
  u'+44 23 4567 8901'
  >>> PHONE_FORMAT("+442345678901", "GB")
  u'023 4567 8901'
  >>> PHONE_FORMAT("+12345678901", "GB")
  u'+1 234-567-8901'
  >>> PHONE_FORMAT("(234) 567-8901")    # doctest: +IGNORE_EXCEPTION_DETAIL
  Traceback (most recent call last):
  ...
  NumberParseException: (0) Missing or invalid default region.
  >>> PHONE_FORMAT("(234)567 89-01", "US", "tel")
  u'tel:+1-234-567-8901'
  >>> PHONE_FORMAT("2/3456/7890", "FR", '#')
  u'02 34 56 78 90'
  >>> PHONE_FORMAT("+33234567890", '#')
  u'+33 2 34 56 78 90'
  >>> PHONE_FORMAT("+33234567890", 'tel')
  u'tel:+33-2-34-56-78-90'
  >>> PHONE_FORMAT("tel:+1-234-567-8901", country="US", format="*")
  u'+12345678901'
  """
  if not value:
    return value
  if format is None and country in output_formats:
    format = country
    country = None
  parsed = phonenumbers.parse(str(value), country)
  out_fmt = output_formats.get(format or "#")
  if out_fmt is None:
    raise ValueError("Unrecognized phone format; try +, INTL, #, NATL, *, E164, tel, or RFC3966")

  if out_fmt == phonenumbers.PhoneNumberFormat.NATIONAL and not country:
    # With no country, we lose info in NATIONAL format (because numbers must be specified with an
    # international prefix, and the output would discard it). Use INTERNATIONAL instead.
    out_fmt = phonenumbers.PhoneNumberFormat.INTERNATIONAL

  result = phonenumbers.format_number(parsed, out_fmt)

  # If using a national format with a country, check that we don't garble numbers with a different
  # international prefix. If so, use an international format. E.g. for
  # PHONE_FORMAT('+12345678901', 'FR'), the output should include the US dialing prefix.
  if (out_fmt == phonenumbers.PhoneNumberFormat.NATIONAL and country and
      phonenumbers.parse(result, country) != parsed):
    result = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.INTERNATIONAL)

  return result


def PROPER(text):
  """
  Capitalizes each word in a specified string. It converts the first letter of each word to
  uppercase, and all other letters to lowercase. Same as `text.title()`.

  >>> PROPER('this is a TITLE')
  'This Is A Title'
  >>> PROPER('2-way street')
  '2-Way Street'
  >>> PROPER('76BudGet')
  '76Budget'
  """
  return text.title()


def REGEXEXTRACT(text, regular_expression):
  """
  Extracts the first part of text that matches regular_expression.

  >>> REGEXEXTRACT("Google Doc 101", "[0-9]+")
  '101'
  >>> REGEXEXTRACT("The price today is $826.25", "[0-9]*\\.[0-9]+[0-9]+")
  '826.25'

  If there is a parenthesized expression, it is returned instead of the whole match.
  >>> REGEXEXTRACT("(Content) between brackets", "\\(([A-Za-z]+)\\)")
  'Content'
  >>> REGEXEXTRACT("Foo", "Bar")
  Traceback (most recent call last):
  ...
  ValueError: REGEXEXTRACT text does not match
  """
  m = re.search(regular_expression, text)
  if not m:
    raise ValueError("REGEXEXTRACT text does not match")
  return m.group(1) if m.lastindex else m.group(0)


def REGEXMATCH(text, regular_expression):
  """
  Returns whether a piece of text matches a regular expression.

  >>> REGEXMATCH("Google Doc 101", "[0-9]+")
  True
  >>> REGEXMATCH("Google Doc", "[0-9]+")
  False
  >>> REGEXMATCH("The price today is $826.25", "[0-9]*\\.[0-9]+[0-9]+")
  True
  >>> REGEXMATCH("(Content) between brackets", "\\(([A-Za-z]+)\\)")
  True
  >>> REGEXMATCH("Foo", "Bar")
  False
  """
  return bool(re.search(regular_expression, text))


def REGEXREPLACE(text, regular_expression, replacement):
  """
  Replaces all parts of text matching the given regular expression with replacement text.

  >>> REGEXREPLACE("Google Doc 101", "[0-9]+", "777")
  'Google Doc 777'
  >>> REGEXREPLACE("Google Doc", "[0-9]+", "777")
  'Google Doc'
  >>> REGEXREPLACE("The price is $826.25", "[0-9]*\\.[0-9]+[0-9]+", "315.75")
  'The price is $315.75'
  >>> REGEXREPLACE("(Content) between brackets", "\\(([A-Za-z]+)\\)", "Word")
  'Word between brackets'
  >>> REGEXREPLACE("Foo", "Bar", "Baz")
  'Foo'
  """
  return re.sub(regular_expression, replacement, text)


def REPLACE(text, position, length, new_text):
  """
  Replaces part of a text string with a different text string. Position is counted from 1.

  >>> REPLACE("abcdefghijk", 6, 5, "*")
  'abcde*k'
  >>> REPLACE("2009", 3, 2, "10")
  '2010'
  >>> REPLACE('123456', 1, 3, '@')
  '@456'
  >>> REPLACE('foo', 1, 0, 'bar')
  'barfoo'
  >>> REPLACE('foo', 0, 1, 'bar')
  Traceback (most recent call last):
  ...
  ValueError: position invalid
  """
  if position < 1:
    raise ValueError("position invalid")
  return text[:position - 1] + new_text + text[position - 1 + length:]


def REPT(text, number_times):
  """
  Returns specified text repeated a number of times. Same as `text * number_times`.

  The result of the REPT function cannot be longer than 32767 characters, or it raises a
  ValueError.

  >>> REPT("*-", 3)
  '*-*-*-'
  >>> REPT('-', 10)
  '----------'
  >>> REPT('-', 0)
  ''
  >>> len(REPT('---', 10000))
  30000
  >>> REPT('---', 11000)
  Traceback (most recent call last):
  ...
  ValueError: number_times invalid
  >>> REPT('-', -1)
  Traceback (most recent call last):
  ...
  ValueError: number_times invalid
  """
  if number_times < 0 or len(text) * number_times > 32767:
    raise ValueError("number_times invalid")
  return text * int(number_times)


def RIGHT(string, num_chars=1):
  """
  Returns a substring of length num_chars from the end of a specified string. If num_chars is
  omitted, it is assumed to be 1. Same as `string[-num_chars:]`.

  >>> RIGHT("Sale Price", 5)
  'Price'
  >>> RIGHT('Stock Number')
  'r'
  >>> RIGHT('Text', 100)
  'Text'
  >>> RIGHT('Text', -1)
  Traceback (most recent call last):
  ...
  ValueError: num_chars invalid
  """
  if num_chars < 0:
    raise ValueError("num_chars invalid")
  return string[-num_chars:]


def SEARCH(find_text, within_text, start_num=1):
  """
  Returns the position at which a string is first found within text, ignoring case.

  Find is case-sensitive. The returned position is 1 if within_text starts with find_text.
  Start_num specifies the character at which to start the search, defaulting to 1 (the first
  character of within_text).

  If find_text is not found, or start_num is invalid, raises ValueError.
  >>> SEARCH("e", "Statements", 6)
  7
  >>> SEARCH("margin", "Profit Margin")
  8
  >>> SEARCH(" ", "Profit Margin")
  7
  >>> SEARCH('"', 'The "boss" is here.')
  5
  >>> SEARCH("gle", "Google")
  4
  >>> SEARCH("GLE", "Google")
  4
  """
  # .lower() isn't always correct for unicode. See http://stackoverflow.com/a/29247821/328565
  return within_text.lower().index(find_text.lower(), start_num - 1) + 1


def SUBSTITUTE(text, old_text, new_text, instance_num=None):
  u"""
  Replaces existing text with new text in a string. It is useful when you know the substring of
  text to replace. Use REPLACE when you know the position of text to replace.

  If instance_num is given, it specifies which occurrence of old_text to replace. If omitted, all
  occurrences are replaced.

  Same as `text.replace(old_text, new_text)` when instance_num is omitted.

  >>> SUBSTITUTE("Sales Data", "Sales", "Cost")
  u'Cost Data'
  >>> SUBSTITUTE("Quarter 1, 2008", "1", "2", 1)
  u'Quarter 2, 2008'
  >>> SUBSTITUTE("Quarter 1, 2011", "1", "2", 3)
  u'Quarter 1, 2012'

  More tests:
  >>> SUBSTITUTE("Hello world", "", "-")
  u'Hello world'
  >>> SUBSTITUTE("Hello world", " ", "-")
  u'Hello-world'
  >>> SUBSTITUTE("Hello world", " ", 12.1)
  u'Hello12.1world'
  >>> SUBSTITUTE(u"Hello world", u" ", 12.1)
  u'Hello12.1world'
  >>> SUBSTITUTE("Hello world", "world", "")
  u'Hello '
  >>> SUBSTITUTE("Hello", "world", "")
  u'Hello'

  Overlapping matches are all counted when looking for instance_num.
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx')
  u'xxxxxxxx'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 1)
  u'xxxxabab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 2)
  u'abxxxxab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 3)
  u'ababxxxx'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 4)
  u'abababab'
  >>> SUBSTITUTE('abababab', 'abab', 'xxxx', 0)
  Traceback (most recent call last):
  ...
  ValueError: instance_num invalid
  >>> SUBSTITUTE( "crème",  "è", "e")
  u'creme'
  >>> SUBSTITUTE(u"crème", u"è", "e")
  u'creme'
  >>> SUBSTITUTE(u"crème",  "è", "e")
  u'creme'
  >>> SUBSTITUTE( "crème", u"è", "e")
  u'creme'
  """
  text = six.text_type(text)
  old_text = six.text_type(old_text)
  new_text = six.text_type(new_text)

  if not old_text:
    return text

  if instance_num is None:
    return text.replace(old_text, new_text)

  if instance_num <= 0:
    raise ValueError("instance_num invalid")

  # No trivial way to replace nth occurrence.
  i = -1
  for c in xrange(instance_num):
    i = text.find(old_text, i + 1)
    if i < 0:
      return text
  return text[:i] + new_text + text[i + len(old_text):]


def T(value):
  """
  Returns value if value is text, or the empty string when value is not text.

  >>> T('Text')
  u'Text'
  >>> T(826)
  u''
  >>> T('826')
  u'826'
  >>> T(False)
  u''
  >>> T('100 points')
  u'100 points'
  >>> T(AltText('Text'))
  u'Text'
  >>> T(float('nan'))
  u''
  """
  return (value.decode('utf8') if isinstance(value, six.binary_type) else
          value if isinstance(value, six.text_type) else
          six.text_type(value) if isinstance(value, AltText) else u"")


@unimplemented
def TEXT(number, format_type):    # pylint: disable=unused-argument
  """
  Converts a number into text according to a specified format. It is not yet implemented in
  Grist.
  """
  raise NotImplementedError()


_trim_re = re.compile(r'  +')

def TRIM(text):
  """
  Removes all spaces from text except for single spaces between words. Note that TRIM does not
  remove other whitespace such as tab or newline characters.

  >>> TRIM(" First Quarter\\n    Earnings     ")
  'First Quarter\\n Earnings'
  >>> TRIM("")
  ''
  """
  return _trim_re.sub(' ', text.strip())


def UPPER(text):
  """
  Converts a specified string to uppercase. Same as `text.lower()`.

  >>> UPPER("e. e. cummings")
  'E. E. CUMMINGS'
  >>> UPPER("Apt. 2B")
  'APT. 2B'
  """
  return text.upper()


def VALUE(text):
  """
  Converts a string in accepted date, time or number formats into a number or date.

  >>> VALUE("$1,000")
  1000
  >>> assert VALUE("16:48:00") - VALUE("12:00:00") == datetime.timedelta(0, 17280)
  >>> VALUE("01/01/2012")
  datetime.datetime(2012, 1, 1, 0, 0)
  >>> VALUE("")
  0
  >>> VALUE(0)
  0
  >>> VALUE("826")
  826
  >>> VALUE("-826.123123123")
  -826.123123123
  >>> VALUE(float('nan'))
  nan
  >>> VALUE("Invalid")
  Traceback (most recent call last):
  ...
  ValueError: text cannot be parsed to a number
  >>> VALUE("13/13/13")
  Traceback (most recent call last):
  ...
  ValueError: text cannot be parsed to a number
  """
  # This is not particularly robust, but makes an attempt to handle a number of cases: numbers,
  # including optional comma separators, dates/times, leading dollar-sign.
  if isinstance(text, (numbers.Number, datetime.date)):
    return text
  text = text.strip().lstrip('$')
  nocommas = text.replace(',', '')
  if nocommas == "":
    return 0

  try:
    return int(nocommas)
  except ValueError:
    pass

  try:
    return float(nocommas)
  except ValueError:
    pass

  try:
    return dateutil.parser.parse(text)
  except ValueError:
    pass

  raise ValueError('text cannot be parsed to a number')