gristlabs_grist-core/sandbox/grist/functions/stats.py

# pylint: disable=redefined-builtin, line-too-long, unused-argument
import datetime

from .math import _chain, _chain_numeric, _chain_numeric_a, _chain_numeric_or_date
from .info import ISNUMBER, ISLOGICAL
from .date import DATE, DTIME       # pylint: disable=unused-import
from .unimplemented import unimplemented

def _average(iterable):
  total, count = 0.0, 0
  for value in iterable:
    total += value
    count += 1
  return total / count

def _default_if_empty(iterable, default):
  """
  Yields all values from iterable, except when it is empty, yields just the single default value.
  """
  empty = True
  for value in iterable:
    empty = False
    yield value
  if empty:
    yield default


@unimplemented
def AVEDEV(value1, value2):
  """Calculates the average of the magnitudes of deviations of data from a dataset's mean."""
  raise NotImplementedError()


def AVERAGE(value, *more_values):
  """
  Returns the numerical average value in a dataset, ignoring non-numerical values.

  Each argument may be a value or an array. Values that are not numbers, including logical
  and blank values, and text representations of numbers, are ignored.

  >>> AVERAGE([2, -1.0, 11])
  4.0
  >>> AVERAGE([2, -1, 11, "Hello"])
  4.0
  >>> AVERAGE([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11])
  4.0
  >>> AVERAGE(False, True)
  Traceback (most recent call last):
    ...
  ZeroDivisionError: float division by zero
  """
  return _average(_chain_numeric(value, *more_values))


def AVERAGEA(value, *more_values):
  """
  Returns the numerical average value in a dataset, counting non-numerical values as 0.

  Each argument may be a value of an array. Values that are not numbers, including dates and text
  representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
  False as 0.

  >>> AVERAGEA([2, -1.0, 11])
  4.0
  >>> AVERAGEA([2, -1, 11, "Hello"])
  3.0
  >>> AVERAGEA([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
  1.5
  >>> AVERAGEA(False, True)
  0.5
  """
  return _average(_chain_numeric_a(value, *more_values))

# Note that Google Sheets offers a similar function, called AVERAGE.WEIGHTED
# (https://support.google.com/docs/answer/9084098?hl=en)
def AVERAGE_WEIGHTED(pairs):
  """
  Given a list of (value, weight) pairs, finds the average of the values weighted by the
  corresponding weights. Ignores any pairs with a non-numerical value or weight.

  If you have two lists, of values and weights, use the Python built-in zip() function to create a
  list of pairs.

  >>> AVERAGE_WEIGHTED(((95, .25), (90, .1), ("X", .5), (85, .15), (88, .2), (82, .3), (70, None)))
  87.7
  >>> AVERAGE_WEIGHTED(zip([95, 90, "X", 85, 88, 82, 70], [25, 10, 50, 15, 20, 30, None]))
  87.7
  >>> AVERAGE_WEIGHTED(zip([95, 90, False, 85, 88, 82, 70], [.25, .1, .5, .15, .2, .3, True]))
  87.7
  """
  sum_value, sum_weight = 0.0, 0.0
  for value, weight in pairs:
    # The type-checking here is the same as used by _chain_numeric.
    if ISNUMBER(value) and not ISLOGICAL(value) and ISNUMBER(weight) and not ISLOGICAL(weight):
      sum_value += value * weight
      sum_weight += weight
  return sum_value / sum_weight


@unimplemented
def AVERAGEIF(criteria_range, criterion, average_range=None):
  """Returns the average of a range depending on criteria."""
  raise NotImplementedError()

@unimplemented
def AVERAGEIFS(average_range, criteria_range1, criterion1, *args):
  """Returns the average of a range depending on multiple criteria."""
  raise NotImplementedError()

@unimplemented
def BINOMDIST(num_successes, num_trials, prob_success, cumulative):
  """
  Calculates the probability of drawing a certain number of successes (or a maximum number of
  successes) in a certain number of tries given a population of a certain size containing a
  certain number of successes, with replacement of draws.
  """
  raise NotImplementedError()

@unimplemented
def CONFIDENCE(alpha, standard_deviation, pop_size):
  """Calculates the width of half the confidence interval for a normal distribution."""
  raise NotImplementedError()

@unimplemented
def CORREL(data_y, data_x):
  """Calculates r, the Pearson product-moment correlation coefficient of a dataset."""
  raise NotImplementedError()

def COUNT(value, *more_values):
  """
  Returns the count of numerical and date/datetime values in a dataset,
  ignoring other types of values.

  Each argument may be a value or an array. Values that are not numbers or dates, including logical
  and blank values, and text representations of numbers, are ignored.

  >>> COUNT([2, -1.0, 11])
  3
  >>> COUNT([2, -1, 11, "Hello"])
  3
  >>> COUNT([DATE(2000, 1, 1), DATE(2000, 1, 2), DATE(2000, 1, 3), "Hello"])
  3
  >>> COUNT([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
  4
  >>> COUNT(False, True)
  0
  """
  return sum(1 for _ in _chain_numeric_or_date(value, *more_values))


def COUNTA(value, *more_values):
  """
  Returns the count of all values in a dataset, including non-numerical values.

  Each argument may be a value or an array.

  >>> COUNTA([2, -1.0, 11])
  3
  >>> COUNTA([2, -1, 11, "Hello"])
  4
  >>> COUNTA([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
  9
  >>> COUNTA(False, True)
  2
  """
  return sum(1 for _ in _chain(value, *more_values))


@unimplemented
def COVAR(data_y, data_x):
  """Calculates the covariance of a dataset."""
  raise NotImplementedError()

@unimplemented
def CRITBINOM(num_trials, prob_success, target_prob):
  """Calculates the smallest value for which the cumulative binomial distribution is greater than or equal to a specified criteria."""
  raise NotImplementedError()

@unimplemented
def DEVSQ(value1, value2):
  """Calculates the sum of squares of deviations based on a sample."""
  raise NotImplementedError()

@unimplemented
def EXPONDIST(x, lambda_, cumulative):
  """Returns the value of the exponential distribution function with a specified lambda at a specified value."""
  raise NotImplementedError()

@unimplemented
def F_DIST(x, degrees_freedom1, degrees_freedom2, cumulative):
  """
  Calculates the left-tailed F probability distribution (degree of diversity) for two data sets
  with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
  distribution.
  """
  raise NotImplementedError()

@unimplemented
def F_DIST_RT(x, degrees_freedom1, degrees_freedom2):
  """
  Calculates the right-tailed F probability distribution (degree of diversity) for two data sets
  with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
  distribution.
  """
  raise NotImplementedError()

@unimplemented
def FDIST(x, degrees_freedom1, degrees_freedom2):
  """
  Calculates the right-tailed F probability distribution (degree of diversity) for two data sets
  with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
  distribution.
  """
  raise NotImplementedError()

@unimplemented
def FISHER(value):
  """Returns the Fisher transformation of a specified value."""
  raise NotImplementedError()

@unimplemented
def FISHERINV(value):
  """Returns the inverse Fisher transformation of a specified value."""
  raise NotImplementedError()

@unimplemented
def FORECAST(x, data_y, data_x):
  """Calculates the expected y-value for a specified x based on a linear regression of a dataset."""
  raise NotImplementedError()

@unimplemented
def GEOMEAN(value1, value2):
  """Calculates the geometric mean of a dataset."""
  raise NotImplementedError()

@unimplemented
def HARMEAN(value1, value2):
  """Calculates the harmonic mean of a dataset."""
  raise NotImplementedError()

@unimplemented
def HYPGEOMDIST(num_successes, num_draws, successes_in_pop, pop_size):
  """Calculates the probability of drawing a certain number of successes in a certain number of tries given a population of a certain size containing a certain number of successes, without replacement of draws."""
  raise NotImplementedError()

@unimplemented
def INTERCEPT(data_y, data_x):
  """Calculates the y-value at which the line resulting from linear regression of a dataset will intersect the y-axis (x=0)."""
  raise NotImplementedError()

@unimplemented
def KURT(value1, value2):
  """Calculates the kurtosis of a dataset, which describes the shape, and in particular the "peakedness" of that dataset."""
  raise NotImplementedError()

@unimplemented
def LARGE(data, n):
  """Returns the nth largest element from a data set, where n is user-defined."""
  raise NotImplementedError()

@unimplemented
def LOGINV(x, mean, standard_deviation):
  """Returns the value of the inverse log-normal cumulative distribution with given mean and standard deviation at a specified value."""
  raise NotImplementedError()

@unimplemented
def LOGNORMDIST(x, mean, standard_deviation):
  """Returns the value of the log-normal cumulative distribution with given mean and standard deviation at a specified value."""
  raise NotImplementedError()


def MAX(value, *more_values):
  """
  Returns the maximum value in a dataset, ignoring values other than numbers and dates/datetimes.

  Each argument may be a value or an array. Values that are not numbers or dates, including logical
  and blank values, and text representations of numbers, are ignored. Returns 0 if the arguments
  contain no numbers or dates.

  >>> MAX([2, -1.5, 11.5])
  11.5
  >>> MAX([2, -1.5, "Hello"], True, [False, "123", "", 11.5])
  11.5
  >>> MAX(True, -123)
  -123
  >>> MAX("123", -123)
  -123
  >>> MAX("Hello", "123", True, False)
  0
  >>> MAX(DATE(2015, 1, 1), DATE(2015, 1, 2))
  datetime.date(2015, 1, 2)
  >>> MAX(DATE(2015, 1, 1), datetime.datetime(2015, 1, 1, 12, 34, 56))
  datetime.datetime(2015, 1, 1, 12, 34, 56)
  >>> MAX(DATE(2015, 1, 2), datetime.datetime(2015, 1, 1, 12, 34, 56))
  datetime.date(2015, 1, 2)
  """
  values = _default_if_empty(_chain_numeric_or_date(value, *more_values), 0)
  return max(values, key=_compare_date_datetime_key)


def MAXA(value, *more_values):
  """
  Returns the maximum numeric value in a dataset.

  Each argument may be a value of an array. Values that are not numbers, including dates and text
  representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
  False as 0. Returns 0 if the arguments contain no numbers.

  >>> MAXA([2, -1.5, 11.5])
  11.5
  >>> MAXA([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
  11.5
  >>> MAXA(True, -123)
  1
  >>> MAXA("123", -123)
  0
  >>> MAXA("Hello", "123", DATE(2015, 1, 1))
  0
  """
  return max(_default_if_empty(_chain_numeric_a(value, *more_values), 0))


def MEDIAN(value, *more_values):
  """
  Returns the median value in a numeric dataset, ignoring non-numerical values.

  Each argument may be a value or an array. Values that are not numbers, including logical
  and blank values, and text representations of numbers, are ignored.

  Produces an error if the arguments contain no numbers.

  The median is the middle number when all values are sorted. So half of the values in the dataset
  are less than the median, and half of the values are greater. If there is an even number of
  values in the dataset, returns the average of the two numbers in the middle.

  >>> MEDIAN(1, 2, 3, 4, 5)
  3
  >>> MEDIAN(3, 5, 1, 4, 2)
  3
  >>> MEDIAN(range(10))
  4.5
  >>> MEDIAN("Hello", "123", DATE(2015, 1, 1), 12.3)
  12.3
  >>> MEDIAN("Hello", "123", DATE(2015, 1, 1))
  Traceback (most recent call last):
    ...
  ValueError: MEDIAN requires at least one number
  """
  values = sorted(_chain_numeric(value, *more_values))
  if not values:
    raise ValueError("MEDIAN requires at least one number")
  count = len(values)
  if count % 2 == 0:
    return (values[count // 2 - 1] + values[count // 2]) / 2.0
  else:
    return values[(count - 1) // 2]


def _compare_date_datetime_key(x):
  # Convert dates and naive datetimes to timezone-aware datetimes for sorting.
  if isinstance(x, (datetime.date, datetime.datetime)):
    return DTIME(x)
  else:
    return x


def MIN(value, *more_values):
  """
  Returns the minimum value in a dataset, ignoring values other than numbers and dates/datetimes.

  Each argument may be a value or an array. Values that are not numbers or dates, including logical
  and blank values, and text representations of numbers, are ignored. Returns 0 if the arguments
  contain no numbers or dates.

  >>> MIN([2, -1.5, 11.5])
  -1.5
  >>> MIN([2, -1.5, "Hello"], True, [False, "123", "", 11.5])
  -1.5
  >>> MIN(True, 123)
  123
  >>> MIN("-123", 123)
  123
  >>> MIN("Hello", "123", True, False)
  0
  >>> MIN(DATE(2015, 1, 1), DATE(2015, 1, 2))
  datetime.date(2015, 1, 1)
  >>> MIN(DATE(2015, 1, 1), datetime.datetime(2015, 1, 1, 12, 34, 56))
  datetime.date(2015, 1, 1)
  >>> MIN(DATE(2015, 1, 2), datetime.datetime(2015, 1, 1, 12, 34, 56))
  datetime.datetime(2015, 1, 1, 12, 34, 56)
  """
  values = _default_if_empty(_chain_numeric_or_date(value, *more_values), 0)
  return min(values, key=_compare_date_datetime_key)

def MINA(value, *more_values):
  """
  Returns the minimum numeric value in a dataset.

  Each argument may be a value of an array. Values that are not numbers, including dates and text
  representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
  False as 0. Returns 0 if the arguments contain no numbers.

  >>> MINA([2, -1.5, 11.5])
  -1.5
  >>> MINA([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
  -1.5
  >>> MINA(True, 123)
  1
  >>> MINA("-123", 123)
  0
  >>> MINA("Hello", "123", DATE(2015, 1, 1))
  0
  """
  return min(_default_if_empty(_chain_numeric_a(value, *more_values), 0))


@unimplemented
def MODE(value1, value2):
  """Returns the most commonly occurring value in a dataset."""
  raise NotImplementedError()

@unimplemented
def NEGBINOMDIST(num_failures, num_successes, prob_success):
  """Calculates the probability of drawing a certain number of failures before a certain number of successes given a probability of success in independent trials."""
  raise NotImplementedError()

@unimplemented
def NORMDIST(x, mean, standard_deviation, cumulative):
  """
  Returns the value of the normal distribution function (or normal cumulative distribution
  function) for a specified value, mean, and standard deviation.
  """
  raise NotImplementedError()

@unimplemented
def NORMINV(x, mean, standard_deviation):
  """Returns the value of the inverse normal distribution function for a specified value, mean, and standard deviation."""
  raise NotImplementedError()

@unimplemented
def NORMSDIST(x):
  """Returns the value of the standard normal cumulative distribution function for a specified value."""
  raise NotImplementedError()

@unimplemented
def NORMSINV(x):
  """Returns the value of the inverse standard normal distribution function for a specified value."""
  raise NotImplementedError()

@unimplemented
def PEARSON(data_y, data_x):
  """Calculates r, the Pearson product-moment correlation coefficient of a dataset."""
  raise NotImplementedError()

@unimplemented
def PERCENTILE(data, percentile):
  """Returns the value at a given percentile of a dataset."""
  raise NotImplementedError()

@unimplemented
def PERCENTRANK(data, value, significant_digits=None):
  """Returns the percentage rank (percentile) of a specified value in a dataset."""
  raise NotImplementedError()

@unimplemented
def PERCENTRANK_EXC(data, value, significant_digits=None):
  """Returns the percentage rank (percentile) from 0 to 1 exclusive of a specified value in a dataset."""
  raise NotImplementedError()

@unimplemented
def PERCENTRANK_INC(data, value, significant_digits=None):
  """Returns the percentage rank (percentile) from 0 to 1 inclusive of a specified value in a dataset."""
  raise NotImplementedError()

@unimplemented
def PERMUT(n, k):
  """Returns the number of ways to choose some number of objects from a pool of a given size of objects, considering order."""
  raise NotImplementedError()

@unimplemented
def POISSON(x, mean, cumulative):
  """
  Returns the value of the Poisson distribution function (or Poisson cumulative distribution
  function) for a specified value and mean.
  """
  raise NotImplementedError()

@unimplemented
def PROB(data, probabilities, low_limit, high_limit=None):
  """Given a set of values and corresponding probabilities, calculates the probability that a value chosen at random falls between two limits."""
  raise NotImplementedError()

@unimplemented
def QUARTILE(data, quartile_number):
  """Returns a value nearest to a specified quartile of a dataset."""
  raise NotImplementedError()

@unimplemented
def RANK(value, data, is_ascending=None):
  """Returns the rank of a specified value in a dataset."""
  raise NotImplementedError()

@unimplemented
def RANK_AVG(value, data, is_ascending=None):
  """Returns the rank of a specified value in a dataset. If there is more than one entry of the same value in the dataset, the average rank of the entries will be returned."""
  raise NotImplementedError()

@unimplemented
def RANK_EQ(value, data, is_ascending=None):
  """Returns the rank of a specified value in a dataset. If there is more than one entry of the same value in the dataset, the top rank of the entries will be returned."""
  raise NotImplementedError()

@unimplemented
def RSQ(data_y, data_x):
  """Calculates the square of r, the Pearson product-moment correlation coefficient of a dataset."""
  raise NotImplementedError()

@unimplemented
def SKEW(value1, value2):
  """Calculates the skewness of a dataset, which describes the symmetry of that dataset about the mean."""
  raise NotImplementedError()

@unimplemented
def SLOPE(data_y, data_x):
  """Calculates the slope of the line resulting from linear regression of a dataset."""
  raise NotImplementedError()

@unimplemented
def SMALL(data, n):
  """Returns the nth smallest element from a data set, where n is user-defined."""
  raise NotImplementedError()

@unimplemented
def STANDARDIZE(value, mean, standard_deviation):
  """Calculates the normalized equivalent of a random variable given mean and standard deviation of the distribution."""
  raise NotImplementedError()

# This should make us all cry a little. Because the sandbox does not do Python3 (which has
# statistics package), and because it does not do numpy (because it's native and hasn't been built
# for it), we have to implement simple stats functions by hand.
# TODO: switch to use the statistics package instead, once we upgrade to Python3.
#
# The following implementation of stdev is taken from https://stackoverflow.com/a/27758326/328565
def _mean(data):
  return sum(data) / float(len(data))

def _ss(data):
  """Return sum of square deviations of sequence data."""
  c = _mean(data)
  return sum((x-c)**2 for x in data)

def _stddev(data, ddof=0):
  """Calculates the population standard deviation
  by default; specify ddof=1 to compute the sample
  standard deviation."""
  n = len(data)
  ss = _ss(data)
  pvar = ss/(n-ddof)
  return pvar**0.5

# The examples in the doctests below come from https://support.google.com/docs/answer/3094054 and
# related articles, which helps ensure correctness and compatibility.
def STDEV(value, *more_values):
  """
  Calculates the standard deviation based on a sample, ignoring non-numerical values.

  >>> STDEV([2, 5, 8, 13, 10])
  4.277849927241488
  >>> STDEV([2, 5, 8, 13, 10, True, False, "Test"])
  4.277849927241488
  >>> STDEV([2, 5, 8, 13, 10], 3, 12, 15)
  4.810702354423639
  >>> STDEV([2, 5, 8, 13, 10], [3, 12, 15])
  4.810702354423639
  >>> STDEV([5])
  Traceback (most recent call last):
    ...
  ZeroDivisionError: float division by zero
  """
  return _stddev(list(_chain_numeric(value, *more_values)), 1)

def STDEVA(value, *more_values):
  """
  Calculates the standard deviation based on a sample, setting text to the value `0`.

  >>> STDEVA([2, 5, 8, 13, 10])
  4.277849927241488
  >>> STDEVA([2, 5, 8, 13, 10, True, False, "Test"])
  4.969550137731641
  >>> STDEVA([2, 5, 8, 13, 10], 1, 0, 0)
  4.969550137731641
  >>> STDEVA([2, 5, 8, 13, 10], [1, 0, 0])
  4.969550137731641
  >>> STDEVA([5])
  Traceback (most recent call last):
    ...
  ZeroDivisionError: float division by zero
  """
  return _stddev(list(_chain_numeric_a(value, *more_values)), 1)

def STDEVP(value, *more_values):
  """
  Calculates the standard deviation based on an entire population, ignoring non-numerical values.

  >>> STDEVP([2, 5, 8, 13, 10])
  3.8262252939417984
  >>> STDEVP([2, 5, 8, 13, 10, True, False, "Test"])
  3.8262252939417984
  >>> STDEVP([2, 5, 8, 13, 10], 3, 12, 15)
  4.5
  >>> STDEVP([2, 5, 8, 13, 10], [3, 12, 15])
  4.5
  >>> STDEVP([5])
  0.0
  """
  return _stddev(list(_chain_numeric(value, *more_values)), 0)

def STDEVPA(value, *more_values):
  """
  Calculates the standard deviation based on an entire population, setting text to the value `0`.

  >>> STDEVPA([2, 5, 8, 13, 10])
  3.8262252939417984
  >>> STDEVPA([2, 5, 8, 13, 10, True, False, "Test"])
  4.648588495446763
  >>> STDEVPA([2, 5, 8, 13, 10], 1, 0, 0)
  4.648588495446763
  >>> STDEVPA([2, 5, 8, 13, 10], [1, 0, 0])
  4.648588495446763
  >>> STDEVPA([5])
  0.0
  """
  return _stddev(list(_chain_numeric_a(value, *more_values)), 0)

@unimplemented
def STEYX(data_y, data_x):
  """Calculates the standard error of the predicted y-value for each x in the regression of a dataset."""
  raise NotImplementedError()

@unimplemented
def T_INV(probability, degrees_freedom):
  """Calculates the negative inverse of the one-tailed TDIST function."""
  raise NotImplementedError()

@unimplemented
def T_INV_2T(probability, degrees_freedom):
  """Calculates the inverse of the two-tailed TDIST function."""
  raise NotImplementedError()

@unimplemented
def TDIST(x, degrees_freedom, tails):
  """Calculates the probability for Student's t-distribution with a given input (x)."""
  raise NotImplementedError()

@unimplemented
def TINV(probability, degrees_freedom):
  """Calculates the inverse of the two-tailed TDIST function."""
  raise NotImplementedError()

@unimplemented
def TRIMMEAN(data, exclude_proportion):
  """Calculates the mean of a dataset excluding some proportion of data from the high and low ends of the dataset."""
  raise NotImplementedError()

@unimplemented
def TTEST(range1, range2, tails, type):
  """Returns the probability associated with t-test. Determines whether two samples are likely to have come from the same two underlying populations that have the same mean."""
  raise NotImplementedError()

@unimplemented
def VAR(value1, value2):
  """Calculates the variance based on a sample."""
  raise NotImplementedError()

@unimplemented
def VARA(value1, value2):
  """Calculates an estimate of variance based on a sample, setting text to the value `0`."""
  raise NotImplementedError()

@unimplemented
def VARP(value1, value2):
  """Calculates the variance based on an entire population."""
  raise NotImplementedError()

@unimplemented
def VARPA(value1, value2):
  """Calculates the variance based on an entire population, setting text to the value `0`."""
  raise NotImplementedError()

@unimplemented
def WEIBULL(x, shape, scale, cumulative):
  """
  Returns the value of the Weibull distribution function (or Weibull cumulative distribution
  function) for a specified shape and scale.
  """
  raise NotImplementedError()

@unimplemented
def ZTEST(data, value, standard_deviation):
  """Returns the two-tailed P-value of a Z-test with standard distribution."""
  raise NotImplementedError()