gristlabs_grist-core/sandbox/grist/functions/stats.py
Paul Fitzpatrick b82eec714a (core) move data engine code to core
Summary:
this moves sandbox/grist to core, and adds a requirements.txt
file for reconstructing the content of sandbox/thirdparty.

Test Plan:
existing tests pass.
Tested core functionality manually.  Tested docker build manually.

Reviewers: dsagal

Reviewed By: dsagal

Differential Revision: https://phab.getgrist.com/D2563
2020-07-29 08:57:25 -04:00

616 lines
21 KiB
Python

# pylint: disable=redefined-builtin, line-too-long, unused-argument
from math import _chain, _chain_numeric, _chain_numeric_a
from info import ISNUMBER, ISLOGICAL
from date import DATE # pylint: disable=unused-import
def _average(iterable):
total, count = 0.0, 0
for value in iterable:
total += value
count += 1
return total / count
def _default_if_empty(iterable, default):
"""
Yields all values from iterable, except when it is empty, yields just the single default value.
"""
empty = True
for value in iterable:
empty = False
yield value
if empty:
yield default
def AVEDEV(value1, value2):
"""Calculates the average of the magnitudes of deviations of data from a dataset's mean."""
raise NotImplementedError()
def AVERAGE(value, *more_values):
"""
Returns the numerical average value in a dataset, ignoring non-numerical values.
Each argument may be a value or an array. Values that are not numbers, including logical
and blank values, and text representations of numbers, are ignored.
>>> AVERAGE([2, -1.0, 11])
4.0
>>> AVERAGE([2, -1, 11, "Hello"])
4.0
>>> AVERAGE([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11])
4.0
>>> AVERAGE(False, True)
Traceback (most recent call last):
...
ZeroDivisionError: float division by zero
"""
return _average(_chain_numeric(value, *more_values))
def AVERAGEA(value, *more_values):
"""
Returns the numerical average value in a dataset, counting non-numerical values as 0.
Each argument may be a value of an array. Values that are not numbers, including dates and text
representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
False as 0.
>>> AVERAGEA([2, -1.0, 11])
4.0
>>> AVERAGEA([2, -1, 11, "Hello"])
3.0
>>> AVERAGEA([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
1.5
>>> AVERAGEA(False, True)
0.5
"""
return _average(_chain_numeric_a(value, *more_values))
# Note that Google Sheets offers a similar function, called AVERAGE.WEIGHTED
# (https://support.google.com/docs/answer/9084098?hl=en)
def AVERAGE_WEIGHTED(pairs):
"""
Given a list of (value, weight) pairs, finds the average of the values weighted by the
corresponding weights. Ignores any pairs with a non-numerical value or weight.
If you have two lists, of values and weights, use the Python built-in zip() function to create a
list of pairs.
>>> AVERAGE_WEIGHTED(((95, .25), (90, .1), ("X", .5), (85, .15), (88, .2), (82, .3), (70, None)))
87.7
>>> AVERAGE_WEIGHTED(zip([95, 90, "X", 85, 88, 82, 70], [25, 10, 50, 15, 20, 30, None]))
87.7
>>> AVERAGE_WEIGHTED(zip([95, 90, False, 85, 88, 82, 70], [.25, .1, .5, .15, .2, .3, True]))
87.7
"""
sum_value, sum_weight = 0.0, 0.0
for value, weight in pairs:
# The type-checking here is the same as used by _chain_numeric.
if ISNUMBER(value) and not ISLOGICAL(value) and ISNUMBER(weight) and not ISLOGICAL(weight):
sum_value += value * weight
sum_weight += weight
return sum_value / sum_weight
def AVERAGEIF(criteria_range, criterion, average_range=None):
"""Returns the average of a range depending on criteria."""
raise NotImplementedError()
def AVERAGEIFS(average_range, criteria_range1, criterion1, *args):
"""Returns the average of a range depending on multiple criteria."""
raise NotImplementedError()
def BINOMDIST(num_successes, num_trials, prob_success, cumulative):
"""
Calculates the probability of drawing a certain number of successes (or a maximum number of
successes) in a certain number of tries given a population of a certain size containing a
certain number of successes, with replacement of draws.
"""
raise NotImplementedError()
def CONFIDENCE(alpha, standard_deviation, pop_size):
"""Calculates the width of half the confidence interval for a normal distribution."""
raise NotImplementedError()
def CORREL(data_y, data_x):
"""Calculates r, the Pearson product-moment correlation coefficient of a dataset."""
raise NotImplementedError()
def COUNT(value, *more_values):
"""
Returns the count of numerical values in a dataset, ignoring non-numerical values.
Each argument may be a value or an array. Values that are not numbers, including logical
and blank values, and text representations of numbers, are ignored.
>>> COUNT([2, -1.0, 11])
3
>>> COUNT([2, -1, 11, "Hello"])
3
>>> COUNT([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
3
>>> COUNT(False, True)
0
"""
return sum(1 for v in _chain_numeric(value, *more_values))
def COUNTA(value, *more_values):
"""
Returns the count of all values in a dataset, including non-numerical values.
Each argument may be a value or an array.
>>> COUNTA([2, -1.0, 11])
3
>>> COUNTA([2, -1, 11, "Hello"])
4
>>> COUNTA([2, -1, "Hello", DATE(2015,1,1)], True, [False, "123", "", 11.5])
9
>>> COUNTA(False, True)
2
"""
return sum(1 for v in _chain(value, *more_values))
def COVAR(data_y, data_x):
"""Calculates the covariance of a dataset."""
raise NotImplementedError()
def CRITBINOM(num_trials, prob_success, target_prob):
"""Calculates the smallest value for which the cumulative binomial distribution is greater than or equal to a specified criteria."""
raise NotImplementedError()
def DEVSQ(value1, value2):
"""Calculates the sum of squares of deviations based on a sample."""
raise NotImplementedError()
def EXPONDIST(x, lambda_, cumulative):
"""Returns the value of the exponential distribution function with a specified lambda at a specified value."""
raise NotImplementedError()
def F_DIST(x, degrees_freedom1, degrees_freedom2, cumulative):
"""
Calculates the left-tailed F probability distribution (degree of diversity) for two data sets
with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
distribution.
"""
raise NotImplementedError()
def F_DIST_RT(x, degrees_freedom1, degrees_freedom2):
"""
Calculates the right-tailed F probability distribution (degree of diversity) for two data sets
with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
distribution.
"""
raise NotImplementedError()
def FDIST(x, degrees_freedom1, degrees_freedom2):
"""
Calculates the right-tailed F probability distribution (degree of diversity) for two data sets
with given input x. Alternately called Fisher-Snedecor distribution or Snedecor's F
distribution.
"""
raise NotImplementedError()
def FISHER(value):
"""Returns the Fisher transformation of a specified value."""
raise NotImplementedError()
def FISHERINV(value):
"""Returns the inverse Fisher transformation of a specified value."""
raise NotImplementedError()
def FORECAST(x, data_y, data_x):
"""Calculates the expected y-value for a specified x based on a linear regression of a dataset."""
raise NotImplementedError()
def GEOMEAN(value1, value2):
"""Calculates the geometric mean of a dataset."""
raise NotImplementedError()
def HARMEAN(value1, value2):
"""Calculates the harmonic mean of a dataset."""
raise NotImplementedError()
def HYPGEOMDIST(num_successes, num_draws, successes_in_pop, pop_size):
"""Calculates the probability of drawing a certain number of successes in a certain number of tries given a population of a certain size containing a certain number of successes, without replacement of draws."""
raise NotImplementedError()
def INTERCEPT(data_y, data_x):
"""Calculates the y-value at which the line resulting from linear regression of a dataset will intersect the y-axis (x=0)."""
raise NotImplementedError()
def KURT(value1, value2):
"""Calculates the kurtosis of a dataset, which describes the shape, and in particular the "peakedness" of that dataset."""
raise NotImplementedError()
def LARGE(data, n):
"""Returns the nth largest element from a data set, where n is user-defined."""
raise NotImplementedError()
def LOGINV(x, mean, standard_deviation):
"""Returns the value of the inverse log-normal cumulative distribution with given mean and standard deviation at a specified value."""
raise NotImplementedError()
def LOGNORMDIST(x, mean, standard_deviation):
"""Returns the value of the log-normal cumulative distribution with given mean and standard deviation at a specified value."""
raise NotImplementedError()
def MAX(value, *more_values):
"""
Returns the maximum value in a dataset, ignoring non-numerical values.
Each argument may be a value or an array. Values that are not numbers, including logical
and blank values, and text representations of numbers, are ignored. Returns 0 if the arguments
contain no numbers.
>>> MAX([2, -1.5, 11.5])
11.5
>>> MAX([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
11.5
>>> MAX(True, -123)
-123
>>> MAX("123", -123)
-123
>>> MAX("Hello", "123", DATE(2015, 1, 1))
0
"""
return max(_default_if_empty(_chain_numeric(value, *more_values), 0))
def MAXA(value, *more_values):
"""
Returns the maximum numeric value in a dataset.
Each argument may be a value of an array. Values that are not numbers, including dates and text
representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
False as 0. Returns 0 if the arguments contain no numbers.
>>> MAXA([2, -1.5, 11.5])
11.5
>>> MAXA([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
11.5
>>> MAXA(True, -123)
1
>>> MAXA("123", -123)
0
>>> MAXA("Hello", "123", DATE(2015, 1, 1))
0
"""
return max(_default_if_empty(_chain_numeric_a(value, *more_values), 0))
def MEDIAN(value, *more_values):
"""
Returns the median value in a numeric dataset, ignoring non-numerical values.
Each argument may be a value or an array. Values that are not numbers, including logical
and blank values, and text representations of numbers, are ignored.
Produces an error if the arguments contain no numbers.
The median is the middle number when all values are sorted. So half of the values in the dataset
are less than the median, and half of the values are greater. If there is an even number of
values in the dataset, returns the average of the two numbers in the middle.
>>> MEDIAN(1, 2, 3, 4, 5)
3
>>> MEDIAN(3, 5, 1, 4, 2)
3
>>> MEDIAN(xrange(10))
4.5
>>> MEDIAN("Hello", "123", DATE(2015, 1, 1), 12.3)
12.3
>>> MEDIAN("Hello", "123", DATE(2015, 1, 1))
Traceback (most recent call last):
...
ValueError: MEDIAN requires at least one number
"""
values = sorted(_chain_numeric(value, *more_values))
if not values:
raise ValueError("MEDIAN requires at least one number")
count = len(values)
if count % 2 == 0:
return (values[count / 2 - 1] + values[count / 2]) / 2.0
else:
return values[(count - 1) / 2]
def MIN(value, *more_values):
"""
Returns the minimum value in a dataset, ignoring non-numerical values.
Each argument may be a value or an array. Values that are not numbers, including logical
and blank values, and text representations of numbers, are ignored. Returns 0 if the arguments
contain no numbers.
>>> MIN([2, -1.5, 11.5])
-1.5
>>> MIN([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
-1.5
>>> MIN(True, 123)
123
>>> MIN("-123", 123)
123
>>> MIN("Hello", "123", DATE(2015, 1, 1))
0
"""
return min(_default_if_empty(_chain_numeric(value, *more_values), 0))
def MINA(value, *more_values):
"""
Returns the minimum numeric value in a dataset.
Each argument may be a value of an array. Values that are not numbers, including dates and text
representations of numbers, are counted as 0 (zero). Logical value of True is counted as 1, and
False as 0. Returns 0 if the arguments contain no numbers.
>>> MINA([2, -1.5, 11.5])
-1.5
>>> MINA([2, -1.5, "Hello", DATE(2015, 1, 1)], True, [False, "123", "", 11.5])
-1.5
>>> MINA(True, 123)
1
>>> MINA("-123", 123)
0
>>> MINA("Hello", "123", DATE(2015, 1, 1))
0
"""
return min(_default_if_empty(_chain_numeric_a(value, *more_values), 0))
def MODE(value1, value2):
"""Returns the most commonly occurring value in a dataset."""
raise NotImplementedError()
def NEGBINOMDIST(num_failures, num_successes, prob_success):
"""Calculates the probability of drawing a certain number of failures before a certain number of successes given a probability of success in independent trials."""
raise NotImplementedError()
def NORMDIST(x, mean, standard_deviation, cumulative):
"""
Returns the value of the normal distribution function (or normal cumulative distribution
function) for a specified value, mean, and standard deviation.
"""
raise NotImplementedError()
def NORMINV(x, mean, standard_deviation):
"""Returns the value of the inverse normal distribution function for a specified value, mean, and standard deviation."""
raise NotImplementedError()
def NORMSDIST(x):
"""Returns the value of the standard normal cumulative distribution function for a specified value."""
raise NotImplementedError()
def NORMSINV(x):
"""Returns the value of the inverse standard normal distribution function for a specified value."""
raise NotImplementedError()
def PEARSON(data_y, data_x):
"""Calculates r, the Pearson product-moment correlation coefficient of a dataset."""
raise NotImplementedError()
def PERCENTILE(data, percentile):
"""Returns the value at a given percentile of a dataset."""
raise NotImplementedError()
def PERCENTRANK(data, value, significant_digits=None):
"""Returns the percentage rank (percentile) of a specified value in a dataset."""
raise NotImplementedError()
def PERCENTRANK_EXC(data, value, significant_digits=None):
"""Returns the percentage rank (percentile) from 0 to 1 exclusive of a specified value in a dataset."""
raise NotImplementedError()
def PERCENTRANK_INC(data, value, significant_digits=None):
"""Returns the percentage rank (percentile) from 0 to 1 inclusive of a specified value in a dataset."""
raise NotImplementedError()
def PERMUT(n, k):
"""Returns the number of ways to choose some number of objects from a pool of a given size of objects, considering order."""
raise NotImplementedError()
def POISSON(x, mean, cumulative):
"""
Returns the value of the Poisson distribution function (or Poisson cumulative distribution
function) for a specified value and mean.
"""
raise NotImplementedError()
def PROB(data, probabilities, low_limit, high_limit=None):
"""Given a set of values and corresponding probabilities, calculates the probability that a value chosen at random falls between two limits."""
raise NotImplementedError()
def QUARTILE(data, quartile_number):
"""Returns a value nearest to a specified quartile of a dataset."""
raise NotImplementedError()
def RANK(value, data, is_ascending=None):
"""Returns the rank of a specified value in a dataset."""
raise NotImplementedError()
def RANK_AVG(value, data, is_ascending=None):
"""Returns the rank of a specified value in a dataset. If there is more than one entry of the same value in the dataset, the average rank of the entries will be returned."""
raise NotImplementedError()
def RANK_EQ(value, data, is_ascending=None):
"""Returns the rank of a specified value in a dataset. If there is more than one entry of the same value in the dataset, the top rank of the entries will be returned."""
raise NotImplementedError()
def RSQ(data_y, data_x):
"""Calculates the square of r, the Pearson product-moment correlation coefficient of a dataset."""
raise NotImplementedError()
def SKEW(value1, value2):
"""Calculates the skewness of a dataset, which describes the symmetry of that dataset about the mean."""
raise NotImplementedError()
def SLOPE(data_y, data_x):
"""Calculates the slope of the line resulting from linear regression of a dataset."""
raise NotImplementedError()
def SMALL(data, n):
"""Returns the nth smallest element from a data set, where n is user-defined."""
raise NotImplementedError()
def STANDARDIZE(value, mean, standard_deviation):
"""Calculates the normalized equivalent of a random variable given mean and standard deviation of the distribution."""
raise NotImplementedError()
# This should make us all cry a little. Because the sandbox does not do Python3 (which has
# statistics package), and because it does not do numpy (because it's native and hasn't been built
# for it), we have to implement simple stats functions by hand.
# TODO: switch to use the statistics package instead, once we upgrade to Python3.
#
# The following implementation of stdev is taken from https://stackoverflow.com/a/27758326/328565
def _mean(data):
return sum(data) / float(len(data))
def _ss(data):
"""Return sum of square deviations of sequence data."""
c = _mean(data)
return sum((x-c)**2 for x in data)
def _stddev(data, ddof=0):
"""Calculates the population standard deviation
by default; specify ddof=1 to compute the sample
standard deviation."""
n = len(data)
ss = _ss(data)
pvar = ss/(n-ddof)
return pvar**0.5
# The examples in the doctests below come from https://support.google.com/docs/answer/3094054 and
# related articles, which helps ensure correctness and compatibility.
def STDEV(value, *more_values):
"""
Calculates the standard deviation based on a sample, ignoring non-numerical values.
>>> STDEV([2, 5, 8, 13, 10])
4.277849927241488
>>> STDEV([2, 5, 8, 13, 10, True, False, "Test"])
4.277849927241488
>>> STDEV([2, 5, 8, 13, 10], 3, 12, 15)
4.810702354423639
>>> STDEV([2, 5, 8, 13, 10], [3, 12, 15])
4.810702354423639
>>> STDEV([5])
Traceback (most recent call last):
...
ZeroDivisionError: float division by zero
"""
return _stddev(list(_chain_numeric(value, *more_values)), 1)
def STDEVA(value, *more_values):
"""
Calculates the standard deviation based on a sample, setting text to the value `0`.
>>> STDEVA([2, 5, 8, 13, 10])
4.277849927241488
>>> STDEVA([2, 5, 8, 13, 10, True, False, "Test"])
4.969550137731641
>>> STDEVA([2, 5, 8, 13, 10], 1, 0, 0)
4.969550137731641
>>> STDEVA([2, 5, 8, 13, 10], [1, 0, 0])
4.969550137731641
>>> STDEVA([5])
Traceback (most recent call last):
...
ZeroDivisionError: float division by zero
"""
return _stddev(list(_chain_numeric_a(value, *more_values)), 1)
def STDEVP(value, *more_values):
"""
Calculates the standard deviation based on an entire population, ignoring non-numerical values.
>>> STDEVP([2, 5, 8, 13, 10])
3.8262252939417984
>>> STDEVP([2, 5, 8, 13, 10, True, False, "Test"])
3.8262252939417984
>>> STDEVP([2, 5, 8, 13, 10], 3, 12, 15)
4.5
>>> STDEVP([2, 5, 8, 13, 10], [3, 12, 15])
4.5
>>> STDEVP([5])
0.0
"""
return _stddev(list(_chain_numeric(value, *more_values)), 0)
def STDEVPA(value, *more_values):
"""
Calculates the standard deviation based on an entire population, setting text to the value `0`.
>>> STDEVPA([2, 5, 8, 13, 10])
3.8262252939417984
>>> STDEVPA([2, 5, 8, 13, 10, True, False, "Test"])
4.648588495446763
>>> STDEVPA([2, 5, 8, 13, 10], 1, 0, 0)
4.648588495446763
>>> STDEVPA([2, 5, 8, 13, 10], [1, 0, 0])
4.648588495446763
>>> STDEVPA([5])
0.0
"""
return _stddev(list(_chain_numeric_a(value, *more_values)), 0)
def STEYX(data_y, data_x):
"""Calculates the standard error of the predicted y-value for each x in the regression of a dataset."""
raise NotImplementedError()
def T_INV(probability, degrees_freedom):
"""Calculates the negative inverse of the one-tailed TDIST function."""
raise NotImplementedError()
def T_INV_2T(probability, degrees_freedom):
"""Calculates the inverse of the two-tailed TDIST function."""
raise NotImplementedError()
def TDIST(x, degrees_freedom, tails):
"""Calculates the probability for Student's t-distribution with a given input (x)."""
raise NotImplementedError()
def TINV(probability, degrees_freedom):
"""Calculates the inverse of the two-tailed TDIST function."""
raise NotImplementedError()
def TRIMMEAN(data, exclude_proportion):
"""Calculates the mean of a dataset excluding some proportion of data from the high and low ends of the dataset."""
raise NotImplementedError()
def TTEST(range1, range2, tails, type):
"""Returns the probability associated with t-test. Determines whether two samples are likely to have come from the same two underlying populations that have the same mean."""
raise NotImplementedError()
def VAR(value1, value2):
"""Calculates the variance based on a sample."""
raise NotImplementedError()
def VARA(value1, value2):
"""Calculates an estimate of variance based on a sample, setting text to the value `0`."""
raise NotImplementedError()
def VARP(value1, value2):
"""Calculates the variance based on an entire population."""
raise NotImplementedError()
def VARPA(value1, value2):
"""Calculates the variance based on an entire population, setting text to the value `0`."""
raise NotImplementedError()
def WEIBULL(x, shape, scale, cumulative):
"""
Returns the value of the Weibull distribution function (or Weibull cumulative distribution
function) for a specified shape and scale.
"""
raise NotImplementedError()
def ZTEST(data, value, standard_deviation):
"""Returns the two-tailed P-value of a Z-test with standard distribution."""
raise NotImplementedError()