Source code for temci.report.testers

"""
Contains the tester base class and several simple implementations
that simplify the work with statistical hypothesis tests.
"""

import warnings
import temci.utils.util as util
import typing as t
if util.can_import("scipy"):
    import scipy.stats as st
    import scipy.optimize as opti
    import numpy as np
from temci.utils.typecheck import *
from temci.utils.registry import AbstractRegistry, register
import logging

Number = t.Union[int, float]

[docs]class TesterRegistry(AbstractRegistry):

    settings_key_path = "stats"
    use_key = "tester"
    use_list = False
    default = "t"
    registry = {}
    plugin_synonym = ("tester", "testers")


[docs]class Tester(object, metaclass=util.Singleton):
    """
    A tester tests the probability of the nullh ypothesis of two same length list of observations.

    This is a base class that shouldn't be instantiated.
    """

    scipy_stat_method = None  # type: t.Optional[str]
    """ Used method of the scipy.stats module if the _test_impl isn't reimplemented """
    name = ""
    """ Name of the implemented statistical test """

    def __init__(self, misc_settings: dict, uncertainty_range: t.Tuple[float, float]):
        """
        Creates a new instance.
        :param misc_settings: Additional settings
        :param uncertainty_range: (start, end) probability tuple that gives range in which the tester doesn't give
             a definitive result on the nullhypothesis check
        """
        self.uncertainty_range = uncertainty_range
        """
        (start, end) probability tuple that gives range in which the tester doesn't give
             a definitive result on the nullhypothesis check
        """
        assert isinstance(uncertainty_range, Tuple(Float(), Float()))
        self.misc_settings = misc_settings
        """ Additional settings """

[docs]    def test(self, data1: t.List[Number], data2: t.List[Number]) -> float:
        """
        Calculates the probability of the null hypotheses for two samples.
        """
        res = 0
        min_len = min(len(data1), len(data2))
        with warnings.catch_warnings(record=True) as w:
            res = self._test_impl(data1[0:min_len], data2[0: min_len])
        return res

    def _test_impl(self, data1: t.List[Number], data2: t.List[Number]) -> float:
        """
        Calculates the probability of the null hypotheses for two equal sized samples.
        """
        assert self.scipy_stat_method
        return getattr(st, self.scipy_stat_method)(data1, data2)[-1]

[docs]    def is_uncertain(self, data1: t.List[Number], data2: t.List[Number]) -> bool:
        """ Does the probability of the null hypothesis for two samples lie in the uncertainty range? """
        val = self.test(data1, data2)
        return min(len(data1), len(data2)) == 0 or \
               self.uncertainty_range[0] <= val <= self.uncertainty_range[1] or \
               val != val

[docs]    def is_equal(self, data1: t.List[Number], data2: t.List[Number]) -> bool:
        """ Are the two samples not significantly unequal regarding the probability of the null hypothesis? """
        return self.test(data1, data2) > max(*self.uncertainty_range)

[docs]    def is_unequal(self, data1: t.List[Number], data2: t.List[Number]) -> bool:
        """ Are the two samples significantly unequal regarding the probability of the null hypothesis? """
        return self.test(data1, data2) < min(*self.uncertainty_range)

[docs]    def estimate_needed_runs(self, data1: list, data2: list,
                             run_bin_size: int, min_runs: int,
                             max_runs: int) -> int:
        """
        Calculate a approximation of the needed length of both observations that is needed for the
        p value to lie outside the uncertainty range.

        It uses the simple observation that the graph of the p value plotted against
        the size of the sets has a exponential, logarithmic or root shape.

        :warning: Doesn't work well.

        :param data1: list of observations
        :param data2: list of observations
        :param run_bin_size: granularity of the observation (> 0)
        :param min_runs: minimum number of allowed runs
        :param max_runs: maximum number of allowed runs
        :return: approximation of needed runs or float("inf")
        """
        #print("###", max_runs)
        if data1 == data2:
            #logging.error("equal")
            return min_runs
        min_len = min(len(data1), len(data2))
        #print("##", max_runs)
        if min_len <= 5:
            return max_runs
        x_space = np.linspace(0, min_len - 2, min_len - 2)
        yn = [self.test(data1[0:i], data2[0:i]) for i in range(2, min_len)]

        def interpolate(func, name: str):
            try:
                popt, pcov = opti.curve_fit(func, x_space, yn, maxfev=10000)
                for i in range(min_len, max_runs + 1, run_bin_size):
                    ith = func(i, *popt)
                    if ith > max(self.uncertainty_range) or ith < min(self.uncertainty_range):
                        #print("i = ", i)
                        return i
                return max_runs
            except (TypeError, RuntimeWarning, RuntimeError) as err:
                logging.info("Interpolating {} with {} data points gave "
                              "following error: {}".format(name, min_len, str(err)))
                return float("inf")

        funcs = [
            (lambda x, a, b, c: a * np.exp(-b * x) + c, "exponential function")
        ]
        res = 0
        with warnings.catch_warnings(record=True) as w:
            res = min(interpolate(*f) for f in funcs)
        return res

    def __eq__(self, other) -> bool:
        return isinstance(other, type(self))


[docs]@register(TesterRegistry, name="t", misc_type=Dict())
class TTester(Tester):
    """
    Tester that uses the student's t test.
    """

    scipy_stat_method = "ttest_ind"
    name = "t"


[docs]@register(TesterRegistry, name="ks", misc_type=Dict())
class KSTester(Tester):
    """
    Tester that uses the Kolmogorov-Smirnov statistic on 2 samples.
    """

    scipy_stat_method = "ks_2samp"
    name = "kolmogorov smirnov"


[docs]@register(TesterRegistry, name="anderson", misc_type=Dict())
class AndersonTester(Tester):
    """
    Tester that uses the Anderson statistic on 2 samples.
    """

    scipy_stat_method = "anderson_ksamp"
    name = "anderson"

    def _test_impl(self, data1: t.List[Number], data2: t.List[Number]) -> float:
        return max(st.anderson_ksamp([data1, data2])[-1], 1)