Source code for varats.data.metrics

"""This module contains functions that calculate various metrics on data."""
import typing as tp

import numpy as np
import numpy.typing as npt
import pandas as pd



[docs]
def lorenz_curve(data: pd.Series) -> pd.Series:
    """
    Calculates the values for the lorenz curve of the data.

    For more information see online
    `lorenz curve <https://en.wikipedia.org/wiki/Lorenz_curve>`_.

    Args:
        data: sorted series to calculate the lorenz curve for

    Returns:
        the values of the lorenz curve as a series
    """
    scaled_prefix_sum = data.cumsum() / data.sum()
    return tp.cast(pd.Series, scaled_prefix_sum)




[docs]
def gini_coefficient(distribution: pd.Series) -> float:
    """
    Calculates the Gini coefficient of the data.

    For more information see online
    `Gini coefficient <https://en.wikipedia.org/wiki/Gini_coefficient>`_.

    Args:
        distribution: sorted series to calculate the Gini coefficient for

    Returns:
        the Gini coefficient for the data
    """
    dist_array: npt.NDArray[np.float64] = np.array(distribution)
    return 0.5 * float(
        ((np.abs(np.subtract.outer(dist_array, dist_array)).mean()) /
         np.mean(dist_array))
    )




[docs]
def normalized_gini_coefficient(distribution: pd.Series) -> float:
    """
    Calculates the normalized Gini coefficient of the given data, , i.e.,

    ``gini(data) * (n / n - 1)`` where ``n`` is the length of the data.

    Args:
        distribution: sorted series to calculate the normalized Gini coefficient
                      for

    Returns:
        the normalized Gini coefficient for the data
    """
    n = float(len(distribution))
    if n <= 1:
        return gini_coefficient(distribution)

    return gini_coefficient(distribution) * (n / (n - 1.0))




[docs]
def apply_tukeys_fence(
    data: pd.DataFrame, column: str, k: float
) -> pd.DataFrame:
    """
    Removes rows which are outliers in the given column using Tukey's fence.

    Tukey's fence defines all values to be outliers that are outside the range
    `[q1 - k * (q3 - q1), q3 + k * (q3 - q1)]`, i.e., values that are further
    than `k` times the interquartile range away from the first or third
    quartile.

    Common values for ``k``:

    +-----+---------------------------------------------------------------+
    | 2.2 | (“Fine-Tuning Some Resistant Rules for Outlier Labeling”,     |
    |     |   Hoaglin and Iglewicz (1987))                                |
    +-----+---------------------------------------------------------------+
    | 1.5 | (outliers, “Exploratory Data Analysis”, John W. Tukey (1977)) |
    +-----+---------------------------------------------------------------+
    | 3.0 | (far out outliers, “Exploratory Data Analysis”,               |
    |     |  John W. Tukey (1977))                                        |
    +-----+---------------------------------------------------------------+

    Args:
        data: data to remove outliers from
        column: column to use for outlier detection
        k: multiplicative factor on the inter-quartile-range

    Returns:
        the data without outliers

    Test:
    >>> apply_tukeys_fence(pd.DataFrame({'foo': [1,1,2,2,10]})
    ...                    .rename_axis('cols', axis=1), 'foo', 3)
    cols  foo
    0       1
    1       1
    2       2
    3       2
    """
    quartile_1 = data[column].quantile(0.25)
    quartile_3 = data[column].quantile(0.75)
    iqr = quartile_3 - quartile_1
    return tp.cast(
        pd.DataFrame, data.loc[(data[column] >= quartile_1 - k * iqr) &
                               (data[column] <= quartile_3 + k * iqr)]
    )




[docs]
def min_max_normalize(values: pd.Series) -> pd.Series:
    """
    Min-Max normalize a series.

    Args:
        values: the series to normalize

    Returns:
        the normalized series

    Test:
    >>> min_max_normalize(pd.Series([1,2,3]))
    0    0.0
    1    0.5
    2    1.0
    dtype: float64
    """
    max_value = values.max()
    min_value = values.min()
    return tp.cast(pd.Series, (values - min_value) / (max_value - min_value))



T = tp.TypeVar("T")



[docs]
class ConfusionMatrix(tp.Generic[T]):
    """
    Helper class to automatically calculate classification results.

    +---------------------+-------------------------+-------------------------+
    |                     | Predicted Positive (PP) | Predicted Negative (PN) |
    +---------------------+-------------------------+-------------------------+
    | Actual Positive (P) | True Positive      (TP) | False Negative     (FN) |
    | Actual Negative (N) | False Positive     (FP) | True Negative      (TN) |
    +---------------------+-------------------------+-------------------------+

    Reference: https://en.wikipedia.org/wiki/Precision_and_recall
    """

    def __init__(
        self, actual_positive_values: tp.List[T],
        actual_negative_values: tp.List[T],
        predicted_positive_values: tp.List[T],
        predicted_negative_values: tp.List[T]
    ) -> None:
        self.__actual_positive_values = actual_positive_values
        self.__actual_negative_values = actual_negative_values
        self.__predicted_positive_values = predicted_positive_values
        self.__predicted_negative_values = predicted_negative_values

    ###################
    # Base metrics

    @property
    def P(self) -> int:  # pylint: disable=C0103
        return len(self.__actual_positive_values)

    @property
    def N(self) -> int:  # pylint: disable=C0103
        return len(self.__actual_negative_values)

    @property
    def PP(self) -> int:  # pylint: disable=C0103
        return len(self.__predicted_positive_values)

    @property
    def PN(self) -> int:  # pylint: disable=C0103
        return len(self.__predicted_negative_values)

    ###################
    # Combined metrics

    @property
    def TP(self) -> int:  # pylint: disable=C0103
        return len(self.getTPs())

    @property
    def TN(self) -> int:  # pylint: disable=C0103
        return len(self.getTNs())

    @property
    def FP(self) -> int:  # pylint: disable=C0103
        return self.PP - self.TP

    @property
    def FN(self) -> int:  # pylint: disable=C0103
        return self.PN - self.TN

    ###################
    # Combined values


[docs]
    def getTPs(self) -> tp.Set[T]:  # pylint: disable=C0103
        return set(self.__actual_positive_values
                  ).intersection(self.__predicted_positive_values)



[docs]
    def getTNs(self) -> tp.Set[T]:  # pylint: disable=C0103
        return set(self.__actual_negative_values
                  ).intersection(self.__predicted_negative_values)



[docs]
    def getFPs(self) -> tp.Set[T]:  # pylint: disable=C0103
        return set(self.__predicted_positive_values
                  ).intersection(self.__actual_negative_values)



[docs]
    def getFNs(self) -> tp.Set[T]:  # pylint: disable=C0103
        return set(self.__predicted_negative_values
                  ).intersection(self.__actual_positive_values)


    ###################
    # Interpretations


[docs]
    def precision(self) -> float:
        """Positive predictive value (PPV)"""
        if self.PP == 0:
            return float('nan')

        return self.TP / self.PP



[docs]
    def recall(self) -> float:
        """True positive rate (TPR)"""
        if self.P == 0:
            return float('nan')

        return self.TP / self.P



[docs]
    def specificity(self) -> float:
        """True negative rate (TNR)"""
        if self.N == 0:
            return float('nan')

        return self.TN / self.N



[docs]
    def accuracy(self) -> float:
        """Accuracy (ACC)"""
        if (self.P + self.N) == 0:
            return float('nan')

        return (self.TP + self.TN) / (self.P + self.N)



[docs]
    def balanced_accuracy(self) -> float:
        """
        Balanced accuracy (BA)/(bACC)

        Balanced accuracy can serve as an overall performance metric for a
        model, whether or not the true labels are imbalanced in the data,
        assuming the cost of FN is the same as FP.
        """
        return (self.recall() + self.specificity()) / 2



[docs]
    def f1_score(self) -> float:
        """In statistical analysis of binary classification, the F-score or
        F-measure is a measure of a test's accuracy."""
        numerator = 2 * self.TP
        denominator = 2 * self.TP + self.FP + self.FN
        if denominator == 0.0:
            return float('nan')

        return numerator / denominator


    ###################
    # python underscore methods
    def __repr__(self) -> str:
        return str(self)

    def __str__(self) -> str:
        return f"""ConfM[TP={self.TP}, TN={self.TN}, FP={self.FP}, FN={self.FN}]
  ├─ Precision: {self.precision()}
  ├─ Recall:    {self.recall()}
  ├─ Accuracy:  {self.accuracy()}
  └─ F1_Score:  {self.f1_score()}
"""