Source code for varats.data.metrics

"""This module contains functions that calculate various metrics on data."""
import typing as tp

import numpy as np
import numpy.typing as npt
import pandas as pd


[docs] def lorenz_curve(data: pd.Series) -> pd.Series: """ Calculates the values for the lorenz curve of the data. For more information see online `lorenz curve <https://en.wikipedia.org/wiki/Lorenz_curve>`_. Args: data: sorted series to calculate the lorenz curve for Returns: the values of the lorenz curve as a series """ scaled_prefix_sum = data.cumsum() / data.sum() return tp.cast(pd.Series, scaled_prefix_sum)
[docs] def gini_coefficient(distribution: pd.Series) -> float: """ Calculates the Gini coefficient of the data. For more information see online `Gini coefficient <https://en.wikipedia.org/wiki/Gini_coefficient>`_. Args: distribution: sorted series to calculate the Gini coefficient for Returns: the Gini coefficient for the data """ dist_array: npt.NDArray[np.float64] = np.array(distribution) return 0.5 * float( ((np.abs(np.subtract.outer(dist_array, dist_array)).mean()) / np.mean(dist_array)) )
[docs] def normalized_gini_coefficient(distribution: pd.Series) -> float: """ Calculates the normalized Gini coefficient of the given data, , i.e., ``gini(data) * (n / n - 1)`` where ``n`` is the length of the data. Args: distribution: sorted series to calculate the normalized Gini coefficient for Returns: the normalized Gini coefficient for the data """ n = float(len(distribution)) if n <= 1: return gini_coefficient(distribution) return gini_coefficient(distribution) * (n / (n - 1.0))
[docs] def apply_tukeys_fence( data: pd.DataFrame, column: str, k: float ) -> pd.DataFrame: """ Removes rows which are outliers in the given column using Tukey's fence. Tukey's fence defines all values to be outliers that are outside the range `[q1 - k * (q3 - q1), q3 + k * (q3 - q1)]`, i.e., values that are further than `k` times the interquartile range away from the first or third quartile. Common values for ``k``: +-----+---------------------------------------------------------------+ | 2.2 | (“Fine-Tuning Some Resistant Rules for Outlier Labeling”, | | | Hoaglin and Iglewicz (1987)) | +-----+---------------------------------------------------------------+ | 1.5 | (outliers, “Exploratory Data Analysis”, John W. Tukey (1977)) | +-----+---------------------------------------------------------------+ | 3.0 | (far out outliers, “Exploratory Data Analysis”, | | | John W. Tukey (1977)) | +-----+---------------------------------------------------------------+ Args: data: data to remove outliers from column: column to use for outlier detection k: multiplicative factor on the inter-quartile-range Returns: the data without outliers Test: >>> apply_tukeys_fence(pd.DataFrame({'foo': [1,1,2,2,10]}) ... .rename_axis('cols', axis=1), 'foo', 3) cols foo 0 1 1 1 2 2 3 2 """ quartile_1 = data[column].quantile(0.25) quartile_3 = data[column].quantile(0.75) iqr = quartile_3 - quartile_1 return tp.cast( pd.DataFrame, data.loc[(data[column] >= quartile_1 - k * iqr) & (data[column] <= quartile_3 + k * iqr)] )
[docs] def min_max_normalize(values: pd.Series) -> pd.Series: """ Min-Max normalize a series. Args: values: the series to normalize Returns: the normalized series Test: >>> min_max_normalize(pd.Series([1,2,3])) 0 0.0 1 0.5 2 1.0 dtype: float64 """ max_value = values.max() min_value = values.min() return tp.cast(pd.Series, (values - min_value) / (max_value - min_value))
T = tp.TypeVar("T")
[docs] class ConfusionMatrix(tp.Generic[T]): """ Helper class to automatically calculate classification results. +---------------------+-------------------------+-------------------------+ | | Predicted Positive (PP) | Predicted Negative (PN) | +---------------------+-------------------------+-------------------------+ | Actual Positive (P) | True Positive (TP) | False Negative (FN) | | Actual Negative (N) | False Positive (FP) | True Negative (TN) | +---------------------+-------------------------+-------------------------+ Reference: https://en.wikipedia.org/wiki/Precision_and_recall """ def __init__( self, actual_positive_values: tp.List[T], actual_negative_values: tp.List[T], predicted_positive_values: tp.List[T], predicted_negative_values: tp.List[T] ) -> None: self.__actual_positive_values = actual_positive_values self.__actual_negative_values = actual_negative_values self.__predicted_positive_values = predicted_positive_values self.__predicted_negative_values = predicted_negative_values ################### # Base metrics @property def P(self) -> int: # pylint: disable=C0103 return len(self.__actual_positive_values) @property def N(self) -> int: # pylint: disable=C0103 return len(self.__actual_negative_values) @property def PP(self) -> int: # pylint: disable=C0103 return len(self.__predicted_positive_values) @property def PN(self) -> int: # pylint: disable=C0103 return len(self.__predicted_negative_values) ################### # Combined metrics @property def TP(self) -> int: # pylint: disable=C0103 return len(self.getTPs()) @property def TN(self) -> int: # pylint: disable=C0103 return len(self.getTNs()) @property def FP(self) -> int: # pylint: disable=C0103 return self.PP - self.TP @property def FN(self) -> int: # pylint: disable=C0103 return self.PN - self.TN ################### # Combined values
[docs] def getTPs(self) -> tp.Set[T]: # pylint: disable=C0103 return set(self.__actual_positive_values ).intersection(self.__predicted_positive_values)
[docs] def getTNs(self) -> tp.Set[T]: # pylint: disable=C0103 return set(self.__actual_negative_values ).intersection(self.__predicted_negative_values)
[docs] def getFPs(self) -> tp.Set[T]: # pylint: disable=C0103 return set(self.__predicted_positive_values ).intersection(self.__actual_negative_values)
[docs] def getFNs(self) -> tp.Set[T]: # pylint: disable=C0103 return set(self.__predicted_negative_values ).intersection(self.__actual_positive_values)
################### # Interpretations
[docs] def precision(self) -> float: """Positive predictive value (PPV)""" if self.PP == 0: return float('nan') return self.TP / self.PP
[docs] def recall(self) -> float: """True positive rate (TPR)""" if self.P == 0: return float('nan') return self.TP / self.P
[docs] def specificity(self) -> float: """True negative rate (TNR)""" if self.N == 0: return float('nan') return self.TN / self.N
[docs] def accuracy(self) -> float: """Accuracy (ACC)""" if (self.P + self.N) == 0: return float('nan') return (self.TP + self.TN) / (self.P + self.N)
[docs] def balanced_accuracy(self) -> float: """ Balanced accuracy (BA)/(bACC) Balanced accuracy can serve as an overall performance metric for a model, whether or not the true labels are imbalanced in the data, assuming the cost of FN is the same as FP. """ return (self.recall() + self.specificity()) / 2
[docs] def f1_score(self) -> float: """In statistical analysis of binary classification, the F-score or F-measure is a measure of a test's accuracy.""" numerator = 2 * self.TP denominator = 2 * self.TP + self.FP + self.FN if denominator == 0.0: return float('nan') return numerator / denominator
################### # python underscore methods def __repr__(self) -> str: return str(self) def __str__(self) -> str: return f"""ConfM[TP={self.TP}, TN={self.TN}, FP={self.FP}, FN={self.FN}] ├─ Precision: {self.precision()} ├─ Recall: {self.recall()} ├─ Accuracy: {self.accuracy()} └─ F1_Score: {self.f1_score()} """