Source code for varats.data.databases.evaluationdatabase

"""Module for the base Database class."""
import abc
import typing as tp

import pandas as pd
from pygtrie import CharTrie

from varats.data.cache_helper import get_data_file_path
from varats.mapping.commit_map import CommitMap
from varats.paper.case_study import CaseStudy
from varats.utils.git_util import ShortCommitHash

AvailableColumns = tp.TypeVar("AvailableColumns")


[docs] class EvaluationDatabase(abc.ABC): """ Base class for accessing report data. Subclasses have to provide the following: - a list of available columns in the variable ``COLUMNS``; this list must start with ``Database.COLUMNS``! - an identifier for cache files ``CACHE_ID`` - a function :func:`_load_dataframe` that loads and transparently caches report data """ CACHE_ID: str COLUMN_TYPES = {"revision": 'str', "time_id": 'int32'} COLUMNS: tp.List[str] @classmethod def __init_subclass__( cls, *args: tp.Any, cache_id: str, column_types: tp.Dict[str, str], **kwargs: tp.Any ) -> None: super().__init_subclass__(*args, **kwargs) cls.CACHE_ID = cache_id cls.COLUMN_TYPES = {**cls.COLUMN_TYPES, **column_types} cls.COLUMNS = list(cls.COLUMN_TYPES.keys()) @classmethod @abc.abstractmethod def _load_dataframe( cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Dict[str, tp.Any] ) -> pd.DataFrame: """ Load and transparently cache the dataframe for this database class. NOTE: this function is not intended for external use. Use :func:`get_data_for_project` instead. Args: project_name: the project to load data for commit_map: the commit map to use case_study: the case_study to load data for kwargs: additional arguments used to load data Return: a pandas dataframe with all the cached data """ @classmethod def __get_data_for_case_study( cls, project_name: str, columns: tp.List[str], commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any ) -> pd.DataFrame: data: pd.DataFrame = cls._load_dataframe( project_name, commit_map, case_study, **kwargs ) if not [*data] == cls.COLUMNS: raise AssertionError( "Loaded dataframe does not match expected layout." "Consider removing the cache file " f"{get_data_file_path(cls.CACHE_ID, project_name)}." ) if not all(column in cls.COLUMNS for column in columns): raise ValueError( f"All values in 'columns' must be in {cls.__name__}.COLUMNS" ) def cs_filter(data_frame: pd.DataFrame) -> pd.DataFrame: """Filter out all commits that are not in the case study if one was selected.""" if case_study is None or data_frame.empty: return data_frame # use a trie for fast prefix lookup revisions = CharTrie() for revision in case_study.revisions: revisions[revision.hash] = True return data_frame[data_frame["revision"]. apply(lambda x: revisions.has_node(x.hash) != 0)] # Convert all revisions to ShortCommitHash(es) data['revision'] = data['revision'].apply(ShortCommitHash) data = cs_filter(data) return data[columns]
[docs] @classmethod def get_data_for_project( cls, project_name: str, columns: tp.List[str], commit_map: CommitMap, *case_studies: CaseStudy, **kwargs: tp.Any ) -> pd.DataFrame: """ Retrieve data for a given project and case study. Args: project_name: the project to retrieve data for columns: the columns the resulting dataframe should have; all column names must occur in the ``COLUMNS`` class variable commit_map: the commit map to use case_studies: the case studies to retrieve data for kwargs: additional arguments that are passed to :func:`_load_dataframe()` Return: a pandas dataframe with the given columns and the """ if cls.__name__ == "Database": raise AssertionError( "You must not call this function on the " "'Database' base class." ) if not case_studies: return cls.__get_data_for_case_study( project_name, columns, commit_map, None, **kwargs ) data_frames: tp.List[pd.DataFrame] = [] for case_study in case_studies: data_frames.append( cls.__get_data_for_case_study( project_name, columns, commit_map, case_study, **kwargs ) ) return pd.concat(data_frames)