Source code for varats.data.databases.evaluationdatabase
"""Module for the base Database class."""
import abc
import typing as tp
import pandas as pd
from pygtrie import CharTrie
from varats.data.cache_helper import get_data_file_path
from varats.mapping.commit_map import CommitMap
from varats.paper.case_study import CaseStudy
from varats.utils.git_util import ShortCommitHash
AvailableColumns = tp.TypeVar("AvailableColumns")
[docs]
class EvaluationDatabase(abc.ABC):
"""
Base class for accessing report data.
Subclasses have to provide the following:
- a list of available columns in the variable ``COLUMNS``; this list
must start with ``Database.COLUMNS``!
- an identifier for cache files ``CACHE_ID``
- a function :func:`_load_dataframe` that loads and transparently caches
report data
"""
CACHE_ID: str
COLUMN_TYPES = {"revision": 'str', "time_id": 'int32'}
COLUMNS: tp.List[str]
@classmethod
def __init_subclass__(
cls, *args: tp.Any, cache_id: str, column_types: tp.Dict[str, str],
**kwargs: tp.Any
) -> None:
super().__init_subclass__(*args, **kwargs)
cls.CACHE_ID = cache_id
cls.COLUMN_TYPES = {**cls.COLUMN_TYPES, **column_types}
cls.COLUMNS = list(cls.COLUMN_TYPES.keys())
@classmethod
@abc.abstractmethod
def _load_dataframe(
cls, project_name: str, commit_map: CommitMap,
case_study: tp.Optional[CaseStudy], **kwargs: tp.Dict[str, tp.Any]
) -> pd.DataFrame:
"""
Load and transparently cache the dataframe for this database class.
NOTE: this function is not intended for external use.
Use :func:`get_data_for_project` instead.
Args:
project_name: the project to load data for
commit_map: the commit map to use
case_study: the case_study to load data for
kwargs: additional arguments used to load data
Return:
a pandas dataframe with all the cached data
"""
@classmethod
def __get_data_for_case_study(
cls, project_name: str, columns: tp.List[str], commit_map: CommitMap,
case_study: tp.Optional[CaseStudy], **kwargs: tp.Any
) -> pd.DataFrame:
data: pd.DataFrame = cls._load_dataframe(
project_name, commit_map, case_study, **kwargs
)
if not [*data] == cls.COLUMNS:
raise AssertionError(
"Loaded dataframe does not match expected layout."
"Consider removing the cache file "
f"{get_data_file_path(cls.CACHE_ID, project_name)}."
)
if not all(column in cls.COLUMNS for column in columns):
raise ValueError(
f"All values in 'columns' must be in {cls.__name__}.COLUMNS"
)
def cs_filter(data_frame: pd.DataFrame) -> pd.DataFrame:
"""Filter out all commits that are not in the case study if one was
selected."""
if case_study is None or data_frame.empty:
return data_frame
# use a trie for fast prefix lookup
revisions = CharTrie()
for revision in case_study.revisions:
revisions[revision.hash] = True
return data_frame[data_frame["revision"].
apply(lambda x: revisions.has_node(x.hash) != 0)]
# Convert all revisions to ShortCommitHash(es)
data['revision'] = data['revision'].apply(ShortCommitHash)
data = cs_filter(data)
return data[columns]
[docs]
@classmethod
def get_data_for_project(
cls, project_name: str, columns: tp.List[str], commit_map: CommitMap,
*case_studies: CaseStudy, **kwargs: tp.Any
) -> pd.DataFrame:
"""
Retrieve data for a given project and case study.
Args:
project_name: the project to retrieve data for
columns: the columns the resulting dataframe should have; all column
names must occur in the ``COLUMNS`` class variable
commit_map: the commit map to use
case_studies: the case studies to retrieve data for
kwargs: additional arguments that are passed to
:func:`_load_dataframe()`
Return:
a pandas dataframe with the given columns and the
"""
if cls.__name__ == "Database":
raise AssertionError(
"You must not call this function on the "
"'Database' base class."
)
if not case_studies:
return cls.__get_data_for_case_study(
project_name, columns, commit_map, None, **kwargs
)
data_frames: tp.List[pd.DataFrame] = []
for case_study in case_studies:
data_frames.append(
cls.__get_data_for_case_study(
project_name, columns, commit_map, case_study, **kwargs
)
)
return pd.concat(data_frames)