"""
Implementation of case studies for VaRA-TS.
A case study is used to pin down the exact set of revisions that should be
analysed for a project.
"""
import typing as tp
from pathlib import Path
import benchbuild as bb
from varats.base.configuration import Configuration
from varats.base.sampling_method import (
NormalSamplingMethod,
SamplingMethod,
SamplingMethodBase,
)
from varats.base.version_header import VersionHeader
from varats.mapping.configuration_map import (
ConfigurationMap,
create_configuration_map_from_yaml_doc,
)
from varats.project.project_util import get_project_cls_by_name
from varats.provider.release.release_provider import ReleaseType
from varats.utils.git_util import CommitHash, FullCommitHash, ShortCommitHash
from varats.utils.yaml_util import load_yaml, store_as_yaml
CSEntryMapTypes = str | int | list[int]
[docs]
class CSEntry:
"""
Entry for a case study.
Combining a commit hash with a unique and ordered id, starting with 0 for
the first commit in the repository.
"""
def __init__(
self,
commit_hash: FullCommitHash,
commit_id: int,
config_ids: list[int] | None = None,
) -> None:
"""Initialize a new case study entry."""
self.__commit_hash = commit_hash
self.__commit_id = commit_id
if config_ids:
self.__config_ids: list[int] = config_ids
else:
# By default we add a list with the DummyConfig ID if no
# configurations were provided.
self.__config_ids = [ConfigurationMap.DUMMY_CONFIG_ID]
@property
def commit_hash(self) -> FullCommitHash:
"""A commit hash from the git repository."""
return self.__commit_hash
@property
def commit_id(self) -> int:
"""The order ID of the commit hash."""
return self.__commit_id
@property
def config_ids(self) -> list[int]:
"""The order ID of the configuration."""
return self.__config_ids
[docs]
def get_dict(self) -> dict[str, CSEntryMapTypes]:
"""Get a dict representation of this commit and id."""
return {
"commit_hash": self.commit_hash.hash,
"commit_id": self.commit_id,
"config_ids": self.config_ids,
}
def __str__(self) -> str:
"""String representation of the commit and id."""
return f"({self.commit_id}: #{self.commit_hash.hash})"
def __repr__(self) -> str:
"""String representation of the commit and id."""
return str(self)
[docs]
class CSStage:
"""
A stage in a case-study, i.e., a collection of revisions.
Stages are used to separate revisions into groups.
"""
def __init__(
self,
name: str | None = None,
sampling_method: SamplingMethod | None = None,
release_type: ReleaseType | None = None,
revisions: list[CSEntry] | None = None,
) -> None:
"""Initialize a new stage."""
self.__name: str | None = name
self.__sampling_method: SamplingMethod | None = sampling_method
self.__release_type: ReleaseType | None = release_type
self.__revisions: list[CSEntry] = (
revisions if revisions is not None else []
)
@property
def revisions(self) -> list[FullCommitHash]:
"""Project revisions that are part of this case study."""
return [x.commit_hash for x in self.__revisions]
@property
def name(self) -> str | None:
"""Name of the stage."""
return self.__name
@name.setter
def name(self, name: str) -> None:
"""Setter for the name of the stage."""
self.__name = name
@property
def sampling_method(self) -> SamplingMethod | None:
"""The sampling method used for this stage."""
return self.__sampling_method
@sampling_method.setter
def sampling_method(self, sampling_method: NormalSamplingMethod) -> None:
"""Setter for the sampling method of the stage."""
self.__sampling_method = sampling_method
@property
def release_type(self) -> ReleaseType | None:
"""The sampling method used for this stage."""
return self.__release_type
@release_type.setter
def release_type(self, release_type: ReleaseType) -> None:
"""Setter for the sampling method of the stage."""
self.__release_type = release_type
[docs]
def has_revision(self, revision: CommitHash) -> bool:
"""
Check if a revision is part of this case study.
Args:
revision: project revision to check
Returns:
``True``, in case the revision is part of the case study,
``False`` otherwise.
"""
for cs_revision in self.__revisions:
if cs_revision.commit_hash.startswith(revision):
return True
return False
[docs]
def add_revision(
self,
revision: FullCommitHash,
commit_id: int,
config_ids: list[int] | None = None,
) -> None:
"""
Add a new revision to this stage.
Args:
revision: to add
commit_id: unique ID for ordering of commits
config_ids: list of configuration IDs
"""
if not self.has_revision(revision):
self.__revisions.append(CSEntry(revision, commit_id, config_ids))
[docs]
def get_config_ids_for_revision(self, revision: CommitHash) -> list[int]:
"""
Returns a list of all configuration IDs specified for this revision.
Args:
revision: i.e., a commit hash registered in this ``CSStage``
Returns: list of config IDs
"""
return list(
{
config_id
for entry in self.__revisions
if entry.commit_hash.startswith(revision)
for config_id in entry.config_ids
if config_id != ConfigurationMap.DUMMY_CONFIG_ID
}
)
[docs]
def sort(self, reverse: bool = True) -> None:
"""Sort the revisions of the case study by commit ID inplace."""
self.__revisions.sort(key=lambda x: x.commit_id, reverse=reverse)
[docs]
def get_dict(self) -> dict[str, str | list[dict[str, CSEntryMapTypes]]]:
"""Get a dict representation of this stage."""
stage_dict: dict[str, str | list[dict[str, CSEntryMapTypes]]] = {}
if self.name is not None:
stage_dict['name'] = self.name
if self.sampling_method is not None:
stage_dict['sampling_method'] = self.sampling_method.name()
if self.release_type is not None:
stage_dict['release_type'] = self.release_type.name
revision_list = [revision.get_dict() for revision in self.__revisions]
stage_dict['revisions'] = revision_list
return stage_dict
[docs]
class CaseStudy:
"""
A case study persists a set of revisions of a project.
This allows for easy re-evaluation.
Stored values:
- name of the related benchbuild.project
- a set of revisions
"""
def __init__(
self,
project_name: str,
version: int,
stages: list[CSStage] | None = None,
) -> None:
"""Initialize a new case study."""
self.__project_name = project_name
self.__version = version
self.__stages = stages if stages is not None else []
@property
def project_name(self) -> str:
"""
Name of the related project.
!! This name must match the name of the BB project !!
"""
return self.__project_name
@property
def project_cls(self) -> type[bb.Project]:
"""
Look up the BenchBuild project for this case study.
Returns:
project class
"""
return get_project_cls_by_name(self.project_name)
@property
def version(self) -> int:
"""
Version ID for this case study.
The version differentiates case studies of the same project.
"""
return self.__version
@property
def revisions(self) -> list[FullCommitHash]:
"""Project revisions that are part of this case study."""
return list(
dict.fromkeys(
[x for stage in self.__stages for x in stage.revisions]
)
)
@property
def stages(self) -> list[CSStage]:
"""Get a list with all stages."""
# Return new list to forbid modification of the case-study
return list(self.__stages)
@property
def num_stages(self) -> int:
"""Get nummer of stages."""
return len(self.__stages)
[docs]
def get_stage_by_name(self, stage_name: str) -> CSStage | None:
"""
Get a stage by its name.
Since multiple stages can have the same name,
the first matching stage is returned.
Args:
stage_name: name of the stage to lookup
Returns:
the stage, corresponding with the 'stage_name', or ``None``
"""
for stage in self.__stages:
if stage.name == stage_name:
return stage
return None
[docs]
def get_stage_index_by_name(self, stage_name: str) -> int | None:
"""
Get a stage's index by its name.
Since multiple stages can have the same
name, the first matching stage is returned.
Args:
stage_name: name of the stage to lookup
Returns:
the stage index, corresponding with the 'stage_name', or ``None``
"""
for i, stage in enumerate(self.__stages):
if stage.name == stage_name:
return i
return None
[docs]
def has_revision(self, revision: CommitHash) -> bool:
"""
Check if a revision is part of this case study.
Returns:
``True``, if the revision was found in one of the stages,
``False`` otherwise
"""
return any(stage.has_revision(revision) for stage in self.__stages)
[docs]
def has_revision_in_stage(
self, revision: ShortCommitHash, num_stage: int
) -> bool:
"""
Checks if a revision is in a specific stage.
Returns:
``True``, if the revision was found in the specified stage,
``False`` otherwise
"""
if self.num_stages <= num_stage:
return False
return self.__stages[num_stage].has_revision(revision)
[docs]
def has_revision_configs_specified(self, revision: CommitHash) -> bool:
"""
Checks whether a revision specifies different configurations.
Args:
revision: i.e., a commit hash registed in this case study
Returns: True, if configurations have been specified for this revision
"""
return bool(self.get_config_ids_for_revision(revision))
[docs]
def get_config_ids_for_revision(self, revision: CommitHash) -> list[int]:
"""
Returns a list of all configuration IDs specified for this revision.
Args:
revision: i.e., a commit hash registed in this case study
Returns: list of config IDs
"""
config_ids: list[int] = []
for stage in self.__stages:
config_ids += stage.get_config_ids_for_revision(revision)
if ConfigurationMap.DUMMY_CONFIG_ID in config_ids:
config_ids.remove(ConfigurationMap.DUMMY_CONFIG_ID)
return config_ids
[docs]
def get_config_ids_for_revision_in_stage(
self, revision: CommitHash, num_stage: int
) -> list[int]:
"""
Returns a list of all configuration IDs specified for this revision.
Args:
revision: i.e., a commit hash registed in this case study
num_stage: number of the stage to search in
Returns: list of config IDs
"""
if self.num_stages <= num_stage:
return []
config_ids = self.__stages[num_stage].get_config_ids_for_revision(
revision
)
while ConfigurationMap.DUMMY_CONFIG_ID in config_ids:
config_ids.remove(ConfigurationMap.DUMMY_CONFIG_ID)
return config_ids
[docs]
def shift_stage(self, from_index: int, offset: int) -> None:
"""
Shift a stage in the case-studies' stage list by an offset.
Beware that
shifts to the left (offset<0) will destroy stages.
Args:
from_index: index of the first stage to shift
offset: amount to stages should be shifted
"""
# keep parens for clarification
if not (0 <= from_index < len(self.__stages)): # pylint: disable=C0325
raise AssertionError("from_index out of bounds")
if (from_index + offset) < 0:
raise AssertionError("Shifting out of bounds")
if offset > 0:
for _ in range(offset):
self.__stages.insert(from_index, CSStage())
if offset < 0:
remove_index = from_index + offset
for _ in range(abs(offset)):
self.__stages.pop(remove_index)
[docs]
def insert_empty_stage(self, pos: int) -> CSStage:
"""
Insert a new stage at the given index.
Shifts the list elements to the
right. The newly created stage is returned.
Args:
pos: index position to insert an empty stage
"""
new_stage = CSStage()
self.__stages.insert(pos, new_stage)
return new_stage
[docs]
def include_revision(
self,
revision: FullCommitHash,
commit_id: int,
stage_num: int = 0,
sort_revs: bool = True,
) -> None:
"""
Add a revision to this case study.
Args:
revision: to add
commit_id: unique ID for ordering of commits
stage_num: index number of the stage to add the revision to
sort_revs: if True, the modified stage will be sorted afterwards
"""
# Create missing stages
while self.num_stages <= stage_num:
self.__stages.append(CSStage())
stage = self.__stages[stage_num]
if not stage.has_revision(revision):
stage.add_revision(revision, commit_id)
if sort_revs:
stage.sort()
[docs]
def include_revisions(
self,
revisions: list[tuple[FullCommitHash, int]],
stage_num: int = 0,
sort_revs: bool = True,
) -> None:
"""
Add multiple revisions to this case study.
Args:
revisions: List of tuples with (commit_hash, id) to be inserted
stage_num: The stage to insert the revisions
sort_revs: True if the stage should be kept sorted
"""
for revision in revisions:
self.include_revision(revision[0], revision[1], stage_num, False)
if len(self.__stages) <= stage_num:
for idx in range(len(self.__stages), stage_num + 1):
self.insert_empty_stage(idx)
if sort_revs and self.num_stages > 0:
self.__stages[stage_num].sort()
[docs]
def name_stage(self, stage_num: int, name: str) -> None:
"""
Names an already existing stage.
Args:
stage_num: The number of the stage to name
name: The new name of the stage
"""
if stage_num < self.num_stages:
self.__stages[stage_num].name = name
[docs]
def get_revision_filter(self) -> tp.Callable[[CommitHash], bool]:
"""
Generate a case study specific revision filter.
Returns:
a callable filter function
"""
def revision_filter(revision: CommitHash) -> bool:
return self.has_revision(revision)
return revision_filter
[docs]
def get_dict(
self,
) -> dict[
str, str | int | list[dict[str, str | list[dict[str, CSEntryMapTypes]]]]
]:
"""Get a dict representation of this case study."""
return {
"project_name": self.project_name,
"version": self.version,
"stages": [stage.get_dict() for stage in self.stages],
}
[docs]
def load_case_study_from_file(file_path: Path) -> CaseStudy:
"""
Load a case study from a file.
Args:
file_path: path to the case study file
"""
documents = load_yaml(file_path)
version_header = VersionHeader(next(documents))
version_header.raise_if_not_type("CaseStudy")
version_header.raise_if_version_is_less_than(1)
raw_case_study = next(documents)
stages: list[CSStage] = []
for raw_stage in raw_case_study['stages']:
hash_id_tuples: list[CSEntry] = []
for raw_hash_id_tuple in raw_stage['revisions']:
if raw_config_ids := raw_hash_id_tuple.get('config_ids', None):
if raw_config_ids == "all":
config_ids = load_configuration_map_from_case_study_file(
file_path
).ids()
else:
config_ids = []
for x in raw_hash_id_tuple['config_ids']:
if isinstance(x, str):
parts = x.split("..")
begin = int(parts[0])
end = int(parts[1])
config_ids.extend(range(begin, end + 1))
else:
config_ids.append(int(x))
else:
config_ids = []
hash_id_tuples.append(
CSEntry(
FullCommitHash(raw_hash_id_tuple['commit_hash']),
raw_hash_id_tuple['commit_id'],
config_ids,
)
)
sampling_method_name = raw_stage.get('sampling_method') or None
if sampling_method_name:
sampling_method: SamplingMethod | None = SamplingMethodBase[
SamplingMethod
].get_sampling_method_type(sampling_method_name)()
else:
sampling_method = None
release_type = raw_stage.get('release_type') or None
stages.append(
CSStage(
raw_stage.get('name') or None,
sampling_method,
ReleaseType[release_type] if release_type is not None else None,
hash_id_tuples,
)
)
return CaseStudy(
raw_case_study['project_name'], raw_case_study['version'], stages
)
[docs]
def load_configuration_map_from_case_study_file(
file_path: Path, concrete_config_type: type[Configuration] | None = None
) -> ConfigurationMap:
"""
Load a configuration map from a case-study file.
Args:
file_path: to the configuration map file
concrete_config_type: type of the configuration objects that should be
created
Returns: a new `ConfigurationMap` based on the parsed file
"""
documents = load_yaml(file_path)
version_header = VersionHeader(next(documents))
version_header.raise_if_not_type("CaseStudy")
version_header.raise_if_version_is_less_than(1)
next(documents) # skip case study document
try:
while True:
document = next(documents)
raw_config_type = document.get("config_type", None)
if raw_config_type is not None and (
concrete_config_type is None
or raw_config_type == concrete_config_type.__name__
):
break
return create_configuration_map_from_yaml_doc(
document, concrete_config_type
)
except StopIteration:
return ConfigurationMap()
[docs]
def store_case_study(case_study: CaseStudy, case_study_location: Path) -> None:
"""
Store case study to file in the specified paper_config.
Args:
case_study: the case study to store
case_study_location: can be either a path to a paper_config
or a direct path to a `.case_study` file
"""
if case_study_location.suffix != '.case_study':
file_name = f"{case_study.project_name}_{case_study.version}.case_study"
case_study_location /= file_name
__store_case_study_to_file(case_study, case_study_location)
def __store_case_study_to_file(case_study: CaseStudy, file_path: Path) -> None:
"""Store case study to file."""
store_as_yaml(
file_path,
[VersionHeader.from_version_number('CaseStudy', 1), case_study],
)