Source code for varats.utils.github_util
"""Utility module for working with the pygithub API."""
import codecs
import logging
import pickle # nosec
import re
import typing as tp
from pathlib import Path
import pandas as pd
from benchbuild.project import Project
from benchbuild.source import primary
from github import Auth, Github
from github.GithubObject import GithubObject
from varats.utils.settings import vara_cfg
if tp.TYPE_CHECKING:
# pylint: disable=unused-import,ungrouped-imports
from github.PaginatedList import PaginatedList
LOG = logging.getLogger(__name__)
GITHUB_URL_PATTERN = re.compile(r"https://github\.com/(.*)/(.*)\.git")
[docs]
def get_github_instance() -> Github:
"""
Creates a Github instance using a github access token if configured.
Returns:
a Github instance
"""
if (access_token := str(vara_cfg()["provider"]["github_access_token"])):
return Github(auth=Auth.Token(access_token))
return Github()
__PYGITHUB_CACHE_FILE_NAME = "pygithub.csv.gz"
__PYGITHUB_KEY_COLUMN = "key"
__PYGITHUB_LIST_LENGTH_COLUMN = "length"
__PYGITHUB_OBJECT_COLUMN = "object"
PyGithubObj = tp.TypeVar("PyGithubObj", bound=GithubObject)
def _dump_pygithub_object(obj: GithubObject) -> str:
"""
Pickle a GithubObject.
Args:
obj: the object to pickle
Returns:
the pickled object
"""
return codecs.encode(
pickle.dumps((obj.__class__, obj.raw_data, obj.raw_headers)), "base64"
).decode()
def _load_pygithub_object(obj: str) -> GithubObject:
"""
Unpickle a GithubObject.
Args:
obj: the object to unpickle
Returns:
the unpickled object
"""
return tp.cast(
GithubObject,
get_github_instance().create_from_raw_data(
*pickle.loads(codecs.decode(obj.encode(), "base64")) # nosec
)
)
def _load_cache_file() -> pd.DataFrame:
cache_file = Path(
str(vara_cfg()["data_cache"])
) / __PYGITHUB_CACHE_FILE_NAME
if cache_file.exists():
cache_df = pd.read_csv(
str(cache_file), index_col=0, compression='infer'
)
return cache_df
return pd.DataFrame(
columns=[
__PYGITHUB_KEY_COLUMN, __PYGITHUB_OBJECT_COLUMN,
__PYGITHUB_LIST_LENGTH_COLUMN
]
)
def _store_cache_file(cache_df: pd.DataFrame) -> None:
cache_file = Path(
str(vara_cfg()["data_cache"])
) / __PYGITHUB_CACHE_FILE_NAME
cache_df.to_csv(str(cache_file), compression='infer')
def _cache_pygithub_object(key: str, obj: GithubObject) -> None:
"""
Cache a GithubObject.
Args:
key: the unique identifier for the object to store
obj: the object to store
"""
cache_df = _load_cache_file()
cache_df = pd.concat((
cache_df,
pd.DataFrame({
__PYGITHUB_KEY_COLUMN: [key],
__PYGITHUB_OBJECT_COLUMN: [_dump_pygithub_object(obj)]
})
),
ignore_index=True)
_store_cache_file(cache_df)
def _get_cached_pygithub_object(key: str) -> tp.Optional[GithubObject]:
"""
Load a GithubObject from the cache.
Args:
key: the unique identifier of the object to load
Returns:
the cached object if available, else ``None``
"""
cache_df = _load_cache_file()
selected_rows = cache_df[cache_df[__PYGITHUB_KEY_COLUMN] == key]
if selected_rows.empty:
return None
return _load_pygithub_object(selected_rows[__PYGITHUB_OBJECT_COLUMN].item())
def _cache_pygithub_object_list(key: str, objs: tp.List[PyGithubObj]) -> None:
"""
Cache a list of GithubObjects.
Args:
key: the unique identifier for the list to store
"""
cache_df = _load_cache_file()
cache_df = cache_df.append({
__PYGITHUB_KEY_COLUMN: key,
__PYGITHUB_LIST_LENGTH_COLUMN: len(objs)
},
ignore_index=True)
for idx, obj in enumerate(objs):
cache_df = cache_df.append({
__PYGITHUB_KEY_COLUMN: f"{key}_{idx}",
__PYGITHUB_OBJECT_COLUMN: _dump_pygithub_object(obj)
},
ignore_index=True)
_store_cache_file(cache_df)
def _get_cached_pygithub_object_list(
key: str
) -> tp.Optional[tp.List[GithubObject]]:
"""
Load a list of GithubObjects from the cache.
Args:
key: the unique identifier of the list to load
Returns:
the cached list if available, else ``None``
"""
cache_df = _load_cache_file()
list_header = cache_df[cache_df[__PYGITHUB_KEY_COLUMN] == key]
if list_header.empty:
return None
list_length: int = int(list_header[__PYGITHUB_LIST_LENGTH_COLUMN].item())
selected_rows = pd.DataFrame()
for idx in range(list_length):
selected_rows = selected_rows.append(
cache_df[cache_df[__PYGITHUB_KEY_COLUMN] == f"{key}_{idx}"],
ignore_index=True
)
if len(selected_rows) != list_length:
raise AssertionError("List length is not equal to list header.")
return [
_load_pygithub_object(obj)
for obj in selected_rows[__PYGITHUB_OBJECT_COLUMN].tolist()
]
[docs]
def get_cached_github_object(
cached_object_key: str, load_function: tp.Callable[[Github], PyGithubObj]
) -> PyGithubObj:
"""
Transparently caches a GithubObj loaded by the given function.
Args:
cached_object_key: unique name to identify the GithubObj
load_function: function that loads a GithubObj
Returns:
the fetched or cached GithubObj
"""
if (cached_object := _get_cached_pygithub_object(cached_object_key)):
return tp.cast(PyGithubObj, cached_object)
obj_to_cache = load_function(get_github_instance())
_cache_pygithub_object(cached_object_key, obj_to_cache)
return obj_to_cache
[docs]
def get_cached_github_object_list(
cached_object_key: str,
load_function: 'tp.Callable[[Github], PaginatedList[PyGithubObj]]'
) -> tp.List[PyGithubObj]:
"""
Transparently caches a PaginatedList of GithubObjs loaded by the given
function.
Args:
cached_object_key: unique name to identify the GithubObj list
load_function: function that loads a PaginatedList of PygithubObjs
Returns:
the fetched or cached list of GithubObjs
"""
if (cached_list := _get_cached_pygithub_object_list(cached_object_key)):
return [tp.cast(PyGithubObj, obj) for obj in cached_list]
obj_list_to_cache = list(load_function(get_github_instance()))
# if list shall be cached manually:
# _cache_pygithub_object_list(cached_object_key, obj_list_to_cache)
return obj_list_to_cache
[docs]
def get_github_repo_name_for_project(
project: tp.Type[Project]
) -> tp.Optional[str]:
"""
Finds the github repo name corresponding to a given github project.
Args:
project: class of said project
Returns:
the github repo name for the project or ``None`` if the given project
is not a github project
"""
if (match := GITHUB_URL_PATTERN.match(primary(*project.SOURCE).remote)):
return f"{match.group(1)}/{match.group(2)}"
return None