Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/almanack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from .book import read
from .metrics.data import get_table as table
from .metrics.data import process_repo_for_analysis
from .metrics.data import process_repo_for_almanack, process_repo_for_analysis
from .metrics.entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down
86 changes: 86 additions & 0 deletions src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse

import awkward as ak
import defusedxml.ElementTree as ET
import pandas as pd
import pygit2
import yaml

Expand Down Expand Up @@ -555,6 +557,90 @@ def process_repo_for_analysis(
shutil.rmtree(temp_dir)


def table_to_wide(table_rows: list[dict]) -> Dict[str, Any]:
"""
Transpose Almanack table (name->result), compute checks summary, flatten nested.
`repo-file-info-entropy` is ignored due to scope and increased runtime for analysis.

Args:
table_rows (list[dict]):
The Almanack metrics table as a list of dictionaries,
each containing metric metadata and a "result" field.

Returns:
dict:
A flattened dictionary mapping metric names to their results,
including computed summary fields:
- "checks_total": total number of sustainability-related checks
- "checks_passed": number of checks passed
- "checks_pct": percentage of checks passed

"""
if not table_rows:
return {"checks_total": 0, "checks_passed": 0, "checks_pct": None}

df = pd.DataFrame(table_rows)

# Dynamic sustainability checks: bool + positive correlation
mask = (df["result-type"] == "bool") & (df["sustainability_correlation"] == 1)
checks_total = int(mask.sum())
checks_passed = int((df.loc[mask, "result"] == True).sum())
checks_pct = (100.0 * checks_passed / checks_total) if checks_total else None

# name->result (wide format)
wide: Dict[str, Any] = (
df[["name", "result"]]
.set_index("name")
.T.reset_index(drop=True)
.iloc[0]
.to_dict()
)

wide.pop("repo-file-info-entropy", None)

# Flatten repo-almanack-score if present (avoid nested dict in parquet)
score = wide.get("repo-almanack-score")
if isinstance(score, dict):
wide["repo-almanack-score_nested"] = ak.Array([score])
del wide["repo-almanack-score"]

# Preserve nested with awkward arrays
for key, value in list(wide.items()):
if isinstance(value, (dict, list)) and not key.endswith("_json"):
wide[f"{key}_nested"] = ak.Array([value])
del wide[key]

# Attach computed check summaries
wide["checks_total"] = checks_total
wide["checks_passed"] = checks_passed
wide["checks_pct"] = checks_pct

return wide


def process_repo_for_almanack(repo_url: str) -> Dict[str, Any]:
"""
Processes a single GitHub repository URL into a flat dictionary of Almanack metrics.

Args:
repo_url (str): The GitHub repository URL.

Returns:
dict: Flattened metrics for the repository, including sustainability checks.
If the processing fails, returns an error entry with the repository URL.
"""
try:
# Merge the base repo identifier with flattened metrics in one dictionary
return {
**{"Repository URL": repo_url},
**table_to_wide(get_table(repo_path=repo_url)),
}

except Exception as e:
LOGGER.error(f"ERROR processing {repo_url}: {e}")
return {"Repository URL": repo_url, "almanack_error": str(e)}


def _get_almanack_version() -> str:
"""
Seeks the current version of almanack using either pkg_resources
Expand Down
13 changes: 12 additions & 1 deletion src/almanack/metrics/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import logging
import os
import pathlib
import time
from datetime import datetime, timezone
Expand Down Expand Up @@ -39,6 +40,16 @@ def get_api_data(
if params is None:
params = {}

# If available, use GitHub token for authenticated requests to increase rate limits
github_token = os.environ.get("GITHUB_TOKEN")
headers = {"accept": "application/json"}
if github_token and (
"github.com" in api_endpoint or "api.github.com" in api_endpoint
):
headers["Authorization"] = f"Bearer {github_token}"
else:
headers = {"accept": "application/json"}

max_retries = 100 # Number of attempts for rate limit errors
base_backoff = 5 # Base backoff time in seconds

Expand All @@ -47,7 +58,7 @@ def get_api_data(
# Perform the GET request with query parameters
response = requests.get(
api_endpoint,
headers={"accept": "application/json"},
headers=headers,
params=params,
timeout=300,
)
Expand Down
Loading
Loading