software-gardening · willdavidson05 · Sep 3, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 24, 2025
@@ -13,7 +13,7 @@
 
 from .book import read
 from .metrics.data import get_table as table
-from .metrics.data import process_repo_for_analysis
+from .metrics.data import process_repo_for_almanack, process_repo_for_analysis
 from .metrics.entropy.calculate_entropy import (
     calculate_aggregate_entropy,
     calculate_normalized_entropy,

@@ -11,7 +11,9 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
+import awkward as ak
 import defusedxml.ElementTree as ET
+import pandas as pd
 import pygit2
 import yaml
 
@@ -555,6 +557,90 @@ def process_repo_for_analysis(
         shutil.rmtree(temp_dir)
 
 
+def table_to_wide(table_rows: list[dict]) -> Dict[str, Any]:
+    """
+    Transpose Almanack table (name->result), compute checks summary, flatten nested.
+    `repo-file-info-entropy` is ignored due to scope and increased runtime for analysis.
+
+    Args:
+        table_rows (list[dict]):
+            The Almanack metrics table as a list of dictionaries,
+            each containing metric metadata and a "result" field.
+
+    Returns:
+        dict:
+            A flattened dictionary mapping metric names to their results,
+            including computed summary fields:
+              - "checks_total": total number of sustainability-related checks
+              - "checks_passed": number of checks passed
+              - "checks_pct": percentage of checks passed
+
+    """
+    if not table_rows:
+        return {"checks_total": 0, "checks_passed": 0, "checks_pct": None}
+
+    df = pd.DataFrame(table_rows)
+
+    # Dynamic sustainability checks: bool + positive correlation
+    mask = (df["result-type"] == "bool") & (df["sustainability_correlation"] == 1)
+    checks_total = int(mask.sum())
+    checks_passed = int((df.loc[mask, "result"] == True).sum())
+    checks_pct = (100.0 * checks_passed / checks_total) if checks_total else None
+
+    # name->result (wide format)
+    wide: Dict[str, Any] = (
+        df[["name", "result"]]
+        .set_index("name")
+        .T.reset_index(drop=True)
+        .iloc[0]
+        .to_dict()
+    )
+
+    wide.pop("repo-file-info-entropy", None)
+
+    # Flatten repo-almanack-score if present (avoid nested dict in parquet)
+    score = wide.get("repo-almanack-score")
+    if isinstance(score, dict):
+        wide["repo-almanack-score_nested"] = ak.Array([score])
+        del wide["repo-almanack-score"]
+
+    # Preserve nested with awkward arrays
+    for key, value in list(wide.items()):
+        if isinstance(value, (dict, list)) and not key.endswith("_json"):
+            wide[f"{key}_nested"] = ak.Array([value])
+            del wide[key]
+
+    # Attach computed check summaries
+    wide["checks_total"] = checks_total
+    wide["checks_passed"] = checks_passed
+    wide["checks_pct"] = checks_pct
+
+    return wide
+
+
+def process_repo_for_almanack(repo_url: str) -> Dict[str, Any]:
+    """
+    Processes a single GitHub repository URL into a flat dictionary of Almanack metrics.
+
+    Args:
+        repo_url (str): The GitHub repository URL.
+
+    Returns:
+        dict: Flattened metrics for the repository, including sustainability checks.
+              If the processing fails, returns an error entry with the repository URL.
+    """
+    try:
+        # Merge the base repo identifier with flattened metrics in one dictionary
+        return {
+            **{"Repository URL": repo_url},
+            **table_to_wide(get_table(repo_path=repo_url)),
+        }
+
+    except Exception as e:
+        LOGGER.error(f"ERROR processing {repo_url}: {e}")
+        return {"Repository URL": repo_url, "almanack_error": str(e)}
+
+
 def _get_almanack_version() -> str:
     """
     Seeks the current version of almanack using either pkg_resources

@@ -4,6 +4,7 @@
 """
 
 import logging
+import os
 import pathlib
 import time
 from datetime import datetime, timezone
@@ -39,6 +40,16 @@ def get_api_data(
     if params is None:
         params = {}
 
+    # If available, use GitHub token for authenticated requests to increase rate limits
+    github_token = os.environ.get("GITHUB_TOKEN")
+    headers = {"accept": "application/json"}
+    if github_token and (
+        "github.com" in api_endpoint or "api.github.com" in api_endpoint
+    ):
+        headers["Authorization"] = f"Bearer {github_token}"
+    else:
+        headers = {"accept": "application/json"}
+
     max_retries = 100  # Number of attempts for rate limit errors
     base_backoff = 5  # Base backoff time in seconds
 
@@ -47,7 +58,7 @@ def get_api_data(
             # Perform the GET request with query parameters
             response = requests.get(
                 api_endpoint,
-                headers={"accept": "application/json"},
+                headers=headers,
                 params=params,
                 timeout=300,
             )