Merge pull request #91 from Carreau/misc

Carreau · web-flow · commit 20d7c96c67eb · 2025-03-28T13:01:53.000+01:00
misc update and tidelift scraping
diff --git a/tools/all_repos.py b/tools/all_repos.py
@@ -0,0 +1,183 @@
+# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "requests",
+#   "rich",
+#   "beautifulsoup4",
+# ]
+# ///
+"""GitHub Organization Activity Tracker
+
+This module tracks and reports the last activity of members across GitHub organizations.
+It implements disk-based caching to minimize API requests and respect rate limits.
+"""
+
+import os
+import asks
+from rich import print
+import trio
+
+import requests
+from rich import print
+from bs4 import BeautifulSoup
+
+
+def get_packages(url):
+    # Send a GET request to the webpage with a custom user agent
+    headers = {"User-Agent": "python/request/jupyter"}
+    response = requests.get(url, headers=headers, allow_redirects=True)
+
+    if response.status_code != 200:
+        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
+        exit(1)
+
+    if "A required part of this site couldn’t load" in response.text:
+        print("Fastly is blocking us. Status code: 403")
+        exit(1)
+
+    # Parse the HTML content
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    # Find all <h3> tags and accumulate their text in a list
+    h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]
+
+    # Sort the list of <h3> contents
+    h3_tags.sort()
+
+    if not h3_tags:
+        print("No packages found")
+        exit(1)
+    return h3_tags
+
+
+default_orgs = [
+    "binder-examples",
+    "binderhub-ci-repos",
+    "ipython",
+    "jupyter",
+    "jupyter-attic",
+    "jupyter-book",
+    "jupyter-governance",
+    "jupyter-incubator",
+    "jupyter-resources",
+    "jupyter-server",
+    "jupyter-standard",
+    "jupyter-standards",
+    "jupyter-widgets",
+    "jupyter-xeus",
+    "jupytercon",
+    "jupyterhub",
+    "jupyterlab",
+    "voila-dashboards",
+    "voila-gallery",
+    "pickleshare",
+]
+
+token = os.getenv("GH_TOKEN")
+if not token:
+    print("[red]Error: GH_TOKEN environment variable not set[/red]")
+    exit(1)
+
+headers = {
+    "Authorization": f"token {token}",
+    "Accept": "application/vnd.github.v3+json",
+}
+
+
+async def list_repos(orgs):
+    async with trio.open_nursery() as nursery:
+        results = []
+        for org in orgs:
+
+            async def _loc(results, org):
+                results.append(await list_repos_for_org(org))
+
+            nursery.start_soon(_loc, results, org)
+    for org_repos in results:
+        for org, repo in org_repos:
+            yield org, repo
+
+
+async def list_repos_for_org(org):
+    reps = []
+    for p in range(1, 10):
+        response = await asks.get(
+            f"https://api.github.com/orgs/{org}/repos?per_page=100&page={p}",
+            headers=headers,
+        )
+        response.raise_for_status()
+        repos = response.json()
+        for repo in repos:
+            reps.append((org, repo["name"]))
+        if len(repos) < 100:
+            break
+    return reps
+
+
+async def main():
+
+    packages = get_packages(f"https://pypi.org/org/jupyter/")
+    print(f"Found {len(packages)} packages in the pypi jupyter org")
+
+    map = {p.lower().replace("-", "_"): p for p in packages}
+
+    todo = []
+    async for org, repo in list_repos(default_orgs):
+        lowname = repo.lower().replace("-", "_")
+        if lowname in map:
+            print(
+                f"{org}/{repo}".ljust(40),
+                f"https://pypi.org/project/{map[lowname]}",
+                " in jupyter org",
+            )
+            del map[lowname]
+        else:
+            todo.append((org, repo))
+
+    print()
+    print("check potentially matching Pypi names:")
+
+    async with trio.open_nursery() as nursery:
+        targets = []
+        for org, repo in todo:
+
+            async def _loc(targets, org, repo):
+                targets.append(
+                    (
+                        org,
+                        repo,
+                        (
+                            await asks.get(f"https://pypi.org/pypi/{repo}/json")
+                        ).status_code,
+                    )
+                )
+
+            nursery.start_soon(_loc, targets, org, repo)
+    corg = ""
+    for org, repo, status in sorted(targets):
+        if org != corg:
+            print()
+            corg = org
+        if status == 200:
+            print(
+                f"https://github.com/{org}/{repo}".ljust(70),
+                f"{status} for https://pypi.org/project/{repo}",
+            )
+
+    print()
+    print("repos with no Pypi package:")
+    corg = ""
+    for org, repo, status in sorted(targets):
+        if org != corg:
+            print()
+            corg = org
+        if status != 200:
+            print(f"https://github.com/{org}/{repo}")
+
+    print()
+    print("Packages with no repos.")
+    print(map)
+
+
+trio.run(main)
diff --git a/tools/private_sec_report.py b/tools/private_sec_report.py
@@ -72,11 +72,18 @@ async def get_private_report(session, org, repo):
     ) as repo_response:
         repo_info = await repo_response.json()
         archived = repo_info.get("archived", False)
+        private = repo_info.get("private", False)
     async with session.get(private_report_url, headers=headers) as response:
         if response.status == 200:
-            return org, repo, (await response.json()).get("enabled", False), archived
+            return (
+                org,
+                repo,
+                (await response.json()).get("enabled", False),
+                archived,
+                private,
+            )
         else:
-            return org, repo, False, archived
+            return org, repo, False, archived, private
 
 
 async def main():
@@ -90,14 +97,16 @@ async def main():
 
         results = await asyncio.gather(*tasks)
         prev_org = None
-        for org, repo, enabled, archived in results:
+        for org, repo, enabled, archived, private in results:
             if org != prev_org:
                 print()
                 print(f"[bold]{org}[/bold]")
                 prev_org = org
             if enabled:
                 print(f"    [green]{repo}: {enabled}[/green]")
             else:
+                if private:
+                    print(f"    [yellow]{org}/{repo}: {enabled} (private)[/yellow]")
                 if archived:
                     print(f"    [yellow]{org}/{repo}: {enabled} (archived)[/yellow]")
                 elif f"{org}/{repo}" in ignore_repos:
diff --git a/tools/tide.py b/tools/tide.py
@@ -0,0 +1,119 @@
+# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "requests",
+#   "rich",
+#   "beautifulsoup4",
+# ]
+# ///
+import requests
+from rich import print
+from bs4 import BeautifulSoup
+import sys
+from rich.table import Table
+
+
+def get_packages(url):
+    # Send a GET request to the webpage with a custom user agent
+    headers = {"User-Agent": "python/request/jupyter"}
+    response = requests.get(url, headers=headers, allow_redirects=True)
+
+    if response.status_code != 200:
+        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
+        exit(1)
+
+    if "A required part of this site couldn’t load" in response.text:
+        print(f"Fastly is blocking us for {url}. Status code: 403")
+        exit(1)
+
+    # Parse the HTML content
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    # Find all <h3> tags and accumulate their text in a list
+    h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]
+
+    # Sort the list of <h3> contents
+    h3_tags.sort()
+
+    if not h3_tags:
+        print("No packages found")
+        exit(1)
+    return h3_tags
+
+
+def get_tidelift_data(packages):
+    packages_data = [{"platform": "pypi", "name": h3} for h3 in packages]
+
+    data = {"packages": packages_data}
+    res = requests.post(
+        "https://tidelift.com/api/depci/estimate/bulk_estimates", json=data
+    )
+
+    res.raise_for_status()
+
+    # Collecting all package data for aligned printing
+    package_data = []
+    response_data = res.json()
+
+    for package in response_data:
+        name = package["name"]
+        lifted = package["lifted"]
+        estimated_money = package["estimated_money"]
+        package_data.append((name, lifted, estimated_money))
+
+    package_names = {p["name"] for p in response_data}
+    for package in packages:
+        if package not in package_names:
+            package_data.append((package, None, None))
+
+    # Print the collected data in aligned columns
+
+    # Create a table for aligned output
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Package Name")
+    table.add_column("Estimated Money")
+    table.add_column("Lifted")
+
+    def maybefloat(x):
+        if x is None:
+            return 0
+        try:
+            return float(x)
+        except TypeError:
+            return 0
+
+    package_data.sort(
+        key=lambda x: (x[1] is None, x[1], -maybefloat(x[2]), x[0])
+    )  # sort lifted True first, then None, then False, then amount,  then by name
+    for name, lifted, estimated_money in package_data:
+        if lifted:
+            table.add_row(name, "-- need login ––", f"[green]{lifted}[/green]")
+        else:
+            table.add_row(name, str(estimated_money), f"[red]{lifted}[/red]")
+
+    print(table)
+
+
+if __name__ == "__main__":
+    # URL of the webpage
+    args = sys.argv[1:]
+    packages = []
+    while args:
+        if args[0] == "--org":
+            url = f"https://pypi.org/org/{args[1]}/"
+            packages += get_packages(url)
+            args = args[2:]
+        elif args[0] == "--user":
+            url = f"https://pypi.org/user/{args[1]}/"
+            packages += get_packages(url)
+            args = args[2:]
+        elif args[0] == "--packages":
+            packages += args[1:]
+            args = []
+        else:
+            print(
+                "Invalid argument. Please use either --org ORG, --user USER or --packages PACKAGE1 PACKAGE2 ..."
+            )
+            exit(1)
+    get_tidelift_data(packages)