Skip to content

Commit 20d7c96

Browse files
authored
Merge pull request #91 from Carreau/misc
misc update and tidelift scraping
2 parents 4bdfc45 + 249bc47 commit 20d7c96

File tree

3 files changed

+314
-3
lines changed

3 files changed

+314
-3
lines changed

tools/all_repos.py

+183
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
2+
# /// script
3+
# requires-python = ">=3.12"
4+
# dependencies = [
5+
# "requests",
6+
# "rich",
7+
# "beautifulsoup4",
8+
# ]
9+
# ///
10+
"""GitHub Organization Activity Tracker
11+
12+
This module tracks and reports the last activity of members across GitHub organizations.
13+
It implements disk-based caching to minimize API requests and respect rate limits.
14+
"""
15+
16+
import os
17+
import asks
18+
from rich import print
19+
import trio
20+
21+
import requests
22+
from rich import print
23+
from bs4 import BeautifulSoup
24+
25+
26+
def get_packages(url):
27+
# Send a GET request to the webpage with a custom user agent
28+
headers = {"User-Agent": "python/request/jupyter"}
29+
response = requests.get(url, headers=headers, allow_redirects=True)
30+
31+
if response.status_code != 200:
32+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
33+
exit(1)
34+
35+
if "A required part of this site couldn’t load" in response.text:
36+
print("Fastly is blocking us. Status code: 403")
37+
exit(1)
38+
39+
# Parse the HTML content
40+
soup = BeautifulSoup(response.content, "html.parser")
41+
42+
# Find all <h3> tags and accumulate their text in a list
43+
h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]
44+
45+
# Sort the list of <h3> contents
46+
h3_tags.sort()
47+
48+
if not h3_tags:
49+
print("No packages found")
50+
exit(1)
51+
return h3_tags
52+
53+
54+
default_orgs = [
55+
"binder-examples",
56+
"binderhub-ci-repos",
57+
"ipython",
58+
"jupyter",
59+
"jupyter-attic",
60+
"jupyter-book",
61+
"jupyter-governance",
62+
"jupyter-incubator",
63+
"jupyter-resources",
64+
"jupyter-server",
65+
"jupyter-standard",
66+
"jupyter-standards",
67+
"jupyter-widgets",
68+
"jupyter-xeus",
69+
"jupytercon",
70+
"jupyterhub",
71+
"jupyterlab",
72+
"voila-dashboards",
73+
"voila-gallery",
74+
"pickleshare",
75+
]
76+
77+
token = os.getenv("GH_TOKEN")
78+
if not token:
79+
print("[red]Error: GH_TOKEN environment variable not set[/red]")
80+
exit(1)
81+
82+
headers = {
83+
"Authorization": f"token {token}",
84+
"Accept": "application/vnd.github.v3+json",
85+
}
86+
87+
88+
async def list_repos(orgs):
89+
async with trio.open_nursery() as nursery:
90+
results = []
91+
for org in orgs:
92+
93+
async def _loc(results, org):
94+
results.append(await list_repos_for_org(org))
95+
96+
nursery.start_soon(_loc, results, org)
97+
for org_repos in results:
98+
for org, repo in org_repos:
99+
yield org, repo
100+
101+
102+
async def list_repos_for_org(org):
103+
reps = []
104+
for p in range(1, 10):
105+
response = await asks.get(
106+
f"https://api.github.com/orgs/{org}/repos?per_page=100&page={p}",
107+
headers=headers,
108+
)
109+
response.raise_for_status()
110+
repos = response.json()
111+
for repo in repos:
112+
reps.append((org, repo["name"]))
113+
if len(repos) < 100:
114+
break
115+
return reps
116+
117+
118+
async def main():
119+
120+
packages = get_packages(f"https://pypi.org/org/jupyter/")
121+
print(f"Found {len(packages)} packages in the pypi jupyter org")
122+
123+
map = {p.lower().replace("-", "_"): p for p in packages}
124+
125+
todo = []
126+
async for org, repo in list_repos(default_orgs):
127+
lowname = repo.lower().replace("-", "_")
128+
if lowname in map:
129+
print(
130+
f"{org}/{repo}".ljust(40),
131+
f"https://pypi.org/project/{map[lowname]}",
132+
" in jupyter org",
133+
)
134+
del map[lowname]
135+
else:
136+
todo.append((org, repo))
137+
138+
print()
139+
print("check potentially matching Pypi names:")
140+
141+
async with trio.open_nursery() as nursery:
142+
targets = []
143+
for org, repo in todo:
144+
145+
async def _loc(targets, org, repo):
146+
targets.append(
147+
(
148+
org,
149+
repo,
150+
(
151+
await asks.get(f"https://pypi.org/pypi/{repo}/json")
152+
).status_code,
153+
)
154+
)
155+
156+
nursery.start_soon(_loc, targets, org, repo)
157+
corg = ""
158+
for org, repo, status in sorted(targets):
159+
if org != corg:
160+
print()
161+
corg = org
162+
if status == 200:
163+
print(
164+
f"https://github.com/{org}/{repo}".ljust(70),
165+
f"{status} for https://pypi.org/project/{repo}",
166+
)
167+
168+
print()
169+
print("repos with no Pypi package:")
170+
corg = ""
171+
for org, repo, status in sorted(targets):
172+
if org != corg:
173+
print()
174+
corg = org
175+
if status != 200:
176+
print(f"https://github.com/{org}/{repo}")
177+
178+
print()
179+
print("Packages with no repos.")
180+
print(map)
181+
182+
183+
trio.run(main)

tools/private_sec_report.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,18 @@ async def get_private_report(session, org, repo):
7272
) as repo_response:
7373
repo_info = await repo_response.json()
7474
archived = repo_info.get("archived", False)
75+
private = repo_info.get("private", False)
7576
async with session.get(private_report_url, headers=headers) as response:
7677
if response.status == 200:
77-
return org, repo, (await response.json()).get("enabled", False), archived
78+
return (
79+
org,
80+
repo,
81+
(await response.json()).get("enabled", False),
82+
archived,
83+
private,
84+
)
7885
else:
79-
return org, repo, False, archived
86+
return org, repo, False, archived, private
8087

8188

8289
async def main():
@@ -90,14 +97,16 @@ async def main():
9097

9198
results = await asyncio.gather(*tasks)
9299
prev_org = None
93-
for org, repo, enabled, archived in results:
100+
for org, repo, enabled, archived, private in results:
94101
if org != prev_org:
95102
print()
96103
print(f"[bold]{org}[/bold]")
97104
prev_org = org
98105
if enabled:
99106
print(f" [green]{repo}: {enabled}[/green]")
100107
else:
108+
if private:
109+
print(f" [yellow]{org}/{repo}: {enabled} (private)[/yellow]")
101110
if archived:
102111
print(f" [yellow]{org}/{repo}: {enabled} (archived)[/yellow]")
103112
elif f"{org}/{repo}" in ignore_repos:

tools/tide.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# https://packaging.python.org/en/latest/specifications/inline-script-metadata/
2+
# /// script
3+
# requires-python = ">=3.12"
4+
# dependencies = [
5+
# "requests",
6+
# "rich",
7+
# "beautifulsoup4",
8+
# ]
9+
# ///
10+
import requests
11+
from rich import print
12+
from bs4 import BeautifulSoup
13+
import sys
14+
from rich.table import Table
15+
16+
17+
def get_packages(url):
18+
# Send a GET request to the webpage with a custom user agent
19+
headers = {"User-Agent": "python/request/jupyter"}
20+
response = requests.get(url, headers=headers, allow_redirects=True)
21+
22+
if response.status_code != 200:
23+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
24+
exit(1)
25+
26+
if "A required part of this site couldn’t load" in response.text:
27+
print(f"Fastly is blocking us for {url}. Status code: 403")
28+
exit(1)
29+
30+
# Parse the HTML content
31+
soup = BeautifulSoup(response.content, "html.parser")
32+
33+
# Find all <h3> tags and accumulate their text in a list
34+
h3_tags = [h3.get_text(strip=True) for h3 in soup.find_all("h3")]
35+
36+
# Sort the list of <h3> contents
37+
h3_tags.sort()
38+
39+
if not h3_tags:
40+
print("No packages found")
41+
exit(1)
42+
return h3_tags
43+
44+
45+
def get_tidelift_data(packages):
46+
packages_data = [{"platform": "pypi", "name": h3} for h3 in packages]
47+
48+
data = {"packages": packages_data}
49+
res = requests.post(
50+
"https://tidelift.com/api/depci/estimate/bulk_estimates", json=data
51+
)
52+
53+
res.raise_for_status()
54+
55+
# Collecting all package data for aligned printing
56+
package_data = []
57+
response_data = res.json()
58+
59+
for package in response_data:
60+
name = package["name"]
61+
lifted = package["lifted"]
62+
estimated_money = package["estimated_money"]
63+
package_data.append((name, lifted, estimated_money))
64+
65+
package_names = {p["name"] for p in response_data}
66+
for package in packages:
67+
if package not in package_names:
68+
package_data.append((package, None, None))
69+
70+
# Print the collected data in aligned columns
71+
72+
# Create a table for aligned output
73+
table = Table(show_header=True, header_style="bold magenta")
74+
table.add_column("Package Name")
75+
table.add_column("Estimated Money")
76+
table.add_column("Lifted")
77+
78+
def maybefloat(x):
79+
if x is None:
80+
return 0
81+
try:
82+
return float(x)
83+
except TypeError:
84+
return 0
85+
86+
package_data.sort(
87+
key=lambda x: (x[1] is None, x[1], -maybefloat(x[2]), x[0])
88+
) # sort lifted True first, then None, then False, then amount, then by name
89+
for name, lifted, estimated_money in package_data:
90+
if lifted:
91+
table.add_row(name, "-- need login ––", f"[green]{lifted}[/green]")
92+
else:
93+
table.add_row(name, str(estimated_money), f"[red]{lifted}[/red]")
94+
95+
print(table)
96+
97+
98+
if __name__ == "__main__":
99+
# URL of the webpage
100+
args = sys.argv[1:]
101+
packages = []
102+
while args:
103+
if args[0] == "--org":
104+
url = f"https://pypi.org/org/{args[1]}/"
105+
packages += get_packages(url)
106+
args = args[2:]
107+
elif args[0] == "--user":
108+
url = f"https://pypi.org/user/{args[1]}/"
109+
packages += get_packages(url)
110+
args = args[2:]
111+
elif args[0] == "--packages":
112+
packages += args[1:]
113+
args = []
114+
else:
115+
print(
116+
"Invalid argument. Please use either --org ORG, --user USER or --packages PACKAGE1 PACKAGE2 ..."
117+
)
118+
exit(1)
119+
get_tidelift_data(packages)

0 commit comments

Comments
 (0)