Skip to content

Commit 38ad209

Browse files
committed
update: add more analysis
1 parent cf7f2e4 commit 38ad209

9 files changed

+1652
-40
lines changed

analysis/3rd_party_libs.json

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"Crypto": "pycryptodome",
3+
"PIL": "pillow",
4+
"blake3": "blake3",
5+
"bs4": "beautifulsoup4",
6+
"chardet": "chardet",
7+
"cryptography": "cryptography",
8+
"dateutil": "python-dateutil",
9+
"django": "django",
10+
"docx": "python-docx",
11+
"faker": "faker",
12+
"flask": "flask",
13+
"flask_login": "flask-login",
14+
"flask_mail": "Flask-Mail",
15+
"flask_restful": "flask-restful",
16+
"folium": "folium",
17+
"geopy": "geopy",
18+
"keras": "keras",
19+
"librosa": "librosa",
20+
"lxml": "lxml",
21+
"matplotlib": "matplotlib",
22+
"mechanize": "mechanize",
23+
"nltk": "nltk",
24+
"numpy": "numpy",
25+
"openpyxl": "openpyxl",
26+
"pandas": "pandas",
27+
"prettytable": "prettytable",
28+
"psutil": "psutil",
29+
"pytesseract": "pytesseract",
30+
"pytz": "pytz",
31+
"requests": "requests",
32+
"rsa": "rsa",
33+
"scipy": "scipy",
34+
"seaborn": "seaborn",
35+
"sendgrid": "sendgrid",
36+
"sklearn": "scikit-learn",
37+
"soundfile": "soundfile",
38+
"statsmodels": "statsmodels",
39+
"tensorflow": "tensorflow",
40+
"texttable": "texttable",
41+
"werkzeug": "werkzeug",
42+
"wordninja": "wordninja",
43+
"wtforms": "WTForms",
44+
"xlwt": "xlwt",
45+
"xmltodict": "xmltodict",
46+
"yaml": "PyYAML",
47+
"flask_wtf": "Flask-WTF",
48+
"gensim": "gensim",
49+
"python_http_client": "python-http-client"
50+
}

analysis/HumanEval.jsonl

+164
Large diffs are not rendered by default.

analysis/benchmark_analysis.ipynb

+240-39
Large diffs are not rendered by default.

analysis/download_stats.json

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"requests": 407179468,
3+
"python-dateutil": 317542652,
4+
"PyYAML": 266619413,
5+
"numpy": 238728858,
6+
"cryptography": 228003149,
7+
"pandas": 190247110,
8+
"rsa": 176030185,
9+
"pytz": 155795840,
10+
"flask": 100804266,
11+
"scipy": 98509718,
12+
"psutil": 93922127,
13+
"werkzeug": 93236214,
14+
"beautifulsoup4": 91662537,
15+
"pillow": 86446419,
16+
"lxml": 79545989,
17+
"openpyxl": 76199497,
18+
"scikit-learn": 61481156,
19+
"chardet": 59122948,
20+
"matplotlib": 58474767,
21+
"xmltodict": 40772510,
22+
"pycryptodome": 26702817,
23+
"tensorflow": 21654531,
24+
"nltk": 18161557,
25+
"keras": 16838960,
26+
"seaborn": 16540691,
27+
"statsmodels": 16508856,
28+
"django": 13188158,
29+
"prettytable": 12583066,
30+
"faker": 12116411,
31+
"texttable": 7645216,
32+
"python-http-client": 6660018,
33+
"sendgrid": 6094785,
34+
"Flask-WTF": 5828864,
35+
"flask-login": 5493873,
36+
"WTForms": 5088533,
37+
"gensim": 4886183,
38+
"geopy": 4711066,
39+
"python-docx": 3982122,
40+
"xlwt": 3704863,
41+
"soundfile": 2472954,
42+
"librosa": 2445908,
43+
"pytesseract": 2004509,
44+
"flask-restful": 1563911,
45+
"folium": 998664,
46+
"Flask-Mail": 372163,
47+
"mechanize": 281683,
48+
"blake3": 209166,
49+
"wordninja": 183705
50+
}

analysis/download_stats.png

18.6 KB
Loading

analysis/ds1000.jsonl

+1,000
Large diffs are not rendered by default.

analysis/lib2domain.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"Crypto": "Cryptography", "PIL": "Visualization", "array": "General", "base64": "Cryptography", "binascii": "Cryptography", "bisect": "General", "blake3": "Cryptography", "bs4": "Network", "calendar": "Time", "cgi": "Network", "chardet": "Network", "cmath": "Computation", "codecs": "Cryptography", "collections": "General", "cryptography": "Cryptography", "csv": "System", "ctypes": "System", "datetime": "Time", "dateutil": "Time", "difflib": "General", "django": "Network", "docx": "System", "email": "Network", "faker": "General", "flask": "Network", "flask_login": "Network", "flask_mail": "Network", "flask_restful": "Network", "fnmatch": "General", "folium": "Visualization", "functools": "General", "geopy": "General", "getpass": "System", "glob": "System", "gzip": "System", "hashlib": "Cryptography", "heapq": "General", "hmac": "Cryptography", "html": "Network", "http": "Network", "importlib": "General", "inspect": "General", "io": "System", "ipaddress": "Network", "itertools": "General", "json": "System", "keras": "Computation", "librosa": "Computation", "logging": "System", "lxml": "Network", "math": "Computation", "matplotlib": "Visualization", "mechanize": "Network", "mimetypes": "Network", "multiprocessing": "System", "nltk": "Computation", "numpy": "Computation", "openpyxl": "System", "operator": "General", "os": "System", "pandas": "Computation", "pathlib": "System", "pickle": "System", "pkgutil": "General", "platform": "System", "prettytable": "General", "psutil": "System", "pytesseract": "Computation", "pytz": "Time", "queue": "General", "random": "General", "re": "General", "requests": "Network", "rsa": "Cryptography", "scipy": "Computation", "seaborn": "Visualization", "secrets": "Cryptography", "select": "System", "sendgrid": "Network", "shutil": "System", "sklearn": "Computation", "smtplib": "Network", "socket": "Network", "soundfile": "Computation", "sqlite3": "System", "ssl": "Network", "statistics": "Computation", "statsmodels": "Computation", "string": "General", "struct": "System", "subprocess": "System", "sys": "System", "tarfile": "System", "tensorflow": "Computation", "texttable": "General", "textwrap": "General", "threading": "System", "time": "Time", "turtle": "Visualization", "types": "General", "unicodedata": "General", "urllib": "Network", "uuid": "General", "warnings": "General", "werkzeug": "Network", "wordninja": "Computation", "wtforms": "Network", "xlwt": "System", "xml": "Network", "xmltodict": "Network", "yaml": "System", "zipfile": "System"}
1+
{"Crypto": "Cryptography", "PIL": "Visualization", "array": "General", "base64": "Cryptography", "binascii": "Cryptography", "bisect": "General", "blake3": "Cryptography", "bs4": "Network", "calendar": "Time", "cgi": "Network", "chardet": "Network", "cmath": "Computation", "codecs": "Cryptography", "collections": "General", "cryptography": "Cryptography", "csv": "System", "ctypes": "System", "datetime": "Time", "dateutil": "Time", "difflib": "General", "django": "Network", "docx": "System", "email": "Network", "faker": "General", "flask": "Network", "flask_login": "Network", "flask_mail": "Network", "flask_restful": "Network", "fnmatch": "General", "folium": "Visualization", "functools": "General", "geopy": "General", "getpass": "System", "glob": "System", "gzip": "System", "hashlib": "Cryptography", "heapq": "General", "hmac": "Cryptography", "html": "Network", "http": "Network", "importlib": "General", "inspect": "General", "io": "System", "ipaddress": "Network", "itertools": "General", "json": "System", "keras": "Computation", "librosa": "Computation", "logging": "System", "lxml": "Network", "math": "Computation", "matplotlib": "Visualization", "mechanize": "Network", "mimetypes": "Network", "multiprocessing": "System", "nltk": "Computation", "numpy": "Computation", "openpyxl": "System", "operator": "General", "os": "System", "pandas": "Computation", "pathlib": "System", "pickle": "System", "pkgutil": "General", "platform": "System", "prettytable": "General", "psutil": "System", "pytesseract": "Computation", "pytz": "Time", "queue": "General", "random": "General", "re": "General", "requests": "Network", "rsa": "Cryptography", "scipy": "Computation", "seaborn": "Visualization", "secrets": "Cryptography", "select": "System", "sendgrid": "Network", "shutil": "System", "sklearn": "Computation", "smtplib": "Network", "socket": "Network", "soundfile": "Computation", "sqlite3": "System", "ssl": "Network", "statistics": "Computation", "statsmodels": "Computation", "string": "General", "struct": "System", "subprocess": "System", "sys": "System", "tarfile": "System", "tensorflow": "Computation", "texttable": "General", "textwrap": "General", "threading": "System", "time": "Time", "turtle": "Visualization", "types": "General", "unicodedata": "General", "urllib": "Network", "uuid": "General", "warnings": "General", "werkzeug": "Network", "wordninja": "Computation", "wtforms": "Network", "xlwt": "System", "xml": "Network", "xmltodict": "Network", "yaml": "System", "zipfile": "System", "Levenshtein": "Computation", "ast": "General", "configparser": "System", "cv2": "Computation", "decimal": "General", "enum": "General", "errno": "System", "flask_wtf": "Network", "ftplib": "Network", "gensim": "Computation", "geopandas": "Computation", "holidays": "Time", "mpl_toolkits": "Visualization", "natsort": "General", "pyquery": "Network", "python_http_client": "Network", "regex": "General", "shapely": "Computation", "shlex": "System", "signal": "System", "skimage": "Computation", "sympy": "Computation", "textblob": "Computation", "typing": "General", "wikipedia": "Network", "wordcloud": "Visualization", "zlib": "System"}

analysis/pypi_download_stats.py

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import sys
2+
import json
3+
import subprocess
4+
from tqdm import tqdm
5+
from numpy import mean, median
6+
from matplotlib import pyplot as plt
7+
def get_pypi_stats(package_name):
8+
"""
9+
Function to get PyPI download stats for a given package using pypinfo.
10+
"""
11+
try:
12+
# Constructing the command to call pypinfo
13+
command = f"pypinfo --json {package_name}"
14+
15+
# Running the command and capturing the output
16+
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
17+
18+
# Print the output
19+
print(result.stdout)
20+
parsed_result = json.loads(result.stdout)
21+
22+
return parsed_result["rows"][0]["download_count"]
23+
24+
except subprocess.CalledProcessError as e:
25+
print(f"An error occurred: {e.stderr}", file=sys.stderr)
26+
return None
27+
28+
if __name__ == "__main__":
29+
with open("analysis/lib2domain.json") as f:
30+
lib2domain = json.load(f)
31+
32+
with open("analysis/standard_lib.json") as f:
33+
standard_lib = json.load(f)
34+
35+
with open("analysis/used_std_libs.json","w") as f:
36+
libs = []
37+
for lib in lib2domain.keys():
38+
if lib in standard_lib:
39+
libs.append(lib)
40+
json.dump(libs,f,indent=4)
41+
# with open("analysis/3rd_party_libs.json","w") as f:
42+
# libs = []
43+
# for lib in lib2domain.keys():
44+
# if lib not in standard_lib:
45+
# libs.append(lib)
46+
# json.dump(libs,f,indent=4)
47+
48+
# with open("analysis/3rd_party_libs.json") as f:
49+
# libs = json.load(f)
50+
51+
# download_stats = {}
52+
# for lib in tqdm(list(libs.values())[:]):
53+
# print(f"Getting download stats for {lib}")
54+
# download_stats[lib] = get_pypi_stats(lib)
55+
# sorted_download_stats = dict(sorted(download_stats.items(), key=lambda x: x[1], reverse=True))
56+
57+
# with open("analysis/download_stats.json", "w") as f:
58+
# json.dump(sorted_download_stats, f, indent=4)
59+
60+
with open("analysis/download_stats.json") as f:
61+
download_stats = json.load(f)
62+
# get mean and median download stats
63+
print(f"Mean download stats: {mean(list(download_stats.values()))}")
64+
print(f"Median download stats: {median(list(download_stats.values()))}")
65+
# plot the download stats with curve fitting
66+
plt.hist(list(download_stats.values()), bins=50, color='blue', edgecolor='black')
67+
plt.xlabel("Download Stats")
68+
plt.ylabel("Frequency")
69+
plt.title("Distribution of Download Stats")
70+
plt.savefig("analysis/download_stats.png")
71+
72+
73+

analysis/used_std_libs.json

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
[
2+
"array",
3+
"base64",
4+
"binascii",
5+
"bisect",
6+
"calendar",
7+
"cgi",
8+
"cmath",
9+
"codecs",
10+
"collections",
11+
"csv",
12+
"ctypes",
13+
"datetime",
14+
"difflib",
15+
"email",
16+
"fnmatch",
17+
"functools",
18+
"getpass",
19+
"glob",
20+
"gzip",
21+
"hashlib",
22+
"heapq",
23+
"hmac",
24+
"html",
25+
"http",
26+
"importlib",
27+
"inspect",
28+
"io",
29+
"ipaddress",
30+
"itertools",
31+
"json",
32+
"logging",
33+
"math",
34+
"mimetypes",
35+
"multiprocessing",
36+
"operator",
37+
"os",
38+
"pathlib",
39+
"pickle",
40+
"pkgutil",
41+
"platform",
42+
"queue",
43+
"random",
44+
"re",
45+
"secrets",
46+
"select",
47+
"shutil",
48+
"smtplib",
49+
"socket",
50+
"sqlite3",
51+
"ssl",
52+
"statistics",
53+
"string",
54+
"struct",
55+
"subprocess",
56+
"sys",
57+
"tarfile",
58+
"textwrap",
59+
"threading",
60+
"time",
61+
"turtle",
62+
"types",
63+
"unicodedata",
64+
"urllib",
65+
"uuid",
66+
"warnings",
67+
"xml",
68+
"zipfile",
69+
"decimal",
70+
"enum",
71+
"typing",
72+
"unittest",
73+
"zlib"
74+
]

0 commit comments

Comments
 (0)