Skip to content

Commit 06e8b82

Browse files
committed
Add initial support for parsing Git commit messages
Signed-off-by: ziad hany <[email protected]>
1 parent dcb0511 commit 06e8b82

File tree

1 file changed

+126
-0
lines changed

1 file changed

+126
-0
lines changed

vulnerabilities/extract_commits.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import json
2+
import os
3+
import re
4+
5+
from black.trans import defaultdict
6+
from git import Repo
7+
8+
9+
def clone_repo(repo_url: str, clone_dir: str) -> str:
10+
# Ensure the target directory exists
11+
os.makedirs(clone_dir, exist_ok=True)
12+
13+
try:
14+
print(f"Cloning {repo_url} into {clone_dir}...")
15+
repo = Repo.clone_from(repo_url, clone_dir)
16+
print("Clone successful.")
17+
return repo.working_tree_dir
18+
except Exception as e:
19+
print(f"Failed to clone repository: {e}")
20+
return ""
21+
22+
23+
def classify_commit_type(commit):
24+
"""
25+
Classify commit into root, normal, or merge based on parent count.
26+
"""
27+
num_parents = len(commit.parents)
28+
29+
if num_parents == 0:
30+
return "root" # never a fix
31+
elif num_parents == 1:
32+
return "normal" # main source of fixes
33+
else:
34+
return "merge" # usually not a fix
35+
36+
37+
def detect_fix_commit(commit):
38+
"""
39+
Detect whether a commit is a bug-fix or vulnerability-fix commit.
40+
Returns: "vulnerability_fix", "code_fix", "other"
41+
"""
42+
msg = commit.message.lower()
43+
44+
# Vulnerability/security fix patterns
45+
security_patterns = ["cve-[0-9]{4}-[0-9]{4,19}"]
46+
47+
if any(re.search(p, msg) for p in security_patterns):
48+
return "vulnerability_fix"
49+
50+
return "other"
51+
52+
53+
def extract_cves(text: str) -> list[str]:
54+
if not text:
55+
return []
56+
57+
cves = re.findall("cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE)
58+
59+
# Normalize format (uppercase) and remove duplicates
60+
return list(set(cve.upper() for cve in cves))
61+
62+
63+
def classify_diff(commit) -> bool:
64+
"""
65+
Return True -> commit touches at least one non-doc file (i.e., code change)
66+
Return False -> commit touches ONLY doc/text files
67+
"""
68+
doc_extensions = {
69+
".txt",
70+
".md",
71+
".rst",
72+
".mdx",
73+
".doc",
74+
".docx",
75+
".odt",
76+
".rtf",
77+
".pdf",
78+
".adoc",
79+
".asciidoc",
80+
".tex",
81+
".markdown",
82+
}
83+
84+
# FIXME
85+
return True
86+
87+
88+
if __name__ == "__main__":
89+
repo_url = "https://github.com/openssl/openssl/"
90+
repo_path = clone_repo(repo_url, clone_dir=f"/tmp/{hash(repo_url)}")
91+
92+
repo = Repo(repo_path)
93+
commits_data = []
94+
cve_list = defaultdict(set)
95+
96+
for commit in repo.iter_commits("--all"):
97+
"""
98+
- Root commits ( Never a fix ) Can be ignored in fix detection.
99+
- Normal commits main source of bug/security fixes.
100+
- Merge commits ( A merge commit itself is usually not the fix ,it just joins two histories. )
101+
"""
102+
commit_type = classify_commit_type(commit)
103+
is_fix_commit = detect_fix_commit(commit)
104+
105+
if is_fix_commit in "vulnerability_fix" and commit_type in ["normal", "merge"]:
106+
is_not_docs = classify_diff(commit)
107+
if is_not_docs:
108+
commits_data.append(
109+
{
110+
"hash": commit.hexsha,
111+
"author": commit.author.name,
112+
"email": commit.author.email,
113+
"date": commit.committed_datetime.isoformat(),
114+
"message": commit.message.strip(),
115+
}
116+
)
117+
118+
cves_temp = extract_cves(commit.message.strip())
119+
for cve_temp in cves_temp:
120+
cve_list[cve_temp].add("https://github.com/openssl/openssl/commit/" + commit.hexsha)
121+
122+
# Convert sets to lists for JSON serialization
123+
result = {cve: list(commits) for cve, commits in cve_list.items()}
124+
125+
print(f"Found {len(result)} unique CVEs")
126+
print(json.dumps(result, indent=2))

0 commit comments

Comments
 (0)