|  | 
|  | 1 | +import json | 
|  | 2 | +import os | 
|  | 3 | +import re | 
|  | 4 | + | 
|  | 5 | +from black.trans import defaultdict | 
|  | 6 | +from git import Repo | 
|  | 7 | + | 
|  | 8 | + | 
|  | 9 | +def clone_repo(repo_url: str, clone_dir: str) -> str: | 
|  | 10 | +    # Ensure the target directory exists | 
|  | 11 | +    os.makedirs(clone_dir, exist_ok=True) | 
|  | 12 | + | 
|  | 13 | +    try: | 
|  | 14 | +        print(f"Cloning {repo_url} into {clone_dir}...") | 
|  | 15 | +        repo = Repo.clone_from(repo_url, clone_dir) | 
|  | 16 | +        print("Clone successful.") | 
|  | 17 | +        return repo.working_tree_dir | 
|  | 18 | +    except Exception as e: | 
|  | 19 | +        print(f"Failed to clone repository: {e}") | 
|  | 20 | +        return "" | 
|  | 21 | + | 
|  | 22 | + | 
|  | 23 | +def classify_commit_type(commit): | 
|  | 24 | +    """ | 
|  | 25 | +    Classify commit into root, normal, or merge based on parent count. | 
|  | 26 | +    """ | 
|  | 27 | +    num_parents = len(commit.parents) | 
|  | 28 | + | 
|  | 29 | +    if num_parents == 0: | 
|  | 30 | +        return "root"  # never a fix | 
|  | 31 | +    elif num_parents == 1: | 
|  | 32 | +        return "normal"  # main source of fixes | 
|  | 33 | +    else: | 
|  | 34 | +        return "merge"  # usually not a fix | 
|  | 35 | + | 
|  | 36 | + | 
|  | 37 | +def detect_fix_commit(commit): | 
|  | 38 | +    """ | 
|  | 39 | +    Detect whether a commit is a bug-fix or vulnerability-fix commit. | 
|  | 40 | +    Returns: "vulnerability_fix", "code_fix", "other" | 
|  | 41 | +    """ | 
|  | 42 | +    msg = commit.message.lower() | 
|  | 43 | + | 
|  | 44 | +    # Vulnerability/security fix patterns | 
|  | 45 | +    security_patterns = ["cve-[0-9]{4}-[0-9]{4,19}"] | 
|  | 46 | + | 
|  | 47 | +    if any(re.search(p, msg) for p in security_patterns): | 
|  | 48 | +        return "vulnerability_fix" | 
|  | 49 | + | 
|  | 50 | +    return "other" | 
|  | 51 | + | 
|  | 52 | + | 
|  | 53 | +def extract_cves(text: str) -> list[str]: | 
|  | 54 | +    if not text: | 
|  | 55 | +        return [] | 
|  | 56 | + | 
|  | 57 | +    cves = re.findall("cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) | 
|  | 58 | + | 
|  | 59 | +    # Normalize format (uppercase) and remove duplicates | 
|  | 60 | +    return list(set(cve.upper() for cve in cves)) | 
|  | 61 | + | 
|  | 62 | + | 
|  | 63 | +def classify_diff(commit) -> bool: | 
|  | 64 | +    """ | 
|  | 65 | +    Return True  -> commit touches at least one non-doc file (i.e., code change) | 
|  | 66 | +    Return False -> commit touches ONLY doc/text files | 
|  | 67 | +    """ | 
|  | 68 | +    doc_extensions = { | 
|  | 69 | +        ".txt", | 
|  | 70 | +        ".md", | 
|  | 71 | +        ".rst", | 
|  | 72 | +        ".mdx", | 
|  | 73 | +        ".doc", | 
|  | 74 | +        ".docx", | 
|  | 75 | +        ".odt", | 
|  | 76 | +        ".rtf", | 
|  | 77 | +        ".pdf", | 
|  | 78 | +        ".adoc", | 
|  | 79 | +        ".asciidoc", | 
|  | 80 | +        ".tex", | 
|  | 81 | +        ".markdown", | 
|  | 82 | +    } | 
|  | 83 | + | 
|  | 84 | +    # FIXME | 
|  | 85 | +    return True | 
|  | 86 | + | 
|  | 87 | + | 
|  | 88 | +if __name__ == "__main__": | 
|  | 89 | +    repo_url = "https://github.com/openssl/openssl/" | 
|  | 90 | +    repo_path = clone_repo(repo_url, clone_dir=f"/tmp/{hash(repo_url)}") | 
|  | 91 | + | 
|  | 92 | +    repo = Repo(repo_path) | 
|  | 93 | +    commits_data = [] | 
|  | 94 | +    cve_list = defaultdict(set) | 
|  | 95 | + | 
|  | 96 | +    for commit in repo.iter_commits("--all"): | 
|  | 97 | +        """ | 
|  | 98 | +        - Root commits ( Never a fix ) Can be ignored in fix detection. | 
|  | 99 | +        - Normal commits main source of bug/security fixes. | 
|  | 100 | +        - Merge commits ( A merge commit itself is usually not the fix ,it just joins two histories. ) | 
|  | 101 | +        """ | 
|  | 102 | +        commit_type = classify_commit_type(commit) | 
|  | 103 | +        is_fix_commit = detect_fix_commit(commit) | 
|  | 104 | + | 
|  | 105 | +        if is_fix_commit in "vulnerability_fix" and commit_type in ["normal", "merge"]: | 
|  | 106 | +            is_not_docs = classify_diff(commit) | 
|  | 107 | +            if is_not_docs: | 
|  | 108 | +                commits_data.append( | 
|  | 109 | +                    { | 
|  | 110 | +                        "hash": commit.hexsha, | 
|  | 111 | +                        "author": commit.author.name, | 
|  | 112 | +                        "email": commit.author.email, | 
|  | 113 | +                        "date": commit.committed_datetime.isoformat(), | 
|  | 114 | +                        "message": commit.message.strip(), | 
|  | 115 | +                    } | 
|  | 116 | +                ) | 
|  | 117 | + | 
|  | 118 | +            cves_temp = extract_cves(commit.message.strip()) | 
|  | 119 | +            for cve_temp in cves_temp: | 
|  | 120 | +                cve_list[cve_temp].add("https://github.com/openssl/openssl/commit/" + commit.hexsha) | 
|  | 121 | + | 
|  | 122 | +    # Convert sets to lists for JSON serialization | 
|  | 123 | +    result = {cve: list(commits) for cve, commits in cve_list.items()} | 
|  | 124 | + | 
|  | 125 | +    print(f"Found {len(result)} unique CVEs") | 
|  | 126 | +    print(json.dumps(result, indent=2)) | 
0 commit comments