diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 82ee4525a..63e6f80c7 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -43,6 +43,9 @@ from vulnerabilities.pipelines import pysec_importer from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2 +from vulnerabilities.pipelines.v2_importers import ( + collect_repo_fix_commits as collect_repo_fix_commits, +) from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, @@ -115,5 +118,6 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + collect_repo_fix_commits.CollectRepoFixCommitPipeline, ] ) diff --git a/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py new file mode 100644 index 000000000..521fed4ab --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py @@ -0,0 +1,111 @@ +import re +import shutil +import tempfile +from collections import defaultdict + +from git import Repo + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 + +SECURITY_PATTERNS = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"\bGHSA-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}\b", + r"\bPYSEC-\d{4}-\d{1,6}\b", + r"\bXSA-\d{1,4}\b", +] + + +class CollectRepoFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect fix commits from any git repository. + """ + + pipeline_id = "repo_fix_commit" + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + """Clone the repository.""" + self.repo_url = "https://github.com/torvalds/linux" + repo_path = tempfile.mkdtemp() + self.repo = Repo.clone_from( + url=self.repo_url, + to_path=repo_path, + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) + + def advisories_count(self) -> int: + return int(self.repo.git.rev_list("--count", "HEAD")) + + def classify_commit_type(self, commit) -> list[str]: + """ + Extract vulnerability identifiers from a commit message. + Returns a list of matched vulnerability IDs (normalized to uppercase). + """ + matches = [] + for pattern in SECURITY_PATTERNS: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + return a list with (vuln_id, [(commit_id, commit_message)]). + """ + self.log("Processing git repository fix commits (grouped by vulnerability IDs).") + + grouped_commits = defaultdict(list) + for commit in self.repo.iter_commits("--all"): + matched_ids = self.classify_commit_type(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + grouped_commits[vuln_id].append((commit_id, commit_message)) + + self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") + self.log("Finished processing all commits.") + return grouped_commits + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. + """ + self.log("Generating AdvisoryData objects from grouped commits.") + grouped_commits = self.collect_fix_commits() + for vuln_id, commits in grouped_commits.items(): + references = [ReferenceV2(url=f"{self.repo_url}/commit/{cid}") for cid, _ in commits] + + summary_lines = [f"- {cid}: {msg}" for cid, msg in commits] + summary = f"Commits fixing {vuln_id}:\n" + "\n".join(summary_lines) + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[vuln_id], + summary=summary, + references_v2=references, + url=self.repo_url, + ) + + def clean_downloads(self): + """Cleanup any temporary repository data.""" + self.log("Cleaning up local repository resources.") + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) + + def on_failure(self): + """Ensure cleanup is always performed on failure.""" + self.clean_downloads() diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py new file mode 100644 index 000000000..6f6b1b8b1 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py @@ -0,0 +1,124 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from unittest import TestCase +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest + +from vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits import ( + CollectRepoFixCommitPipeline, +) +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + return pipeline + + +def test_classify_commit_type_extracts_ids(pipeline): + class DummyCommit: + message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq" + + result = pipeline.classify_commit_type(DummyCommit) + assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"] + + +@patch("vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits.Repo") +def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline): + commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123") + commit2 = MagicMock(message="Patch GHSA-dead-beef-baad", hexsha="def456") + commit3 = MagicMock(message="Unrelated change", hexsha="ghi789") + + pipeline.repo = MagicMock() + pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3] + + pipeline.classify_commit_type = MagicMock( + side_effect=lambda c: ( + ["CVE-2021-0001"] + if "CVE" in c.message + else ["GHSA-dead-beef-baad"] + if "GHSA" in c.message + else [] + ) + ) + + grouped = pipeline.collect_fix_commits() + + expected = { + "CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")], + "GHSA-dead-beef-baad": [("def456", "Patch GHSA-dead-beef-baad")], + } + + assert grouped == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits" + + +class TestRepoFixCommitPipeline(TestCase): + def test_collect_advisories_from_json(self): + input_file = TEST_DATA / "grouped_commits_input.json" + expected_file = TEST_DATA / "expected_linux_advisory_output.json" + + grouped_commits = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file) + + +@pytest.mark.parametrize( + "commit_message, expected_ids", + [ + ("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]), + ("Address GHSA-abcd-1234-efgh report", ["GHSA-abcd-1234-efgh"]), + ("Python security PYSEC-2021-12345 fix", ["PYSEC-2021-12345"]), + ("Xen XSA-43 security update", ["XSA-43"]), + ( + "Fix CVE-2023-1111 and GHSA-aaaa-bbbb-cccc in kernel", + ["CVE-2023-1111", "GHSA-aaaa-bbbb-cccc"], + ), + ("Refactor logging system with no security ID", []), + ], +) +def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids): + """Ensure classify_commit_type correctly extracts vulnerability IDs.""" + + class DummyCommit: + def __init__(self, message): + self.message = message + + commit = DummyCommit(commit_message) + result = pipeline.classify_commit_type(commit) + + assert result == expected_ids, f"Unexpected result for message: {commit_message}" + + +def test_classify_commit_type_case_insensitive(pipeline): + """Ensure pattern matching is case-insensitive.""" + + class DummyCommit: + message = "fix cVe-2022-9999 and ghSa-dead-beef-baad" + + result = pipeline.classify_commit_type(DummyCommit) + assert any("CVE-2022-9999" in r.upper() for r in result) + assert any("GHSA-DEAD-BEEF-BAAD" in r.upper() for r in result) diff --git a/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json new file mode 100644 index 000000000..c34dc05aa --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json @@ -0,0 +1,40 @@ +[ + { + "advisory_id": "CVE-2021-0001", + "aliases": [ + "CVE-2021-0001" + ], + "summary": "Commits fixing CVE-2021-0001:\n- abc123: Fix CVE-2021-0001", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "", + "reference_type": "", + "url": "https://github.com/test/repo/commit/abc123" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-dead-beef-baad", + "aliases": [ + "GHSA-dead-beef-baad" + ], + "summary": "Commits fixing GHSA-dead-beef-baad:\n- def456: Patch GHSA-dead-beef-baad", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "", + "reference_type": "", + "url": "https://github.com/test/repo/commit/def456" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json new file mode 100644 index 000000000..9c49d65a4 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json @@ -0,0 +1,8 @@ +{ + "CVE-2021-0001": [ + ["abc123", "Fix CVE-2021-0001"] + ], + "GHSA-dead-beef-baad": [ + ["def456", "Patch GHSA-dead-beef-baad"] + ] +} \ No newline at end of file