-
-
Notifications
You must be signed in to change notification settings - Fork 238
Collect existing fix commits for project-kb #1987
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| # | ||
| # Copyright (c) nexB Inc. and others. All rights reserved. | ||
| # VulnerableCode is a trademark of nexB Inc. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
| # See https://github.com/aboutcode-org/vulnerablecode for support or download. | ||
| # See https://aboutcode.org for more information about nexB OSS projects. | ||
| # | ||
|
|
||
| import json | ||
| from pathlib import Path | ||
| from typing import Iterable | ||
|
|
||
| import saneyaml | ||
| from fetchcode.vcs import fetch_via_vcs | ||
| from packageurl import PackageURL | ||
| from univers.maven import VersionRange | ||
|
|
||
| from vulnerabilities.importer import AdvisoryData | ||
| from vulnerabilities.importer import AffectedPackageV2 | ||
| from vulnerabilities.importer import ReferenceV2 | ||
| from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 | ||
| from vulnerabilities.utils import get_advisory_url | ||
|
|
||
|
|
||
| class ProjectKBPipeline(VulnerableCodeBaseImporterPipelineV2): | ||
| """ | ||
| ProjectKB Importer Pipeline | ||
| Collect advisory from ProjectKB data: | ||
| - YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml | ||
| """ | ||
|
|
||
| pipeline_id = "project-kb_v2" | ||
| spdx_license_expression = "Apache-2.0" | ||
| license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt" | ||
| repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data" | ||
|
|
||
| @classmethod | ||
| def steps(cls): | ||
| return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads) | ||
|
|
||
| def clone_repo(self): | ||
| self.log("Processing ProjectKB advisory data...") | ||
| self.vcs_response = fetch_via_vcs(self.repo_url) | ||
|
|
||
| def advisories_count(self): | ||
| base_path = Path(self.vcs_response.dest_dir) / "statements" | ||
| count = sum(1 for _ in base_path.rglob("*.yaml")) | ||
| self.log(f"Estimated advisories to process: {count}") | ||
| return count | ||
|
|
||
| def collect_advisories(self) -> Iterable[AdvisoryData]: | ||
| """Collect fix commits from YAML statements under /statements.""" | ||
| base_path = Path(self.vcs_response.dest_dir) / "statements" | ||
|
|
||
| for yaml_file in base_path.rglob("*.yaml"): | ||
| if yaml_file.name != "statement.yaml": | ||
| continue | ||
|
|
||
| with open(yaml_file, encoding="utf-8") as f: | ||
| yaml_data = saneyaml.load(f) | ||
|
|
||
| vulnerability_id = yaml_data.get("vulnerability_id") | ||
| if not vulnerability_id: | ||
| continue | ||
|
|
||
| note_texts = [] | ||
| for note_entry in yaml_data.get("notes", []): | ||
| text_content = note_entry.get("text") | ||
| if text_content: | ||
| note_texts.append(text_content) | ||
| description = "\n".join(note_texts) | ||
|
|
||
| references = [] | ||
| for fix in yaml_data.get("fixes", []): | ||
| for commit in fix.get("commits", []): | ||
| commit_id = commit.get("id") | ||
| repo_url = commit.get("repository") | ||
| if not commit_id or not repo_url: | ||
| continue | ||
|
|
||
| commit_url = repo_url.replace(".git", "") + "/commit/" + commit_id | ||
| ref = ReferenceV2.from_url(commit_url) | ||
| references.append(ref) | ||
|
|
||
| affected_packages = [] | ||
| for artifact in yaml_data.get("artifacts", []): | ||
| affected = artifact.get("affected") | ||
| if not affected: | ||
| continue | ||
|
|
||
| purl_str = artifact.get("id") | ||
| purl = PackageURL.from_string(purl_str) | ||
|
|
||
| affected_package = AffectedPackageV2( | ||
| package=PackageURL(type=purl.type, namespace=purl.namespace, name=purl.name), | ||
| fixed_version_range=VersionRange.from_version(purl.version), | ||
| ) | ||
| affected_packages.append(affected_package) | ||
|
|
||
| advisory_url = get_advisory_url( | ||
| file=yaml_file, | ||
| base_path=base_path, | ||
| url="https://github.com/SAP/project-kb/blob/vulnerability-data/statements/", | ||
| ) | ||
|
|
||
| yield AdvisoryData( | ||
| advisory_id=vulnerability_id, | ||
| aliases=[], | ||
| summary=description or "", | ||
| affected_packages=affected_packages, | ||
| references_v2=references, | ||
| url=advisory_url, | ||
| original_advisory_text=json.dumps(yaml_data, indent=2, ensure_ascii=False), | ||
| ) | ||
|
|
||
| def clean_downloads(self): | ||
| """Remove the cloned repository from disk.""" | ||
| self.log("Removing cloned repository...") | ||
| if self.vcs_response: | ||
| self.vcs_response.delete() | ||
|
|
||
| def on_failure(self): | ||
| """Ensure cleanup happens on pipeline failure.""" | ||
| self.clean_downloads() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| # | ||
| # Copyright (c) nexB Inc. and others. All rights reserved. | ||
| # VulnerableCode is a trademark of nexB Inc. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
| # See https://github.com/aboutcode-org/vulnerablecode for support or download. | ||
| # See https://aboutcode.org for more information about nexB OSS projects. | ||
| # | ||
|
|
||
| import csv | ||
| from pathlib import Path | ||
|
|
||
| from fetchcode.vcs import fetch_via_vcs | ||
|
|
||
| from vulnerabilities.models import AdvisoryV2 | ||
| from vulnerabilities.models import CodeFixV2 | ||
| from vulnerabilities.pipelines import VulnerableCodePipeline | ||
|
|
||
|
|
||
| class CollectFixCommitsProjectKBPipeline(VulnerableCodePipeline): | ||
| """ | ||
| Pipeline to collect fix commits from Project KB: | ||
| https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv | ||
| """ | ||
|
|
||
| pipeline_id = "kb_project_fix_commits" | ||
| spdx_license_expression = "Apache-2.0" | ||
| license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt" | ||
| qualified_name = "kb_project_fix_commits" | ||
| repo_url = "git+https://github.com/SAP/project-kb" | ||
|
|
||
| @classmethod | ||
| def steps(cls): | ||
| return ( | ||
| cls.clone, | ||
| cls.collect_fix_commits, | ||
| ) | ||
|
|
||
| def clone(self): | ||
| self.log("Cloning repositories for ProjectKB fix commits from CSV...") | ||
| self.vcs_response = fetch_via_vcs(self.repo_url) | ||
|
|
||
| def collect_fix_commits(self): | ||
| self.log("Collecting fix commits from ProjectKB...") | ||
|
|
||
| csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv" | ||
|
|
||
| with open(csv_path, newline="", encoding="utf-8") as f: | ||
| reader = csv.reader(f) | ||
| next(reader, None) # skip header | ||
| rows = [r for r in reader if len(r) == 4 and r[0]] | ||
|
|
||
| vuln_ids = {r[0] for r in rows} | ||
| advisories = AdvisoryV2.objects.filter(advisory_id__in=vuln_ids).prefetch_related( | ||
| "impacted_packages__affecting_packages" | ||
| ) | ||
|
Comment on lines
+54
to
+56
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do not want to merge the advisory info coming from different source. |
||
| advisory_map = {a.advisory_id: a for a in advisories} | ||
|
|
||
| codefixes = [] | ||
| for vuln_id, repo_url, commit, _ in rows: | ||
| advisory = advisory_map.get(vuln_id) | ||
| if not advisory: | ||
| continue | ||
|
|
||
| repo_url = repo_url.rstrip("/").removesuffix(".git") | ||
| vcs_url = f"{repo_url}/commit/{commit}" | ||
|
|
||
| for impact in advisory.impacted_packages.all(): | ||
| for pkg in impact.affecting_packages.all(): | ||
| codefixes.append( | ||
| CodeFixV2( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO we should treat this as an advisory and update impact_package model to hold the fixed and affecting commit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main issue is how to relate a fix commit to an impacted package. IMHO, we should have an advisory, but the code fix should be considered as a reference URL, with an optional relation to the impacted packages. Since we don't know which version or package (purl) is going to be impacted by this commit. |
||
| commits=[vcs_url], | ||
| advisory=advisory, | ||
| affected_package=pkg, | ||
| ) | ||
| ) | ||
|
|
||
| if codefixes: | ||
| CodeFixV2.objects.bulk_create(codefixes, ignore_conflicts=True) | ||
| self.log(f"Created {len(codefixes)} CodeFix entries.") | ||
| else: | ||
| self.log("No CodeFix entries created.") | ||
|
|
||
| def clean_downloads(self): | ||
| """Remove the cloned repository from disk.""" | ||
| if self.vcs_response: | ||
| self.log(f"Removing cloned repository") | ||
| self.vcs_response.delete() | ||
|
|
||
| def on_failure(self): | ||
| self.clean_downloads() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| # | ||
| # Copyright (c) nexB Inc. and others. All rights reserved. | ||
| # VulnerableCode is a trademark of nexB Inc. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
| # See https://github.com/aboutcode-org/vulnerablecode for support or download. | ||
| # See https://aboutcode.org for more information about nexB OSS projects. | ||
| # | ||
| from datetime import datetime | ||
| from datetime import timezone | ||
| from pathlib import Path | ||
| from types import SimpleNamespace | ||
| from unittest import TestCase | ||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
|
|
||
| from vulnerabilities.models import AdvisoryV2 | ||
| from vulnerabilities.models import CodeFixV2 | ||
| from vulnerabilities.models import ImpactedPackage | ||
| from vulnerabilities.models import PackageV2 | ||
| from vulnerabilities.pipelines.v2_importers.project_kb_importer import ProjectKBPipeline | ||
| from vulnerabilities.pipelines.v2_improvers.collect_commits_project_kb import ( | ||
| CollectFixCommitsProjectKBPipeline, | ||
| ) | ||
| from vulnerabilities.tests import util_tests | ||
|
|
||
| TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "kbmsr2019" | ||
|
|
||
|
|
||
| class TestProjectKbImporterPipeline(TestCase): | ||
| """ | ||
| Integration-style test that validates YAML → Advisory → JSON conversion | ||
| using real test data files, but mocks network and repo access. | ||
| """ | ||
|
|
||
| @patch( | ||
| "vulnerabilities.pipelines.v2_importers.project_kb_importer.get_advisory_url", | ||
| return_value="https://mocked.url/advisory", | ||
| ) | ||
| def test_project_kb_collect_advisories_v2(self, mock_get_advisory_url): | ||
| pipeline = ProjectKBPipeline() | ||
| pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA) | ||
|
|
||
| for idx in range(1, 4): | ||
| yaml_file = TEST_DATA / str(idx) / f"statement.yaml" | ||
| expected_file = TEST_DATA / f"statement-{idx}-expected.json" | ||
|
|
||
| with patch( | ||
| "vulnerabilities.pipelines.v2_importers.project_kb_importer.Path.rglob", | ||
| return_value=[yaml_file], | ||
| ): | ||
| result = [adv.to_dict() for adv in pipeline.collect_advisories()] | ||
|
|
||
| util_tests.check_results_against_json(result, expected_file) | ||
|
|
||
| @pytest.mark.django_db | ||
| def test_collect_fix_commits_uses_existing_csv(self): | ||
| """ | ||
| Test that CollectFixCommitsProjectKBPipeline.collect_fix_commits() | ||
| reads an existing ProjectKB CSV file and creates CodeFixV2 entries. | ||
| """ | ||
|
|
||
| advisory = AdvisoryV2.objects.create( | ||
| advisory_id="CVE-2018-8034", | ||
| datasource_id="test-datasource", | ||
| avid="TEST-1234", | ||
| unique_content_id="unique-test-id", | ||
| url="https://example.com/advisory/CVE-2018-8034", | ||
| date_collected=datetime.now(timezone.utc), | ||
| ) | ||
|
|
||
| pkg1 = PackageV2.objects.create(name="test_name1", type="test") | ||
| pkg2 = PackageV2.objects.create(name="test_name2", type="test") | ||
|
|
||
| impacted = ImpactedPackage.objects.create(advisory=advisory) | ||
| impacted.affecting_packages.set([pkg1, pkg2]) | ||
|
|
||
| pipeline = CollectFixCommitsProjectKBPipeline() | ||
| pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA) | ||
|
|
||
| pipeline.collect_fix_commits() | ||
|
|
||
| fixes = CodeFixV2.objects.all() | ||
| assert len(fixes) == 2 | ||
| assert [fix.commits for fix in fixes] == [ | ||
| ["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"], | ||
| ["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"], | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This path is only valid for GitHub repos, are we sure we only have GitHub repos in project kb advisory.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, Project KB Advisory is just one GitHub repository.