vulnerablecode/vulnerabilities/pipelines/v2_improvers/clamav_rules.py at 89a88128208209837b63575a7612846c2b4c2d0e · aboutcode-org/vulnerablecode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import gzip
import io
import os
import shutil
import tarfile
import tempfile
from pathlib import Path
from typing import List

import requests

from vulnerabilities.models import AdvisoryAlias
from vulnerabilities.models import DetectionRule
from vulnerabilities.models import DetectionRuleTypes
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.utils import find_all_cve


def extract_cvd(cvd_path, output_dir):
    """
    Extract a CVD file. CVD format: 512-byte header + gzipped tar archive and returns Path to output directory
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    with open(cvd_path, "rb") as f:
        f.seek(512)  # Skip header
        compressed_data = f.read()

    decompressed_data = gzip.decompress(compressed_data)
    tar_buffer = io.BytesIO(decompressed_data)

    with tarfile.open(fileobj=tar_buffer, mode="r:") as tar:
        tar.extractall(path=output_path)

    for file in output_path.rglob("*"):
        if file.is_file():
            file.chmod(0o644)  # rw-r--r--
    return output_path


def parse_ndb_file(ndb_path: Path) -> List[dict]:
    """Parse a .ndb file (extended signatures). Return list of dicts."""
    signatures = []
    with ndb_path.open("r", encoding="utf-8", errors="ignore") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            parts = line.split(":")
            if len(parts) >= 4:
                signatures.append(
                    {
                        "name": parts[0],
                        "target_type": parts[1],
                        "offset": parts[2],
                        "hex_signature": parts[3],
                        "line_num": line_num,
                    }
                )
    return signatures


def parse_hdb_file(hdb_path: Path) -> List[dict]:
    """Parse a .hdb file (MD5 hash signatures). Return list of dicts."""
    signatures = []
    with hdb_path.open("r", encoding="utf-8", errors="ignore") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            parts = line.split(":")
            if len(parts) >= 3:
                signatures.append(
                    {
                        "hash": parts[0],
                        "file_size": parts[1],
                        "name": parts[2],
                        "line_num": line_num,
                    }
                )
    return signatures


def extract_cve_id(name: str):
    """Normalize underscores and extract the first CVE ID from a string, or None."""
    normalized = name.replace("_", "-")
    cves = [cve.upper() for cve in find_all_cve(normalized)]
    return cves[0] if cves else None


class ClamVRulesImproverPipeline(VulnerableCodeBaseImporterPipelineV2):
    """
    Pipeline that downloads ClamAV database (main.cvd), extracts signatures,
    parses .ndb and .hdb files and save a detection rules.
    """

    pipeline_id = "clamv_rules"
    MAIN_DATABASE_URL = "https://database.clamav.net/main.cvd"
    license_url = "https://github.com/Cisco-Talos/clamav/blob/c73755d3fc130b0c60ccf4e8f8d28c62fc58c95b/README.md#licensing"
    license_expression = "GNU GENERAL PUBLIC LICENSE"

    @classmethod
    def steps(cls):
        return (
            cls.download_database,
            cls.extract_database,
            cls.collect_and_store_advisories,
            cls.clean_downloads,
        )

    def download_database(self):
        """Download ClamAV database using the supported API with proper headers."""

        self.log("Downloading ClamAV database…")
        self.db_dir = Path(tempfile.mkdtemp()) / "clamav_db"
        self.db_dir.mkdir(parents=True, exist_ok=True)

        database_url = "https://database.clamav.net/main.cvd?api-version=1"
        headers = {
            "User-Agent": "ClamAV-Client/1.0 (https://github.com/yourproject)",
            "Accept": "*/*",
        }

        filename = self.db_dir / "main.cvd"
        self.log(f"Downloading {database_url} → {filename}")

        resp = requests.get(database_url, headers=headers, stream=True, timeout=30)
        resp.raise_for_status()

        with filename.open("wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        self.log("ClamAV DB file downloaded successfully.")

    def extract_database(self):
        """Extract the downloaded CVD into a directory"""
        out_dir = self.db_dir / "extracted"
        self.extract_cvd_dir = extract_cvd(self.db_dir / "main.cvd", out_dir)
        self.log(f"Extracted CVD to {self.extract_cvd_dir}")

    def collect_and_store_advisories(self):
        """Parse .ndb and .hdb files and store rules in the DB."""

        for rule_entry in parse_hdb_file(self.extract_cvd_dir / "main.hdb") + parse_ndb_file(
            self.extract_cvd_dir / "main.ndb"
        ):
            name = rule_entry.get("name", "")
            cve_id = extract_cve_id(name)
            found_advisories = set()

            if cve_id:
                try:
                    if alias := AdvisoryAlias.objects.get(alias=cve_id):
                        for adv in alias.advisories.all():
                            found_advisories.add(adv)
                except AdvisoryAlias.DoesNotExist:
                    self.log(f"Advisory {cve_id} not found.")

            for adv in found_advisories:
                DetectionRule.objects.update_or_create(
                    rule_text=str(rule_entry),
                    rule_type=DetectionRuleTypes.CLAMAV,
                    advisory=adv,
                    defaults={
                        "source_url": self.MAIN_DATABASE_URL,
                    },
                )

            if not found_advisories:
                DetectionRule.objects.update_or_create(
                    rule_text=str(rule_entry),
                    rule_type=DetectionRuleTypes.CLAMAV,
                    advisory=None,
                    defaults={
                        "source_url": self.MAIN_DATABASE_URL,
                    },
                )

    def clean_downloads(self):
        """Clean up downloaded files."""
        if getattr(self, "db_dir", None) and os.path.exists(self.db_dir):
            shutil.rmtree(self.db_dir, ignore_errors=True)
            self.log("Cleaned up downloaded files.")

    def on_failure(self):
        """Ensure cleanup on failure."""
        self.clean_downloads()