Skip to content

Commit c030f37

Browse files
committed
Add match test for approximate file matching #342
Signed-off-by: Jono Yang <[email protected]>
1 parent 95f09ee commit c030f37

File tree

9 files changed

+4763
-17
lines changed

9 files changed

+4763
-17
lines changed

matchcode/match_test_utils.py renamed to matchcode/match.py

+45-6
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,22 @@
1818

1919
from matchcode.models import ApproximateDirectoryContentIndex
2020
from matchcode.models import ApproximateDirectoryStructureIndex
21+
from matchcode.models import ApproximateResourceContentIndex
2122
from matchcode.models import ExactFileIndex
2223
from matchcode.models import ExactPackageArchiveIndex
2324

2425

25-
# TODO: Refactor this file into functions/utilities used in
26-
# a scanpipe pipeline.
26+
"""
27+
These functions are convenience functions to run matching on a Codebase or
28+
VirtualCodebase for the purpose of testing. The functions that are used for
29+
matching are in `matchcode_pipeline/pipes/matching.py`.
30+
"""
31+
2732
EXACT_PACKAGE_ARCHIVE_MATCH = 0
2833
APPROXIMATE_DIRECTORY_STRUCTURE_MATCH = 1
2934
APPROXIMATE_DIRECTORY_CONTENT_MATCH = 2
3035
EXACT_FILE_MATCH = 3
36+
APPROXIMATE_FILE_MATCH = 4
3137

3238

3339
def get_matchers():
@@ -36,15 +42,17 @@ def get_matchers():
3642
APPROXIMATE_DIRECTORY_CONTENT_MATCH: approximate_directory_content_match,
3743
APPROXIMATE_DIRECTORY_STRUCTURE_MATCH: approximate_directory_structure_match,
3844
EXACT_FILE_MATCH: individual_file_match,
45+
APPROXIMATE_FILE_MATCH: approximate_file_match,
3946
}
4047
return MATCHERS_BY_MATCH_TYPE
4148

4249

4350
def do_match(codebase, match_type):
4451
"""
45-
Perform Package matching on `codebase` by running matching functions of `match_type` on `codebase`
52+
Perform Package matching on `codebase` by running matching functions of
53+
`match_type` on `codebase`.
4654
47-
The total number of matches found is returned
55+
The total number of matches found is returned.
4856
"""
4957

5058
matcher = get_matchers().get(match_type)
@@ -135,9 +143,9 @@ def approximate_directory_structure_match(codebase):
135143

136144
def individual_file_match(codebase):
137145
"""
138-
Update Matches from detected Package files in `codebase`
146+
Update Matches from detected Package files in `codebase`.
139147
140-
Return the number of matches found in `codebase`
148+
Return the number of matches found in `codebase`.
141149
"""
142150
match_count = 0
143151
for resource in codebase.walk(topdown=True):
@@ -153,6 +161,25 @@ def individual_file_match(codebase):
153161
return match_count
154162

155163

164+
def approximate_file_match(codebase):
165+
"""
166+
Update Matches from approximatly matched Package files in `codebase`.
167+
168+
Return the number of approximate matches found in `codebase`.
169+
"""
170+
match_count = 0
171+
for resource in codebase.walk(topdown=True):
172+
if resource.is_dir or resource.extra_data.get('matched', False):
173+
continue
174+
file_matches, match_type = get_approximate_file_match(resource)
175+
if not file_matches:
176+
continue
177+
178+
match_count += len(file_matches)
179+
tag_matched_resources(resource, codebase, file_matches, match_type)
180+
return match_count
181+
182+
156183
def get_directory_content_match(resource):
157184
"""
158185
Match a directory to a Package using its contents
@@ -198,6 +225,18 @@ def get_file_match(resource):
198225
return file_matches, 'exact-file'
199226

200227

228+
def get_approximate_file_match(resource):
229+
"""
230+
Approximately match an individual file back to the Package it is from
231+
"""
232+
if hasattr(resource, 'halo1'):
233+
resource_content_fingerprint = resource.halo1
234+
else:
235+
resource_content_fingerprint = resource.extra_data.get('halo1', '')
236+
file_matches = ApproximateResourceContentIndex.match(resource_content_fingerprint)
237+
return file_matches, 'approximate-file'
238+
239+
201240
def tag_matched_resource(resource, codebase, purl):
202241
"""
203242
Set a resource to be flagged as matched, so it will not be considered in

matchcode/tests/test_match.py

+36-7
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@
1010
import os
1111

1212
from packagedb.models import Package
13-
14-
from matchcode.match_test_utils import EXACT_PACKAGE_ARCHIVE_MATCH
15-
from matchcode.match_test_utils import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH
16-
from matchcode.match_test_utils import APPROXIMATE_DIRECTORY_CONTENT_MATCH
17-
from matchcode.match_test_utils import EXACT_FILE_MATCH
18-
from matchcode.match_test_utils import path_suffixes
19-
from matchcode.match_test_utils import run_do_match_from_scan
13+
from packagedb.models import Resource
14+
15+
from matchcode.match import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH
16+
from matchcode.match import APPROXIMATE_DIRECTORY_CONTENT_MATCH
17+
from matchcode.match import APPROXIMATE_FILE_MATCH
18+
from matchcode.match import EXACT_FILE_MATCH
19+
from matchcode.match import EXACT_PACKAGE_ARCHIVE_MATCH
20+
from matchcode.match import path_suffixes
21+
from matchcode.match import run_do_match_from_scan
22+
from matchcode.models import ApproximateResourceContentIndex
2023
from matchcode.utils import index_package_directories
2124
from matchcode.utils import index_package_files_sha1
2225
from matchcode.utils import index_packages_sha1
@@ -87,6 +90,26 @@ def setUp(self):
8790
index_package_directories(self.test_package4)
8891
index_package_files_sha1(self.test_package4, self.get_test_loc('models/match-test.json'))
8992

93+
# Add approximate file resource
94+
self.test_package5, _ = Package.objects.get_or_create(
95+
filename='inflate.tar.gz',
96+
sha1='deadfeed',
97+
type='generic',
98+
name='inflate',
99+
version='1.0.0',
100+
download_url='inflate.com/inflate.tar.gz',
101+
)
102+
self.test_resource5, _ = Resource.objects.get_or_create(
103+
path='inflate.c',
104+
package=self.test_package5
105+
)
106+
self.test_resource5_fingerprint = '000018fba23a49e4cd40718d1297be719e6564a4'
107+
ApproximateResourceContentIndex.index(
108+
self.test_resource5_fingerprint,
109+
self.test_resource5.path,
110+
self.test_package5
111+
)
112+
90113
def test_do_match_package_archive_match(self):
91114
input_file = self.get_test_loc('models/match-test.json')
92115
vc = run_do_match_from_scan(input_file, EXACT_PACKAGE_ARCHIVE_MATCH)
@@ -111,6 +134,12 @@ def test_do_match_package_file_match(self):
111134
expected = self.get_test_loc('models/match-test-exact-file-results.json')
112135
self.check_codebase(vc, expected, regen=FIXTURES_REGEN)
113136

137+
def test_do_match_approximate_package_file_match(self):
138+
input_file = self.get_test_loc('match/approximate-file-matching/approximate-match-test.json')
139+
vc = run_do_match_from_scan(input_file, APPROXIMATE_FILE_MATCH)
140+
expected = self.get_test_loc('match/approximate-file-matching/approximate-match-test-results.json')
141+
self.check_codebase(vc, expected, regen=FIXTURES_REGEN)
142+
114143

115144
class MatchNestedPackagesTestCase(MatchcodeTestCase):
116145
BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"files": [
3+
{
4+
"path": "inflate-mod.c",
5+
"type": "file",
6+
"name": "inflate-mod.c",
7+
"base_name": "inflate-mod",
8+
"extension": ".c",
9+
"size": 55466,
10+
"date": "2024-04-16",
11+
"sha1": "12d8cd7cb0db81b8578f608ef3619304ac87f0e0",
12+
"md5": "b86e60b3ad49c08b6cfa0ab3b989b8d5",
13+
"sha256": "82f60fddd2fe80234b63f9c4757740fc17fc87ba0a2c9cb74799a02a82b43c7e",
14+
"mime_type": "text/x-c",
15+
"file_type": "C source, ASCII text",
16+
"programming_language": "C",
17+
"is_binary": false,
18+
"is_text": true,
19+
"is_archive": false,
20+
"is_media": false,
21+
"is_source": true,
22+
"is_script": false,
23+
"directory_content_fingerprint": null,
24+
"directory_structure_fingerprint": null,
25+
"halo1": "000018f4aa3a49e4cd40718d1297be519e6564a4",
26+
"matched_to": [
27+
"pkg:generic/[email protected]"
28+
],
29+
"files_count": 0,
30+
"dirs_count": 0,
31+
"size_count": 0,
32+
"scan_errors": []
33+
}
34+
]
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
{
2+
"headers": [
3+
{
4+
"tool_name": "scancode-toolkit",
5+
"tool_version": "32.1.0",
6+
"options": {
7+
"input": [
8+
"inflate-mod.c"
9+
],
10+
"--fingerprint": true,
11+
"--info": true,
12+
"--json-pp": "approximate-match-test.json"
13+
},
14+
"notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
15+
"start_timestamp": "2024-05-11T014449.669618",
16+
"end_timestamp": "2024-05-11T014449.971868",
17+
"output_format_version": "3.1.0",
18+
"duration": 0.302262544631958,
19+
"message": null,
20+
"errors": [],
21+
"warnings": [],
22+
"extra_data": {
23+
"system_environment": {
24+
"operating_system": "linux",
25+
"cpu_architecture": "64",
26+
"platform": "Linux-6.5.13-1-pve-x86_64-with-glibc2.35",
27+
"platform_version": "#1 SMP PREEMPT_DYNAMIC PMX 6.5.13-1 (2024-02-05T13:50Z)",
28+
"python_version": "3.11.8 (main, Mar 11 2024, 14:33:31) [GCC 11.4.0]"
29+
},
30+
"spdx_license_list_version": "3.23",
31+
"files_count": 1
32+
}
33+
}
34+
],
35+
"files": [
36+
{
37+
"path": "inflate-mod.c",
38+
"type": "file",
39+
"name": "inflate-mod.c",
40+
"base_name": "inflate-mod",
41+
"extension": ".c",
42+
"size": 55466,
43+
"date": "2024-04-16",
44+
"sha1": "12d8cd7cb0db81b8578f608ef3619304ac87f0e0",
45+
"md5": "b86e60b3ad49c08b6cfa0ab3b989b8d5",
46+
"sha256": "82f60fddd2fe80234b63f9c4757740fc17fc87ba0a2c9cb74799a02a82b43c7e",
47+
"mime_type": "text/x-c",
48+
"file_type": "C source, ASCII text",
49+
"programming_language": "C",
50+
"is_binary": false,
51+
"is_text": true,
52+
"is_archive": false,
53+
"is_media": false,
54+
"is_source": true,
55+
"is_script": false,
56+
"directory_content_fingerprint": null,
57+
"directory_structure_fingerprint": null,
58+
"halo1": "000018f4aa3a49e4cd40718d1297be519e6564a4",
59+
"files_count": 0,
60+
"dirs_count": 0,
61+
"size_count": 0,
62+
"scan_errors": []
63+
}
64+
]
65+
}

0 commit comments

Comments
 (0)