Skip to content

Commit 693c668

Browse files
committed
Collect WHEEL files for installed python wheels #4214
Parse the WHEEL file in .dist-info directories to extract wheel tags needed to reconstruct the original wheel filename for building more detailed PURLs. Changes: - Add parse_wheel_tags() to parse .dist-info/WHEEL files - Add reconstruct_wheel_filename() to rebuild wheel filenames - Modify parse_metadata() to collect wheel data into extra_data - Add new test fixtures and tests for wheel tag collection - Update existing expected test JSONs with wheel tag data Signed-off-by: Kareem Samy <karim.elsayed401@eng-st.cu.edu.eg>
1 parent 022ddc8 commit 693c668

30 files changed

+2435
-1767
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,4 @@ The following organizations or individuals have contributed to ScanCode:
105105
- Yash Sharma @yasharmaster
106106
- Yunus Rahbar @yns88
107107
- Stefano Zacchiroli @zacchiro
108+
- Kareem Samy @kaokab33

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ Changelog
44
Next release
55
--------------
66

7+
- Collect WHEEL files for installed python wheels to extract
8+
wheel tags needed to reconstruct the original wheel filename
9+
for building more detailed PURLs.
10+
https://github.com/aboutcode-org/scancode-toolkit/issues/4214
11+
712
v3.5.0 - 2026-01-15
813
-------------------
914

src/packagedcode/pypi.py

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,72 @@ def parse(cls, location, package_only=False):
956956

957957
META_DIR_SUFFIXES = '.dist-info', '.egg-info', 'EGG-INFO',
958958

959+
def parse_wheel_tags(dist_info_path):
960+
"""
961+
Parse the WHEEL file in a .dist-info directory and return a list
962+
of tag strings (e.g., ['cp310-cp310-manylinux_2_17_x86_64']).
963+
964+
A WHEEL file looks like:
965+
Wheel-Version: 1.0
966+
Generator: bdist_wheel (0.37.1)
967+
Root-Is-Purelib: false
968+
Tag: cp310-cp310-manylinux_2_17_x86_64
969+
970+
There can be multiple Tag: lines.
971+
"""
972+
tags = []
973+
wheel_version = None
974+
generator = None
975+
root_is_purelib = None
976+
977+
if isinstance(dist_info_path, ZipPath):
978+
wheel_path = dist_info_path / 'WHEEL'
979+
if not wheel_path.exists():
980+
return {}
981+
content = wheel_path.read_text(encoding='utf-8')
982+
else:
983+
wheel_path = Path(dist_info_path) / 'WHEEL'
984+
if not wheel_path.exists():
985+
return {}
986+
content = wheel_path.read_text(encoding='utf-8')
987+
988+
for line in content.strip().splitlines():
989+
line = line.strip()
990+
if not line or ':' not in line:
991+
continue
992+
993+
key, _, value = line.partition(':')
994+
key = key.strip()
995+
value = value.strip()
996+
997+
if key == 'Tag':
998+
tags.append(value)
999+
elif key == 'Wheel-Version':
1000+
wheel_version = value
1001+
elif key == 'Generator':
1002+
generator = value
1003+
elif key == 'Root-Is-Purelib':
1004+
root_is_purelib = value.lower() == 'true'
1005+
1006+
return {
1007+
'wheel_version': wheel_version,
1008+
'generator': generator,
1009+
'root_is_purelib': root_is_purelib,
1010+
'tags': tags,
1011+
}
1012+
1013+
def reconstruct_wheel_filename(name, version, tag):
1014+
"""
1015+
Reconstruct a wheel filename from a package name, version, and tag string.
1016+
1017+
For example:
1018+
>>> reconstruct_wheel_filename('numpy', '1.23.0', 'cp310-cp310-manylinux_2_17_x86_64')
1019+
'numpy-1.23.0-cp310-cp310-manylinux_2_17_x86_64.whl'
1020+
>>> reconstruct_wheel_filename('my-package', '2.0', 'py3-none-any')
1021+
'my_package-2.0-py3-none-any.whl'
1022+
"""
1023+
safe_name = name.replace('-', '_')
1024+
return f"{safe_name}-{version}-{tag}.whl"
9591025

9601026
def parse_metadata(location, datasource_id, package_type, package_only=False):
9611027
"""
@@ -993,7 +1059,28 @@ def parse_metadata(location, datasource_id, package_type, package_only=False):
9931059
# nicely?
9941060
dependencies = get_dist_dependencies(dist)
9951061
file_references = list(get_file_references(dist))
996-
1062+
1063+
# ============= NEW CODE START =============
1064+
wheel_data = {}
1065+
if parent.name.endswith('.dist-info'):
1066+
wheel_data = parse_wheel_tags(path)
1067+
1068+
wheel_filename = None
1069+
if wheel_data and wheel_data.get('tags') and name and version:
1070+
# Use the first tag to reconstruct the filename
1071+
first_tag = wheel_data['tags'][0]
1072+
wheel_filename = reconstruct_wheel_filename(name, version, first_tag)
1073+
1074+
# Store all wheel metadata in extra_data
1075+
if wheel_data.get('tags'):
1076+
extra_data['wheel_tags'] = wheel_data['tags']
1077+
if wheel_data.get('wheel_version'):
1078+
extra_data['wheel_version'] = wheel_data['wheel_version']
1079+
if wheel_data.get('generator'):
1080+
extra_data['wheel_generator'] = wheel_data['generator']
1081+
if wheel_data.get('root_is_purelib') is not None:
1082+
extra_data['root_is_purelib'] = wheel_data['root_is_purelib']
1083+
9971084
package_data = dict(
9981085
datasource_id=datasource_id,
9991086
type=package_type,
@@ -1011,7 +1098,6 @@ def parse_metadata(location, datasource_id, package_type, package_only=False):
10111098
)
10121099
return models.PackageData.from_data(package_data, package_only)
10131100

1014-
10151101
def urlsafe_b64decode(data):
10161102
"""
10171103
urlsafe_b64decode without padding

tests/packagedcode/data/pypi/archive/atomicwrites-1.2.1-py2.py3-none-any.whl-expected.json

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,15 @@
142142
],
143143
"is_private": false,
144144
"is_virtual": false,
145-
"extra_data": {},
145+
"extra_data": {
146+
"wheel_tags": [
147+
"py2-none-any",
148+
"py3-none-any"
149+
],
150+
"wheel_version": "1.0",
151+
"wheel_generator": "bdist_wheel (0.31.1)",
152+
"root_is_purelib": true
153+
},
146154
"dependencies": [],
147155
"repository_homepage_url": "https://pypi.org/project/atomicwrites",
148156
"repository_download_url": "https://pypi.org/packages/source/a/atomicwrites/atomicwrites-1.2.1.tar.gz",

0 commit comments

Comments
 (0)