Skip to content

Commit dff19f7

Browse files
committed
WIP Add functionality for working with RECORD files
1 parent 0055d4b commit dff19f7

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed

src/packaging/record.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# This file is dual licensed under the terms of the Apache License, Version
2+
# 2.0, and the BSD License. See the LICENSE file in the root of this repository
3+
# for complete details.
4+
5+
"""
6+
Implements functionality for working with the file format described here:
7+
8+
https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import base64
14+
import csv
15+
import functools
16+
import hashlib
17+
import os
18+
import re
19+
from collections.abc import Iterator
20+
from dataclasses import dataclass
21+
from io import StringIO
22+
23+
24+
@functools.lru_cache
25+
def _expected_digest_size(algorithm: str) -> int:
26+
return hashlib.new(algorithm).digest_size
27+
28+
29+
class InvalidRecordHash(ValueError):
30+
pass
31+
32+
33+
@dataclass(frozen=True)
34+
class RecordHash:
35+
algorithm: str
36+
digest: bytes
37+
38+
def __post_init__(self) -> None:
39+
if not isinstance(self.algorithm, str):
40+
raise TypeError("algorithm must have type 'str'")
41+
42+
if self.algorithm not in hashlib.algorithms_guaranteed:
43+
raise InvalidRecordHash(
44+
f"{self.algorithm!r} is not a guaranteed hash algorithm"
45+
)
46+
47+
expected_len = _expected_digest_size(self.algorithm)
48+
49+
if expected_len <= 0:
50+
# hashlib returns a digest_size of 0 for variable-length hashes,
51+
# like SHAKE-128. While the spec takes no stance on such algorithms,
52+
# using one would make it impossible for a reader to calculate its own
53+
# digest of a file for comparison without first parsing the recorded digest.
54+
# That seems unreasonable, so we prohibit such algorithms.
55+
raise InvalidRecordHash(
56+
f"{self.algorithm!r} does not have fixed length digests"
57+
)
58+
59+
if not isinstance(self.digest, bytes):
60+
raise TypeError("digest must have type 'bytes'")
61+
62+
if len(self.digest) != expected_len:
63+
raise InvalidRecordHash(
64+
f"digest has wrong length: {len(self.digest)} (expected {expected_len})"
65+
)
66+
67+
def __str__(self) -> str:
68+
digest_str = base64.urlsafe_b64encode(self.digest).decode().rstrip("=")
69+
return f"{self.algorithm}={digest_str}"
70+
71+
72+
_RE_BASE64_URLSAFE = re.compile(r"[0-9A-Za-z_-]*")
73+
74+
75+
def parse_record_hash(hash_str: str, /) -> RecordHash:
76+
algorithm, sep, digest_str = hash_str.partition("=")
77+
78+
if not sep:
79+
raise InvalidRecordHash("'=' not found")
80+
81+
if not _RE_BASE64_URLSAFE.fullmatch(digest_str):
82+
raise InvalidRecordHash("invalid Base64 encoding")
83+
84+
pad_len = 4 - len(digest_str) % 4
85+
if pad_len == 3:
86+
raise InvalidRecordHash("invalid Base64 encoding")
87+
88+
digest = base64.urlsafe_b64decode(digest_str + "=" * pad_len)
89+
90+
return RecordHash(algorithm, digest)
91+
92+
93+
class InvalidRecord(ValueError):
94+
pass
95+
96+
97+
@dataclass(frozen=True)
98+
class Record:
99+
path: str
100+
hash: RecordHash | None = None
101+
size: int | None = None
102+
103+
def __init__(
104+
self, path: str, *, hash: RecordHash | None = None, size: int | None = None
105+
):
106+
# This constructor emulates kw_only (which isn't available until Python 3.10)
107+
# for hash and size.
108+
object.__setattr__(self, "path", path)
109+
object.__setattr__(self, "hash", hash)
110+
object.__setattr__(self, "size", size)
111+
112+
def __post_init__(self) -> None:
113+
if not isinstance(self.path, str):
114+
raise TypeError("path must have type 'str'")
115+
116+
if not self.path:
117+
raise InvalidRecord("path must not be empty")
118+
119+
if self.path.endswith((os.sep, os.altsep)):
120+
raise InvalidRecord("path must not be a directory path")
121+
122+
if self.hash is not None and not isinstance(self.hash, RecordHash):
123+
raise TypeError("hash must be either None or have type 'RecordHash'")
124+
125+
if self.size is not None:
126+
if not isinstance(self.size, int):
127+
raise TypeError("size must be either None or have type 'int'")
128+
129+
if self.size < 0:
130+
raise InvalidRecord("size must not be negative")
131+
132+
133+
class InvalidRecordSet(ValueError):
134+
pass
135+
136+
137+
class RecordSet:
138+
_records: dict[str, Record]
139+
140+
def record_for_path(self, path: str, /) -> Record:
141+
return self._records[path]
142+
143+
def __iter__(self) -> Iterator[Record]:
144+
return iter(self._records.values())
145+
146+
def to_csv(self) -> str:
147+
file = StringIO()
148+
writer = csv.writer(file)
149+
150+
for record in self:
151+
writer.writerow(
152+
(
153+
record.path,
154+
"" if record.hash is None else str(record.hash),
155+
"" if record.size is None else str(record.size),
156+
)
157+
)
158+
159+
return file.getvalue()
160+
161+
162+
def parse_record_csv(data: str, /) -> RecordSet:
163+
builder = RecordSetBuilder()
164+
165+
file = StringIO(data)
166+
reader = csv.reader(file)
167+
168+
for row in reader:
169+
if len(row) != 3:
170+
raise InvalidRecordSet(f"row has {len(row)} fields (expected 3)")
171+
172+
path, hash_str, size_str = row
173+
174+
try:
175+
if hash_str:
176+
hash = parse_record_hash(hash_str)
177+
else:
178+
hash = None
179+
180+
if size_str:
181+
size = int(size_str)
182+
else:
183+
size = None
184+
185+
record = Record(path, hash=hash, size=size)
186+
except ValueError as ex:
187+
raise InvalidRecordSet(f"invalid record for path {path!r}: {ex}") from ex
188+
189+
builder.add(record)
190+
191+
return builder.build()
192+
193+
194+
class RecordSetBuilder:
195+
def __init__(self) -> None:
196+
self._records: dict[str, Record] = {}
197+
198+
def add(self, record: Record, /) -> None:
199+
if record.path in self._records:
200+
raise InvalidRecordSet(f"duplicate record path {record.path}")
201+
202+
self._records[record.path] = record
203+
204+
def build(self) -> RecordSet:
205+
rs = RecordSet()
206+
rs._records = self._records.copy()
207+
return rs

0 commit comments

Comments
 (0)