Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PYTHON-5126 & PYTHON-5280 Addresses issues raised in DRIVERS-3097 and DRIVERS-3123 #2261

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions bson/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ def from_vector(
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: # type: ignore
raise ValueError(
"If padding p is provided, all bits in the final byte lower than p must be 0."
)
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
Expand Down Expand Up @@ -490,6 +494,11 @@ def as_vector(self) -> BinaryVector:
dtype = BinaryVectorDtype(dtype)
n_values = len(self) - position

if padding and dtype != BinaryVectorDtype.PACKED_BIT:
raise ValueError(
f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})"
)

if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"<{n_values}{dtype_format}"
Expand All @@ -513,6 +522,12 @@ def as_vector(self) -> BinaryVector:
dtype_format = "B"
format_string = f"<{n_values}{dtype_format}"
unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
if padding and not n_values:
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0:
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these breaking changes for existing data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are, yeah. But only for a case that vector search doesn't use. I feel confident that it won't affect anyone now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a changelog entry to explain this because it's a breaking change.

"Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero."
)
return BinaryVector(unpacked_uint8s, dtype, padding)

else:
Expand Down
21 changes: 15 additions & 6 deletions test/bson_binary_vector/packed_bit.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,32 @@
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
},
{
"description": "Empty Vector PACKED_BIT",
"description": "PACKED_BIT with padding",
"valid": true,
"vector": [],
"vector": [127, 8],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009100000"
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0800"
},
{
"description": "PACKED_BIT with padding",
"valid": true,
"description": "PACKED_BIT with inconsistent padding",
"valid": false,
"vector": [127, 7],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
},
{
"description": "Empty Vector PACKED_BIT",
"valid": true,
"vector": [],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009100000"
},
{
"description": "Overflow Vector PACKED_BIT",
"valid": false,
Expand Down
8 changes: 4 additions & 4 deletions test/test_bson.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def test_vector(self):
"""Tests of subtype 9"""
# We start with valid cases, across the 3 dtypes implemented.
# Work with a simple vector that can be interpreted as int8, float32, or ubyte
list_vector = [127, 7]
list_vector = [127, 8]
# As INT8, vector has length 2
binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8)
vector = binary_vector.as_vector()
Expand All @@ -764,18 +764,18 @@ def test_vector(self):
uncompressed = ""
for val in list_vector:
uncompressed += format(val, "08b")
assert uncompressed[:-padding] == "0111111100000"
assert uncompressed[:-padding] == "0111111100001"

# It is worthwhile explicitly showing the values encoded to BSON
padded_doc = {"padded_vec": padded_vec}
assert (
encode(padded_doc)
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00"
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x08\x00"
)
# and dumped to json
assert (
json_util.dumps(padded_doc)
== '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}'
== '{"padded_vec": {"$binary": {"base64": "EAN/CA==", "subType": "09"}}}'
)

# FLOAT32 is also implemented
Expand Down
31 changes: 21 additions & 10 deletions test/test_bson_binary_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ def create_test(case_spec):
def run_test(self):
for test_case in case_spec.get("tests", []):
description = test_case["description"]
vector_exp = test_case.get("vector", [])
vector_exp = test_case.get("vector", None)
dtype_hex_exp = test_case["dtype_hex"]
dtype_alias_exp = test_case.get("dtype_alias")
padding_exp = test_case.get("padding", 0)
canonical_bson_exp = test_case.get("canonical_bson")
canonical_bson_exp = test_case.get("canonical_bson", None)
# Convert dtype hex string into bytes
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))

Expand Down Expand Up @@ -85,14 +85,25 @@ def run_test(self):
self.assertEqual(cB_obs, canonical_bson_exp, description)

else:
with self.assertRaises((struct.error, ValueError), msg=description):
# Tests Binary.from_vector
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
# Tests Binary.as_vector
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
decoded_doc = decode(cB_exp)
binary_obs = decoded_doc[test_key]
binary_obs.as_vector()
"""
#### To prove correct in an invalid case (`valid:false`), one MUST
- if the vector field is present, raise an exception when attempting to encode a document from the numeric values,
dtype, and padding.
- if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding
numeric values, as the field contains corrupted data.
"""
# Tests Binary.from_vector()
if vector_exp is not None:
with self.assertRaises((struct.error, ValueError), msg=description):
Binary.from_vector(vector_exp, dtype_exp, padding_exp)

# Tests Binary.as_vector()
if canonical_bson_exp is not None:
with self.assertRaises((struct.error, ValueError), msg=description):
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
decoded_doc = decode(cB_exp)
binary_obs = decoded_doc[test_key]
binary_obs.as_vector()

return run_test

Expand Down
Loading