mongodb · caseyclements · Apr 9, 2025 · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025
@@ -462,6 +462,10 @@ def from_vector(
                 raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
             if padding and not vector:
                 raise ValueError("Empty vector with non-zero padding.")
+            if padding and not (vector[-1] & ((1 << padding) - 1)) == 0:  # type: ignore
+                raise ValueError(
+                    "If padding p is provided, all bits in the final byte lower than p must be 0."
+                )
         elif dtype == BinaryVectorDtype.FLOAT32:  # pack floats as float32
             format_str = "f"
             if padding:
@@ -490,6 +494,11 @@ def as_vector(self) -> BinaryVector:
         dtype = BinaryVectorDtype(dtype)
         n_values = len(self) - position
 
+        if padding and dtype != BinaryVectorDtype.PACKED_BIT:
+            raise ValueError(
+                f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})"
+            )
+
         if dtype == BinaryVectorDtype.INT8:
             dtype_format = "b"
             format_string = f"<{n_values}{dtype_format}"
@@ -513,6 +522,12 @@ def as_vector(self) -> BinaryVector:
             dtype_format = "B"
             format_string = f"<{n_values}{dtype_format}"
             unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
+            if padding and not n_values:
+                raise ValueError("Corrupt data. Vector has a padding P, but no data.")
+            if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0:
+                raise ValueError(
+                    "Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero."
+                )
             return BinaryVector(unpacked_uint8s, dtype, padding)
 
         else:

@@ -21,23 +21,32 @@
       "canonical_bson": "1600000005766563746F7200040000000910007F0700"
     },
     {
-      "description": "Empty Vector PACKED_BIT",
+      "description": "PACKED_BIT with padding",
       "valid": true,
-      "vector": [],
+      "vector": [127, 8],
       "dtype_hex": "0x10",
       "dtype_alias": "PACKED_BIT",
-      "padding": 0,
-      "canonical_bson": "1400000005766563746F72000200000009100000"
+      "padding": 3,
+      "canonical_bson": "1600000005766563746F7200040000000910037F0800"
     },
     {
-      "description": "PACKED_BIT with padding",
-      "valid": true,
+      "description": "PACKED_BIT with inconsistent padding",
+      "valid": false,
       "vector": [127, 7],
       "dtype_hex": "0x10",
       "dtype_alias": "PACKED_BIT",
       "padding": 3,
       "canonical_bson": "1600000005766563746F7200040000000910037F0700"
     },
+    {
+      "description": "Empty Vector PACKED_BIT",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009100000"
+    },
     {
       "description": "Overflow Vector PACKED_BIT",
       "valid": false,

@@ -739,7 +739,7 @@ def test_vector(self):
         """Tests of subtype 9"""
         # We start with valid cases, across the 3 dtypes implemented.
         # Work with a simple vector that can be interpreted as int8, float32, or ubyte
-        list_vector = [127, 7]
+        list_vector = [127, 8]
         # As INT8, vector has length 2
         binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8)
         vector = binary_vector.as_vector()
@@ -764,18 +764,18 @@ def test_vector(self):
         uncompressed = ""
         for val in list_vector:
             uncompressed += format(val, "08b")
-        assert uncompressed[:-padding] == "0111111100000"
+        assert uncompressed[:-padding] == "0111111100001"
 
         # It is worthwhile explicitly showing the values encoded to BSON
         padded_doc = {"padded_vec": padded_vec}
         assert (
             encode(padded_doc)
-            == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00"
+            == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x08\x00"
         )
         # and dumped to json
         assert (
             json_util.dumps(padded_doc)
-            == '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}'
+            == '{"padded_vec": {"$binary": {"base64": "EAN/CA==", "subType": "09"}}}'
         )
 
         # FLOAT32 is also implemented

@@ -48,11 +48,11 @@ def create_test(case_spec):
     def run_test(self):
         for test_case in case_spec.get("tests", []):
             description = test_case["description"]
-            vector_exp = test_case.get("vector", [])
+            vector_exp = test_case.get("vector", None)
             dtype_hex_exp = test_case["dtype_hex"]
             dtype_alias_exp = test_case.get("dtype_alias")
             padding_exp = test_case.get("padding", 0)
-            canonical_bson_exp = test_case.get("canonical_bson")
+            canonical_bson_exp = test_case.get("canonical_bson", None)
             # Convert dtype hex string into bytes
             dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))
 
@@ -85,14 +85,25 @@ def run_test(self):
                 self.assertEqual(cB_obs, canonical_bson_exp, description)
 
             else:
-                with self.assertRaises((struct.error, ValueError), msg=description):
-                    # Tests Binary.from_vector
-                    Binary.from_vector(vector_exp, dtype_exp, padding_exp)
-                    # Tests Binary.as_vector
-                    cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
-                    decoded_doc = decode(cB_exp)
-                    binary_obs = decoded_doc[test_key]
-                    binary_obs.as_vector()
+                """
+                #### To prove correct in an invalid case (`valid:false`), one MUST
+                - if the vector field is present, raise an exception when attempting to encode a document from the numeric values,
+                dtype, and padding.
+                - if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding
+                    numeric values, as the field contains corrupted data.
+                """
+                # Tests Binary.from_vector()
+                if vector_exp is not None:
+                    with self.assertRaises((struct.error, ValueError), msg=description):
+                        Binary.from_vector(vector_exp, dtype_exp, padding_exp)
+
+                # Tests Binary.as_vector()
+                if canonical_bson_exp is not None:
+                    with self.assertRaises((struct.error, ValueError), msg=description):
+                        cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
+                        decoded_doc = decode(cB_exp)
+                        binary_obs = decoded_doc[test_key]
+                        binary_obs.as_vector()
 
     return run_test