Re-enable VLenBytes round-trip None test (#736)

jakirkham · web-flow · commit 3438e1619301 · 2025-04-10T21:41:05.000-07:00
diff --git a/numcodecs/tests/test_vlen_bytes.py b/numcodecs/tests/test_vlen_bytes.py
@@ -1,4 +1,3 @@
-import sys
 import unittest
 
 import numpy as np
@@ -85,9 +84,6 @@ def test_decode_errors():
         codec.decode(enc, out=np.zeros(10, dtype='i4'))
 
 
-# TODO: fix this test on GitHub actions somehow...
-# See https://github.com/zarr-developers/numcodecs/issues/683
-@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.")
 def test_encode_none():
     a = np.array([b'foo', None, b'bar'], dtype=object)
     codec = VLenBytes()
diff --git a/numcodecs/vlen.pyx b/numcodecs/vlen.pyx
@@ -10,23 +10,10 @@ cimport cython
 from libc.stdint cimport uint8_t, uint32_t
 from libc.string cimport memcpy
 
-from cpython.buffer cimport PyBuffer_IsContiguous
-from cpython.bytearray cimport (
-    PyByteArray_AS_STRING,
-    PyByteArray_FromStringAndSize,
-)
-from cpython.bytes cimport (
-    PyBytes_AS_STRING,
-    PyBytes_GET_SIZE,
-    PyBytes_Check,
-    PyBytes_FromStringAndSize,
-)
+from cpython.bytearray cimport PyByteArray_FromStringAndSize
+from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.memoryview cimport PyMemoryView_GET_BUFFER
-from cpython.unicode cimport (
-    PyUnicode_AsUTF8String,
-    PyUnicode_Check,
-    PyUnicode_FromStringAndSize,
-)
+from cpython.unicode cimport PyUnicode_FromStringAndSize
 
 from numpy cimport ndarray
 
@@ -89,15 +76,15 @@ class VLenUTF8(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, L, n_items, data_length
             ndarray[object, ndim=1] input_values
             object[:] encoded_values
             int[:] encoded_lengths
-            char* encv
             bytes b
             bytearray out
             char* data
-            object u
+            object o
+            unicode u
 
         # normalise input
         input_values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -110,36 +97,33 @@ class VLenUTF8(Codec):
         encoded_lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to convert to bytes
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
-            u = input_values[i]
-            if u is None or u == 0:  # treat these as missing value, normalize
-                u = ''
-            elif not PyUnicode_Check(u):
-                raise TypeError('expected unicode string, found %r' % u)
-            b = PyUnicode_AsUTF8String(u)
-            l = PyBytes_GET_SIZE(b)
+            o = input_values[i]
+            # replace missing value and coerce to typed data
+            u = "" if o is None or o == 0 else o
+            b = u.encode("utf-8")
+            L = len(b)
             encoded_values[i] = b
-            data_length += l + HEADER_LENGTH
-            encoded_lengths[i] = l
+            data_length += L + HEADER_LENGTH
+            encoded_lengths[i] = L
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = encoded_lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = encoded_lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
-            encv = PyBytes_AS_STRING(encoded_values[i])
-            memcpy(data, encv, l)
-            data += l
+            b = encoded_values[i]
+            memcpy(data, <const char*>b, L)
+            data += L
 
         return out
 
@@ -151,16 +135,14 @@ class VLenUTF8(Codec):
             const Py_buffer* buf_pb
             const char* data
             const char* data_end
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -184,12 +166,12 @@ class VLenUTF8(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            out[i] = PyUnicode_FromStringAndSize(data, l)
-            data += l
+            out[i] = PyUnicode_FromStringAndSize(data, L)
+            data += L
 
         return out
 
@@ -225,11 +207,12 @@ class VLenBytes(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, L, n_items, data_length
             object[:] values
+            object[:] normed_values
             int[:] lengths
-            char* encv
-            object b
+            object o
+            bytes b
             bytearray out
             char* data
 
@@ -240,37 +223,36 @@ class VLenBytes(Codec):
         n_items = values.shape[0]
 
         # setup intermediates
+        normed_values = np.empty(n_items, dtype=object)
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to find lengths
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
-            b = values[i]
-            if b is None or b == 0:  # treat these as missing value, normalize
-                b = b''
-            elif not PyBytes_Check(b):
-                raise TypeError('expected byte string, found %r' % b)
-            l = PyBytes_GET_SIZE(b)
-            data_length += l + HEADER_LENGTH
-            lengths[i] = l
+            o = values[i]
+            # replace missing value and coerce to typed data
+            b = b"" if o is None or o == 0 else o
+            normed_values[i] = b
+            L = len(b)
+            data_length += HEADER_LENGTH + L
+            lengths[i] = L
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
-            encv = PyBytes_AS_STRING(values[i])
-            memcpy(data, encv, l)
-            data += l
+            b = normed_values[i]
+            memcpy(data, <const char*>b, L)
+            data += L
 
         return out
 
@@ -282,16 +264,14 @@ class VLenBytes(Codec):
             const Py_buffer* buf_pb
             const char* data
             const char* data_end
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -315,12 +295,12 @@ class VLenBytes(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            out[i] = PyBytes_FromStringAndSize(data, l)
-            data += l
+            out[i] = PyBytes_FromStringAndSize(data, L)
+            data += L
 
         return out
 
@@ -369,17 +349,16 @@ class VLenArray(Codec):
     @cython.boundscheck(False)
     def encode(self, buf):
         cdef:
-            Py_ssize_t i, l, n_items, data_length, total_length
+            Py_ssize_t i, L, n_items, data_length
             object[:] values
             object[:] normed_values
             int[:] lengths
-            const char* encv
             bytes b
             bytearray out
             char* data
             memoryview value_mv
             const Py_buffer* value_pb
-            object v
+            object o
 
         # normalise input
         values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -392,41 +371,41 @@ class VLenArray(Codec):
         lengths = np.empty(n_items, dtype=np.intc)
 
         # first iteration to convert to bytes
-        data_length = 0
+        data_length = HEADER_LENGTH
         for i in range(n_items):
-            v = values[i]
-            if v is None:
-                v = np.array([], dtype=self.dtype)
-            else:
-                v = np.ascontiguousarray(v, self.dtype)
-            if v.ndim != 1:
-                raise ValueError('only 1-dimensional arrays are supported')
-            l = v.nbytes
-            normed_values[i] = v
-            data_length += l + HEADER_LENGTH
-            lengths[i] = l
+            o = values[i]
+            # replace missing value and coerce to typed data
+            value_mv = ensure_continguous_memoryview(
+                np.array([], dtype=self.dtype) if o is None
+                else np.ascontiguousarray(o, self.dtype)
+            )
+            value_pb = PyMemoryView_GET_BUFFER(value_mv)
+            if value_pb.ndim != 1:
+                raise ValueError("only 1-dimensional arrays are supported")
+            L = value_pb.len
+            normed_values[i] = value_mv
+            data_length += HEADER_LENGTH + L
+            lengths[i] = L
 
         # setup output
-        total_length = HEADER_LENGTH + data_length
-        out = PyByteArray_FromStringAndSize(NULL, total_length)
+        out = PyByteArray_FromStringAndSize(NULL, data_length)
 
         # write header
-        data = PyByteArray_AS_STRING(out)
+        data = out
         store_le32(<uint8_t*>data, n_items)
 
         # second iteration, store data
         data += HEADER_LENGTH
         for i in range(n_items):
-            l = lengths[i]
-            store_le32(<uint8_t*>data, l)
+            L = lengths[i]
+            store_le32(<uint8_t*>data, L)
             data += HEADER_LENGTH
 
-            value_mv = ensure_continguous_memoryview(normed_values[i])
+            value_mv = normed_values[i]
             value_pb = PyMemoryView_GET_BUFFER(value_mv)
-            encv = <const char*>value_pb.buf
 
-            memcpy(data, encv, l)
-            data += l
+            memcpy(data, value_pb.buf, L)
+            data += L
 
         return out
 
@@ -441,16 +420,14 @@ class VLenArray(Codec):
             object v
             memoryview v_mv
             Py_buffer* v_pb
-            Py_ssize_t i, l, n_items, data_length
+            Py_ssize_t i, L, n_items, data_length
 
         # obtain memoryview
         buf = ensure_contiguous_ndarray(buf)
-        buf_mv = memoryview(buf)
+        buf_mv = ensure_continguous_memoryview(buf)
         buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
 
         # sanity checks
-        if not PyBuffer_IsContiguous(buf_pb, b'A'):
-            raise BufferError("`buf` must contain contiguous memory")
         if buf_pb.len < HEADER_LENGTH:
             raise ValueError('corrupt buffer, missing or truncated header')
 
@@ -474,18 +451,18 @@ class VLenArray(Codec):
         for i in range(n_items):
             if data + HEADER_LENGTH > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
-            l = load_le32(<uint8_t*>data)
+            L = load_le32(<uint8_t*>data)
             data += HEADER_LENGTH
-            if data + l > data_end:
+            if data + L > data_end:
                 raise ValueError('corrupt buffer, data seem truncated')
 
             # Create & fill array value
-            v = np.empty((l,), dtype="uint8").view(self.dtype)
+            v = np.empty((L,), dtype="uint8").view(self.dtype)
             v_mv = memoryview(v)
             v_pb = PyMemoryView_GET_BUFFER(v_mv)
-            memcpy(v_pb.buf, data, l)
+            memcpy(v_pb.buf, data, L)
 
             out[i] = v
-            data += l
+            data += L
 
         return out