Skip to content

Commit 3438e16

Browse files
authored
Re-enable VLenBytes round-trip None test (#736)
1 parent 85eeed3 commit 3438e16

File tree

2 files changed

+83
-110
lines changed

2 files changed

+83
-110
lines changed

Diff for: numcodecs/tests/test_vlen_bytes.py

-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import sys
21
import unittest
32

43
import numpy as np
@@ -85,9 +84,6 @@ def test_decode_errors():
8584
codec.decode(enc, out=np.zeros(10, dtype='i4'))
8685

8786

88-
# TODO: fix this test on GitHub actions somehow...
89-
# See https://github.com/zarr-developers/numcodecs/issues/683
90-
@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.")
9187
def test_encode_none():
9288
a = np.array([b'foo', None, b'bar'], dtype=object)
9389
codec = VLenBytes()

Diff for: numcodecs/vlen.pyx

+83-106
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,10 @@ cimport cython
1010
from libc.stdint cimport uint8_t, uint32_t
1111
from libc.string cimport memcpy
1212

13-
from cpython.buffer cimport PyBuffer_IsContiguous
14-
from cpython.bytearray cimport (
15-
PyByteArray_AS_STRING,
16-
PyByteArray_FromStringAndSize,
17-
)
18-
from cpython.bytes cimport (
19-
PyBytes_AS_STRING,
20-
PyBytes_GET_SIZE,
21-
PyBytes_Check,
22-
PyBytes_FromStringAndSize,
23-
)
13+
from cpython.bytearray cimport PyByteArray_FromStringAndSize
14+
from cpython.bytes cimport PyBytes_FromStringAndSize
2415
from cpython.memoryview cimport PyMemoryView_GET_BUFFER
25-
from cpython.unicode cimport (
26-
PyUnicode_AsUTF8String,
27-
PyUnicode_Check,
28-
PyUnicode_FromStringAndSize,
29-
)
16+
from cpython.unicode cimport PyUnicode_FromStringAndSize
3017

3118
from numpy cimport ndarray
3219

@@ -89,15 +76,15 @@ class VLenUTF8(Codec):
8976
@cython.boundscheck(False)
9077
def encode(self, buf):
9178
cdef:
92-
Py_ssize_t i, l, n_items, data_length, total_length
79+
Py_ssize_t i, L, n_items, data_length
9380
ndarray[object, ndim=1] input_values
9481
object[:] encoded_values
9582
int[:] encoded_lengths
96-
char* encv
9783
bytes b
9884
bytearray out
9985
char* data
100-
object u
86+
object o
87+
unicode u
10188

10289
# normalise input
10390
input_values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -110,36 +97,33 @@ class VLenUTF8(Codec):
11097
encoded_lengths = np.empty(n_items, dtype=np.intc)
11198

11299
# first iteration to convert to bytes
113-
data_length = 0
100+
data_length = HEADER_LENGTH
114101
for i in range(n_items):
115-
u = input_values[i]
116-
if u is None or u == 0: # treat these as missing value, normalize
117-
u = ''
118-
elif not PyUnicode_Check(u):
119-
raise TypeError('expected unicode string, found %r' % u)
120-
b = PyUnicode_AsUTF8String(u)
121-
l = PyBytes_GET_SIZE(b)
102+
o = input_values[i]
103+
# replace missing value and coerce to typed data
104+
u = "" if o is None or o == 0 else o
105+
b = u.encode("utf-8")
106+
L = len(b)
122107
encoded_values[i] = b
123-
data_length += l + HEADER_LENGTH
124-
encoded_lengths[i] = l
108+
data_length += L + HEADER_LENGTH
109+
encoded_lengths[i] = L
125110

126111
# setup output
127-
total_length = HEADER_LENGTH + data_length
128-
out = PyByteArray_FromStringAndSize(NULL, total_length)
112+
out = PyByteArray_FromStringAndSize(NULL, data_length)
129113

130114
# write header
131-
data = PyByteArray_AS_STRING(out)
115+
data = out
132116
store_le32(<uint8_t*>data, n_items)
133117

134118
# second iteration, store data
135119
data += HEADER_LENGTH
136120
for i in range(n_items):
137-
l = encoded_lengths[i]
138-
store_le32(<uint8_t*>data, l)
121+
L = encoded_lengths[i]
122+
store_le32(<uint8_t*>data, L)
139123
data += HEADER_LENGTH
140-
encv = PyBytes_AS_STRING(encoded_values[i])
141-
memcpy(data, encv, l)
142-
data += l
124+
b = encoded_values[i]
125+
memcpy(data, <const char*>b, L)
126+
data += L
143127

144128
return out
145129

@@ -151,16 +135,14 @@ class VLenUTF8(Codec):
151135
const Py_buffer* buf_pb
152136
const char* data
153137
const char* data_end
154-
Py_ssize_t i, l, n_items, data_length
138+
Py_ssize_t i, L, n_items, data_length
155139

156140
# obtain memoryview
157141
buf = ensure_contiguous_ndarray(buf)
158-
buf_mv = memoryview(buf)
142+
buf_mv = ensure_continguous_memoryview(buf)
159143
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
160144

161145
# sanity checks
162-
if not PyBuffer_IsContiguous(buf_pb, b'A'):
163-
raise BufferError("`buf` must contain contiguous memory")
164146
if buf_pb.len < HEADER_LENGTH:
165147
raise ValueError('corrupt buffer, missing or truncated header')
166148

@@ -184,12 +166,12 @@ class VLenUTF8(Codec):
184166
for i in range(n_items):
185167
if data + HEADER_LENGTH > data_end:
186168
raise ValueError('corrupt buffer, data seem truncated')
187-
l = load_le32(<uint8_t*>data)
169+
L = load_le32(<uint8_t*>data)
188170
data += HEADER_LENGTH
189-
if data + l > data_end:
171+
if data + L > data_end:
190172
raise ValueError('corrupt buffer, data seem truncated')
191-
out[i] = PyUnicode_FromStringAndSize(data, l)
192-
data += l
173+
out[i] = PyUnicode_FromStringAndSize(data, L)
174+
data += L
193175

194176
return out
195177

@@ -225,11 +207,12 @@ class VLenBytes(Codec):
225207
@cython.boundscheck(False)
226208
def encode(self, buf):
227209
cdef:
228-
Py_ssize_t i, l, n_items, data_length, total_length
210+
Py_ssize_t i, L, n_items, data_length
229211
object[:] values
212+
object[:] normed_values
230213
int[:] lengths
231-
char* encv
232-
object b
214+
object o
215+
bytes b
233216
bytearray out
234217
char* data
235218

@@ -240,37 +223,36 @@ class VLenBytes(Codec):
240223
n_items = values.shape[0]
241224

242225
# setup intermediates
226+
normed_values = np.empty(n_items, dtype=object)
243227
lengths = np.empty(n_items, dtype=np.intc)
244228

245229
# first iteration to find lengths
246-
data_length = 0
230+
data_length = HEADER_LENGTH
247231
for i in range(n_items):
248-
b = values[i]
249-
if b is None or b == 0: # treat these as missing value, normalize
250-
b = b''
251-
elif not PyBytes_Check(b):
252-
raise TypeError('expected byte string, found %r' % b)
253-
l = PyBytes_GET_SIZE(b)
254-
data_length += l + HEADER_LENGTH
255-
lengths[i] = l
232+
o = values[i]
233+
# replace missing value and coerce to typed data
234+
b = b"" if o is None or o == 0 else o
235+
normed_values[i] = b
236+
L = len(b)
237+
data_length += HEADER_LENGTH + L
238+
lengths[i] = L
256239

257240
# setup output
258-
total_length = HEADER_LENGTH + data_length
259-
out = PyByteArray_FromStringAndSize(NULL, total_length)
241+
out = PyByteArray_FromStringAndSize(NULL, data_length)
260242

261243
# write header
262-
data = PyByteArray_AS_STRING(out)
244+
data = out
263245
store_le32(<uint8_t*>data, n_items)
264246

265247
# second iteration, store data
266248
data += HEADER_LENGTH
267249
for i in range(n_items):
268-
l = lengths[i]
269-
store_le32(<uint8_t*>data, l)
250+
L = lengths[i]
251+
store_le32(<uint8_t*>data, L)
270252
data += HEADER_LENGTH
271-
encv = PyBytes_AS_STRING(values[i])
272-
memcpy(data, encv, l)
273-
data += l
253+
b = normed_values[i]
254+
memcpy(data, <const char*>b, L)
255+
data += L
274256

275257
return out
276258

@@ -282,16 +264,14 @@ class VLenBytes(Codec):
282264
const Py_buffer* buf_pb
283265
const char* data
284266
const char* data_end
285-
Py_ssize_t i, l, n_items, data_length
267+
Py_ssize_t i, L, n_items, data_length
286268

287269
# obtain memoryview
288270
buf = ensure_contiguous_ndarray(buf)
289-
buf_mv = memoryview(buf)
271+
buf_mv = ensure_continguous_memoryview(buf)
290272
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
291273

292274
# sanity checks
293-
if not PyBuffer_IsContiguous(buf_pb, b'A'):
294-
raise BufferError("`buf` must contain contiguous memory")
295275
if buf_pb.len < HEADER_LENGTH:
296276
raise ValueError('corrupt buffer, missing or truncated header')
297277

@@ -315,12 +295,12 @@ class VLenBytes(Codec):
315295
for i in range(n_items):
316296
if data + HEADER_LENGTH > data_end:
317297
raise ValueError('corrupt buffer, data seem truncated')
318-
l = load_le32(<uint8_t*>data)
298+
L = load_le32(<uint8_t*>data)
319299
data += HEADER_LENGTH
320-
if data + l > data_end:
300+
if data + L > data_end:
321301
raise ValueError('corrupt buffer, data seem truncated')
322-
out[i] = PyBytes_FromStringAndSize(data, l)
323-
data += l
302+
out[i] = PyBytes_FromStringAndSize(data, L)
303+
data += L
324304

325305
return out
326306

@@ -369,17 +349,16 @@ class VLenArray(Codec):
369349
@cython.boundscheck(False)
370350
def encode(self, buf):
371351
cdef:
372-
Py_ssize_t i, l, n_items, data_length, total_length
352+
Py_ssize_t i, L, n_items, data_length
373353
object[:] values
374354
object[:] normed_values
375355
int[:] lengths
376-
const char* encv
377356
bytes b
378357
bytearray out
379358
char* data
380359
memoryview value_mv
381360
const Py_buffer* value_pb
382-
object v
361+
object o
383362

384363
# normalise input
385364
values = np.asarray(buf, dtype=object).reshape(-1, order='A')
@@ -392,41 +371,41 @@ class VLenArray(Codec):
392371
lengths = np.empty(n_items, dtype=np.intc)
393372

394373
# first iteration to convert to bytes
395-
data_length = 0
374+
data_length = HEADER_LENGTH
396375
for i in range(n_items):
397-
v = values[i]
398-
if v is None:
399-
v = np.array([], dtype=self.dtype)
400-
else:
401-
v = np.ascontiguousarray(v, self.dtype)
402-
if v.ndim != 1:
403-
raise ValueError('only 1-dimensional arrays are supported')
404-
l = v.nbytes
405-
normed_values[i] = v
406-
data_length += l + HEADER_LENGTH
407-
lengths[i] = l
376+
o = values[i]
377+
# replace missing value and coerce to typed data
378+
value_mv = ensure_continguous_memoryview(
379+
np.array([], dtype=self.dtype) if o is None
380+
else np.ascontiguousarray(o, self.dtype)
381+
)
382+
value_pb = PyMemoryView_GET_BUFFER(value_mv)
383+
if value_pb.ndim != 1:
384+
raise ValueError("only 1-dimensional arrays are supported")
385+
L = value_pb.len
386+
normed_values[i] = value_mv
387+
data_length += HEADER_LENGTH + L
388+
lengths[i] = L
408389

409390
# setup output
410-
total_length = HEADER_LENGTH + data_length
411-
out = PyByteArray_FromStringAndSize(NULL, total_length)
391+
out = PyByteArray_FromStringAndSize(NULL, data_length)
412392

413393
# write header
414-
data = PyByteArray_AS_STRING(out)
394+
data = out
415395
store_le32(<uint8_t*>data, n_items)
416396

417397
# second iteration, store data
418398
data += HEADER_LENGTH
419399
for i in range(n_items):
420-
l = lengths[i]
421-
store_le32(<uint8_t*>data, l)
400+
L = lengths[i]
401+
store_le32(<uint8_t*>data, L)
422402
data += HEADER_LENGTH
423403

424-
value_mv = ensure_continguous_memoryview(normed_values[i])
404+
value_mv = normed_values[i]
425405
value_pb = PyMemoryView_GET_BUFFER(value_mv)
426-
encv = <const char*>value_pb.buf
427406

428-
memcpy(data, encv, l)
429-
data += l
407+
memcpy(data, value_pb.buf, L)
408+
data += L
430409

431410
return out
432411

@@ -441,16 +420,14 @@ class VLenArray(Codec):
441420
object v
442421
memoryview v_mv
443422
Py_buffer* v_pb
444-
Py_ssize_t i, l, n_items, data_length
423+
Py_ssize_t i, L, n_items, data_length
445424

446425
# obtain memoryview
447426
buf = ensure_contiguous_ndarray(buf)
448-
buf_mv = memoryview(buf)
427+
buf_mv = ensure_continguous_memoryview(buf)
449428
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
450429

451430
# sanity checks
452-
if not PyBuffer_IsContiguous(buf_pb, b'A'):
453-
raise BufferError("`buf` must contain contiguous memory")
454431
if buf_pb.len < HEADER_LENGTH:
455432
raise ValueError('corrupt buffer, missing or truncated header')
456433

@@ -474,18 +451,18 @@ class VLenArray(Codec):
474451
for i in range(n_items):
475452
if data + HEADER_LENGTH > data_end:
476453
raise ValueError('corrupt buffer, data seem truncated')
477-
l = load_le32(<uint8_t*>data)
454+
L = load_le32(<uint8_t*>data)
478455
data += HEADER_LENGTH
479-
if data + l > data_end:
456+
if data + L > data_end:
480457
raise ValueError('corrupt buffer, data seem truncated')
481458

482459
# Create & fill array value
483-
v = np.empty((l,), dtype="uint8").view(self.dtype)
460+
v = np.empty((L,), dtype="uint8").view(self.dtype)
484461
v_mv = memoryview(v)
485462
v_pb = PyMemoryView_GET_BUFFER(v_mv)
486-
memcpy(v_pb.buf, data, l)
463+
memcpy(v_pb.buf, data, L)
487464

488465
out[i] = v
489-
data += l
466+
data += L
490467

491468
return out

0 commit comments

Comments
 (0)