@@ -10,23 +10,10 @@ cimport cython
10
10
from libc.stdint cimport uint8_t, uint32_t
11
11
from libc.string cimport memcpy
12
12
13
- from cpython.buffer cimport PyBuffer_IsContiguous
14
- from cpython.bytearray cimport (
15
- PyByteArray_AS_STRING,
16
- PyByteArray_FromStringAndSize,
17
- )
18
- from cpython.bytes cimport (
19
- PyBytes_AS_STRING,
20
- PyBytes_GET_SIZE,
21
- PyBytes_Check,
22
- PyBytes_FromStringAndSize,
23
- )
13
+ from cpython.bytearray cimport PyByteArray_FromStringAndSize
14
+ from cpython.bytes cimport PyBytes_FromStringAndSize
24
15
from cpython.memoryview cimport PyMemoryView_GET_BUFFER
25
- from cpython.unicode cimport (
26
- PyUnicode_AsUTF8String,
27
- PyUnicode_Check,
28
- PyUnicode_FromStringAndSize,
29
- )
16
+ from cpython.unicode cimport PyUnicode_FromStringAndSize
30
17
31
18
from numpy cimport ndarray
32
19
@@ -89,15 +76,15 @@ class VLenUTF8(Codec):
89
76
@ cython.boundscheck (False )
90
77
def encode (self , buf ):
91
78
cdef:
92
- Py_ssize_t i, l , n_items, data_length, total_length
79
+ Py_ssize_t i, L , n_items, data_length
93
80
ndarray[object , ndim= 1 ] input_values
94
81
object [:] encoded_values
95
82
int [:] encoded_lengths
96
- char * encv
97
83
bytes b
98
84
bytearray out
99
85
char * data
100
- object u
86
+ object o
87
+ unicode u
101
88
102
89
# normalise input
103
90
input_values = np.asarray(buf, dtype = object ).reshape(- 1 , order = ' A' )
@@ -110,36 +97,33 @@ class VLenUTF8(Codec):
110
97
encoded_lengths = np.empty(n_items, dtype = np.intc)
111
98
112
99
# first iteration to convert to bytes
113
- data_length = 0
100
+ data_length = HEADER_LENGTH
114
101
for i in range (n_items):
115
- u = input_values[i]
116
- if u is None or u == 0 : # treat these as missing value, normalize
117
- u = ' '
118
- elif not PyUnicode_Check(u):
119
- raise TypeError (' expected unicode string, found %r ' % u)
120
- b = PyUnicode_AsUTF8String(u)
121
- l = PyBytes_GET_SIZE(b)
102
+ o = input_values[i]
103
+ # replace missing value and coerce to typed data
104
+ u = " " if o is None or o == 0 else o
105
+ b = u.encode(" utf-8" )
106
+ L = len (b)
122
107
encoded_values[i] = b
123
- data_length += l + HEADER_LENGTH
124
- encoded_lengths[i] = l
108
+ data_length += L + HEADER_LENGTH
109
+ encoded_lengths[i] = L
125
110
126
111
# setup output
127
- total_length = HEADER_LENGTH + data_length
128
- out = PyByteArray_FromStringAndSize(NULL , total_length)
112
+ out = PyByteArray_FromStringAndSize(NULL , data_length)
129
113
130
114
# write header
131
- data = PyByteArray_AS_STRING( out)
115
+ data = out
132
116
store_le32(< uint8_t* > data, n_items)
133
117
134
118
# second iteration, store data
135
119
data += HEADER_LENGTH
136
120
for i in range (n_items):
137
- l = encoded_lengths[i]
138
- store_le32(< uint8_t* > data, l )
121
+ L = encoded_lengths[i]
122
+ store_le32(< uint8_t* > data, L )
139
123
data += HEADER_LENGTH
140
- encv = PyBytes_AS_STRING( encoded_values[i])
141
- memcpy(data, encv, l )
142
- data += l
124
+ b = encoded_values[i]
125
+ memcpy(data, < const char * > b, L )
126
+ data += L
143
127
144
128
return out
145
129
@@ -151,16 +135,14 @@ class VLenUTF8(Codec):
151
135
const Py_buffer* buf_pb
152
136
const char * data
153
137
const char * data_end
154
- Py_ssize_t i, l , n_items, data_length
138
+ Py_ssize_t i, L , n_items, data_length
155
139
156
140
# obtain memoryview
157
141
buf = ensure_contiguous_ndarray(buf)
158
- buf_mv = memoryview (buf)
142
+ buf_mv = ensure_continguous_memoryview (buf)
159
143
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
160
144
161
145
# sanity checks
162
- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
163
- raise BufferError(" `buf` must contain contiguous memory" )
164
146
if buf_pb.len < HEADER_LENGTH:
165
147
raise ValueError (' corrupt buffer, missing or truncated header' )
166
148
@@ -184,12 +166,12 @@ class VLenUTF8(Codec):
184
166
for i in range (n_items):
185
167
if data + HEADER_LENGTH > data_end:
186
168
raise ValueError (' corrupt buffer, data seem truncated' )
187
- l = load_le32(< uint8_t* > data)
169
+ L = load_le32(< uint8_t* > data)
188
170
data += HEADER_LENGTH
189
- if data + l > data_end:
171
+ if data + L > data_end:
190
172
raise ValueError (' corrupt buffer, data seem truncated' )
191
- out[i] = PyUnicode_FromStringAndSize(data, l )
192
- data += l
173
+ out[i] = PyUnicode_FromStringAndSize(data, L )
174
+ data += L
193
175
194
176
return out
195
177
@@ -225,11 +207,12 @@ class VLenBytes(Codec):
225
207
@ cython.boundscheck (False )
226
208
def encode (self , buf ):
227
209
cdef:
228
- Py_ssize_t i, l , n_items, data_length, total_length
210
+ Py_ssize_t i, L , n_items, data_length
229
211
object [:] values
212
+ object [:] normed_values
230
213
int [:] lengths
231
- char * encv
232
- object b
214
+ object o
215
+ bytes b
233
216
bytearray out
234
217
char * data
235
218
@@ -240,37 +223,36 @@ class VLenBytes(Codec):
240
223
n_items = values.shape[0 ]
241
224
242
225
# setup intermediates
226
+ normed_values = np.empty(n_items, dtype = object )
243
227
lengths = np.empty(n_items, dtype = np.intc)
244
228
245
229
# first iteration to find lengths
246
- data_length = 0
230
+ data_length = HEADER_LENGTH
247
231
for i in range (n_items):
248
- b = values[i]
249
- if b is None or b == 0 : # treat these as missing value, normalize
250
- b = b' '
251
- elif not PyBytes_Check(b):
252
- raise TypeError (' expected byte string, found %r ' % b)
253
- l = PyBytes_GET_SIZE(b)
254
- data_length += l + HEADER_LENGTH
255
- lengths[i] = l
232
+ o = values[i]
233
+ # replace missing value and coerce to typed data
234
+ b = b" " if o is None or o == 0 else o
235
+ normed_values[i] = b
236
+ L = len (b)
237
+ data_length += HEADER_LENGTH + L
238
+ lengths[i] = L
256
239
257
240
# setup output
258
- total_length = HEADER_LENGTH + data_length
259
- out = PyByteArray_FromStringAndSize(NULL , total_length)
241
+ out = PyByteArray_FromStringAndSize(NULL , data_length)
260
242
261
243
# write header
262
- data = PyByteArray_AS_STRING( out)
244
+ data = out
263
245
store_le32(< uint8_t* > data, n_items)
264
246
265
247
# second iteration, store data
266
248
data += HEADER_LENGTH
267
249
for i in range (n_items):
268
- l = lengths[i]
269
- store_le32(< uint8_t* > data, l )
250
+ L = lengths[i]
251
+ store_le32(< uint8_t* > data, L )
270
252
data += HEADER_LENGTH
271
- encv = PyBytes_AS_STRING(values [i])
272
- memcpy(data, encv, l )
273
- data += l
253
+ b = normed_values [i]
254
+ memcpy(data, < const char * > b, L )
255
+ data += L
274
256
275
257
return out
276
258
@@ -282,16 +264,14 @@ class VLenBytes(Codec):
282
264
const Py_buffer* buf_pb
283
265
const char * data
284
266
const char * data_end
285
- Py_ssize_t i, l , n_items, data_length
267
+ Py_ssize_t i, L , n_items, data_length
286
268
287
269
# obtain memoryview
288
270
buf = ensure_contiguous_ndarray(buf)
289
- buf_mv = memoryview (buf)
271
+ buf_mv = ensure_continguous_memoryview (buf)
290
272
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
291
273
292
274
# sanity checks
293
- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
294
- raise BufferError(" `buf` must contain contiguous memory" )
295
275
if buf_pb.len < HEADER_LENGTH:
296
276
raise ValueError (' corrupt buffer, missing or truncated header' )
297
277
@@ -315,12 +295,12 @@ class VLenBytes(Codec):
315
295
for i in range (n_items):
316
296
if data + HEADER_LENGTH > data_end:
317
297
raise ValueError (' corrupt buffer, data seem truncated' )
318
- l = load_le32(< uint8_t* > data)
298
+ L = load_le32(< uint8_t* > data)
319
299
data += HEADER_LENGTH
320
- if data + l > data_end:
300
+ if data + L > data_end:
321
301
raise ValueError (' corrupt buffer, data seem truncated' )
322
- out[i] = PyBytes_FromStringAndSize(data, l )
323
- data += l
302
+ out[i] = PyBytes_FromStringAndSize(data, L )
303
+ data += L
324
304
325
305
return out
326
306
@@ -369,17 +349,16 @@ class VLenArray(Codec):
369
349
@ cython.boundscheck (False )
370
350
def encode (self , buf ):
371
351
cdef:
372
- Py_ssize_t i, l , n_items, data_length, total_length
352
+ Py_ssize_t i, L , n_items, data_length
373
353
object [:] values
374
354
object [:] normed_values
375
355
int [:] lengths
376
- const char * encv
377
356
bytes b
378
357
bytearray out
379
358
char * data
380
359
memoryview value_mv
381
360
const Py_buffer* value_pb
382
- object v
361
+ object o
383
362
384
363
# normalise input
385
364
values = np.asarray(buf, dtype = object ).reshape(- 1 , order = ' A' )
@@ -392,41 +371,41 @@ class VLenArray(Codec):
392
371
lengths = np.empty(n_items, dtype = np.intc)
393
372
394
373
# first iteration to convert to bytes
395
- data_length = 0
374
+ data_length = HEADER_LENGTH
396
375
for i in range (n_items):
397
- v = values[i]
398
- if v is None :
399
- v = np.array([], dtype = self .dtype)
400
- else :
401
- v = np.ascontiguousarray(v, self .dtype)
402
- if v.ndim != 1 :
403
- raise ValueError (' only 1-dimensional arrays are supported' )
404
- l = v.nbytes
405
- normed_values[i] = v
406
- data_length += l + HEADER_LENGTH
407
- lengths[i] = l
376
+ o = values[i]
377
+ # replace missing value and coerce to typed data
378
+ value_mv = ensure_continguous_memoryview(
379
+ np.array([], dtype = self .dtype) if o is None
380
+ else np.ascontiguousarray(o, self .dtype)
381
+ )
382
+ value_pb = PyMemoryView_GET_BUFFER(value_mv)
383
+ if value_pb.ndim != 1 :
384
+ raise ValueError (" only 1-dimensional arrays are supported" )
385
+ L = value_pb.len
386
+ normed_values[i] = value_mv
387
+ data_length += HEADER_LENGTH + L
388
+ lengths[i] = L
408
389
409
390
# setup output
410
- total_length = HEADER_LENGTH + data_length
411
- out = PyByteArray_FromStringAndSize(NULL , total_length)
391
+ out = PyByteArray_FromStringAndSize(NULL , data_length)
412
392
413
393
# write header
414
- data = PyByteArray_AS_STRING( out)
394
+ data = out
415
395
store_le32(< uint8_t* > data, n_items)
416
396
417
397
# second iteration, store data
418
398
data += HEADER_LENGTH
419
399
for i in range (n_items):
420
- l = lengths[i]
421
- store_le32(< uint8_t* > data, l )
400
+ L = lengths[i]
401
+ store_le32(< uint8_t* > data, L )
422
402
data += HEADER_LENGTH
423
403
424
- value_mv = ensure_continguous_memoryview( normed_values[i])
404
+ value_mv = normed_values[i]
425
405
value_pb = PyMemoryView_GET_BUFFER(value_mv)
426
- encv = < const char * > value_pb.buf
427
406
428
- memcpy(data, encv, l )
429
- data += l
407
+ memcpy(data, value_pb.buf, L )
408
+ data += L
430
409
431
410
return out
432
411
@@ -441,16 +420,14 @@ class VLenArray(Codec):
441
420
object v
442
421
memoryview v_mv
443
422
Py_buffer* v_pb
444
- Py_ssize_t i, l , n_items, data_length
423
+ Py_ssize_t i, L , n_items, data_length
445
424
446
425
# obtain memoryview
447
426
buf = ensure_contiguous_ndarray(buf)
448
- buf_mv = memoryview (buf)
427
+ buf_mv = ensure_continguous_memoryview (buf)
449
428
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)
450
429
451
430
# sanity checks
452
- if not PyBuffer_IsContiguous(buf_pb, b' A' ):
453
- raise BufferError(" `buf` must contain contiguous memory" )
454
431
if buf_pb.len < HEADER_LENGTH:
455
432
raise ValueError (' corrupt buffer, missing or truncated header' )
456
433
@@ -474,18 +451,18 @@ class VLenArray(Codec):
474
451
for i in range (n_items):
475
452
if data + HEADER_LENGTH > data_end:
476
453
raise ValueError (' corrupt buffer, data seem truncated' )
477
- l = load_le32(< uint8_t* > data)
454
+ L = load_le32(< uint8_t* > data)
478
455
data += HEADER_LENGTH
479
- if data + l > data_end:
456
+ if data + L > data_end:
480
457
raise ValueError (' corrupt buffer, data seem truncated' )
481
458
482
459
# Create & fill array value
483
- v = np.empty((l ,), dtype = " uint8" ).view(self .dtype)
460
+ v = np.empty((L ,), dtype = " uint8" ).view(self .dtype)
484
461
v_mv = memoryview(v)
485
462
v_pb = PyMemoryView_GET_BUFFER(v_mv)
486
- memcpy(v_pb.buf, data, l )
463
+ memcpy(v_pb.buf, data, L )
487
464
488
465
out[i] = v
489
- data += l
466
+ data += L
490
467
491
468
return out
0 commit comments