Skip to content

Commit a24b912

Browse files
authored
Merge pull request #173 from threatgrid/entity-ids
Merged block managers
2 parents b9578eb + 852b69d commit a24b912

15 files changed

+505
-114
lines changed

deps.edn

+1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
org.clojure/core.cache {:mvn/version "1.0.207"}
66
org.clojars.quoll/zuko {:mvn/version "0.6.5"}
77
org.clojure/data.priority-map {:mvn/version "1.0.0"}
8+
cheshire/cheshire {:mvn/version "5.10.0"}
89
tailrecursion/cljs-priority-map {:mvn/version "1.2.1"}}}

src/asami/durable/codec.cljc

+2-1
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@
4949
;; Byte 0
5050
;; 0xxxxxxx String type, length of up to 127.
5151
;; 10xxxxxx URI type, length of up to 64
52-
;; 110xxxxx Keyword type, length of up to 32
52+
;; 1100xxxx Keyword type, length of up to 16
5353
;; For these 3 types, all remaining bytes are the data body.
54+
;; 1101xxxx Long value. xxxx encodes the number of bytes
5455
;; 111ytttt Data is of type described in tttt.
5556
;; Length is run-length encoded as follows:
5657
;; When y=0

src/asami/durable/decoder.clj

+34-17
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141
[^bytes data]
4242
(let [b0 (aget data 0)]
4343
(cond ;; test for short format objects
44-
(zero? (bit-and 0x80 b0)) b0
45-
(zero? (bit-and 0x40 b0)) (bit-and 0x3F b0)
46-
(zero? (bit-and 0x20 b0)) (bit-and 0x1F b0)
44+
(zero? (bit-and 0x80 b0)) b0 ;; short string
45+
(zero? (bit-and 0x40 b0)) (bit-and 0x3F b0) ;; short URI
46+
(zero? (bit-and 0x20 b0)) (bit-and 0x0F b0) ;; short keyword OR number
4747
;; First byte contains only the type information. Give a large number = 63
4848
:default 0x3F)))
4949

@@ -61,6 +61,12 @@
6161
[paged-rdr ^long pos ^long len]
6262
(keyword (read-str paged-rdr pos len)))
6363

64+
(defn read-long
65+
"Raw reading of big-endian bytes into a long"
66+
^long [paged-rdr ^long pos ^long len]
67+
(let [^bytes b (read-bytes paged-rdr pos len)]
68+
(areduce b i ret 0 (bit-or (bit-shift-left ret Byte/SIZE) (bit-and 0xFF (aget b i))))))
69+
6470
;; decoders operate on the bytes following the initial type byte information
6571
;; if the data type has variable length, then this is decoded first
6672

@@ -151,26 +157,35 @@
151157
(defn seq-decoder
152158
"This is a decoder for sequences of data. Use a vector as the sequence."
153159
[ext paged-rdr ^long pos]
160+
;; read the length of the header and the length of the seq data
154161
(let [[i len] (decode-length ext paged-rdr pos)
155162
start (+ i pos)
156163
end (+ start len)
164+
;; get the 0 byte. This contain info about the types in the seq
157165
b0 (read-byte paged-rdr start)
158166
decoder (if (zero? b0)
159-
;; heterogeneous
167+
;; heterogeneous types. Full header on every element. Read objects with size.
160168
read-object-size
161-
;; homogeneous
162-
(if-let [tdecoder (typecode->decoder (bit-and 0x0F b0))]
163-
#(tdecoder true %1 %2)
164-
(throw (ex-info "Illegal datatype in array" {:type-code (bit-and 0x0F b0)}))))]
169+
;; homogeneous types. The header is only written once
170+
(if (= 0xD0 (bit-and 0xF0 b0)) ;; homogenous numbers
171+
(let [num-len (bit-and 0x0F b0)] ;; get the byte length of all the numbers
172+
;; return a function that deserializes the number and pairs it with the length
173+
#(vector (read-long %1 %2 num-len) num-len))
174+
(if-let [tdecoder (typecode->decoder (bit-and 0x0F b0))] ;; reader for type
175+
;; the standard decoder already returns a deserialized value/length pair
176+
#(tdecoder true %1 %2)
177+
(throw (ex-info "Illegal datatype in array" {:type-code (bit-and 0x0F b0)})))))]
178+
;; iterate over the buffer deserializing until the end is reached
165179
(loop [s [] offset (inc start)]
166180
(if (>= offset end)
167-
[s (+ i len)]
168-
(let [[o obj-len] (decoder paged-rdr offset)]
181+
[s (+ i len)] ;; end of the buffer, return the seq and the number of bytes read
182+
(let [[o obj-len] (decoder paged-rdr offset)] ;; deserialize, then step forward
169183
(recur (conj s o) (+ offset obj-len)))))))
170184

171185
(defn map-decoder
172-
"A decoder for maps"
186+
"A decoder for maps. Returns the map and the bytes read."
173187
[ext paged-rdr ^long pos]
188+
;; read the map as one long seq, then split into pairs
174189
(let [[s len] (seq-decoder ext paged-rdr pos)
175190
m (into {} (map vec (partition 2 s)))]
176191
[m len]))
@@ -313,7 +328,7 @@
313328
(or (first (drop-while zero? (map compare left-body (drop 1 right-bytes)))) 0)))
314329

315330
(defn read-object-size
316-
"Reads an object from a paged-reader, at id=pos"
331+
"Reads an object from a paged-reader, at id=pos. Returns both the object and it's length."
317332
[paged-rdr ^long pos]
318333
(let [b0 (read-byte paged-rdr pos)
319334
ipos (inc pos)]
@@ -322,12 +337,14 @@
322337
(zero? (bit-and 0x80 b0)) [(read-str paged-rdr ipos b0) (inc b0)]
323338
(zero? (bit-and 0x40 b0)) (let [len (bit-and 0x3F b0)]
324339
[(read-uri paged-rdr ipos len) (inc len)])
325-
(zero? (bit-and 0x20 b0)) (let [len (bit-and 0x1F b0)]
326-
[(read-keyword paged-rdr ipos len) (inc len)])
327340
;; First byte contains only the type information. Increment the returned length to include b0
328-
:default (update ((typecode->decoder (bit-and 0x0F b0) default-decoder)
329-
(zero? (bit-and 0x10 b0)) paged-rdr ipos)
330-
1 inc))))
341+
(= 0xE0 (bit-and 0xE0 b0)) (update ((typecode->decoder (bit-and 0x0F b0) default-decoder)
342+
(zero? (bit-and 0x10 b0)) paged-rdr ipos)
343+
1 inc)
344+
;; high nybble is 1100 for keywords or 1101 for long number
345+
:default (let [read-fn (if (zero? (bit-and 0x30 b0)) read-keyword read-long)
346+
len (bit-and 0x0F b0)]
347+
[(read-fn paged-rdr ipos len) (inc len)]))))
331348

332349
(defn read-object
333350
"Reads an object from a paged-reader, at id=pos"

src/asami/durable/encoder.clj

+92-23
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020

2121
;; (set! *warn-on-reflection* true)
2222

23+
(def ^:dynamic *entity-offsets* nil)
24+
(def ^:dynamic *current-offset* nil)
25+
26+
(def empty-bytes (byte-array 0))
27+
2328
(def type->code
2429
{Long (byte 0)
2530
Double (byte 1)
@@ -80,12 +85,12 @@
8085
(bit-and 0xFF (bit-shift-right len 8))
8186
(bit-and 0xFF len)])))
8287

83-
;; to-bytes is required by the recursive concattenation operation
84-
(declare to-bytes)
88+
;; to-counted-bytes is required by the recursive concattenation operation
89+
(declare to-counted-bytes)
8590

8691
(defn concat-bytes
8792
"Takes multiple byte arrays and returns an array with all of the bytes concattenated"
88-
[bas]
93+
^bytes [bas]
8994
(let [len (apply + (map alength bas))
9095
output (byte-array len)]
9196
(reduce (fn [offset arr]
@@ -133,6 +138,32 @@
133138
(when (and (< l max-short-long) (> l min-short-long))
134139
(bit-and data-mask l)))
135140

141+
(def ^:dynamic *number-bytes* nil)
142+
(def ^:dynamic *number-buffer* nil)
143+
144+
(defn n-byte-number
145+
"Returns an array of n bytes representing the number x.
146+
Must be initialized for the current thread."
147+
[^long n ^long x]
148+
(.putLong *number-buffer* 0 x)
149+
(let [ret (byte-array n)]
150+
(System/arraycopy ^bytes *number-bytes* (int (- Long/BYTES n)) ret 0 (int n))
151+
ret))
152+
153+
(defn num-bytes
154+
"Determines the number of bytes that can hold a value.
155+
From 2-4 tests, this preferences small numbers."
156+
[^long n]
157+
(let [f (neg? n)
158+
nn (if f (dec (- n)) n)]
159+
(if (<= nn 0x7FFF)
160+
(if (<= nn 0x7F) 1 2)
161+
(if (<= nn 0x7FFFFFFF)
162+
(if (<= nn 0x7FFFFF) 3 4)
163+
(if (<= nn 0x7FFFFFFFFFFF)
164+
(if (<= nn 0x7FFFFFFFFF) 5 6)
165+
(if (<= nn 0x7FFFFFFFFFFFFF) 7 8))))))
166+
136167
(def constant-length?
137168
"The set of types that can be encoded in a constant number of bytes. Used for homogenous sequences."
138169
#{Long Double Date Instant UUID})
@@ -167,25 +198,21 @@
167198

168199
Keyword
169200
(header [this len]
170-
(if (< len 0x20)
201+
(if (< len 0x10)
171202
(byte-array [(bit-or 0xC0 len)])
172203
(general-header (type->code Keyword) len)))
173204
(body [this]
174-
(let [nms (namespace this)
175-
n (name this)]
176-
(.getBytes (subs (str this) 1) ^Charset utf8)))
205+
(.getBytes (subs (str this) 1) ^Charset utf8))
177206
(encapsulate-id [this]
178207
(encapsulate-sstr (subs (str this) 1) skey-type-mask))
179208

180209
Long
181210
(header [this len]
182-
(assert (= len Long/BYTES))
183-
(byte-array [(bit-or 0xE0 (type->code Long))]))
211+
(assert (<= len Long/BYTES))
212+
(byte-array [(bit-or 0xD0 len)]))
184213
(body [^long this]
185-
(let [b (byte-array Long/BYTES)
186-
bb (ByteBuffer/wrap b)]
187-
(.putLong bb 0 this)
188-
b))
214+
(let [n (num-bytes this)]
215+
(n-byte-number n this)))
189216
(encapsulate-id [this]
190217
(when-let [v (encapsulate-long this)]
191218
(bit-or long-type-mask v)))
@@ -220,7 +247,7 @@
220247
(assert (= len Long/BYTES))
221248
(byte-array [(bit-or 0xE0 (type->code Date))]))
222249
(body [^Date this]
223-
(body (.getTime this)))
250+
(n-byte-number Long/BYTES (.getTime this)))
224251
(encapsulate-id [this]
225252
(when-let [v (encapsulate-long (.getTime ^Date this))]
226253
(bit-or date-type-mask v)))
@@ -257,18 +284,47 @@
257284
(general-header (type->code ISeq) len))
258285
(body [this]
259286
(if-not (seq this)
260-
(byte-array 0)
287+
empty-bytes
261288
(let [fst (first this)
262289
t (type fst)
263290
homogeneous (and (constant-length? t) (every? #(instance? t %) this))
264291
[elt-fn prefix] (if homogeneous
265-
(let [hdr (byte-array [(bit-or 0xE0 (type->code t))])]
266-
[#(vector (body %)) hdr])
267-
[to-bytes zero-array])]
268-
(->> this
269-
(mapcat elt-fn)
270-
(cons prefix)
271-
concat-bytes))))
292+
(if (= t Long)
293+
(let [elt-len (apply max (map num-bytes this))
294+
arr-hdr (byte-array [(bit-or 0xD0 elt-len)])] ;; 0xDllll is the header byte for longs
295+
;; integer homogenous arrays store the number in the header, with nil bodies
296+
[#(vector (n-byte-number elt-len %)) arr-hdr])
297+
(let [arr-hdr (byte-array [(bit-or 0xE0 (type->code t))])] ;; 0xEtttt is the header byte for typed things
298+
;; simple homogenous arrays store everything in the object header, with nil bodies
299+
[#(vector (body %)) arr-hdr]))
300+
[to-counted-bytes zero-array])
301+
;; start counting the bytes that are going into the buffer
302+
starting-offset @*current-offset*
303+
_ (vswap! *current-offset* + 3) ;; 2 bytes for a short header + 1 byte for the prefix array
304+
result (->> this
305+
;; like a mapv but records the lengths of the data as it iterates through the seq
306+
(reduce (fn [arrays x]
307+
(let [offset @*current-offset* ;; save the start, as the embedded objects will update this
308+
[head body] (elt-fn x)]
309+
;; regardless of what embedded objects have update the *current-offset* to, change it to the
310+
;; start of the current object, plus its total size
311+
(vreset! *current-offset* (+ offset (alength head) (if body (alength body) 0)))
312+
;; add the bytes of this object to the overall result of byte arrays
313+
(cond-> (conj! arrays head)
314+
body (conj! body)))) ;; only add the body if there is one
315+
(transient [prefix]))
316+
persistent!
317+
concat-bytes)
318+
update-lengths (fn [m u]
319+
(into {} (map (fn [[k v :as kv]]
320+
(if (> v starting-offset) [k (+ v u)] kv))
321+
m)))
322+
rlen (alength result)]
323+
;; correct offsets for longer headers
324+
(cond
325+
(> rlen 0x7FFF) (vswap! *entity-offsets* update-lengths 3) ;; total 5 after the 2 already added
326+
(> rlen 0xFF) (vswap! *entity-offsets* update-lengths 1)) ;; total 3 after the 2 already added
327+
result)))
272328

273329
IPersistentVector
274330
(header [this len] (header (or (seq this) '()) len))
@@ -278,6 +334,10 @@
278334
(header [this len]
279335
(general-header (type->code IPersistentMap) len))
280336
(body [this]
337+
;; If this is an identified object, then save it's location
338+
(doseq [id-attr [:db/id :db/ident :id]]
339+
(when-let [id (id-attr this)]
340+
(vswap! *entity-offsets* assoc id @*current-offset*)))
281341
(body (apply concat (seq this))))
282342

283343
Object
@@ -300,8 +360,17 @@
300360
(encapsulate-id [^asami.graph.InternalNode this]
301361
(bit-or node-type-mask (bit-and data-mask (.id this)))))
302362

303-
(defn to-bytes
363+
(defn to-counted-bytes
304364
"Returns a tuple of byte arrays, representing the header and the body"
305365
[o]
306366
(let [^bytes b (body o)]
307367
[(header o (alength b)) b]))
368+
369+
(defn to-bytes
370+
"Returns a tuple of byte arrays, representing the header and the body"
371+
[o]
372+
(binding [*entity-offsets* (volatile! {})
373+
*current-offset* (volatile! 0)
374+
*number-bytes* (byte-array Long/BYTES)]
375+
(binding [*number-buffer* (ByteBuffer/wrap *number-bytes*)]
376+
(conj (to-counted-bytes o) @*entity-offsets*))))

0 commit comments

Comments
 (0)