Skip to content

Commit b90f16b

Browse files
authored
Add json2 codec (#153)
* Implement json2 codec * Update registry * Clean up * format * Update * Implement options and errors * Format * Test for ensure_ascii * Fmt * Update * Changeset
1 parent 4d2fe82 commit b90f16b

File tree

5 files changed

+378
-1
lines changed

5 files changed

+378
-1
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@zarrita/core": patch
3+
---
4+
5+
Add json2 codec.
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import { describe, expect, test } from "vitest";
2+
3+
import { JsonCodec } from "../src/codecs/json2.js";
4+
5+
describe("JsonCodec", () => {
6+
test("can decode", () => {
7+
// from numcodecs.json import JSON
8+
// import numpy as np
9+
// json_codec = JSON()
10+
// json_codec.encode(np.array(['ASC1', 'ASC2', 'END', 'GABA1', 'GABA2', 'MG', 'NSC', 'ODC1', 'OPC', 'Unclassified', 'exCA1', 'exCA3', 'exDG', 'exPFC1', 'exPFC2'], dtype=object))
11+
const encodedStr =
12+
`["ASC1","ASC2","END","GABA1","GABA2","MG","NSC","ODC1","OPC","Unclassified","exCA1","exCA3","exDG","exPFC1","exPFC2","|O",[15]]`;
13+
const encodedBytes = new TextEncoder().encode(encodedStr);
14+
const jsonCodec = new JsonCodec({ encoding: "utf-8" });
15+
const decodedResult = jsonCodec.decode(encodedBytes);
16+
expect(decodedResult).toStrictEqual({
17+
data: [
18+
"ASC1",
19+
"ASC2",
20+
"END",
21+
"GABA1",
22+
"GABA2",
23+
"MG",
24+
"NSC",
25+
"ODC1",
26+
"OPC",
27+
"Unclassified",
28+
"exCA1",
29+
"exCA3",
30+
"exDG",
31+
"exPFC1",
32+
"exPFC2",
33+
],
34+
shape: [15],
35+
stride: [1],
36+
});
37+
});
38+
test("can encode", () => {
39+
const encodedStr =
40+
`["ASC1","ASC2","END","GABA1","GABA2","MG","NSC","ODC1","OPC","Unclassified","exCA1","exCA3","exDG","exPFC1","exPFC2","|O",[15]]`;
41+
const encodedBytes = new TextEncoder().encode(encodedStr);
42+
43+
const chunk = {
44+
data: [
45+
"ASC1",
46+
"ASC2",
47+
"END",
48+
"GABA1",
49+
"GABA2",
50+
"MG",
51+
"NSC",
52+
"ODC1",
53+
"OPC",
54+
"Unclassified",
55+
"exCA1",
56+
"exCA3",
57+
"exDG",
58+
"exPFC1",
59+
"exPFC2",
60+
],
61+
shape: [15],
62+
stride: [1],
63+
};
64+
const jsonCodec = new JsonCodec({ encoding: "utf-8" });
65+
const encodedResult = jsonCodec.encode(chunk);
66+
expect(encodedResult).toStrictEqual(encodedBytes);
67+
});
68+
69+
test("throws on decode when !strict", () => {
70+
const encodedStr = `["A","B","C","|O",[3]]`;
71+
const encodedBytes = new TextEncoder().encode(encodedStr);
72+
const jsonCodec = new JsonCodec({ strict: false });
73+
expect(() => jsonCodec.decode(encodedBytes)).toThrowError();
74+
});
75+
76+
test("throws on encode with non-supported encoding", () => {
77+
const chunk = {
78+
data: ["A", "B", "C"],
79+
shape: [3],
80+
stride: [1],
81+
};
82+
const jsonCodec = new JsonCodec({ check_circular: false });
83+
expect(() => jsonCodec.encode(chunk)).toThrowError();
84+
});
85+
test("throws on encode with !check_circular", () => {
86+
const chunk = {
87+
data: ["A", "B", "C"],
88+
shape: [3],
89+
stride: [1],
90+
};
91+
const jsonCodec = new JsonCodec({ check_circular: false });
92+
expect(() => jsonCodec.encode(chunk)).toThrowError();
93+
});
94+
test("throws on encode with check_circular and circular reference", () => {
95+
let data: any[] = ["A", null];
96+
data[1] = data;
97+
const chunk = {
98+
data,
99+
shape: [2],
100+
stride: [1],
101+
};
102+
const jsonCodec = new JsonCodec({ check_circular: true });
103+
expect(() => jsonCodec.encode(chunk)).toThrowError();
104+
});
105+
test("supports !allow_nan", () => {
106+
const chunk = {
107+
data: [1, 2, NaN],
108+
shape: [3],
109+
stride: [1],
110+
};
111+
const jsonCodec = new JsonCodec({ allow_nan: false });
112+
expect(() => jsonCodec.encode(chunk)).toThrowError();
113+
});
114+
test("supports sort_keys", () => {
115+
const chunk = {
116+
data: [{ "1": 1, "3": 3, "2": 2 }],
117+
shape: [1],
118+
stride: [1],
119+
};
120+
const jsonCodec = new JsonCodec({ sort_keys: true });
121+
const decodedChunk = jsonCodec.decode(jsonCodec.encode(chunk));
122+
expect(Object.keys(decodedChunk.data[0])).toEqual(["1", "2", "3"]);
123+
});
124+
test("supports ensure_ascii", () => {
125+
const chunk = {
126+
data: ["£"],
127+
shape: [1],
128+
stride: [1],
129+
};
130+
const jsonCodec = new JsonCodec({ ensure_ascii: true });
131+
const encodedChunk = jsonCodec.encode(chunk);
132+
const decodedChunk = jsonCodec.decode(encodedChunk);
133+
expect(decodedChunk.data).toEqual(["£"]);
134+
expect(Array.from(encodedChunk)).toEqual([
135+
91,
136+
34,
137+
92,
138+
117,
139+
48,
140+
48,
141+
97,
142+
51,
143+
34,
144+
44,
145+
34,
146+
124,
147+
79,
148+
34,
149+
44,
150+
91,
151+
49,
152+
93,
153+
93,
154+
]);
155+
});
156+
test("supports !ensure_ascii", () => {
157+
const chunk = {
158+
data: ["£"],
159+
shape: [1],
160+
stride: [1],
161+
};
162+
const jsonCodec = new JsonCodec({ ensure_ascii: false });
163+
const encodedChunk = jsonCodec.encode(chunk);
164+
const decodedChunk = jsonCodec.decode(encodedChunk);
165+
expect(decodedChunk.data).toEqual(["£"]);
166+
expect(Array.from(encodedChunk)).toEqual([
167+
91,
168+
34,
169+
194,
170+
163,
171+
34,
172+
44,
173+
34,
174+
124,
175+
79,
176+
34,
177+
44,
178+
91,
179+
49,
180+
93,
181+
93,
182+
]);
183+
});
184+
});

packages/core/src/codecs.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { TransposeCodec } from "./codecs/transpose.js";
55
import { BytesCodec } from "./codecs/bytes.js";
66
import { Crc32cCodec } from "./codecs/crc32c.js";
77
import { VLenUTF8 } from "./codecs/vlen-utf8.js";
8+
import { JsonCodec } from "./codecs/json2.js";
89

910
type ChunkMetadata<D extends DataType> = {
1011
data_type: D;
@@ -30,7 +31,8 @@ function create_default_registry(): Map<
3031
.set("transpose", () => TransposeCodec)
3132
.set("bytes", () => BytesCodec)
3233
.set("crc32c", () => Crc32cCodec)
33-
.set("vlen-utf8", () => VLenUTF8);
34+
.set("vlen-utf8", () => VLenUTF8)
35+
.set("json2", () => JsonCodec);
3436
}
3537

3638
export const registry = create_default_registry();

packages/core/src/codecs/json2.ts

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
// Adapted from https://github.com/hms-dbmi/vizarr/blob/5b0e3ea6fbb42d19d0e38e60e49bb73d1aca0693/src/utils.ts#L26
2+
import type { Chunk, ObjectType } from "../metadata.js";
3+
import { get_strides, json_decode_object } from "../util.js";
4+
5+
type EncoderConfig = {
6+
encoding?: "utf-8";
7+
skipkeys?: boolean;
8+
ensure_ascii?: boolean;
9+
check_circular?: boolean;
10+
allow_nan?: boolean;
11+
sort_keys?: boolean;
12+
indent?: number;
13+
separators?: [string, string];
14+
};
15+
type DecoderConfig = {
16+
strict?: boolean;
17+
};
18+
19+
type JsonCodecConfig = EncoderConfig & DecoderConfig;
20+
21+
// Reference: https://stackoverflow.com/a/21897413
22+
function throw_on_nan_replacer(_key: string | number, value: any): any {
23+
if (value !== value) {
24+
throw new Error(
25+
"JsonCodec allow_nan is false but NaN was encountered during encoding.",
26+
);
27+
}
28+
29+
if (value === Infinity) {
30+
throw new Error(
31+
"JsonCodec allow_nan is false but Infinity was encountered during encoding.",
32+
);
33+
}
34+
35+
if (value === -Infinity) {
36+
throw new Error(
37+
"JsonCodec allow_nan is false but -Infinity was encountered during encoding.",
38+
);
39+
}
40+
return value;
41+
}
42+
43+
// Reference: https://gist.github.com/davidfurlong/463a83a33b70a3b6618e97ec9679e490
44+
function sort_keys_replacer(_key: string | number, value: any): any {
45+
return value instanceof Object && !(value instanceof Array)
46+
? Object.keys(value)
47+
.sort()
48+
.reduce((sorted: any, key: string | number) => {
49+
sorted[key] = value[key];
50+
return sorted;
51+
}, {})
52+
: value;
53+
}
54+
55+
export class JsonCodec {
56+
kind = "array_to_bytes";
57+
58+
#encoder_config: EncoderConfig;
59+
#decoder_config: DecoderConfig;
60+
61+
constructor(
62+
public configuration: JsonCodecConfig,
63+
) {
64+
// Reference: https://github.com/zarr-developers/numcodecs/blob/0878717a3613d91a453fe3d3716aa9c67c023a8b/numcodecs/json.py#L36
65+
const {
66+
encoding = "utf-8",
67+
skipkeys = false,
68+
ensure_ascii = true,
69+
check_circular = true,
70+
allow_nan = true,
71+
sort_keys = true,
72+
indent,
73+
strict = true,
74+
} = configuration;
75+
76+
let separators = configuration.separators;
77+
if (!separators) {
78+
// ensure separators are explicitly specified, and consistent behaviour across
79+
// Python versions, and most compact representation if indent is None
80+
if (!indent) {
81+
separators = [",", ":"];
82+
} else {
83+
separators = [", ", ": "];
84+
}
85+
}
86+
87+
this.#encoder_config = {
88+
encoding,
89+
skipkeys,
90+
ensure_ascii,
91+
check_circular,
92+
allow_nan,
93+
indent,
94+
separators,
95+
sort_keys,
96+
};
97+
this.#decoder_config = { strict };
98+
}
99+
static fromConfig(
100+
configuration: JsonCodecConfig,
101+
) {
102+
return new JsonCodec(configuration);
103+
}
104+
105+
encode(buf: Chunk<ObjectType>): Uint8Array {
106+
const {
107+
indent,
108+
encoding,
109+
ensure_ascii,
110+
check_circular,
111+
allow_nan,
112+
sort_keys,
113+
} = this.#encoder_config;
114+
if (encoding !== "utf-8") {
115+
throw new Error("JsonCodec does not yet support non-utf-8 encoding.");
116+
}
117+
const replacer_functions: Function[] = [];
118+
if (!check_circular) {
119+
// By default, for JSON.stringify,
120+
// a TypeError will be thrown if one attempts to encode an object with circular references
121+
throw new Error(
122+
"JsonCodec does not yet support skipping the check for circular references during encoding.",
123+
);
124+
}
125+
if (!allow_nan) {
126+
// Throw if NaN/Infinity/-Infinity are encountered during encoding.
127+
replacer_functions.push(throw_on_nan_replacer);
128+
}
129+
if (sort_keys) {
130+
// We can ensure keys are sorted but not really the opposite since
131+
// there is no guarantee of key ordering in JS.
132+
replacer_functions.push(sort_keys_replacer);
133+
}
134+
135+
const items = Array.from(buf.data);
136+
items.push("|O");
137+
items.push(buf.shape);
138+
139+
let replacer = undefined;
140+
if (replacer_functions.length) {
141+
replacer = function (key: string | number, value: any): any {
142+
let new_value = value;
143+
replacer_functions.forEach((sub_replacer) => {
144+
new_value = sub_replacer(key, new_value);
145+
});
146+
return new_value;
147+
};
148+
}
149+
let json_str = JSON.stringify(items, replacer, indent);
150+
151+
if (ensure_ascii) {
152+
// If ensure_ascii is true (the default), the output is guaranteed
153+
// to have all incoming non-ASCII characters escaped.
154+
// If ensure_ascii is false, these characters will be output as-is.
155+
// Reference: https://stackoverflow.com/a/31652607
156+
json_str = json_str.replace(/[\u007F-\uFFFF]/g, function (chr) {
157+
const full_str = "0000" + chr.charCodeAt(0).toString(16);
158+
const sub_str = full_str.substring(full_str.length - 4);
159+
return "\\u" + sub_str;
160+
});
161+
}
162+
return new TextEncoder().encode(json_str);
163+
}
164+
165+
decode(bytes: Uint8Array): Chunk<ObjectType> {
166+
const { strict } = this.#decoder_config;
167+
if (!strict) {
168+
// (i.e., allowing control characters inside strings)
169+
throw new Error(
170+
"JsonCodec does not yet support non-strict decoding.",
171+
);
172+
}
173+
const items = json_decode_object(bytes);
174+
const shape = items.pop();
175+
items.pop(); // Pop off dtype (unused)
176+
if (!shape) {
177+
// O-d case
178+
throw new Error("0D not implemented for JsonCodec.");
179+
} else {
180+
const stride = get_strides(shape, "C");
181+
const data = items;
182+
return { data, shape, stride };
183+
}
184+
}
185+
}

0 commit comments

Comments
 (0)