Skip to content

Commit 60fc4a0

Browse files
committed
Improve performance of writing raw UTF-8 encoded byte arrays
The output escape table covers just 7-bits, meaning that a raw UTF-8 byte cannot be used to index into the table without a branch test for negative bytes (i.e. bytes larger than 0x7F). This extra check occurs in a tight loop and can be avoided if the lookup table were to cover all 8-bit indices. This commit introduces ad-hoc logic in `UTF8JsonGenerator#writeUTF8String` to create an extended copy of `_outputEscapes` if necessary, writing the copy back into the field to avoid having to compute it again (unless it is changed). This ad-hoc strategy was chosen as it is the least disruptive to existing code, as a larger-scale change around `CharacterEscapes` would impact public api or otherwise subtle chances for breakages.
1 parent ec5259c commit 60fc4a0

File tree

2 files changed

+234
-15
lines changed

2 files changed

+234
-15
lines changed

src/main/java/tools/jackson/core/json/UTF8JsonGenerator.java

+31-15
Original file line numberDiff line numberDiff line change
@@ -663,11 +663,16 @@ public JsonGenerator writeUTF8String(byte[] text, int offset, int len) throws Ja
663663
_flushBuffer();
664664
}
665665
_outputBuffer[_outputTail++] = _quoteChar;
666+
667+
// When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into
668+
// using the byte value.
669+
final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits();
670+
666671
// One or multiple segments?
667672
if (len <= _outputMaxContiguous) {
668-
_writeUTF8Segment(text, offset, len);
673+
_writeUTF8Segment(text, offset, len, extendedOutputEscapes);
669674
} else {
670-
_writeUTF8Segments(text, offset, len);
675+
_writeUTF8Segments(text, offset, len, extendedOutputEscapes);
671676
}
672677
if (_outputTail >= _outputEnd) {
673678
_flushBuffer();
@@ -1885,28 +1890,27 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in
18851890
* to fit in the output buffer after escaping; as such, we just need to
18861891
* chunk writes.
18871892
*/
1888-
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
1893+
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes)
18891894
throws JacksonException
18901895
{
18911896
do {
18921897
int len = Math.min(_outputMaxContiguous, totalLen);
1893-
_writeUTF8Segment(utf8, offset, len);
1898+
_writeUTF8Segment(utf8, offset, len, extendedOutputEscapes);
18941899
offset += len;
18951900
totalLen -= len;
18961901
} while (totalLen > 0);
18971902
}
18981903

1899-
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len)
1904+
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len,
1905+
final int[] extendedOutputEscapes)
19001906
throws JacksonException
19011907
{
19021908
// fast loop to see if escaping is needed; don't copy, just look
1903-
final int[] escCodes = _outputEscapes;
1904-
19051909
for (int ptr = offset, end = offset + len; ptr < end; ) {
19061910
// 28-Feb-2011, tatu: escape codes just cover 7-bit range, so:
1907-
int ch = utf8[ptr++];
1908-
if ((ch >= 0) && escCodes[ch] != 0) {
1909-
_writeUTF8Segment2(utf8, offset, len);
1911+
int ch = utf8[ptr++] & 0xFF;
1912+
if (extendedOutputEscapes[ch] != 0) {
1913+
_writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes);
19101914
return;
19111915
}
19121916
}
@@ -1919,7 +1923,8 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le
19191923
_outputTail += len;
19201924
}
19211925

1922-
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
1926+
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len,
1927+
final int[] extendedOutputEscapes)
19231928
throws JacksonException
19241929
{
19251930
int outputPtr = _outputTail;
@@ -1931,17 +1936,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
19311936
}
19321937

19331938
final byte[] outputBuffer = _outputBuffer;
1934-
final int[] escCodes = _outputEscapes;
19351939
len += offset; // so 'len' becomes 'end'
19361940

19371941
while (offset < len) {
19381942
byte b = utf8[offset++];
1939-
int ch = b;
1940-
if (ch < 0 || escCodes[ch] == 0) {
1943+
int ch = b & 0xFF;
1944+
int escape = extendedOutputEscapes[ch];
1945+
if (escape == 0) {
19411946
outputBuffer[outputPtr++] = b;
19421947
continue;
19431948
}
1944-
int escape = escCodes[ch];
19451949
if (escape > 0) { // 2-char escape, fine
19461950
outputBuffer[outputPtr++] = BYTE_BACKSLASH;
19471951
outputBuffer[outputPtr++] = (byte) escape;
@@ -1953,6 +1957,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
19531957
_outputTail = outputPtr;
19541958
}
19551959

1960+
private int[] _extendOutputEscapesTo8Bits() {
1961+
final int[] escapes = _outputEscapes;
1962+
if (escapes.length >= 0xFF) {
1963+
return escapes;
1964+
}
1965+
1966+
final int[] extended = new int[0xFF];
1967+
System.arraycopy(escapes, 0, extended, 0, escapes.length);
1968+
_outputEscapes = extended;
1969+
return extended;
1970+
}
1971+
19561972
/*
19571973
/**********************************************************************
19581974
/* Internal methods, low-level writing, base64 encoded
+203
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
package perf;
2+
3+
import tools.jackson.core.JsonGenerator;
4+
import tools.jackson.core.io.CharTypes;
5+
import tools.jackson.core.json.JsonFactory;
6+
7+
import java.io.ByteArrayOutputStream;
8+
import java.io.IOException;
9+
import java.util.ArrayList;
10+
import java.util.Arrays;
11+
import java.util.List;
12+
13+
/**
14+
* Benchmarks the performance of writing UTF-8 encoded bytes, in particular the difference between using a 7-bit wide
15+
* lookup table for escapes, versus a full 8-bit wide table. The latter is beneficial when processing encoded UTF-8
16+
* bytes, as the byte itself can directly be used as table index instead of needing an additional branch.
17+
* <p>
18+
* This benchmark implements the escaping UTF-8 write loops using both 7-bit and 8-bit tables to show their respective
19+
* differences, as well as testing {@link JsonGenerator#writeUTF8String} for benchmarking the production implementation.
20+
*
21+
* @see <a href="https://github.com/FasterXML/jackson-core/pull/1349">Github PR</a>
22+
*/
23+
public class ManualUtf8WriteTest
24+
{
25+
private String test(byte[] utf8) throws Exception
26+
{
27+
final byte[] OUTPUT = new byte[utf8.length * 2];
28+
ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(utf8.length * 2);
29+
JsonGenerator generator = new JsonFactory().createGenerator(OUTPUT_STREAM);
30+
31+
// Let's try to guestimate suitable size, N megs of output
32+
final int REPS = (int) ((double) (80 * 1000 * 1000) / (double) utf8.length);
33+
System.out.printf("%d bytes to scan, will do %d repetitions\n",
34+
utf8.length, REPS);
35+
36+
int i = 0;
37+
int roundsDone = 0;
38+
final int TYPES = 3;
39+
final int WARMUP_ROUNDS = 5;
40+
final int ROUNDS = WARMUP_ROUNDS + 10;
41+
42+
final long[] times = new long[TYPES];
43+
44+
while (i < ROUNDS * TYPES) {
45+
int round = i++ % TYPES;
46+
47+
String msg;
48+
49+
long msecs;
50+
switch (round) {
51+
case 0:
52+
msg = "Write UTF-8 [7-bit escaping table]";
53+
msecs = writeUtf8_7BitEscapingTable(REPS, utf8, OUTPUT);
54+
break;
55+
case 1:
56+
msg = "Write UTF-8 [8-bit escaping table]";
57+
msecs = writeUtf8_8BitEscapingTable(REPS, utf8, OUTPUT);
58+
break;
59+
case 2:
60+
msg = "JsonGenerator.writeUTF8String ";
61+
msecs = writeUtf8_JsonGenerator(REPS, utf8, OUTPUT_STREAM, generator);
62+
break;
63+
default:
64+
throw new Error();
65+
}
66+
// skip first 5 rounds to let results stabilize
67+
if (roundsDone >= WARMUP_ROUNDS) {
68+
times[round] += msecs;
69+
}
70+
71+
System.out.printf("Test '%s' -> %3d msecs\n", msg, msecs);
72+
if (round == TYPES - 1) {
73+
++roundsDone;
74+
if ((roundsDone % 3) == 0) {
75+
System.out.println("[GC]");
76+
Thread.sleep(100L);
77+
System.gc();
78+
Thread.sleep(100L);
79+
}
80+
System.out.println();
81+
}
82+
}
83+
double den = roundsDone - WARMUP_ROUNDS;
84+
85+
return String.format("(7-bit, 8-bit, JsonGenerator): %5.1f / %5.1f / %5.1f msecs",
86+
times[0] / den, times[1] / den, times[2] / den);
87+
}
88+
89+
private final long writeUtf8_7BitEscapingTable(int REPS, byte[] input, byte[] output)
90+
{
91+
long start = System.currentTimeMillis();
92+
int[] outputEscapes = CharTypes.get7BitOutputEscapes();
93+
94+
while (--REPS >= 0) {
95+
int inOffset = 0;
96+
int outOffset = 0;
97+
int len = input.length;
98+
99+
while (inOffset < len) {
100+
byte b = input[inOffset++];
101+
int ch = b;
102+
if (ch < 0 || outputEscapes[ch] == 0) {
103+
output[outOffset++] = b;
104+
continue;
105+
}
106+
int escape = outputEscapes[ch];
107+
if (escape > 0) {
108+
output[outOffset++] = (byte) '\\';
109+
output[outOffset++] = (byte) escape;
110+
} else {
111+
throw new UnsupportedOperationException("ctrl character escapes are not covered in test");
112+
}
113+
}
114+
}
115+
long time = System.currentTimeMillis() - start;
116+
return time;
117+
}
118+
119+
private final long writeUtf8_8BitEscapingTable(int REPS, byte[] input, byte[] output)
120+
{
121+
long start = System.currentTimeMillis();
122+
123+
int[] outputEscapes = CharTypes.get7BitOutputEscapes();
124+
int[] extendedOutputEscapes = new int[0xFF];
125+
System.arraycopy(outputEscapes, 0, extendedOutputEscapes, 0, outputEscapes.length);
126+
127+
while (--REPS >= 0) {
128+
int inOffset = 0;
129+
int outOffset = 0;
130+
int len = input.length;
131+
132+
while (inOffset < len) {
133+
byte b = input[inOffset++];
134+
int ch = b & 0xFF;
135+
int escape = extendedOutputEscapes[ch];
136+
if (escape == 0) {
137+
output[outOffset++] = b;
138+
continue;
139+
}
140+
if (escape > 0) {
141+
output[outOffset++] = (byte) '\\';
142+
output[outOffset++] = (byte) escape;
143+
} else {
144+
throw new UnsupportedOperationException("ctrl character escapes are not covered in test");
145+
}
146+
}
147+
}
148+
149+
long time = System.currentTimeMillis() - start;
150+
return time;
151+
}
152+
153+
private final long writeUtf8_JsonGenerator(int REPS, byte[] input, ByteArrayOutputStream output, JsonGenerator generator) throws IOException {
154+
long start = System.currentTimeMillis();
155+
156+
while (--REPS >= 0) {
157+
output.reset();
158+
generator.writeUTF8String(input, 0, input.length);
159+
generator.flush();
160+
}
161+
162+
long time = System.currentTimeMillis() - start;
163+
return time;
164+
}
165+
166+
public static void main(String[] args) throws Exception
167+
{
168+
if (args.length != 0) {
169+
System.err.println("Usage: java ...");
170+
System.exit(1);
171+
}
172+
173+
final int[] LENGTHS = new int[]{8, 16, 32, 256, 512, 1024, 1024 * 8};
174+
final String[] ESCAPE_VARIANTS = new String[] {"none", "start", "end"};
175+
final List<String> results = new ArrayList<String>();
176+
for (int length : LENGTHS){
177+
final byte[] buffer = new byte[length];
178+
179+
for (int j = 0; j < ESCAPE_VARIANTS.length; j++) {
180+
Arrays.fill(buffer, (byte) 'a');
181+
182+
if (j == 1) {
183+
buffer[0] = '"';
184+
} else if (j == 2) {
185+
buffer[buffer.length - 1] = '"';
186+
}
187+
188+
String LABEL = String.format("Length %4d, %5s escape", length, ESCAPE_VARIANTS[j]);
189+
190+
System.out.printf("Starting %s %n", LABEL);
191+
String result = new ManualUtf8WriteTest().test(buffer);
192+
System.out.printf("Finished %s %n", LABEL);
193+
System.out.println("================================================================================");
194+
195+
results.add(String.format("%s: %s", LABEL, result));
196+
}
197+
}
198+
199+
for (String result : results) {
200+
System.out.println(result);
201+
}
202+
}
203+
}

0 commit comments

Comments
 (0)