Skip to content

Improve ASCII decoding performance for CBORParser #568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 178 additions & 58 deletions cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
import java.math.BigInteger;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Stack;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.base.ParserMinimalBase;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.io.NumberInput;
import com.fasterxml.jackson.core.json.DupDetector;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
import com.fasterxml.jackson.core.util.*;
import com.fasterxml.jackson.core.util.ByteArrayBuilder;
import com.fasterxml.jackson.core.util.JacksonFeatureSet;
import com.fasterxml.jackson.core.util.TextBuffer;

import static com.fasterxml.jackson.dataformat.cbor.CBORConstants.*;

Expand Down Expand Up @@ -2289,10 +2293,9 @@ protected void _finishToken() throws IOException

if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
_finishShortText(len);
return;
|| _tryToLoadToHaveAtLeast(len)) {
_finishShortText(len);
return;
}
// If not enough space, need handling similar to chunked
_finishLongText(len);
Expand Down Expand Up @@ -2331,11 +2334,9 @@ protected String _finishTextToken(int ch) throws IOException
// due to inputBuffer never being even close to that big).

final int available = _inputEnd - _inputPtr;

if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
|| _tryToLoadToHaveAtLeast(len)) {
return _finishShortText(len);
}
// If not enough space, need handling similar to chunked
Expand Down Expand Up @@ -2364,19 +2365,22 @@ private final String _finishShortText(int len) throws IOException

// Let's actually do a tight loop for ASCII first:
final int end = _inputPtr;

int i;
while ((i = inputBuf[inPtr]) >= 0) {
int i = 0;
while (inPtr < end && i >= 0) {
i = inputBuf[inPtr++];
outBuf[outPtr++] = (char) i;
if (++inPtr == end) {
String str = _textBuffer.setCurrentAndReturn(outPtr);
if (stringRefs != null) {
stringRefs.stringRefs.add(str);
_sharedString = str;
}
return str;
}
if (inPtr == end && i >= 0) {
String str = _textBuffer.setCurrentAndReturn(outPtr);
if (stringRefs != null) {
stringRefs.stringRefs.add(str);
_sharedString = str;
}
return str;
}
// Correct extra increments
outPtr -= 1;
inPtr -= 1;
final int[] codes = UTF8_UNIT_CODES;
do {
i = inputBuf[inPtr++] & 0xFF;
Expand Down Expand Up @@ -2443,10 +2447,17 @@ private final String _finishShortText(int len) throws IOException

private final String _finishLongText(int len) throws IOException
{
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
int outPtr = 0;
final int[] codes = UTF8_UNIT_CODES;
StringRefList stringRefs = null;
if (!_stringRefs.empty() &&
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
stringRefs = _stringRefs.peek();
}
// First a tight loop for ASCII.
len = _finishLongTextAscii(len);
char[] outBuf = _textBuffer.getBufferWithoutReset();
int outPtr = _textBuffer.getCurrentSegmentSize();
int outEnd = outBuf.length;
final int[] codes = UTF8_UNIT_CODES;

while (--len >= 0) {
int c = _nextByte() & 0xFF;
Expand Down Expand Up @@ -2500,14 +2511,51 @@ private final String _finishLongText(int len) throws IOException
outBuf[outPtr++] = (char) c;
}
String str = _textBuffer.setCurrentAndReturn(outPtr);
if (!_stringRefs.empty() &&
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
_stringRefs.peek().stringRefs.add(str);
if (stringRefs != null) {
stringRefs.stringRefs.add(str);
_sharedString = str;
}
return str;
}

/**
* Consumes as many ascii chars as possible in a tight loop. Returns the amount of bytes remaining.
*/
private final int _finishLongTextAscii(int len) throws IOException
{
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
final byte[] input = _inputBuffer;
while (len > 0) {
// load as much input as possible
int size = Math.min(len, Math.min(outBuf.length, input.length));
if (!_tryToLoadToHaveAtLeast(size)) {
return len;
}
int outEnd = size;
int outPtr = 0;
int inPtr = _inputPtr;
int i = 0;
// Tight loop to copy into the output buffer, bail if a non-ascii char is found
while (outPtr < outEnd && i >= 0) {
i = input[inPtr++];
outBuf[outPtr++] = (char) i;
}
// Found a non-ascii char, correct pointers and return to the caller.
if (i < 0) {
--outPtr;
_inputPtr = inPtr - 1;
_textBuffer.setCurrentLength(outPtr);
return len - outPtr;
}
_inputPtr = inPtr;
if (outPtr >= outBuf.length) {
outBuf = _textBuffer.finishCurrentSegment();
}
len -= size;
}
return len;
}

private final void _finishChunkedText() throws IOException
{
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
Expand All @@ -2532,7 +2580,6 @@ private final void _finishChunkedText() throws IOException
}
break;
}
_chunkLeft = len;
int end = _inputPtr + len;
if (end <= _inputEnd) { // all within buffer
_chunkLeft = 0;
Expand All @@ -2541,19 +2588,22 @@ private final void _finishChunkedText() throws IOException
_chunkLeft = (end - _inputEnd);
_chunkEnd = _inputEnd;
}
}
// besides of which just need to ensure there's content
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
loadMoreGuaranteed();
int end = _inputPtr + _chunkLeft;
if (end <= _inputEnd) { // all within buffer
_chunkLeft = 0;
_chunkEnd = end;
} else { // stretches beyond
_chunkLeft = (end - _inputEnd);
_chunkEnd = _inputEnd;
// start of a new chunk
// First a tight loop for ASCII.
_textBuffer.setCurrentLength(outPtr);
if (_finishChunkedTextAscii()) {
// chunk fully consumed, let's get the next one
outBuf = _textBuffer.getBufferWithoutReset();
outPtr = _textBuffer.getCurrentSegmentSize();
outEnd = outBuf.length;
continue;
}
outBuf = _textBuffer.getBufferWithoutReset();
outPtr = _textBuffer.getCurrentSegmentSize();
outEnd = outBuf.length;
}
// besides of which just need to ensure there's content
_loadMoreForChunkIfNeeded();
}
int c = input[_inputPtr++] & 0xFF;
int code = codes[c];
Expand All @@ -2563,9 +2613,9 @@ private final void _finishChunkedText() throws IOException
}

switch (code) {
case 0:
break;
case 1: // 2-byte UTF
case 0:
break;
case 1: // 2-byte UTF
{
int d = _nextChunkedByte();
if ((d & 0xC0) != 0x080) {
Expand All @@ -2574,24 +2624,24 @@ private final void _finishChunkedText() throws IOException
c = ((c & 0x1F) << 6) | (d & 0x3F);
}
break;
case 2: // 3-byte UTF
c = _decodeChunkedUTF8_3(c);
break;
case 3: // 4-byte UTF
c = _decodeChunkedUTF8_4(c);
// Let's add first part right away:
if (outPtr >= outBuf.length) {
outBuf = _textBuffer.finishCurrentSegment();
outPtr = 0;
outEnd = outBuf.length;
}
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
default:
// Is this good enough error message?
_reportInvalidInitial(c);
case 2: // 3-byte UTF
c = _decodeChunkedUTF8_3(c);
break;
case 3: // 4-byte UTF
c = _decodeChunkedUTF8_4(c);
// Let's add first part right away:
if (outPtr >= outBuf.length) {
outBuf = _textBuffer.finishCurrentSegment();
outPtr = 0;
outEnd = outBuf.length;
}
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
default:
// Is this good enough error message?
_reportInvalidInitial(c);
}
// Need more room?
if (outPtr >= outEnd) {
Expand All @@ -2602,9 +2652,75 @@ private final void _finishChunkedText() throws IOException
// Ok, let's add char to output:
outBuf[outPtr++] = (char) c;
}

_textBuffer.setCurrentLength(outPtr);
}

/**
* Reads in a tight loop ASCII text until a non-ASCII char is found. If any, then it returns false to signal the
* caller that the chunk wasn't finished. The caller will keep adding to the _outBuf at the _outPtr position to
* finish the current text buffer segment
*/
private final boolean _finishChunkedTextAscii() throws IOException
{
final byte[] input = _inputBuffer;
int outPtr = _textBuffer.getCurrentSegmentSize();
char[] outBuf = _textBuffer.getBufferWithoutReset();
int outEnd = outBuf.length;
while (true) {
// besides of which just need to ensure there's content
_loadMoreForChunkIfNeeded();

// Find the size of the loop
int inSize = _chunkEnd - _inputPtr;
int outSize = outEnd - outPtr;
int inputPtr = _inputPtr;
int inputPtrEnd = _inputPtr + Math.min(inSize, outSize);
int i = 0;
// loop with copying what we can.
while (inputPtr < inputPtrEnd && i >= 0) {
i = input[inputPtr++];
char val = (char) i;
outBuf[outPtr++] = val;
}
_inputPtr = inputPtr;

if (i < 0) {
// Found a non-ascii char, correct pointers and return to the caller.
_inputPtr -= 1;
_textBuffer.setCurrentLength(outPtr - 1);
// return false to signal this to the calling code to allow the multi-byte code-path to kick.
return false;
}
// Need more room?
if (outPtr >= outEnd) {
outBuf = _textBuffer.finishCurrentSegment();
outPtr = 0;
outEnd = outBuf.length;
}
if (_inputPtr < _chunkEnd || _chunkLeft > 0) {
continue;
}
_textBuffer.setCurrentLength(outPtr);
return true;
}
}

private final void _loadMoreForChunkIfNeeded() throws IOException
{
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
loadMoreGuaranteed();
int end = _inputPtr + _chunkLeft;
if (end <= _inputEnd) { // all within buffer
_chunkLeft = 0;
_chunkEnd = end;
} else { // stretches beyond
_chunkLeft = (end - _inputEnd);
_chunkEnd = _inputEnd;
}
}
}

private final int _nextByte() throws IOException {
int inPtr = _inputPtr;
if (inPtr < _inputEnd) {
Expand Down Expand Up @@ -3716,6 +3832,10 @@ protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOExcep
if (_inputStream == null) {
return false;
}
// The code below assumes this is true, so we check it here.
if (_inputBuffer.length < minAvailable) {
return false;
}
// Need to move remaining data in front?
int amount = _inputEnd - _inputPtr;
if (amount > 0 && _inputPtr > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ protected static String generateUnicodeString(int length) {
return generateUnicodeString(length, new Random(length));
}

protected static String generateUnicodeStringWithAsciiPrefix(int asciiPrefixLen, int length) {
return generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, length, new Random(length));
}

protected static String generateUnicodeString(int length, Random rnd)
{
StringBuilder sw = new StringBuilder(length+10);
Expand All @@ -241,6 +245,31 @@ protected static String generateUnicodeString(int length, Random rnd)
return sw.toString();
}

protected static String generateUnicodeStringWithAsciiPrefix(int asciiLength, int length, Random rnd)
{
StringBuilder sw = new StringBuilder(length+10);
// add a prefix of ascii chars
int num = asciiLength;
while (--num >= 0) {
sw.append((char) ('A' + (num % 32)));
}
do {
// Then a unicode char of 2, 3 or 4 bytes long
switch (rnd.nextInt() % 3) {
case 0:
sw.append((char) (256 + rnd.nextInt() & 511));
break;
case 1:
sw.append((char) (2048 + rnd.nextInt() & 4095));
break;
default:
sw.append((char) (65536 + rnd.nextInt() & 0x3FFF));
break;
}
} while (sw.length() < length);
return sw.toString();
}

protected static String generateLongAsciiString(int length) {
return generateLongAsciiString(length, new Random(length));
}
Expand Down
Loading