Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ a pure JSON library.

2.21.0 (not yet released)

#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF)
(fix by @cowtowncoder, w/ Claude code)
#1180: `JsonLocation` off for unrecognized tokens
(fix by @cowtowncoder, w/ Claude code)
#1470: Add method `copyCurrentStructureExact()` to `JsonGenerator`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,22 @@ protected <T> T _reportUnexpectedNumberChar(int ch, String comment) throws JsonP
protected void reportUnexpectedNumberChar(int ch, String comment) throws JsonParseException {
_reportUnexpectedNumberChar(ch, comment);
}

/**
* Method called to throw an exception for invalid UTF-8 surrogate character: case
* where a surrogate character (between U+D800 and U+DFFF) is decoded from UTF-8
* bytes (but NOT from JSON entity!)
*
* @param ch Character code (int) that is invalid surrogate
*
* @throws JsonParseException Exception that describes problem with UTF-8 surrogate
*
* @since 2.21
*/
protected void _reportInvalidUTF8Surrogate(int ch) throws JsonParseException {
throw _constructReadException(
"Invalid UTF-8: Illegal surrogate character 0x"+Integer.toHexString(ch));
}

protected void _throwInvalidSpace(int i) throws JsonParseException {
char c = (char) i;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1859,7 +1859,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
_reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
if (needed == 2) {
if (ch >= 0xD800 && ch <= 0xDFFF) {
_reportInvalidUTF8Surrogate(ch);
}
} else { // 4 bytes? (need surrogates on output)
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
Expand Down Expand Up @@ -2678,6 +2683,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
_reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
if (c >= 0xD800 && c <= 0xDFFF) {
_reportInvalidUTF8Surrogate(c);
}
return c;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2407,7 +2407,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
_reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
if (needed == 2) {
if (ch >= 0xD800 && ch <= 0xDFFF) {
_reportInvalidUTF8Surrogate(ch);
}
} else { // 4 bytes? (need surrogates on output)
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
Expand Down Expand Up @@ -3481,6 +3486,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
_reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
if (c >= 0xD800 && c <= 0xDFFF) {
_reportInvalidUTF8Surrogate(c);
}
return c;
}

Expand All @@ -3497,6 +3506,10 @@ private final int _decodeUtf8_3fast(int c1) throws IOException
_reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
if (c >= 0xD800 && c <= 0xDFFF) {
_reportInvalidUTF8Surrogate(c);
}
return c;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -768,7 +768,12 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)
_reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
if (needed == 2) {
if (ch >= 0xD800 && ch <= 0xDFFF) {
_reportInvalidUTF8Surrogate(ch);
}
} else { // 4 bytes? (need surrogates on output)
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2828,7 +2828,12 @@ private final boolean _decodeSplitUTF8_3(int prev, int prevCount, int next)
if ((next & 0xC0) != 0x080) {
_reportInvalidOther(next & 0xFF, _inputPtr);
}
_textBuffer.append((char) ((prev << 6) | (next & 0x3F)));
int c = (prev << 6) | (next & 0x3F);
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
if (c >= 0xD800 && c <= 0xDFFF) {
_reportInvalidUTF8Surrogate(c);
}
_textBuffer.append((char) c);
return true;
}

Expand Down Expand Up @@ -2974,7 +2979,12 @@ private final int _decodeUTF8_3(int c, int d, int e) throws IOException
if ((e & 0xC0) != 0x080) {
_reportInvalidOther(e & 0xFF, _inputPtr);
}
return (c << 6) | (e & 0x3F);
c = (c << 6) | (e & 0x3F);
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
if (c >= 0xD800 && c <= 0xDFFF) {
_reportInvalidUTF8Surrogate(c);
}
return c;
}

// @return Character value <b>minus 0x10000</c>; this so that caller
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package com.fasterxml.jackson.core.read;

import org.junit.jupiter.api.Test;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.exc.StreamReadException;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

/**
* Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences
* that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8.
*/
class UTF8SurrogateValidation363Test extends JUnit5TestBase
{
private final JsonFactory FACTORY = newStreamFactory();

/**
* Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range).
* In UTF-8, U+D800 would be encoded as: ED A0 80
*/
@Test
void rejectSurrogateD800InString() throws Exception
{
// JSON: {"value":"X"}
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
byte[] doc = new byte[] {
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
'"',
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
'"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());
assertToken(JsonToken.FIELD_NAME, p.nextToken());
assertEquals("value", p.currentName());

// This should fail when trying to read the string value
assertToken(JsonToken.VALUE_STRING, p.nextToken());
p.getText(); // Actual parsing happens here (lazy parsing)
fail("Should have thrown an exception for surrogate code point in UTF-8");
} catch (StreamReadException e) {
verifyException(e, "Invalid UTF-8");
}
}

/**
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range).
* In UTF-8, U+DFFF would be encoded as: ED BF BF
*/
@Test
void rejectSurrogateDFFFInString() throws Exception
{
// JSON: {"value":"X"}
// where X is the invalid 3-byte sequence ED BF BF (U+DFFF)
byte[] doc = new byte[] {
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
'"',
(byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate
'"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());
assertToken(JsonToken.FIELD_NAME, p.nextToken());
assertEquals("value", p.currentName());

// This should fail when trying to read the string value
assertToken(JsonToken.VALUE_STRING, p.nextToken());
p.getText(); // Actual parsing happens here (lazy parsing)
fail("Should have thrown an exception for surrogate code point in UTF-8");
} catch (StreamReadException e) {
verifyException(e, "Invalid UTF-8");
}
}

/**
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range).
* In UTF-8, U+DABC would be encoded as: ED AA BC
*/
@Test
void rejectSurrogateMiddleInString() throws Exception
{
// JSON: {"value":"X"}
// where X is the invalid 3-byte sequence ED AA BC (U+DABC)
byte[] doc = new byte[] {
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
'"',
(byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate
'"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());
assertToken(JsonToken.FIELD_NAME, p.nextToken());
assertEquals("value", p.currentName());

// This should fail when trying to read the string value
assertToken(JsonToken.VALUE_STRING, p.nextToken());
p.getText(); // Actual parsing happens here (lazy parsing)
fail("Should have thrown an exception for surrogate code point in UTF-8");
} catch (StreamReadException e) {
verifyException(e, "Invalid UTF-8");
}
}

/**
* Test that parser rejects surrogate in field name as well.
*/
@Test
void rejectSurrogateInFieldName() throws Exception
{
// JSON: {"X":"value"}
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
byte[] doc = new byte[] {
'{', '"',
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
'"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());

// This should fail when trying to read the field name
// (no lazy parsing for names)
assertToken(JsonToken.FIELD_NAME, p.nextToken());
fail("Should have thrown an exception for surrogate code point in UTF-8");
} catch (StreamReadException e) {
verifyException(e, "Invalid UTF-8");
}
}

/**
* Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work.
* U+D7FF is the last valid code point before the surrogate range.
* In UTF-8: ED 9F BF
*/
@Test
void acceptValidBeforeSurrogateRange() throws Exception
{
// JSON: {"value":"X"}
// where X is the valid 3-byte sequence ED 9F BF (U+D7FF)
byte[] doc = new byte[] {
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
'"',
(byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates)
'"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());
assertToken(JsonToken.FIELD_NAME, p.nextToken());
assertEquals("value", p.currentName());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals("\uD7FF", p.getText());
assertToken(JsonToken.END_OBJECT, p.nextToken());
}
}

/**
* Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work.
* U+E000 is the first valid code point after the surrogate range.
* In UTF-8: EE 80 80
*/
@Test
void acceptValidAfterSurrogateRange() throws Exception
{
// JSON: {"value":"X"}
// where X is the valid 3-byte sequence EE 80 80 (U+E000)
byte[] doc = new byte[] {
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
'"',
(byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates)
'"',
'}'
};

try (JsonParser p = FACTORY.createParser(doc)) {
assertToken(JsonToken.START_OBJECT, p.nextToken());
assertToken(JsonToken.FIELD_NAME, p.nextToken());
assertEquals("value", p.currentName());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals("\uE000", p.getText());
assertToken(JsonToken.END_OBJECT, p.nextToken());
}
}
}