diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index 6e6a33c1af..fb1250819c 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -16,6 +16,8 @@ a pure JSON library. 2.21.0 (not yet released) +#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF) + (fix by @cowtowncoder, w/ Claude code) #1180: `JsonLocation` off for unrecognized tokens (fix by @cowtowncoder, w/ Claude code) #1470: Add method `copyCurrentStructureExact()` to `JsonGenerator` diff --git a/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java b/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java index b77030557c..d052248a99 100644 --- a/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java +++ b/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java @@ -767,6 +767,22 @@ protected T _reportUnexpectedNumberChar(int ch, String comment) throws JsonP protected void reportUnexpectedNumberChar(int ch, String comment) throws JsonParseException { _reportUnexpectedNumberChar(ch, comment); } + + /** + * Method called to throw an exception for invalid UTF-8 surrogate character: case + * where a surrogate character (between U+D800 and U+DFFF) is decoded from UTF-8 + * bytes (but NOT from JSON entity!) + * + * @param ch Character code (int) that is invalid surrogate + * + * @throws JsonParseException Exception that describes problem with UTF-8 surrogate + * + * @since 2.21 + */ + protected void _reportInvalidUTF8Surrogate(int ch) throws JsonParseException { + throw _constructReadException( + "Invalid UTF-8: Illegal surrogate character 0x"+Integer.toHexString(ch)); + } protected void _throwInvalidSpace(int i) throws JsonParseException { char c = (char) i; diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java index fad4a661fc..76f64ca72c 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java @@ -1859,7 +1859,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes) _reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); - if (needed > 2) { // 4 bytes? (need surrogates on output) + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences + if (needed == 2) { + if (ch >= 0xD800 && ch <= 0xDFFF) { + _reportInvalidUTF8Surrogate(ch); + } + } else { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); @@ -2678,6 +2683,10 @@ private final int _decodeUtf8_3(int c1) throws IOException _reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 + if (c >= 0xD800 && c <= 0xDFFF) { + _reportInvalidUTF8Surrogate(c); + } return c; } diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java index 70e78362e0..d1f9382d07 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java @@ -2407,7 +2407,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes) _reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); - if (needed > 2) { // 4 bytes? (need surrogates on output) + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences + if (needed == 2) { + if (ch >= 0xD800 && ch <= 0xDFFF) { + _reportInvalidUTF8Surrogate(ch); + } + } else { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); @@ -3481,6 +3486,10 @@ private final int _decodeUtf8_3(int c1) throws IOException _reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 + if (c >= 0xD800 && c <= 0xDFFF) { + _reportInvalidUTF8Surrogate(c); + } return c; } @@ -3497,6 +3506,10 @@ private final int _decodeUtf8_3fast(int c1) throws IOException _reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 + if (c >= 0xD800 && c <= 0xDFFF) { + _reportInvalidUTF8Surrogate(c); + } return c; } diff --git a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java index e8cb680fd9..974008da20 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java +++ b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java @@ -768,7 +768,12 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes) _reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); - if (needed > 2) { // 4 bytes? (need surrogates on output) + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences + if (needed == 2) { + if (ch >= 0xD800 && ch <= 0xDFFF) { + _reportInvalidUTF8Surrogate(ch); + } + } else { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); diff --git a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java index 00c9f27153..fceb3734bf 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java +++ b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java @@ -2828,7 +2828,12 @@ private final boolean _decodeSplitUTF8_3(int prev, int prevCount, int next) if ((next & 0xC0) != 0x080) { _reportInvalidOther(next & 0xFF, _inputPtr); } - _textBuffer.append((char) ((prev << 6) | (next & 0x3F))); + int c = (prev << 6) | (next & 0x3F); + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 + if (c >= 0xD800 && c <= 0xDFFF) { + _reportInvalidUTF8Surrogate(c); + } + _textBuffer.append((char) c); return true; } @@ -2974,7 +2979,12 @@ private final int _decodeUTF8_3(int c, int d, int e) throws IOException if ((e & 0xC0) != 0x080) { _reportInvalidOther(e & 0xFF, _inputPtr); } - return (c << 6) | (e & 0x3F); + c = (c << 6) | (e & 0x3F); + // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 + if (c >= 0xD800 && c <= 0xDFFF) { + _reportInvalidUTF8Surrogate(c); + } + return c; } // @return Character value minus 0x10000; this so that caller diff --git a/src/test/java/com/fasterxml/jackson/core/read/UTF8SurrogateValidation363Test.java b/src/test/java/com/fasterxml/jackson/core/read/UTF8SurrogateValidation363Test.java new file mode 100644 index 0000000000..97256dd562 --- /dev/null +++ b/src/test/java/com/fasterxml/jackson/core/read/UTF8SurrogateValidation363Test.java @@ -0,0 +1,194 @@ +package com.fasterxml.jackson.core.read; + +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.core.exc.StreamReadException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +/** + * Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences + * that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8. + */ +class UTF8SurrogateValidation363Test extends JUnit5TestBase +{ + private final JsonFactory FACTORY = newStreamFactory(); + + /** + * Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range). + * In UTF-8, U+D800 would be encoded as: ED A0 80 + */ + @Test + void rejectSurrogateD800InString() throws Exception + { + // JSON: {"value":"X"} + // where X is the invalid 3-byte sequence ED A0 80 (U+D800) + byte[] doc = new byte[] { + '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':', + '"', + (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate + '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("value", p.currentName()); + + // This should fail when trying to read the string value + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + p.getText(); // Actual parsing happens here (lazy parsing) + fail("Should have thrown an exception for surrogate code point in UTF-8"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8"); + } + } + + /** + * Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range). + * In UTF-8, U+DFFF would be encoded as: ED BF BF + */ + @Test + void rejectSurrogateDFFFInString() throws Exception + { + // JSON: {"value":"X"} + // where X is the invalid 3-byte sequence ED BF BF (U+DFFF) + byte[] doc = new byte[] { + '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':', + '"', + (byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate + '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("value", p.currentName()); + + // This should fail when trying to read the string value + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + p.getText(); // Actual parsing happens here (lazy parsing) + fail("Should have thrown an exception for surrogate code point in UTF-8"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8"); + } + } + + /** + * Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range). + * In UTF-8, U+DABC would be encoded as: ED AA BC + */ + @Test + void rejectSurrogateMiddleInString() throws Exception + { + // JSON: {"value":"X"} + // where X is the invalid 3-byte sequence ED AA BC (U+DABC) + byte[] doc = new byte[] { + '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':', + '"', + (byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate + '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("value", p.currentName()); + + // This should fail when trying to read the string value + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + p.getText(); // Actual parsing happens here (lazy parsing) + fail("Should have thrown an exception for surrogate code point in UTF-8"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8"); + } + } + + /** + * Test that parser rejects surrogate in field name as well. + */ + @Test + void rejectSurrogateInFieldName() throws Exception + { + // JSON: {"X":"value"} + // where X is the invalid 3-byte sequence ED A0 80 (U+D800) + byte[] doc = new byte[] { + '{', '"', + (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate + '"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + + // This should fail when trying to read the field name + // (no lazy parsing for names) + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + fail("Should have thrown an exception for surrogate code point in UTF-8"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8"); + } + } + + /** + * Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work. + * U+D7FF is the last valid code point before the surrogate range. + * In UTF-8: ED 9F BF + */ + @Test + void acceptValidBeforeSurrogateRange() throws Exception + { + // JSON: {"value":"X"} + // where X is the valid 3-byte sequence ED 9F BF (U+D7FF) + byte[] doc = new byte[] { + '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':', + '"', + (byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates) + '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("value", p.currentName()); + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + assertEquals("\uD7FF", p.getText()); + assertToken(JsonToken.END_OBJECT, p.nextToken()); + } + } + + /** + * Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work. + * U+E000 is the first valid code point after the surrogate range. + * In UTF-8: EE 80 80 + */ + @Test + void acceptValidAfterSurrogateRange() throws Exception + { + // JSON: {"value":"X"} + // where X is the valid 3-byte sequence EE 80 80 (U+E000) + byte[] doc = new byte[] { + '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':', + '"', + (byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates) + '"', + '}' + }; + + try (JsonParser p = FACTORY.createParser(doc)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("value", p.currentName()); + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + assertEquals("\uE000", p.getText()); + assertToken(JsonToken.END_OBJECT, p.nextToken()); + } + } +}