Skip to content

Commit 40cc9e5

Browse files
authored
Add failing tests for, fix issue #363 (#1494)
1 parent 873d410 commit 40cc9e5

File tree

7 files changed

+254
-5
lines changed

7 files changed

+254
-5
lines changed

release-notes/VERSION-2.x

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ a pure JSON library.
1616

1717
2.21.0 (not yet released)
1818

19+
#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF)
20+
(fix by @cowtowncoder, w/ Claude code)
1921
#1180: `JsonLocation` off for unrecognized tokens
2022
(fix by @cowtowncoder, w/ Claude code)
2123
#1470: Add method `copyCurrentStructureExact()` to `JsonGenerator`

src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,22 @@ protected <T> T _reportUnexpectedNumberChar(int ch, String comment) throws JsonP
767767
protected void reportUnexpectedNumberChar(int ch, String comment) throws JsonParseException {
768768
_reportUnexpectedNumberChar(ch, comment);
769769
}
770+
771+
/**
772+
* Method called to throw an exception for invalid UTF-8 surrogate character: case
773+
* where a surrogate character (between U+D800 and U+DFFF) is decoded from UTF-8
774+
* bytes (but NOT from JSON entity!)
775+
*
776+
* @param ch Character code (int) that is invalid surrogate
777+
*
778+
* @throws JsonParseException Exception that describes problem with UTF-8 surrogate
779+
*
780+
* @since 2.21
781+
*/
782+
protected void _reportInvalidUTF8Surrogate(int ch) throws JsonParseException {
783+
throw _constructReadException(
784+
"Invalid UTF-8: Illegal surrogate character 0x"+Integer.toHexString(ch));
785+
}
770786

771787
protected void _throwInvalidSpace(int i) throws JsonParseException {
772788
char c = (char) i;

src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1859,7 +1859,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
18591859
_reportInvalidOther(ch2);
18601860
}
18611861
ch = (ch << 6) | (ch2 & 0x3F);
1862-
if (needed > 2) { // 4 bytes? (need surrogates on output)
1862+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
1863+
if (needed == 2) {
1864+
if (ch >= 0xD800 && ch <= 0xDFFF) {
1865+
_reportInvalidUTF8Surrogate(ch);
1866+
}
1867+
} else { // 4 bytes? (need surrogates on output)
18631868
ch2 = quads[ix >> 2];
18641869
byteIx = (ix & 3);
18651870
ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -2678,6 +2683,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
26782683
_reportInvalidOther(d & 0xFF);
26792684
}
26802685
c = (c << 6) | (d & 0x3F);
2686+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2687+
if (c >= 0xD800 && c <= 0xDFFF) {
2688+
_reportInvalidUTF8Surrogate(c);
2689+
}
26812690
return c;
26822691
}
26832692

src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2407,7 +2407,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
24072407
_reportInvalidOther(ch2);
24082408
}
24092409
ch = (ch << 6) | (ch2 & 0x3F);
2410-
if (needed > 2) { // 4 bytes? (need surrogates on output)
2410+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
2411+
if (needed == 2) {
2412+
if (ch >= 0xD800 && ch <= 0xDFFF) {
2413+
_reportInvalidUTF8Surrogate(ch);
2414+
}
2415+
} else { // 4 bytes? (need surrogates on output)
24112416
ch2 = quads[ix >> 2];
24122417
byteIx = (ix & 3);
24132418
ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -3481,6 +3486,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
34813486
_reportInvalidOther(d & 0xFF, _inputPtr);
34823487
}
34833488
c = (c << 6) | (d & 0x3F);
3489+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
3490+
if (c >= 0xD800 && c <= 0xDFFF) {
3491+
_reportInvalidUTF8Surrogate(c);
3492+
}
34843493
return c;
34853494
}
34863495

@@ -3497,6 +3506,10 @@ private final int _decodeUtf8_3fast(int c1) throws IOException
34973506
_reportInvalidOther(d & 0xFF, _inputPtr);
34983507
}
34993508
c = (c << 6) | (d & 0x3F);
3509+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
3510+
if (c >= 0xD800 && c <= 0xDFFF) {
3511+
_reportInvalidUTF8Surrogate(c);
3512+
}
35003513
return c;
35013514
}
35023515

src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,12 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)
768768
_reportInvalidOther(ch2);
769769
}
770770
ch = (ch << 6) | (ch2 & 0x3F);
771-
if (needed > 2) { // 4 bytes? (need surrogates on output)
771+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
772+
if (needed == 2) {
773+
if (ch >= 0xD800 && ch <= 0xDFFF) {
774+
_reportInvalidUTF8Surrogate(ch);
775+
}
776+
} else { // 4 bytes? (need surrogates on output)
772777
ch2 = quads[ix >> 2];
773778
byteIx = (ix & 3);
774779
ch2 = (ch2 >> ((3 - byteIx) << 3));

src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2828,7 +2828,12 @@ private final boolean _decodeSplitUTF8_3(int prev, int prevCount, int next)
28282828
if ((next & 0xC0) != 0x080) {
28292829
_reportInvalidOther(next & 0xFF, _inputPtr);
28302830
}
2831-
_textBuffer.append((char) ((prev << 6) | (next & 0x3F)));
2831+
int c = (prev << 6) | (next & 0x3F);
2832+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2833+
if (c >= 0xD800 && c <= 0xDFFF) {
2834+
_reportInvalidUTF8Surrogate(c);
2835+
}
2836+
_textBuffer.append((char) c);
28322837
return true;
28332838
}
28342839

@@ -2974,7 +2979,12 @@ private final int _decodeUTF8_3(int c, int d, int e) throws IOException
29742979
if ((e & 0xC0) != 0x080) {
29752980
_reportInvalidOther(e & 0xFF, _inputPtr);
29762981
}
2977-
return (c << 6) | (e & 0x3F);
2982+
c = (c << 6) | (e & 0x3F);
2983+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2984+
if (c >= 0xD800 && c <= 0xDFFF) {
2985+
_reportInvalidUTF8Surrogate(c);
2986+
}
2987+
return c;
29782988
}
29792989

29802990
// @return Character value <b>minus 0x10000</c>; this so that caller
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
package com.fasterxml.jackson.core.read;
2+
3+
import org.junit.jupiter.api.Test;
4+
5+
import com.fasterxml.jackson.core.*;
6+
import com.fasterxml.jackson.core.exc.StreamReadException;
7+
8+
import static org.junit.jupiter.api.Assertions.assertEquals;
9+
import static org.junit.jupiter.api.Assertions.fail;
10+
11+
/**
12+
* Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences
13+
* that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8.
14+
*/
15+
class UTF8SurrogateValidation363Test extends JUnit5TestBase
16+
{
17+
private final JsonFactory FACTORY = newStreamFactory();
18+
19+
/**
20+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range).
21+
* In UTF-8, U+D800 would be encoded as: ED A0 80
22+
*/
23+
@Test
24+
void rejectSurrogateD800InString() throws Exception
25+
{
26+
// JSON: {"value":"X"}
27+
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
28+
byte[] doc = new byte[] {
29+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
30+
'"',
31+
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
32+
'"',
33+
'}'
34+
};
35+
36+
try (JsonParser p = FACTORY.createParser(doc)) {
37+
assertToken(JsonToken.START_OBJECT, p.nextToken());
38+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
39+
assertEquals("value", p.currentName());
40+
41+
// This should fail when trying to read the string value
42+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
43+
p.getText(); // Actual parsing happens here (lazy parsing)
44+
fail("Should have thrown an exception for surrogate code point in UTF-8");
45+
} catch (StreamReadException e) {
46+
verifyException(e, "Invalid UTF-8");
47+
}
48+
}
49+
50+
/**
51+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range).
52+
* In UTF-8, U+DFFF would be encoded as: ED BF BF
53+
*/
54+
@Test
55+
void rejectSurrogateDFFFInString() throws Exception
56+
{
57+
// JSON: {"value":"X"}
58+
// where X is the invalid 3-byte sequence ED BF BF (U+DFFF)
59+
byte[] doc = new byte[] {
60+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
61+
'"',
62+
(byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate
63+
'"',
64+
'}'
65+
};
66+
67+
try (JsonParser p = FACTORY.createParser(doc)) {
68+
assertToken(JsonToken.START_OBJECT, p.nextToken());
69+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
70+
assertEquals("value", p.currentName());
71+
72+
// This should fail when trying to read the string value
73+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
74+
p.getText(); // Actual parsing happens here (lazy parsing)
75+
fail("Should have thrown an exception for surrogate code point in UTF-8");
76+
} catch (StreamReadException e) {
77+
verifyException(e, "Invalid UTF-8");
78+
}
79+
}
80+
81+
/**
82+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range).
83+
* In UTF-8, U+DABC would be encoded as: ED AA BC
84+
*/
85+
@Test
86+
void rejectSurrogateMiddleInString() throws Exception
87+
{
88+
// JSON: {"value":"X"}
89+
// where X is the invalid 3-byte sequence ED AA BC (U+DABC)
90+
byte[] doc = new byte[] {
91+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
92+
'"',
93+
(byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate
94+
'"',
95+
'}'
96+
};
97+
98+
try (JsonParser p = FACTORY.createParser(doc)) {
99+
assertToken(JsonToken.START_OBJECT, p.nextToken());
100+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
101+
assertEquals("value", p.currentName());
102+
103+
// This should fail when trying to read the string value
104+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
105+
p.getText(); // Actual parsing happens here (lazy parsing)
106+
fail("Should have thrown an exception for surrogate code point in UTF-8");
107+
} catch (StreamReadException e) {
108+
verifyException(e, "Invalid UTF-8");
109+
}
110+
}
111+
112+
/**
113+
* Test that parser rejects surrogate in field name as well.
114+
*/
115+
@Test
116+
void rejectSurrogateInFieldName() throws Exception
117+
{
118+
// JSON: {"X":"value"}
119+
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
120+
byte[] doc = new byte[] {
121+
'{', '"',
122+
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
123+
'"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"',
124+
'}'
125+
};
126+
127+
try (JsonParser p = FACTORY.createParser(doc)) {
128+
assertToken(JsonToken.START_OBJECT, p.nextToken());
129+
130+
// This should fail when trying to read the field name
131+
// (no lazy parsing for names)
132+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
133+
fail("Should have thrown an exception for surrogate code point in UTF-8");
134+
} catch (StreamReadException e) {
135+
verifyException(e, "Invalid UTF-8");
136+
}
137+
}
138+
139+
/**
140+
* Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work.
141+
* U+D7FF is the last valid code point before the surrogate range.
142+
* In UTF-8: ED 9F BF
143+
*/
144+
@Test
145+
void acceptValidBeforeSurrogateRange() throws Exception
146+
{
147+
// JSON: {"value":"X"}
148+
// where X is the valid 3-byte sequence ED 9F BF (U+D7FF)
149+
byte[] doc = new byte[] {
150+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
151+
'"',
152+
(byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates)
153+
'"',
154+
'}'
155+
};
156+
157+
try (JsonParser p = FACTORY.createParser(doc)) {
158+
assertToken(JsonToken.START_OBJECT, p.nextToken());
159+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
160+
assertEquals("value", p.currentName());
161+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
162+
assertEquals("\uD7FF", p.getText());
163+
assertToken(JsonToken.END_OBJECT, p.nextToken());
164+
}
165+
}
166+
167+
/**
168+
* Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work.
169+
* U+E000 is the first valid code point after the surrogate range.
170+
* In UTF-8: EE 80 80
171+
*/
172+
@Test
173+
void acceptValidAfterSurrogateRange() throws Exception
174+
{
175+
// JSON: {"value":"X"}
176+
// where X is the valid 3-byte sequence EE 80 80 (U+E000)
177+
byte[] doc = new byte[] {
178+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
179+
'"',
180+
(byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates)
181+
'"',
182+
'}'
183+
};
184+
185+
try (JsonParser p = FACTORY.createParser(doc)) {
186+
assertToken(JsonToken.START_OBJECT, p.nextToken());
187+
assertToken(JsonToken.FIELD_NAME, p.nextToken());
188+
assertEquals("value", p.currentName());
189+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
190+
assertEquals("\uE000", p.getText());
191+
assertToken(JsonToken.END_OBJECT, p.nextToken());
192+
}
193+
}
194+
}

0 commit comments

Comments
 (0)