Skip to content

Commit d32167d

Browse files
committed
Merge branch '3.0' into 3.x
2 parents 5a06682 + 7cbb3e9 commit d32167d

File tree

10 files changed

+272
-21
lines changed

10 files changed

+272
-21
lines changed

release-notes/VERSION

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ JSON library.
2121

2222
3.0.2 (not yet released)
2323

24+
#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF)
25+
(fix by @cowtowncoder, w/ Claude code)
26+
#1180: `JsonLocation` off for unrecognized tokens
27+
(fix by @cowtowncoder, w/ Claude code)
2428
#1491: Mismatched property name for byte-backed `JsonParser.nextNameMatch(PropertyNameMatcher)`
2529
(fix by @cowtowncoder, w/ Claude code)
2630

release-notes/VERSION-2.x

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ a pure JSON library.
1616

1717
2.21.0 (not yet released)
1818

19+
#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF)
20+
(fix by @cowtowncoder, w/ Claude code)
1921
#1180: `JsonLocation` off for unrecognized tokens
2022
(fix by @cowtowncoder, w/ Claude code)
2123
#1470: Add method `copyCurrentStructureExact()` to `JsonGenerator`

src/main/java/tools/jackson/core/base/ParserMinimalBase.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,20 @@ protected <T> T _reportUnexpectedNumberChar(int ch, String comment) throws Strea
10571057
throw _constructReadException(msg, _currentLocationMinusOne());
10581058
}
10591059

1060+
/**
1061+
* Method called to throw an exception for invalid UTF-8 surrogate character: case
1062+
* where a surrogate character (between U+D800 and U+DFFF) is decoded from UTF-8
1063+
* bytes (but NOT from JSON entity!)
1064+
*
1065+
* @param ch Character code (int) that is invalid surrogate
1066+
*
1067+
* @throws StreamReadException Exception that describes problem with UTF-8 surrogate
1068+
*/
1069+
protected void _reportInvalidUTF8Surrogate(int ch) throws StreamReadException {
1070+
throw _constructReadException(
1071+
"Invalid UTF-8: Illegal surrogate character 0x"+Integer.toHexString(ch));
1072+
}
1073+
10601074
/**
10611075
* Factory method used to provide location for cases where we must read
10621076
* and consume a single "wrong" character (to possibly allow error recovery),

src/main/java/tools/jackson/core/json/UTF8DataInputJsonParser.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1874,7 +1874,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes) throws St
18741874
_reportInvalidOther(ch2);
18751875
}
18761876
ch = (ch << 6) | (ch2 & 0x3F);
1877-
if (needed > 2) { // 4 bytes? (need surrogates on output)
1877+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
1878+
if (needed == 2) {
1879+
if (ch >= 0xD800 && ch <= 0xDFFF) {
1880+
_reportInvalidUTF8Surrogate(ch);
1881+
}
1882+
} else { // 4 bytes? (need surrogates on output)
18781883
ch2 = quads[ix >> 2];
18791884
byteIx = (ix & 3);
18801885
ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -2703,6 +2708,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
27032708
_reportInvalidOther(d & 0xFF);
27042709
}
27052710
c = (c << 6) | (d & 0x3F);
2711+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2712+
if (c >= 0xD800 && c <= 0xDFFF) {
2713+
_reportInvalidUTF8Surrogate(c);
2714+
}
27062715
return c;
27072716
}
27082717

src/main/java/tools/jackson/core/json/UTF8StreamJsonParser.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2758,7 +2758,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
27582758
_reportInvalidOther(ch2);
27592759
}
27602760
ch = (ch << 6) | (ch2 & 0x3F);
2761-
if (needed > 2) { // 4 bytes? (need surrogates on output)
2761+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
2762+
if (needed == 2) {
2763+
if (ch >= 0xD800 && ch <= 0xDFFF) {
2764+
_reportInvalidUTF8Surrogate(ch);
2765+
}
2766+
} else { // 4 bytes? (need surrogates on output)
27622767
ch2 = quads[ix >> 2];
27632768
byteIx = (ix & 3);
27642769
ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -3829,6 +3834,10 @@ private final int _decodeUtf8_3(int c1) throws JacksonException
38293834
_reportInvalidOther(d & 0xFF, _inputPtr);
38303835
}
38313836
c = (c << 6) | (d & 0x3F);
3837+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
3838+
if (c >= 0xD800 && c <= 0xDFFF) {
3839+
_reportInvalidUTF8Surrogate(c);
3840+
}
38323841
return c;
38333842
}
38343843

@@ -3845,6 +3854,10 @@ private final int _decodeUtf8_3fast(int c1) throws JacksonException
38453854
_reportInvalidOther(d & 0xFF, _inputPtr);
38463855
}
38473856
c = (c << 6) | (d & 0x3F);
3857+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
3858+
if (c >= 0xD800 && c <= 0xDFFF) {
3859+
_reportInvalidUTF8Surrogate(c);
3860+
}
38483861
return c;
38493862
}
38503863

src/main/java/tools/jackson/core/json/async/NonBlockingJsonParserBase.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,12 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)
747747
_reportInvalidOther(ch2);
748748
}
749749
ch = (ch << 6) | (ch2 & 0x3F);
750-
if (needed > 2) { // 4 bytes? (need surrogates on output)
750+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
751+
if (needed == 2) {
752+
if (ch >= 0xD800 && ch <= 0xDFFF) {
753+
_reportInvalidUTF8Surrogate(ch);
754+
}
755+
} else { // 4 bytes? (need surrogates on output)
751756
ch2 = quads[ix >> 2];
752757
byteIx = (ix & 3);
753758
ch2 = (ch2 >> ((3 - byteIx) << 3));

src/main/java/tools/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2827,7 +2827,12 @@ private final boolean _decodeSplitUTF8_3(int prev, int prevCount, int next)
28272827
if ((next & 0xC0) != 0x080) {
28282828
_reportInvalidOther(next & 0xFF, _inputPtr);
28292829
}
2830-
_textBuffer.append((char) ((prev << 6) | (next & 0x3F)));
2830+
int c = (prev << 6) | (next & 0x3F);
2831+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2832+
if (c >= 0xD800 && c <= 0xDFFF) {
2833+
_reportInvalidUTF8Surrogate(c);
2834+
}
2835+
_textBuffer.append((char) c);
28312836
return true;
28322837
}
28332838

@@ -2973,7 +2978,12 @@ private final int _decodeUTF8_3(int c, int d, int e) throws JacksonException
29732978
if ((e & 0xC0) != 0x080) {
29742979
_reportInvalidOther(e & 0xFF, _inputPtr);
29752980
}
2976-
return (c << 6) | (e & 0x3F);
2981+
c = (c << 6) | (e & 0x3F);
2982+
// [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
2983+
if (c >= 0xD800 && c <= 0xDFFF) {
2984+
_reportInvalidUTF8Surrogate(c);
2985+
}
2986+
return c;
29772987
}
29782988

29792989
// @return Character value <b>minus 0x10000</c>; this so that caller

src/test/java/tools/jackson/core/unittest/read/ParserErrorHandlingTest.java

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,23 @@ void invalidKeywordsChars() throws Exception {
2929

3030
// Tests for [core#105] ("eager number parsing misses errors")
3131
@Test
32-
void mangledIntsBytes() throws Exception {
33-
_testMangledNumbersInt(MODE_INPUT_STREAM);
34-
_testMangledNumbersInt(MODE_INPUT_STREAM_THROTTLED);
35-
_testMangledNumbersInt(MODE_DATA_INPUT);
32+
void mangledRootIntsBytes() throws Exception {
33+
_testMangledRootNumbersInt(MODE_INPUT_STREAM);
34+
_testMangledRootNumbersInt(MODE_INPUT_STREAM_THROTTLED);
35+
_testMangledRootNumbersInt(MODE_DATA_INPUT);
3636
}
3737

3838
@Test
39-
void mangledFloatsBytes() throws Exception {
40-
_testMangledNumbersFloat(MODE_INPUT_STREAM);
41-
_testMangledNumbersFloat(MODE_INPUT_STREAM_THROTTLED);
42-
43-
// 02-Jun-2017, tatu: Fails as expected, unlike int one. Bit puzzling...
44-
_testMangledNumbersFloat(MODE_DATA_INPUT);
39+
void mangledRootFloatsBytes() throws Exception {
40+
_testMangledRootNumbersFloat(MODE_INPUT_STREAM);
41+
_testMangledRootNumbersFloat(MODE_INPUT_STREAM_THROTTLED);
42+
_testMangledRootNumbersFloat(MODE_DATA_INPUT);
4543
}
4644

4745
@Test
48-
void mangledNumbersChars() throws Exception {
49-
_testMangledNumbersInt(MODE_READER);
50-
_testMangledNumbersFloat(MODE_READER);
46+
void mangledRootNumbersChars() throws Exception {
47+
_testMangledRootNumbersInt(MODE_READER);
48+
_testMangledRootNumbersFloat(MODE_READER);
5149
}
5250

5351
/*
@@ -104,7 +102,7 @@ private void doTestInvalidKeyword1(int mode, String value)
104102
}
105103
}
106104

107-
private void _testMangledNumbersInt(int mode)
105+
private void _testMangledRootNumbersInt(int mode)
108106
{
109107
JsonParser p = createParser(JSON_F, mode, "123true");
110108
try {
@@ -116,7 +114,7 @@ private void _testMangledNumbersInt(int mode)
116114
p.close();
117115
}
118116

119-
private void _testMangledNumbersFloat(int mode)
117+
private void _testMangledRootNumbersFloat(int mode)
120118
{
121119
// Also test with floats
122120
JsonParser p = createParser(JSON_F, mode, "1.5false");
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
package tools.jackson.core.unittest.read;
2+
3+
import org.junit.jupiter.api.Test;
4+
5+
import tools.jackson.core.*;
6+
import tools.jackson.core.exc.StreamReadException;
7+
import tools.jackson.core.json.JsonFactory;
8+
import tools.jackson.core.unittest.JacksonCoreTestBase;
9+
10+
import static org.junit.jupiter.api.Assertions.assertEquals;
11+
import static org.junit.jupiter.api.Assertions.fail;
12+
13+
/**
14+
* Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences
15+
* that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8.
16+
*/
17+
class UTF8SurrogateValidation363Test
18+
extends JacksonCoreTestBase
19+
{
20+
private final JsonFactory FACTORY = newStreamFactory();
21+
22+
/**
23+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range).
24+
* In UTF-8, U+D800 would be encoded as: ED A0 80
25+
*/
26+
@Test
27+
void rejectSurrogateD800InString() throws Exception
28+
{
29+
// JSON: {"value":"X"}
30+
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
31+
byte[] doc = new byte[] {
32+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
33+
'"',
34+
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
35+
'"',
36+
'}'
37+
};
38+
39+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
40+
assertToken(JsonToken.START_OBJECT, p.nextToken());
41+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
42+
assertEquals("value", p.currentName());
43+
44+
// This should fail when trying to read the string value
45+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
46+
p.getString(); // Actual parsing happens here (lazy parsing)
47+
fail("Should have thrown an exception for surrogate code point in UTF-8");
48+
} catch (StreamReadException e) {
49+
verifyException(e, "Invalid UTF-8");
50+
}
51+
}
52+
53+
/**
54+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range).
55+
* In UTF-8, U+DFFF would be encoded as: ED BF BF
56+
*/
57+
@Test
58+
void rejectSurrogateDFFFInString() throws Exception
59+
{
60+
// JSON: {"value":"X"}
61+
// where X is the invalid 3-byte sequence ED BF BF (U+DFFF)
62+
byte[] doc = new byte[] {
63+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
64+
'"',
65+
(byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate
66+
'"',
67+
'}'
68+
};
69+
70+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
71+
assertToken(JsonToken.START_OBJECT, p.nextToken());
72+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
73+
assertEquals("value", p.currentName());
74+
75+
// This should fail when trying to read the string value
76+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
77+
p.getString(); // Actual parsing happens here (lazy parsing)
78+
fail("Should have thrown an exception for surrogate code point in UTF-8");
79+
} catch (StreamReadException e) {
80+
verifyException(e, "Invalid UTF-8");
81+
}
82+
}
83+
84+
/**
85+
* Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range).
86+
* In UTF-8, U+DABC would be encoded as: ED AA BC
87+
*/
88+
@Test
89+
void rejectSurrogateMiddleInString() throws Exception
90+
{
91+
// JSON: {"value":"X"}
92+
// where X is the invalid 3-byte sequence ED AA BC (U+DABC)
93+
byte[] doc = new byte[] {
94+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
95+
'"',
96+
(byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate
97+
'"',
98+
'}'
99+
};
100+
101+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
102+
assertToken(JsonToken.START_OBJECT, p.nextToken());
103+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
104+
assertEquals("value", p.currentName());
105+
106+
// This should fail when trying to read the string value
107+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
108+
p.getString(); // Actual parsing happens here (lazy parsing)
109+
fail("Should have thrown an exception for surrogate code point in UTF-8");
110+
} catch (StreamReadException e) {
111+
verifyException(e, "Invalid UTF-8");
112+
}
113+
}
114+
115+
/**
116+
* Test that parser rejects surrogate in field name as well.
117+
*/
118+
@Test
119+
void rejectSurrogateInFieldName() throws Exception
120+
{
121+
// JSON: {"X":"value"}
122+
// where X is the invalid 3-byte sequence ED A0 80 (U+D800)
123+
byte[] doc = new byte[] {
124+
'{', '"',
125+
(byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
126+
'"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"',
127+
'}'
128+
};
129+
130+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
131+
assertToken(JsonToken.START_OBJECT, p.nextToken());
132+
133+
// This should fail when trying to read the field name
134+
// (no lazy parsing for names)
135+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
136+
fail("Should have thrown an exception for surrogate code point in UTF-8");
137+
} catch (StreamReadException e) {
138+
verifyException(e, "Invalid UTF-8");
139+
}
140+
}
141+
142+
/**
143+
* Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work.
144+
* U+D7FF is the last valid code point before the surrogate range.
145+
* In UTF-8: ED 9F BF
146+
*/
147+
@Test
148+
void acceptValidBeforeSurrogateRange() throws Exception
149+
{
150+
// JSON: {"value":"X"}
151+
// where X is the valid 3-byte sequence ED 9F BF (U+D7FF)
152+
byte[] doc = new byte[] {
153+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
154+
'"',
155+
(byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates)
156+
'"',
157+
'}'
158+
};
159+
160+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
161+
assertToken(JsonToken.START_OBJECT, p.nextToken());
162+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
163+
assertEquals("value", p.currentName());
164+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
165+
assertEquals("\uD7FF", p.getString());
166+
assertToken(JsonToken.END_OBJECT, p.nextToken());
167+
}
168+
}
169+
170+
/**
171+
* Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work.
172+
* U+E000 is the first valid code point after the surrogate range.
173+
* In UTF-8: EE 80 80
174+
*/
175+
@Test
176+
void acceptValidAfterSurrogateRange() throws Exception
177+
{
178+
// JSON: {"value":"X"}
179+
// where X is the valid 3-byte sequence EE 80 80 (U+E000)
180+
byte[] doc = new byte[] {
181+
'{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
182+
'"',
183+
(byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates)
184+
'"',
185+
'}'
186+
};
187+
188+
try (JsonParser p = FACTORY.createParser(ObjectReadContext.empty(), doc)) {
189+
assertToken(JsonToken.START_OBJECT, p.nextToken());
190+
assertToken(JsonToken.PROPERTY_NAME, p.nextToken());
191+
assertEquals("value", p.currentName());
192+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
193+
assertEquals("\uE000", p.getString());
194+
assertToken(JsonToken.END_OBJECT, p.nextToken());
195+
}
196+
}
197+
}

0 commit comments

Comments
 (0)