FasterXML · cowtowncoder · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -16,6 +16,8 @@ a pure JSON library.
 
 2.21.0 (not yet released)
 
+#363: UTF-8 decoding should fail on Surrogate characters (0xD800 - 0xDFFF)
+ (fix by @cowtowncoder, w/ Claude code)
 #1180: `JsonLocation` off for unrecognized tokens
  (fix by @cowtowncoder, w/ Claude code)
 #1470: Add method `copyCurrentStructureExact()` to `JsonGenerator`

diff --git a/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java b/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java
@@ -767,6 +767,22 @@ protected <T> T _reportUnexpectedNumberChar(int ch, String comment) throws JsonP
     protected void reportUnexpectedNumberChar(int ch, String comment) throws JsonParseException {
         _reportUnexpectedNumberChar(ch, comment);
     }
+
+    /**
+     * Method called to throw an exception for invalid UTF-8 surrogate character: case
+     * where a surrogate character (between U+D800 and U+DFFF) is decoded from UTF-8
+     * bytes (but NOT from JSON entity!)
+     *
+     * @param ch Character code (int) that is invalid surrogate
+     *
+     * @throws JsonParseException Exception that describes problem with UTF-8 surrogate
+     *
+     * @since 2.21
+     */
+    protected void _reportInvalidUTF8Surrogate(int ch) throws JsonParseException {
+        throw _constructReadException(
+                "Invalid UTF-8: Illegal surrogate character 0x"+Integer.toHexString(ch));
+    }
 
     protected void _throwInvalidSpace(int i) throws JsonParseException {
         char c = (char) i;

diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8DataInputJsonParser.java
@@ -1859,7 +1859,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
                         _reportInvalidOther(ch2);
                     }
                     ch = (ch << 6) | (ch2 & 0x3F);
-                    if (needed > 2) { // 4 bytes? (need surrogates on output)
+                    // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
+                    if (needed == 2) {
+                        if (ch >= 0xD800 && ch <= 0xDFFF) {
+                            _reportInvalidUTF8Surrogate(ch);
+                        }
+                    } else { // 4 bytes? (need surrogates on output)
                         ch2 = quads[ix >> 2];
                         byteIx = (ix & 3);
                         ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -2678,6 +2683,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
             _reportInvalidOther(d & 0xFF);
         }
         c = (c << 6) | (d & 0x3F);
+        // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
+        if (c >= 0xD800 && c <= 0xDFFF) {
+            _reportInvalidUTF8Surrogate(c);
+        }
         return c;
     }
 

diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java
@@ -2407,7 +2407,12 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)
                         _reportInvalidOther(ch2);
                     }
                     ch = (ch << 6) | (ch2 & 0x3F);
-                    if (needed > 2) { // 4 bytes? (need surrogates on output)
+                    // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
+                    if (needed == 2) {
+                        if (ch >= 0xD800 && ch <= 0xDFFF) {
+                            _reportInvalidUTF8Surrogate(ch);
+                        }
+                    } else { // 4 bytes? (need surrogates on output)
                         ch2 = quads[ix >> 2];
                         byteIx = (ix & 3);
                         ch2 = (ch2 >> ((3 - byteIx) << 3));
@@ -3481,6 +3486,10 @@ private final int _decodeUtf8_3(int c1) throws IOException
             _reportInvalidOther(d & 0xFF, _inputPtr);
         }
         c = (c << 6) | (d & 0x3F);
+        // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
+        if (c >= 0xD800 && c <= 0xDFFF) {
+            _reportInvalidUTF8Surrogate(c);
+        }
         return c;
     }
 
@@ -3497,6 +3506,10 @@ private final int _decodeUtf8_3fast(int c1) throws IOException
             _reportInvalidOther(d & 0xFF, _inputPtr);
         }
         c = (c << 6) | (d & 0x3F);
+        // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
+        if (c >= 0xD800 && c <= 0xDFFF) {
+            _reportInvalidUTF8Surrogate(c);
+        }
         return c;
     }
 

diff --git a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingJsonParserBase.java
@@ -768,7 +768,12 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)
                         _reportInvalidOther(ch2);
                     }
                     ch = (ch << 6) | (ch2 & 0x3F);
-                    if (needed > 2) { // 4 bytes? (need surrogates on output)
+                    // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8 for 3-byte sequences
+                    if (needed == 2) {
+                        if (ch >= 0xD800 && ch <= 0xDFFF) {
+                            _reportInvalidUTF8Surrogate(ch);
+                        }
+                    } else { // 4 bytes? (need surrogates on output)
                         ch2 = quads[ix >> 2];
                         byteIx = (ix & 3);
                         ch2 = (ch2 >> ((3 - byteIx) << 3));

diff --git a/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java b/src/main/java/com/fasterxml/jackson/core/json/async/NonBlockingUtf8JsonParserBase.java
@@ -2828,7 +2828,12 @@ private final boolean _decodeSplitUTF8_3(int prev, int prevCount, int next)
         if ((next & 0xC0) != 0x080) {
             _reportInvalidOther(next & 0xFF, _inputPtr);
         }
-        _textBuffer.append((char) ((prev << 6) | (next & 0x3F)));
+        int c = (prev << 6) | (next & 0x3F);
+        // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
+        if (c >= 0xD800 && c <= 0xDFFF) {
+            _reportInvalidUTF8Surrogate(c);
+        }
+        _textBuffer.append((char) c);
         return true;
     }
 
@@ -2974,7 +2979,12 @@ private final int _decodeUTF8_3(int c, int d, int e) throws IOException
         if ((e & 0xC0) != 0x080) {
             _reportInvalidOther(e & 0xFF, _inputPtr);
         }
-        return (c << 6) | (e & 0x3F);
+        c = (c << 6) | (e & 0x3F);
+        // [jackson-core#363]: Surrogates (0xD800 - 0xDFFF) are illegal in UTF-8
+        if (c >= 0xD800 && c <= 0xDFFF) {
+            _reportInvalidUTF8Surrogate(c);
+        }
+        return c;
     }
 
     // @return Character value <b>minus 0x10000</c>; this so that caller

diff --git a/src/test/java/com/fasterxml/jackson/core/read/UTF8SurrogateValidation363Test.java b/src/test/java/com/fasterxml/jackson/core/read/UTF8SurrogateValidation363Test.java
@@ -0,0 +1,194 @@
+package com.fasterxml.jackson.core.read;
+
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.core.*;
+import com.fasterxml.jackson.core.exc.StreamReadException;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
+
+/**
+ * Tests for [jackson-core#363]: UTF-8 parser should reject 3-byte UTF-8 sequences
+ * that encode surrogate code points (U+D800 to U+DFFF), which are illegal in UTF-8.
+ */
+class UTF8SurrogateValidation363Test extends JUnit5TestBase
+{
+    private final JsonFactory FACTORY = newStreamFactory();
+
+    /**
+     * Test that parser rejects 3-byte UTF-8 sequence encoding U+D800 (start of surrogate range).
+     * In UTF-8, U+D800 would be encoded as: ED A0 80
+     */
+    @Test
+    void rejectSurrogateD800InString() throws Exception
+    {
+        // JSON: {"value":"X"}
+        // where X is the invalid 3-byte sequence ED A0 80 (U+D800)
+        byte[] doc = new byte[] {
+            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
+            '"',
+            (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
+            '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            assertEquals("value", p.currentName());
+
+            // This should fail when trying to read the string value
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            p.getText(); // Actual parsing happens here  (lazy parsing)
+            fail("Should have thrown an exception for surrogate code point in UTF-8");
+        } catch (StreamReadException e) {
+            verifyException(e, "Invalid UTF-8");
+        }
+    }
+
+    /**
+     * Test that parser rejects 3-byte UTF-8 sequence encoding U+DFFF (end of surrogate range).
+     * In UTF-8, U+DFFF would be encoded as: ED BF BF
+     */
+    @Test
+    void rejectSurrogateDFFFInString() throws Exception
+    {
+        // JSON: {"value":"X"}
+        // where X is the invalid 3-byte sequence ED BF BF (U+DFFF)
+        byte[] doc = new byte[] {
+            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
+            '"',
+            (byte) 0xED, (byte) 0xBF, (byte) 0xBF, // Invalid: U+DFFF surrogate
+            '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            assertEquals("value", p.currentName());
+
+            // This should fail when trying to read the string value
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            p.getText(); // Actual parsing happens here  (lazy parsing)
+            fail("Should have thrown an exception for surrogate code point in UTF-8");
+        } catch (StreamReadException e) {
+            verifyException(e, "Invalid UTF-8");
+        }
+    }
+
+    /**
+     * Test that parser rejects 3-byte UTF-8 sequence encoding U+DABC (middle of surrogate range).
+     * In UTF-8, U+DABC would be encoded as: ED AA BC
+     */
+    @Test
+    void rejectSurrogateMiddleInString() throws Exception
+    {
+        // JSON: {"value":"X"}
+        // where X is the invalid 3-byte sequence ED AA BC (U+DABC)
+        byte[] doc = new byte[] {
+            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
+            '"',
+            (byte) 0xED, (byte) 0xAA, (byte) 0xBC, // Invalid: U+DABC surrogate
+            '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            assertEquals("value", p.currentName());
+
+            // This should fail when trying to read the string value
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            p.getText(); // Actual parsing happens here  (lazy parsing)
+            fail("Should have thrown an exception for surrogate code point in UTF-8");
+        } catch (StreamReadException e) {
+            verifyException(e, "Invalid UTF-8");
+        }
+    }
+
+    /**
+     * Test that parser rejects surrogate in field name as well.
+     */
+    @Test
+    void rejectSurrogateInFieldName() throws Exception
+    {
+        // JSON: {"X":"value"}
+        // where X is the invalid 3-byte sequence ED A0 80 (U+D800)
+        byte[] doc = new byte[] {
+            '{', '"',
+            (byte) 0xED, (byte) 0xA0, (byte) 0x80, // Invalid: U+D800 surrogate
+            '"', ':', '"', 'v', 'a', 'l', 'u', 'e', '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+
+            // This should fail when trying to read the field name
+            // (no lazy parsing for names)
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            fail("Should have thrown an exception for surrogate code point in UTF-8");
+        } catch (StreamReadException e) {
+            verifyException(e, "Invalid UTF-8");
+        }
+    }
+
+    /**
+     * Sanity check: valid 3-byte UTF-8 sequences just before surrogate range should work.
+     * U+D7FF is the last valid code point before the surrogate range.
+     * In UTF-8: ED 9F BF
+     */
+    @Test
+    void acceptValidBeforeSurrogateRange() throws Exception
+    {
+        // JSON: {"value":"X"}
+        // where X is the valid 3-byte sequence ED 9F BF (U+D7FF)
+        byte[] doc = new byte[] {
+            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
+            '"',
+            (byte) 0xED, (byte) 0x9F, (byte) 0xBF, // Valid: U+D7FF (just before surrogates)
+            '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            assertEquals("value", p.currentName());
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            assertEquals("\uD7FF", p.getText());
+            assertToken(JsonToken.END_OBJECT, p.nextToken());
+        }
+    }
+
+    /**
+     * Sanity check: valid 3-byte UTF-8 sequences just after surrogate range should work.
+     * U+E000 is the first valid code point after the surrogate range.
+     * In UTF-8: EE 80 80
+     */
+    @Test
+    void acceptValidAfterSurrogateRange() throws Exception
+    {
+        // JSON: {"value":"X"}
+        // where X is the valid 3-byte sequence EE 80 80 (U+E000)
+        byte[] doc = new byte[] {
+            '{', '"', 'v', 'a', 'l', 'u', 'e', '"', ':',
+            '"',
+            (byte) 0xEE, (byte) 0x80, (byte) 0x80, // Valid: U+E000 (just after surrogates)
+            '"',
+            '}'
+        };
+
+        try (JsonParser p = FACTORY.createParser(doc)) {
+            assertToken(JsonToken.START_OBJECT, p.nextToken());
+            assertToken(JsonToken.FIELD_NAME, p.nextToken());
+            assertEquals("value", p.currentName());
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            assertEquals("\uE000", p.getText());
+            assertToken(JsonToken.END_OBJECT, p.nextToken());
+        }
+    }
+}