NVIDIA · wjxiz1992 · May 25, 2026 · May 28, 2026
diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -617,22 +617,64 @@ def test_regexp_choice():
         conf=_regexp_conf)
 
 def test_regexp_hexadecimal_digits():
+    # NOTE: supplementary-codepoint hex escapes (cp > U+FFFF) are
+    # exercised by the `test_rlike_supplementary_codepoint_fallback_issue_14744`
+    # / `test_regexp_replace_supplementary_codepoint_fallback_issue_14744`
+    # tests below -- they now fall back to CPU because cuDF's regex JNI
+    # cannot represent them. Keeping them out of this projection avoids
+    # the mixed GPU/CPU `ProjectExec` execution path. See
+    # NVIDIA/spark-rapids#14744.
     gen = mk_str_gen(
-        '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
+        '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{0000ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark: unary_op_df(spark, gen).selectExpr(
                 'rlike(a, "\\\\x7f")',
                 'rlike(a, "\\\\x80")',
                 'rlike(a, "[\\\\xa0-\\\\xf0]")',
                 'rlike(a, "\\\\x{00eeee}")',
+                'rlike(a, "\\\\x{0000ffff}")',
                 'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)',
                 'regexp_extract(a, "([a-d]+)[\\\\xa0\nabcd]([a-d]+)", 1)',
                 'regexp_replace(a, "\\\\xff", "@")',
                 'regexp_replace(a, "[\\\\xa0-\\\\xb0]", "@")',
-                'regexp_replace(a, "\\\\x{10ffff}", "@")',
             ),
         conf=_regexp_conf)
 
+@allow_non_gpu('ProjectExec', 'RLike')
+@pytest.mark.parametrize('pattern', [
+    '\\\\x{1F600}',    # grinning face emoji
+    '\\\\x{10000}',    # lowest supplementary codepoint
+    '\\\\x{10FFFF}',   # highest valid Unicode codepoint
+    'a\\\\x{1F600}b',  # supplementary codepoint embedded in a literal
+])
+def test_rlike_supplementary_codepoint_fallback_issue_14744(pattern):
+    # Issue NVIDIA/spark-rapids#14744 regression coverage. Before the
+    # fix, hex escapes for supplementary codepoints (cp > U+FFFF) were
+    # silently truncated via `.toChar` on the GPU -- e.g. the pattern
+    # `\x{1F600}` (grinning face 😀) was rewritten as `` (a
+    # private-use BMP codepoint), so the GPU matched the wrong
+    # character. The transpiler now throws `RegexUnsupportedException`
+    # for these patterns, causing spark-rapids to fall back to the CPU
+    # regex engine -- which Java's `Pattern` handles correctly.
+    gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]')
+    assert_gpu_fallback_collect(
+            lambda spark: unary_op_df(spark, gen).selectExpr(
+                'rlike(a, "{}")'.format(pattern)),
+            'RLike',
+        conf=_regexp_conf)
+
+@allow_non_gpu('ProjectExec', 'RegExpReplace')
+def test_regexp_replace_supplementary_codepoint_fallback_issue_14744():
+    # Companion to test_rlike_supplementary_codepoint_fallback_issue_14744 but
+    # for regexp_replace -- same root cause, same CPU fallback path,
+    # different meta. See NVIDIA/spark-rapids#14744.
+    gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]')
+    assert_gpu_fallback_collect(
+            lambda spark: unary_op_df(spark, gen).selectExpr(
+                'regexp_replace(a, "\\\\x{1F600}", "@")'),
+            'RegExpReplace',
+        conf=_regexp_conf)
+
 def test_regexp_whitespace():
     gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}')
     assert_gpu_and_cpu_are_equal_collect(

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -208,12 +208,24 @@ class RegexParser(pattern: String) {
           val hexChar = parseHexDigit
           hexChar.codePoint match {
             case 0 => hexChar
+            case codePoint if codePoint > 0xFFFF =>
+              // cuDF's regex JNI surface cannot represent supplementary
+              // codepoints (cp > U+FFFF); previously these were silently
+              // truncated to their low 16 bits via `.toChar`, producing
+              // wrong matches. See NVIDIA/spark-rapids#14744.
+              throw new RegexUnsupportedException(
+                "cuDF does not support supplementary codepoints (cp > U+FFFF) " +
+                "in hex escapes; falling back to CPU", hexChar.position)
             case codePoint => RegexChar(codePoint.toChar)
           }
         case Some('0') =>
           val octalChar = parseOctalDigit
           octalChar.codePoint match {
             case 0 => RegexHexDigit("00")
+            case codePoint if codePoint > 0xFFFF =>
+              throw new RegexUnsupportedException(
+                "cuDF does not support supplementary codepoints (cp > U+FFFF) " +
+                "in octal escapes; falling back to CPU", octalChar.position)
             case codePoint => RegexChar(codePoint.toChar)
           }
         case Some(ch) =>
@@ -1129,6 +1141,14 @@ class CudfRegexTranspiler(mode: RegexMode) {
 
         if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
           RegexEscaped(r.codePoint.toChar)
+        } else if (r.codePoint > 0xFFFF) {
+          // cuDF's regex JNI cannot represent supplementary codepoints
+          // (cp > U+FFFF); previously these were silently truncated via
+          // `.toChar`, producing wrong matches.
+          // See NVIDIA/spark-rapids#14744.
+          throw new RegexUnsupportedException(
+            "cuDF does not support supplementary codepoints (cp > U+FFFF) " +
+            "in octal escapes; falling back to CPU", r.position)
         } else if(r.codePoint >= 128) {
           RegexChar(r.codePoint.toChar)
         } else {
@@ -1138,6 +1158,13 @@ class CudfRegexTranspiler(mode: RegexMode) {
       case r @ RegexHexDigit(_) =>
         if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
           RegexEscaped(r.codePoint.toChar)
+        } else if (r.codePoint > 0xFFFF) {
+          // See NVIDIA/spark-rapids#14744 — supplementary codepoints
+          // cannot round-trip through cuDF's regex JNI, so we fall back
+          // to CPU rather than emit a truncated `.toChar` match.
+          throw new RegexUnsupportedException(
+            "cuDF does not support supplementary codepoints (cp > U+FFFF) " +
+            "in hex escapes; falling back to CPU", r.position)
         } else if (r.codePoint >= 128) {
           // cuDF only supports 0x00 to 0x7f hexidecimal chars
           RegexChar(r.codePoint.toChar)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -236,11 +236,40 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {
 
   test("hex digits - find") {
     val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}",
-      raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{10FFFF}", raw"\x{00eeee}")
+      raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{00eeee}")
+    // NOTE: supplementary codepoints (cp > U+FFFF) such as `\x{1F600}` are
+    // covered by `issue-14744:` below -- they now throw and fall back to CPU
+    // because cuDF's regex JNI cannot represent them.
     assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b",
         "\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd"))
   }
 
+  test("issue-14744: supplementary codepoint hex/octal escapes fall back to CPU") {
+    // Before the fix, `\x{1F600}` (U+1F600 grinning-face emoji) was
+    // silently truncated to `RegexChar(0x1F600.toChar)` = `RegexChar('')`,
+    // making the GPU match U+F600 instead of the supplementary codepoint.
+    // The transpiler now throws RegexUnsupportedException for any hex/octal
+    // escape whose codepoint exceeds U+FFFF, so spark-rapids falls back to
+    // the CPU regex engine (which Java's Pattern handles correctly).
+    val supplementaryHexPatterns = Seq(
+      raw"\x{10000}",             // lowest supplementary codepoint
+      raw"\x{1F600}",             // grinning face emoji
+      raw"\x{10FFFF}",            // highest valid Unicode codepoint
+      raw"a\x{1F600}b",           // embedded in a longer pattern
+      raw"[\x{1F600}abc]",        // inside a character class
+      raw"[\x{10000}-\x{10FFFF}]" // range with supplementary endpoints
+    )
+    supplementaryHexPatterns.foreach { p =>
+      assertUnsupported(p, RegexFindMode,
+        "cuDF does not support supplementary codepoints")
+    }
+
+    // BMP boundary U+FFFF must continue to transpile (regression guard).
+    assertCpuGpuMatchesRegexpFind(
+      Seq(raw"\x{FFFF}", raw"\xff"),
+      Seq("", "a", "xy", "xÿy", "", "ÿ"))
+  }
+
   test("hex digit character classes") {
     val patterns = Seq(raw"[\x02]", raw"[\x2c]", raw"[\x7f]", raw"[\x80]", raw"[\x01-\xff]",
       raw"[a-\xff]", raw"[\x20-z]")