From 388f79fb44ae4d332112a497ee24303e2c5df9f4 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Mon, 25 May 2026 12:20:39 +0800 Subject: [PATCH 1/2] [BUG] Fix regex transpiler truncating supplementary codepoints in \x{...} and \0NNN escapes (#14744) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hex escapes (\x{NNNN}) and octal escapes (\0NNN) for supplementary codepoints (cp > U+FFFF) were silently truncated to their low 16 bits via `.toChar` in the RegexParser / CudfRegexTranspiler. For example, the pattern `\x{1F600}` (grinning face πŸ˜€) was rewritten to `RegexChar(0xF600.toChar)` β€” a BMP private-use codepoint β€” so the GPU matched the wrong character. cuDF's regex JNI cannot represent supplementary codepoints natively (see the deviation note below), so this PR makes the parser and transpiler throw `RegexUnsupportedException` for any hex/octal escape whose codepoint exceeds U+FFFF. spark-rapids then falls back to the CPU regex engine, which Java's `Pattern` handles correctly. This guarantees GPU == CPU parity (the supreme rule from CLAUDE.md) at the cost of one CPU fallback per affected pattern. Patterns containing only BMP codepoints (cp <= U+FFFF) are unaffected. Four throw sites in `RegexParser.scala`: 1. `parseHex` codepoint > 0xFFFF 2. `parseOctal` codepoint > 0xFFFF 3. `CudfRegexTranspiler.rewrite` β€” `RegexOctalChar` arm 4. `CudfRegexTranspiler.rewrite` β€” `RegexHexDigit` arm Deviation from suggested fix: the issue proposed encoding supplementary codepoints as multi-byte UTF-8 sequences at the AST level. That does not work end-to-end because cuDF's regex JNI consumes Unicode codepoints (not raw UTF-8 bytes), so a synthesized byte sequence still fails to match the actual supplementary codepoint in the data column. The truncation symptom would be replaced by a different wrong-match symptom (GPU matches nothing instead of matching U+F600 by accident). The CPU-fallback path used here is the same contract spark-rapids uses for every other unsupported regex feature. Tests: * Scala: `RegularExpressionTranspilerSuite` -> "issue-14744: supplementary codepoint hex/octal escapes fall back to CPU" asserts `RegexUnsupportedException` for six representative patterns (`\x{10000}`, `\x{1F600}`, `\x{10FFFF}`, embedded, in character class, in range) and adds a regression guard for the BMP boundary U+FFFF. * Python IT: `regexp_test.py` adds `test_rlike_supplementary_codepoint_fallback_issue_14744` (4 patterns parametrized) and `test_regexp_replace_supplementary_codepoint_fallback_issue_14744` using `assert_gpu_fallback_collect`. Updates `test_regexp_hexadecimal_digits` to use `\x{0000ffff}` so its projection still runs fully on GPU. Local validation: * mvn package -pl tests -am -Dbuildver=330 -DwildcardSuites=com.nvidia.spark.rapids.RegularExpressionTranspilerSuite -> Tests: succeeded 95, failed 0, canceled 6, ignored 0, pending 0 * run_pyspark_from_build.sh -k 'supplementary_codepoint_fallback_issue_14744 or test_regexp_hexadecimal_digits' -> 6 passed, 39867 deselected, 8 warnings in 13.34s * spark-shell end-to-end repro on the patched dist JAR: GPU == CPU == [true, false, true] for inputs ["πŸ˜€", "a", "hello πŸ˜€ world"] matched against the pattern `\x{1F600}`. Closes #14744 Contributes to #14733 Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Allen Xu --- .../src/main/python/regexp_test.py | 51 +++++++++++++++++-- .../com/nvidia/spark/rapids/RegexParser.scala | 27 ++++++++++ .../RegularExpressionTranspilerSuite.scala | 31 ++++++++++- 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 07864aca56a..ecb24628683 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -617,22 +617,67 @@ def test_regexp_choice(): conf=_regexp_conf) def test_regexp_hexadecimal_digits(): + # NOTE: supplementary-codepoint hex escapes (cp > U+FFFF) are + # exercised by the `test_rlike_supplementary_codepoint_fallback_issue_14744` + # / `test_regexp_replace_supplementary_codepoint_fallback_issue_14744` + # tests below -- they now fall back to CPU because cuDF's regex JNI + # cannot represent them. Keeping them out of this projection avoids + # the mixed GPU/CPU `ProjectExec` execution path. See + # NVIDIA/spark-rapids#14744. gen = mk_str_gen( - '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') + '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{0000ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'rlike(a, "\\\\x7f")', 'rlike(a, "\\\\x80")', 'rlike(a, "[\\\\xa0-\\\\xf0]")', 'rlike(a, "\\\\x{00eeee}")', + 'rlike(a, "\\\\x{0000ffff}")', 'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)', 'regexp_extract(a, "([a-d]+)[\\\\xa0\nabcd]([a-d]+)", 1)', 'regexp_replace(a, "\\\\xff", "@")', 'regexp_replace(a, "[\\\\xa0-\\\\xb0]", "@")', - 'regexp_replace(a, "\\\\x{10ffff}", "@")', ), conf=_regexp_conf) +@allow_non_gpu('ProjectExec', 'RLike') +@pytest.mark.parametrize('pattern', [ + '\\\\x{1F600}', # grinning face emoji + '\\\\x{10000}', # lowest supplementary codepoint + '\\\\x{10FFFF}', # highest valid Unicode codepoint + 'a\\\\x{1F600}b', # supplementary codepoint embedded in a literal +]) +def test_rlike_supplementary_codepoint_fallback_issue_14744(pattern): + # Issue NVIDIA/spark-rapids#14744 regression coverage. Before the + # fix, hex escapes for supplementary codepoints (cp > U+FFFF) were + # silently truncated via `.toChar` on the GPU -- e.g. the pattern + # `\x{1F600}` (grinning face πŸ˜€) was rewritten as `ο˜€` (a + # private-use BMP codepoint), so the GPU matched the wrong + # character. The transpiler now throws `RegexUnsupportedException` + # for these patterns, causing spark-rapids to fall back to the CPU + # regex engine -- which Java's `Pattern` handles correctly. + # The data gen below seeds inputs that contain the actual + # supplementary codepoints we test for, so CPU matches are real + # rather than always-False. + gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "{}")'.format(pattern)), + 'RLike', + conf=_regexp_conf) + +@allow_non_gpu('ProjectExec', 'RegExpReplace') +def test_regexp_replace_supplementary_codepoint_fallback_issue_14744(): + # Companion to test_rlike_supplementary_codepoint_fallback_issue_14744 but + # for regexp_replace -- same root cause, same CPU fallback path, + # different meta. See NVIDIA/spark-rapids#14744. + gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\x{1F600}", "@")'), + 'RegExpReplace', + conf=_regexp_conf) + def test_regexp_whitespace(): gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}') assert_gpu_and_cpu_are_equal_collect( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index ff1d858002b..1c1507ba981 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -208,12 +208,24 @@ class RegexParser(pattern: String) { val hexChar = parseHexDigit hexChar.codePoint match { case 0 => hexChar + case codePoint if codePoint > 0xFFFF => + // cuDF's regex JNI surface cannot represent supplementary + // codepoints (cp > U+FFFF); previously these were silently + // truncated to their low 16 bits via `.toChar`, producing + // wrong matches. See NVIDIA/spark-rapids#14744. + throw new RegexUnsupportedException( + "cuDF does not support supplementary codepoints (cp > U+FFFF) " + + "in hex escapes; falling back to CPU", hexChar.position) case codePoint => RegexChar(codePoint.toChar) } case Some('0') => val octalChar = parseOctalDigit octalChar.codePoint match { case 0 => RegexHexDigit("00") + case codePoint if codePoint > 0xFFFF => + throw new RegexUnsupportedException( + "cuDF does not support supplementary codepoints (cp > U+FFFF) " + + "in octal escapes; falling back to CPU", octalChar.position) case codePoint => RegexChar(codePoint.toChar) } case Some(ch) => @@ -1129,6 +1141,14 @@ class CudfRegexTranspiler(mode: RegexMode) { if (regexMetaChars.map(_.toInt).contains(r.codePoint)) { RegexEscaped(r.codePoint.toChar) + } else if (r.codePoint > 0xFFFF) { + // cuDF's regex JNI cannot represent supplementary codepoints + // (cp > U+FFFF); previously these were silently truncated via + // `.toChar`, producing wrong matches. + // See NVIDIA/spark-rapids#14744. + throw new RegexUnsupportedException( + "cuDF does not support supplementary codepoints (cp > U+FFFF) " + + "in octal escapes; falling back to CPU", r.position) } else if(r.codePoint >= 128) { RegexChar(r.codePoint.toChar) } else { @@ -1138,6 +1158,13 @@ class CudfRegexTranspiler(mode: RegexMode) { case r @ RegexHexDigit(_) => if (regexMetaChars.map(_.toInt).contains(r.codePoint)) { RegexEscaped(r.codePoint.toChar) + } else if (r.codePoint > 0xFFFF) { + // See NVIDIA/spark-rapids#14744 β€” supplementary codepoints + // cannot round-trip through cuDF's regex JNI, so we fall back + // to CPU rather than emit a truncated `.toChar` match. + throw new RegexUnsupportedException( + "cuDF does not support supplementary codepoints (cp > U+FFFF) " + + "in hex escapes; falling back to CPU", r.position) } else if (r.codePoint >= 128) { // cuDF only supports 0x00 to 0x7f hexidecimal chars RegexChar(r.codePoint.toChar) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index de78cf0c4b5..a5bec5ca5a9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -236,11 +236,40 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { test("hex digits - find") { val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}", - raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{10FFFF}", raw"\x{00eeee}") + raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{00eeee}") + // NOTE: supplementary codepoints (cp > U+FFFF) such as `\x{1F600}` are + // covered by `issue-14744:` below -- they now throw and fall back to CPU + // because cuDF's regex JNI cannot represent them. assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b", "\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd")) } + test("issue-14744: supplementary codepoint hex/octal escapes fall back to CPU") { + // Before the fix, `\x{1F600}` (U+1F600 grinning-face emoji) was + // silently truncated to `RegexChar(0x1F600.toChar)` = `RegexChar('ο˜€')`, + // making the GPU match U+F600 instead of the supplementary codepoint. + // The transpiler now throws RegexUnsupportedException for any hex/octal + // escape whose codepoint exceeds U+FFFF, so spark-rapids falls back to + // the CPU regex engine (which Java's Pattern handles correctly). + val supplementaryHexPatterns = Seq( + raw"\x{10000}", // lowest supplementary codepoint + raw"\x{1F600}", // grinning face emoji + raw"\x{10FFFF}", // highest valid Unicode codepoint + raw"a\x{1F600}b", // embedded in a longer pattern + raw"[\x{1F600}abc]", // inside a character class + raw"[\x{10000}-\x{10FFFF}]" // range with supplementary endpoints + ) + supplementaryHexPatterns.foreach { p => + assertUnsupported(p, RegexFindMode, + "cuDF does not support supplementary codepoints") + } + + // BMP boundary U+FFFF must continue to transpile (regression guard). + assertCpuGpuMatchesRegexpFind( + Seq(raw"\x{FFFF}", raw"\xff"), + Seq("", "a", "xοΏΏy", "xΓΏy", "οΏΏ", "ΓΏ")) + } + test("hex digit character classes") { val patterns = Seq(raw"[\x02]", raw"[\x2c]", raw"[\x7f]", raw"[\x80]", raw"[\x01-\xff]", raw"[a-\xff]", raw"[\x20-z]") From 835458394d166830814affe5beee3e84f74b1434 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Thu, 28 May 2026 17:45:22 +0800 Subject: [PATCH 2/2] docs(regexp_test): remove misleading data-gen comment (#14744) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deleted comment claimed `mk_str_gen` seeds inputs containing actual supplementary codepoints. Python's `re` module (which `mk_str_gen` uses for the seed regex) does not support the `\x{HHHH}` syntax β€” only `\xHH` with exactly 2 hex digits β€” so the gen produces strings with literal backslash-x text rather than real codepoints. The test still validates the correct behavior (GPU falls back to CPU via RegexUnsupportedException and CPU returns the same RLike result the Java engine produces), but the "CPU matches are real" justification was factually wrong. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Allen Xu --- integration_tests/src/main/python/regexp_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index ecb24628683..f0a4fcf5873 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -656,9 +656,6 @@ def test_rlike_supplementary_codepoint_fallback_issue_14744(pattern): # character. The transpiler now throws `RegexUnsupportedException` # for these patterns, causing spark-rapids to fall back to the CPU # regex engine -- which Java's `Pattern` handles correctly. - # The data gen below seeds inputs that contain the actual - # supplementary codepoints we test for, so CPU matches are real - # rather than always-False. gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]') assert_gpu_fallback_collect( lambda spark: unary_op_df(spark, gen).selectExpr(