Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION.
# Copyright (c) 2022-2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -617,22 +617,64 @@ def test_regexp_choice():
conf=_regexp_conf)

def test_regexp_hexadecimal_digits():
# NOTE: supplementary-codepoint hex escapes (cp > U+FFFF) are
# exercised by the `test_rlike_supplementary_codepoint_fallback_issue_14744`
# / `test_regexp_replace_supplementary_codepoint_fallback_issue_14744`
# tests below -- they now fall back to CPU because cuDF's regex JNI
# cannot represent them. Keeping them out of this projection avoids
# the mixed GPU/CPU `ProjectExec` execution path. See
# NVIDIA/spark-rapids#14744.
gen = mk_str_gen(
'[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
'[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{0000ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\x7f")',
'rlike(a, "\\\\x80")',
'rlike(a, "[\\\\xa0-\\\\xf0]")',
'rlike(a, "\\\\x{00eeee}")',
'rlike(a, "\\\\x{0000ffff}")',
'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)',
'regexp_extract(a, "([a-d]+)[\\\\xa0\nabcd]([a-d]+)", 1)',
'regexp_replace(a, "\\\\xff", "@")',
'regexp_replace(a, "[\\\\xa0-\\\\xb0]", "@")',
'regexp_replace(a, "\\\\x{10ffff}", "@")',
),
conf=_regexp_conf)

@allow_non_gpu('ProjectExec', 'RLike')
@pytest.mark.parametrize('pattern', [
'\\\\x{1F600}', # grinning face emoji
'\\\\x{10000}', # lowest supplementary codepoint
'\\\\x{10FFFF}', # highest valid Unicode codepoint
'a\\\\x{1F600}b', # supplementary codepoint embedded in a literal
])
def test_rlike_supplementary_codepoint_fallback_issue_14744(pattern):
# Issue NVIDIA/spark-rapids#14744 regression coverage. Before the
# fix, hex escapes for supplementary codepoints (cp > U+FFFF) were
# silently truncated via `.toChar` on the GPU -- e.g. the pattern
# `\x{1F600}` (grinning face πŸ˜€) was rewritten as `ο˜€` (a
# private-use BMP codepoint), so the GPU matched the wrong
# character. The transpiler now throws `RegexUnsupportedException`
# for these patterns, causing spark-rapids to fall back to the CPU
# regex engine -- which Java's `Pattern` handles correctly.
gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]')
assert_gpu_fallback_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "{}")'.format(pattern)),
'RLike',
conf=_regexp_conf)

@allow_non_gpu('ProjectExec', 'RegExpReplace')
def test_regexp_replace_supplementary_codepoint_fallback_issue_14744():
# Companion to test_rlike_supplementary_codepoint_fallback_issue_14744 but
# for regexp_replace -- same root cause, same CPU fallback path,
# different meta. See NVIDIA/spark-rapids#14744.
gen = mk_str_gen('[abcd]\\\\x{1F600}\\\\x{10000}\\\\x{10FFFF}[abcd]')
assert_gpu_fallback_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'regexp_replace(a, "\\\\x{1F600}", "@")'),
'RegExpReplace',
conf=_regexp_conf)

def test_regexp_whitespace():
gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,24 @@ class RegexParser(pattern: String) {
val hexChar = parseHexDigit
hexChar.codePoint match {
case 0 => hexChar
case codePoint if codePoint > 0xFFFF =>
// cuDF's regex JNI surface cannot represent supplementary
// codepoints (cp > U+FFFF); previously these were silently
// truncated to their low 16 bits via `.toChar`, producing
// wrong matches. See NVIDIA/spark-rapids#14744.
throw new RegexUnsupportedException(
"cuDF does not support supplementary codepoints (cp > U+FFFF) " +
"in hex escapes; falling back to CPU", hexChar.position)
case codePoint => RegexChar(codePoint.toChar)
}
case Some('0') =>
val octalChar = parseOctalDigit
octalChar.codePoint match {
case 0 => RegexHexDigit("00")
case codePoint if codePoint > 0xFFFF =>
throw new RegexUnsupportedException(
"cuDF does not support supplementary codepoints (cp > U+FFFF) " +
"in octal escapes; falling back to CPU", octalChar.position)
case codePoint => RegexChar(codePoint.toChar)
}
case Some(ch) =>
Expand Down Expand Up @@ -1129,6 +1141,14 @@ class CudfRegexTranspiler(mode: RegexMode) {

if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
RegexEscaped(r.codePoint.toChar)
} else if (r.codePoint > 0xFFFF) {
// cuDF's regex JNI cannot represent supplementary codepoints
// (cp > U+FFFF); previously these were silently truncated via
// `.toChar`, producing wrong matches.
// See NVIDIA/spark-rapids#14744.
throw new RegexUnsupportedException(
"cuDF does not support supplementary codepoints (cp > U+FFFF) " +
"in octal escapes; falling back to CPU", r.position)
} else if(r.codePoint >= 128) {
RegexChar(r.codePoint.toChar)
} else {
Expand All @@ -1138,6 +1158,13 @@ class CudfRegexTranspiler(mode: RegexMode) {
case r @ RegexHexDigit(_) =>
if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
RegexEscaped(r.codePoint.toChar)
} else if (r.codePoint > 0xFFFF) {
// See NVIDIA/spark-rapids#14744 β€” supplementary codepoints
// cannot round-trip through cuDF's regex JNI, so we fall back
// to CPU rather than emit a truncated `.toChar` match.
throw new RegexUnsupportedException(
"cuDF does not support supplementary codepoints (cp > U+FFFF) " +
"in hex escapes; falling back to CPU", r.position)
} else if (r.codePoint >= 128) {
// cuDF only supports 0x00 to 0x7f hexidecimal chars
RegexChar(r.codePoint.toChar)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,11 +236,40 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite {

test("hex digits - find") {
val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}",
raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{10FFFF}", raw"\x{00eeee}")
raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{00eeee}")
// NOTE: supplementary codepoints (cp > U+FFFF) such as `\x{1F600}` are
// covered by `issue-14744:` below -- they now throw and fall back to CPU
// because cuDF's regex JNI cannot represent them.
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b",
"\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd"))
}

test("issue-14744: supplementary codepoint hex/octal escapes fall back to CPU") {
// Before the fix, `\x{1F600}` (U+1F600 grinning-face emoji) was
// silently truncated to `RegexChar(0x1F600.toChar)` = `RegexChar('ο˜€')`,
// making the GPU match U+F600 instead of the supplementary codepoint.
// The transpiler now throws RegexUnsupportedException for any hex/octal
// escape whose codepoint exceeds U+FFFF, so spark-rapids falls back to
// the CPU regex engine (which Java's Pattern handles correctly).
val supplementaryHexPatterns = Seq(
raw"\x{10000}", // lowest supplementary codepoint
raw"\x{1F600}", // grinning face emoji
raw"\x{10FFFF}", // highest valid Unicode codepoint
raw"a\x{1F600}b", // embedded in a longer pattern
raw"[\x{1F600}abc]", // inside a character class
raw"[\x{10000}-\x{10FFFF}]" // range with supplementary endpoints
)
supplementaryHexPatterns.foreach { p =>
assertUnsupported(p, RegexFindMode,
"cuDF does not support supplementary codepoints")
}

// BMP boundary U+FFFF must continue to transpile (regression guard).
assertCpuGpuMatchesRegexpFind(
Seq(raw"\x{FFFF}", raw"\xff"),
Seq("", "a", "xοΏΏy", "xΓΏy", "οΏΏ", "ΓΏ"))
}

test("hex digit character classes") {
val patterns = Seq(raw"[\x02]", raw"[\x2c]", raw"[\x7f]", raw"[\x80]", raw"[\x01-\xff]",
raw"[a-\xff]", raw"[\x20-z]")
Expand Down