diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 0ee27b9bc73..f9cca47acdc 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -160,7 +160,6 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.reader.multithreaded.combine.waitTime|When using the multithreaded parquet or orc reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven't met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this.|200|Runtime spark.rapids.sql.reader.multithreaded.read.keepOrder|When using the MULTITHREADED reader, if this is set to true we read the files in the same order Spark does, otherwise the order may not be the same. Now it is supported only for parquet and orc.|true|Runtime spark.rapids.sql.regexp.enabled|Specifies whether supported regular expressions will be evaluated on the GPU. Unsupported expressions will fall back to CPU. However, there are some known edge cases that will still execute on GPU and produce incorrect results and these are documented in the compatibility guide. Setting this config to false will make all regular expressions run on the CPU instead.|true|Runtime -spark.rapids.sql.regexp.maxStateMemoryBytes|Specifies the maximum memory on GPU to be used for regular expressions.The memory usage is an estimate based on an upper-bound approximation on the complexity of the regular expression. Note that the actual memory usage may still be higher than this estimate depending on the number of rows in the datacolumn and the input strings themselves. It is recommended to not set this to more than 3 times spark.rapids.sql.batchSizeBytes|2147483647|Runtime spark.rapids.sql.replaceSortMergeJoin.enabled|Allow replacing sortMergeJoin with HashJoin|true|Runtime spark.rapids.sql.rowBasedUDF.enabled|When set to true, optimizes a row-based UDF in a GPU operation by transferring only the data it needs between GPU and CPU inside a query operation, instead of falling this operation back to CPU. This is an experimental feature, and this config might be removed in the future.|false|Runtime spark.rapids.sql.stableSort.enabled|Enable or disable stable sorting. Apache Spark's sorting is typically a stable sort, but sort stability cannot be guaranteed in distributed work loads because the order in which upstream data arrives to a task is not guaranteed. Sort stability then only matters when reading and sorting data from a file using a single task/partition. Because of limitations in the plugin when you enable stable sorting all of the data for a single task will be combined into a single batch before sorting. This currently disables spilling from GPU memory if the data size is too large.|false|Runtime diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 2f1b96d901d..7cb63db82ac 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -331,7 +331,6 @@ else # Not the default 2G but should be large enough for a single batch for all data (we found # 200 MiB being allocated by a single test at most, and we typically have 4 tasks. export PYSP_TEST_spark_rapids_sql_batchSizeBytes='100m' - export PYSP_TEST_spark_rapids_sql_regexp_maxStateMemoryBytes='300m' export PYSP_TEST_spark_hadoop_hive_exec_scratchdir="$RUN_DIR/hive" diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index c285f708249..eb5d77b3cc2 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -598,13 +598,9 @@ def test_character_classes(): @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10641") def test_regexp_choice(): - # Pattern `(abc1a$|^ab2ab|a3abc)` below transpiles to 21 states; with the - # default `gpuTargetBatchSizeBytes = 1 GiB` and default - # `maxRegExpStateMemory = Integer.MAX_VALUE`, the post-#14849 estimator - # (correctly) computes ~2.25 GiB and falls back to CPU, which drags the - # whole Project to CPU and trips assertIsOnTheGpu under IT mode. - # 3 GiB matches the conf's documented "no more than 3x batchSizeBytes" - # guidance. Tracked by #14867; long-term plugin-level fix is #14887. + # These choice patterns transpile to many cuDF states (e.g. `(abc1a$|^ab2ab|a3abc)` + # is ~21 states). They run on the GPU directly now that the regex complexity gate + # has been removed (#14887). gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( @@ -621,8 +617,7 @@ def test_regexp_choice(): 'regexp_replace(a, "[ab]$|[cd]$", "@")', 'regexp_replace(a, "[ab]+|^cd1", "@")' ), - conf={**_regexp_conf, - 'spark.rapids.sql.regexp.maxStateMemoryBytes': str(3 * 1024 * 1024 * 1024)}) + conf=_regexp_conf) def test_regexp_hexadecimal_digits(): gen = mk_str_gen( @@ -1064,52 +1059,6 @@ def test_regexp_split_unicode_support(): 'split(a, "[o]", -2)'), conf=_regexp_conf) -@allow_non_gpu('ProjectExec', 'RLike') -def test_regexp_memory_fallback(): - gen = StringGen('test') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a{6}"', - 'a rlike "a{6,}"', - 'a rlike "(?:ab){0,3}"', - 'a rlike "(?:12345)?"', - 'a rlike "(?:12345)+"', - 'a rlike "(?:123456)*"', - 'a rlike "a{1,6}"', - 'a rlike "abcdef"', - 'a rlike "(1)(2)(3)"', - 'a rlike "1|2|3|4|5|6"' - ), - cpu_fallback_class_name='RLike', - conf={ - 'spark.rapids.sql.regexp.enabled': True, - 'spark.rapids.sql.regexp.maxStateMemoryBytes': '10', - 'spark.rapids.sql.batchSizeBytes': '20' # 1 row in the batch - } - ) - -def test_regexp_memory_ok(): - gen = StringGen('test') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a{6}"', - 'a rlike "a{6,}"', - 'a rlike "(?:ab){0,3}"', - 'a rlike "(?:12345)?"', - 'a rlike "(?:12345)+"', - 'a rlike "(?:123456)*"', - 'a rlike "a{1,6}"', - 'a rlike "abcdef"', - 'a rlike "(1)(2)(3)"', - 'a rlike "1|2|3|4|5|6"' - ), - conf={ - 'spark.rapids.sql.regexp.enabled': True, - 'spark.rapids.sql.regexp.maxStateMemoryBytes': '12', - 'spark.rapids.sql.batchSizeBytes': '20' # 1 row in the batch - } - ) - def test_illegal_regexp_exception(): gen = mk_str_gen('[abcdef]{0,5}') diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala index b71f4d2da79..d0645bde468 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRegExpReplaceMeta.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,7 +69,6 @@ class GpuRegExpReplaceMeta( replaceOpt = Some(GpuRegExpStringReplaceMulti) case _ => GpuRegExpUtils.tagForRegExpEnabled(this) - GpuRegExpUtils.validateRegExpComplexity(this, pat) cudfPattern = Some(pat.toRegexString) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 6fec6d5f714..7021d62b18c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1985,16 +1985,6 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .booleanConf .createWithDefault(true) - val REGEXP_MAX_STATE_MEMORY_BYTES = conf("spark.rapids.sql.regexp.maxStateMemoryBytes") - .doc("Specifies the maximum memory on GPU to be used for regular expressions." + - "The memory usage is an estimate based on an upper-bound approximation on the " + - "complexity of the regular expression. Note that the actual memory usage may " + - "still be higher than this estimate depending on the number of rows in the data" + - "column and the input strings themselves. It is recommended to not set this to " + - s"more than 3 times ${GPU_BATCH_SIZE_BYTES.key}") - .bytesConf(ByteUnit.BYTE) - .createWithDefault(Integer.MAX_VALUE) - // INTERNAL TEST AND DEBUG CONFIGS val TEST_RETRY_OOM_INJECTION_MODE = conf("spark.rapids.sql.test.injectRetryOOM") @@ -3998,16 +3988,6 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isRegExpEnabled: Boolean = get(ENABLE_REGEXP) - lazy val maxRegExpStateMemory: Long = { - val size = get(REGEXP_MAX_STATE_MEMORY_BYTES) - if (size > 3 * gpuTargetBatchSizeBytes) { - logWarning(s"${REGEXP_MAX_STATE_MEMORY_BYTES.key} is more than 3 times " + - s"${GPU_BATCH_SIZE_BYTES.key}. This may cause regular expression operations to " + - s"encounter GPU out of memory errors.") - } - size - } - lazy val getSparkGpuResourceName: String = get(SPARK_GPU_RESOURCE_NAME) lazy val isCpuBasedUDFEnabled: Boolean = get(ENABLE_CPU_BASED_UDF) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexComplexityEstimator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexComplexityEstimator.scala deleted file mode 100644 index 439ad14a254..00000000000 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexComplexityEstimator.scala +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2022-2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids - -import org.apache.spark.sql.types.DataTypes - -object RegexComplexityEstimator { - private def requireNonNegative(name: String, value: Long): Unit = { - require(value >= 0L, s"$name must be non-negative, got: $value") - } - - // Saturating arithmetic for non-negative state and memory estimates. - private def saturatedAdd(left: Long, right: Long): Long = { - requireNonNegative("left", left) - requireNonNegative("right", right) - if (Long.MaxValue - left < right) { - Long.MaxValue - } else { - left + right - } - } - - private def saturatedMultiply(left: Long, right: Long): Long = { - requireNonNegative("left", left) - requireNonNegative("right", right) - if (left == 0L || right == 0L) { - 0L - } else if (left > Long.MaxValue / right) { - Long.MaxValue - } else { - left * right - } - } - - private def countStates(regex: RegexAST): Long = { - regex match { - case RegexSequence(parts) => - parts.foldLeft(0L) { (total, part) => - saturatedAdd(total, countStates(part)) - } - case RegexGroup(true, term, _) => - saturatedAdd(1L, countStates(term)) - case RegexGroup(false, term, _) => - countStates(term) - case RegexCharacterClass(_, _) => - 1L - case RegexChoice(left, right) => - saturatedAdd(countStates(left), countStates(right)) - case RegexRepetition(term, QuantifierFixedLength(length)) => - saturatedMultiply(length.toLong, countStates(term)) - case RegexRepetition(term, SimpleQuantifier(ch)) => - ch match { - case '*' => - countStates(term) - case '+' => - saturatedAdd(1L, countStates(term)) - case '?' => - saturatedAdd(1L, countStates(term)) - } - case RegexRepetition(term, QuantifierVariableLength(minLength, maxLengthOption)) => - maxLengthOption match { - case Some(maxLength) => - saturatedMultiply(maxLength.toLong, countStates(term)) - case None => - saturatedMultiply(minLength.max(1).toLong, countStates(term)) - } - case RegexChar(_) | RegexEscaped(_) | RegexHexDigit(_) | RegexOctalChar(_) => - 1L - case _ => - 0L - } - } - - private def estimateGpuMemory(numStates: Long, desiredBatchSizeBytes: Long): Long = { - val numRows = GpuBatchUtils.estimateRowCount( - desiredBatchSizeBytes, DataTypes.StringType.defaultSize, 1) - - // cuDF requests num_instructions * num_threads * 2 when allocating the memory on the device - // (ignoring memory alignment). We are trying to reproduce that calculation here: - saturatedMultiply(saturatedMultiply(numStates, numRows.toLong), 2L) - } - - def isValid(conf: RapidsConf, regex: RegexAST): Boolean = { - val numStates = countStates(regex) - estimateGpuMemory(numStates, conf.gpuTargetBatchSizeBytes) <= conf.maxRegExpStateMemory - } -} diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 33644982cf1..ea30808f8aa 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1145,13 +1145,6 @@ object GpuRegExpUtils { } } - def validateRegExpComplexity(meta: ExprMeta[_], regex: RegexAST): Unit = { - if(!RegexComplexityEstimator.isValid(meta.conf, regex)) { - meta.willNotWorkOnGpu(s"estimated memory needed for regular expression exceeds the maximum." + - s" Set ${RapidsConf.REGEXP_MAX_STATE_MEMORY_BYTES} to change it.") - } - } - /** * Recursively check if pattern contains only zero-match repetitions * ?, *, {0,}, or {0,n} or any combination of them. @@ -1254,7 +1247,6 @@ class GpuRLikeMeta( } val (transpiledAST, _) = new CudfRegexTranspiler(RegexFindMode) .getTranspiledAST(regexAst, None, None) - GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST) pattern = Some(transpiledAST.toRegexString) } catch { case e: RegexUnsupportedException => @@ -1516,7 +1508,6 @@ class GpuRegExpExtractMeta( val (transpiledAST, _) = new CudfRegexTranspiler(RegexFindMode).getTranspiledAST( javaRegexpPattern, groupIdx, None) - GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST) pattern = Some(transpiledAST.toRegexString) numGroups = GpuRegExpUtils.countGroups(javaRegexpPattern) } catch { @@ -1645,7 +1636,6 @@ class GpuRegExpExtractAllMeta( val (transpiledAST, _) = new CudfRegexTranspiler(RegexFindMode).getTranspiledAST( javaRegexpPattern, groupIdx, None) - GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST) pattern = Some(transpiledAST.toRegexString) numGroups = GpuRegExpUtils.countGroups(javaRegexpPattern) } catch { @@ -1900,7 +1890,6 @@ abstract class StringSplitRegExpMeta[INPUT <: TernaryExpression](expr: INPUT, case None => try { val (transpiledAST, _) = transpiler.getTranspiledAST(utf8Str.toString, None, None) - GpuRegExpUtils.validateRegExpComplexity(this, transpiledAST) pattern = transpiledAST.toRegexString isRegExp = true } catch { diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegexComplexityEstimatorSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegexComplexityEstimatorSuite.scala deleted file mode 100644 index 214eb7d04d5..00000000000 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegexComplexityEstimatorSuite.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids - -import org.scalatest.funsuite.AnyFunSuite - -class RegexComplexityEstimatorSuite extends AnyFunSuite { - private val conf = new RapidsConf(Map.empty[String, String]) - - private def isValid(pattern: String): Boolean = { - val ast = new RegexParser(pattern).parse() - RegexComplexityEstimator.isValid(conf, ast) - } - - test("reject regex patterns whose state memory estimate requires saturating arithmetic") { - Seq( - "a{65536}{65536}", - "a{1073741824}", - "(.){65536}{32768}", - "a{2147483647}{2147483647}{2147483647}" - ).foreach { pattern => - withClue(s"$pattern should exceed the default regex state memory limit") { - assert(!isValid(pattern)) - } - } - } - - test("accept regex patterns below the default state memory limit") { - assert(isValid("a{2}")) - } -}