diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index aa6745d8c68..c73d45a53cc 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -547,6 +547,23 @@ def test_re_replace(): 'REGEXP_REPLACE(a, "TEST", NULL)'), conf=_regexp_conf) +# Note regexp_replace with empty string will not match +# unless we are using Spark 3.1.4, 3.2.2, or 3.3.0 +# See https://issues.apache.org/jira/browse/SPARK-39107 +# See https://github.com/NVIDIA/spark-rapids/issues/5456 +def test_re_replace_repetition(): + gen = StringGen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "[E]+", "PROD")', + 'REGEXP_REPLACE(a, "[A]+", "PROD")', + 'REGEXP_REPLACE(a, "A{0,}", "PROD")', + 'REGEXP_REPLACE(a, "T?E?", "PROD")', + 'REGEXP_REPLACE(a, "A*", "PROD")', + 'REGEXP_REPLACE(a, "A{0,5}", "PROD")'), + conf=_regexp_conf) + + @allow_non_gpu('ProjectExec', 'RegExpReplace') def test_re_replace_issue_5492(): # https://github.com/NVIDIA/spark-rapids/issues/5492 diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index d001d302d96..92f1784bf1d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -1165,14 +1165,6 @@ class CudfRegexTranspiler(mode: RegexMode) { })._1) case RegexRepetition(base, quantifier) => (base, quantifier) match { - case (_, SimpleQuantifier(ch)) if mode == RegexReplaceMode && "?*".contains(ch) => - // example: pattern " ?", input "] b[", replace with "X": - // java: X]XXbX[X - // cuDF: XXXX] b[ - // see https://github.com/NVIDIA/spark-rapids/issues/4468 - throw new RegexUnsupportedException( - "regexp_replace on GPU does not support repetition with ? or *") - case (_, SimpleQuantifier(ch)) if mode == RegexSplitMode && "?*".contains(ch) => // example: pattern " ?", input "] b[", replace with "X": // java: X]XXbX[X @@ -1181,19 +1173,17 @@ class CudfRegexTranspiler(mode: RegexMode) { throw new RegexUnsupportedException( "regexp_split on GPU does not support repetition with ? or * consistently with Spark") - case (_, QuantifierVariableLength(0, _)) if mode == RegexReplaceMode => - // see https://github.com/NVIDIA/spark-rapids/issues/4468 - throw new RegexUnsupportedException( - "regexp_replace on GPU does not support repetition with {0,} or {0,n}") - case (_, QuantifierVariableLength(0, _)) if mode == RegexSplitMode => // see https://github.com/NVIDIA/spark-rapids/issues/4884 throw new RegexUnsupportedException( "regexp_split on GPU does not support repetition with {0,} or {0,n} " + "consistently with Spark") - case (_, QuantifierFixedLength(0)) - if mode != RegexFindMode => + case (_, QuantifierVariableLength(0, Some(0))) if mode != RegexFindMode => + throw new RegexUnsupportedException( + "regex_replace and regex_split on GPU do not support repetition with {0,0}") + + case (_, QuantifierFixedLength(0)) if mode != RegexFindMode => throw new RegexUnsupportedException( "regex_replace and regex_split on GPU do not support repetition with {0}") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala index a4c45487eef..077a3b8fbec 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala @@ -59,10 +59,8 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { frame => frame.selectExpr("regexp_replace(strings,'','D')") } - testGpuFallback("String regexp_replace regex 1 cpu fall back", - "RegExpReplace", - nullableStringsFromCsv, execsAllowedNonGpu = Seq("ProjectExec", "Alias", - "RegExpReplace", "AttributeReference", "Literal"), conf = conf) { + testSparkResultsAreEqual("String regexp_replace regex 1", + nullableStringsFromCsv, conf = conf) { frame => frame.selectExpr("regexp_replace(strings,'.*','D')") } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 717b5f07780..b8c1fcb1f38 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -333,6 +333,20 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { val pattern = Seq("a.") val inputs = Seq("abc", "a\n\rb", "a\n\u0085b", "a\u2029\u0085b", "a\u2082\rb") assertCpuGpuMatchesRegexpFind(pattern, inputs) + + } + + test("replace_replace - ?, *, +, and {0, n} repetitions") { + val patterns = Seq("D?", "D*", "D+", "D{0,}", "D{0,1}", "D{0,5}", "[1a-zA-Z]{0,}", + "[1a-zA-Z]{0,2}", "A+") + val inputs = Seq("SS", "DD", "SDSDSDS", "DDDD", "DDDDDD", "ABCDEFG") + assertCpuGpuMatchesRegexpReplace(patterns, inputs) + } + + test("dot matches CR on GPU but not on CPU") { + // see https://github.com/rapidsai/cudf/issues/9619 + val pattern = "1." + assertCpuGpuMatchesRegexpFind(Seq(pattern), Seq("1\r2", "1\n2", "1\r\n2")) } test("character class with ranges") { @@ -431,26 +445,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { assertCpuGpuMatchesRegexpReplace(patterns, inputs) } - test("regexp_replace - character class repetition - ? and * - fall back to CPU") { - // see https://github.com/NVIDIA/spark-rapids/issues/4468 - val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*") - patterns.foreach(pattern => - assertUnsupported(pattern, RegexReplaceMode, - "regexp_replace on GPU does not support repetition with ? or *" - ) - ) - } - - test("regexp_replace - character class repetition - {0,} or {0,n} - fall back to CPU") { - // see https://github.com/NVIDIA/spark-rapids/issues/4468 - val patterns = Seq(raw"[1a-zA-Z]{0,}", raw"[1a-zA-Z]{0,2}") - patterns.foreach(pattern => - assertUnsupported(pattern, RegexReplaceMode, - "regexp_replace on GPU does not support repetition with {0,} or {0,n}" - ) - ) - } - test("regexp_split - character class repetition - ? and * - fall back to CPU") { // see https://github.com/NVIDIA/spark-rapids/issues/4884 val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")