Skip to content

Commit

Permalink
Handle ?, *, {0,} and {0,n} based repetitions in regexp_repla…
Browse files Browse the repository at this point in the history
…ce on the GPU (#5450)

* WIP: enable '*' and '?' in regexp_replace

Signed-off-by: Navin Kumar <[email protected]>

* Enabled "?", "*", "{0,}", and "{0,n}"

Signed-off-by: Navin Kumar <[email protected]>

* Update RegularExpressionSuite test

Signed-off-by: Navin Kumar <[email protected]>

* Add integration_test and updated unit test

Signed-off-by: Navin Kumar <[email protected]>

* Update tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala

* Fix transpile test

Signed-off-by: Navin Kumar <[email protected]>

Co-authored-by: Andy Grove <[email protected]>
  • Loading branch information
NVnavkumar and andygrove authored Jun 6, 2022
1 parent cbb3ccc commit 8f20914
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 39 deletions.
17 changes: 17 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,23 @@ def test_re_replace():
'REGEXP_REPLACE(a, "TEST", NULL)'),
conf=_regexp_conf)

# Note regexp_replace with empty string will not match
# unless we are using Spark 3.1.4, 3.2.2, or 3.3.0
# See https://issues.apache.org/jira/browse/SPARK-39107
# See https://github.com/NVIDIA/spark-rapids/issues/5456
def test_re_replace_repetition():
gen = StringGen('.{0,5}TEST[\ud720 A]{0,5}')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'REGEXP_REPLACE(a, "[E]+", "PROD")',
'REGEXP_REPLACE(a, "[A]+", "PROD")',
'REGEXP_REPLACE(a, "A{0,}", "PROD")',
'REGEXP_REPLACE(a, "T?E?", "PROD")',
'REGEXP_REPLACE(a, "A*", "PROD")',
'REGEXP_REPLACE(a, "A{0,5}", "PROD")'),
conf=_regexp_conf)


@allow_non_gpu('ProjectExec', 'RegExpReplace')
def test_re_replace_issue_5492():
# https://github.com/NVIDIA/spark-rapids/issues/5492
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1165,14 +1165,6 @@ class CudfRegexTranspiler(mode: RegexMode) {
})._1)

case RegexRepetition(base, quantifier) => (base, quantifier) match {
case (_, SimpleQuantifier(ch)) if mode == RegexReplaceMode && "?*".contains(ch) =>
// example: pattern " ?", input "] b[", replace with "X":
// java: X]XXbX[X
// cuDF: XXXX] b[
// see https://github.com/NVIDIA/spark-rapids/issues/4468
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with ? or *")

case (_, SimpleQuantifier(ch)) if mode == RegexSplitMode && "?*".contains(ch) =>
// example: pattern " ?", input "] b[", replace with "X":
// java: X]XXbX[X
Expand All @@ -1181,19 +1173,17 @@ class CudfRegexTranspiler(mode: RegexMode) {
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with ? or * consistently with Spark")

case (_, QuantifierVariableLength(0, _)) if mode == RegexReplaceMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4468
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with {0,} or {0,n}")

case (_, QuantifierVariableLength(0, _)) if mode == RegexSplitMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4884
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with {0,} or {0,n} " +
"consistently with Spark")

case (_, QuantifierFixedLength(0))
if mode != RegexFindMode =>
case (_, QuantifierVariableLength(0, Some(0))) if mode != RegexFindMode =>
throw new RegexUnsupportedException(
"regex_replace and regex_split on GPU do not support repetition with {0,0}")

case (_, QuantifierFixedLength(0)) if mode != RegexFindMode =>
throw new RegexUnsupportedException(
"regex_replace and regex_split on GPU do not support repetition with {0}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,8 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite {
frame => frame.selectExpr("regexp_replace(strings,'','D')")
}

testGpuFallback("String regexp_replace regex 1 cpu fall back",
"RegExpReplace",
nullableStringsFromCsv, execsAllowedNonGpu = Seq("ProjectExec", "Alias",
"RegExpReplace", "AttributeReference", "Literal"), conf = conf) {
testSparkResultsAreEqual("String regexp_replace regex 1",
nullableStringsFromCsv, conf = conf) {
frame => frame.selectExpr("regexp_replace(strings,'.*','D')")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,20 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
val pattern = Seq("a.")
val inputs = Seq("abc", "a\n\rb", "a\n\u0085b", "a\u2029\u0085b", "a\u2082\rb")
assertCpuGpuMatchesRegexpFind(pattern, inputs)

}

test("replace_replace - ?, *, +, and {0, n} repetitions") {
val patterns = Seq("D?", "D*", "D+", "D{0,}", "D{0,1}", "D{0,5}", "[1a-zA-Z]{0,}",
"[1a-zA-Z]{0,2}", "A+")
val inputs = Seq("SS", "DD", "SDSDSDS", "DDDD", "DDDDDD", "ABCDEFG")
assertCpuGpuMatchesRegexpReplace(patterns, inputs)
}

test("dot matches CR on GPU but not on CPU") {
// see https://github.com/rapidsai/cudf/issues/9619
val pattern = "1."
assertCpuGpuMatchesRegexpFind(Seq(pattern), Seq("1\r2", "1\n2", "1\r\n2"))
}

test("character class with ranges") {
Expand Down Expand Up @@ -431,26 +445,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpReplace(patterns, inputs)
}

test("regexp_replace - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regexp_replace on GPU does not support repetition with ? or *"
)
)
}

test("regexp_replace - character class repetition - {0,} or {0,n} - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]{0,}", raw"[1a-zA-Z]{0,2}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regexp_replace on GPU does not support repetition with {0,} or {0,n}"
)
)
}

test("regexp_split - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4884
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
Expand Down

0 comments on commit 8f20914

Please sign in to comment.