Skip to content

Commit

Permalink
support repetition of \d and \D in regexp functions (#4469)
Browse files Browse the repository at this point in the history
Signed-off-by: Andy Grove <[email protected]>
  • Loading branch information
andygrove authored Jan 7, 2022
1 parent 657acd3 commit 54c0f94
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -571,10 +571,11 @@ class CudfRegexTranspiler(replace: Boolean) {
// example: pattern " ?", input "] b[", replace with "X":
// java: X]XXbX[X
// cuDF: XXXX] b[
// see https://github.com/NVIDIA/spark-rapids/issues/4468
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with ? or *")

case (RegexEscaped(_), _) =>
case (RegexEscaped(ch), _) if ch != 'd' && ch != 'D' =>
// example: "\B?"
throw new RegexUnsupportedException(nothingToRepeat)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -232,6 +232,21 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpFind(patterns, inputs)
}

test("compare CPU and GPU: find digits") {
val patterns = Seq("\\d", "\\d+", "\\d*", "\\d?",
"\\D", "\\D+", "\\D*", "\\D?")
val inputs = Seq("a", "1", "12", "a12z", "1az2")
assertCpuGpuMatchesRegexpFind(patterns, inputs)
}

test("compare CPU and GPU: replace digits") {
// note that we do not test with quantifiers `?` or `*` due
// to https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq("\\d", "\\d+", "\\D", "\\D+")
val inputs = Seq("a", "1", "12", "a12z", "1az2")
assertCpuGpuMatchesRegexpReplace(patterns, inputs)
}

private val REGEXP_LIMITED_CHARS = "|()[]{},.^$*+?abc123x\\ \tBsdwSDW"

test("compare CPU and GPU: regexp find fuzz test with limited chars") {
Expand Down

0 comments on commit 54c0f94

Please sign in to comment.