From fc2b230cd7c2916801e7c9ca14bee857012ddb83 Mon Sep 17 00:00:00 2001 From: fejiang Date: Fri, 5 Jul 2024 14:53:18 +0800 Subject: [PATCH 01/17] clear the regex logic Signed-off-by: fejiang --- .../spark/sql/rapids/stringFunctions.scala | 93 +++++++++++-------- .../suites/RapidsStringExpressionsSuite.scala | 1 + 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index dc2845e4461..0544fd45f28 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1584,63 +1584,63 @@ class SubstringIndexMeta( override val parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) extends TernaryExprMeta[SubstringIndex](expr, conf, parent, rule) { - private var regexp: String = _ + //private var regexp: String = _ override def tagExprForGpu(): Unit = { val delim = GpuOverrides.extractStringLit(expr.delimExpr).getOrElse("") - if (delim == null || delim.length != 1) { - willNotWorkOnGpu("only a single character deliminator is supported") - } + // delim can be null in here just return "" them +// if (delim == null) { +// willNotWorkOnGpu("only a single character deliminator is supported") +// } val count = GpuOverrides.extractLit(expr.countExpr) if (canThisBeReplaced) { val c = count.get.value.asInstanceOf[Integer] - this.regexp = GpuSubstringIndex.makeExtractRe(delim, c) + //this.regexp = GpuSubstringIndex.makeExtractRe(delim, c) } } override def convertToGpu( column: Expression, delim: Expression, - count: Expression): GpuExpression = GpuSubstringIndex(column, this.regexp, delim, count) + count: Expression): GpuExpression = GpuSubstringIndex(column, delim, count) } -object GpuSubstringIndex { - def makeExtractRe(delim: String, count: Integer): String = { - if (delim.length != 1) { - throw new IllegalStateException("NOT SUPPORTED") - } - val quotedDelim = CudfRegexp.cudfQuote(delim.charAt(0)) - val notDelim = CudfRegexp.notCharSet(delim.charAt(0)) - // substring_index has a deliminator and a count. If the count is positive then - // you get back a substring from 0 until the Nth deliminator is found - // If the count is negative it goes in reverse - if (count == 0) { - // Count is zero so return a null regexp as a special case - null - } else if (count == 1) { - // If the count is 1 we want to match everything from the beginning of the string until we - // find the first occurrence of the deliminator or the end of the string - "\\A(" + notDelim + "*)" - } else if (count > 0) { - // If the count is > 1 we first match 0 up to count - 1 occurrences of the patten - // `not the deliminator 0 or more times followed by the deliminator` - // After that we go back to matching everything until we find the deliminator or the end of - // the string - "\\A((?:" + notDelim + "*" + quotedDelim + "){0," + (count - 1) + "}" + notDelim + "*)" - } else if (count == -1) { - // A -1 looks like 1 but we start looking at the end of the string - "(" + notDelim + "*)\\Z" - } else { //count < 0 - // All others look like a positive count, but again we are matching starting at the end of - // the string instead of the beginning - "((?:" + notDelim + "*" + quotedDelim + "){0," + ((-count) - 1) + "}" + notDelim + "*)\\Z" - } - } -} +//object GpuSubstringIndex { +// def makeExtractRe(delim: String, count: Integer): String = { +// if (delim.length != 1) { +// throw new IllegalStateException("NOT SUPPORTED") +// } +// val quotedDelim = CudfRegexp.cudfQuote(delim.charAt(0)) +// val notDelim = CudfRegexp.notCharSet(delim.charAt(0)) +// // substring_index has a deliminator and a count. If the count is positive then +// // you get back a substring from 0 until the Nth deliminator is found +// // If the count is negative it goes in reverse +// if (count == 0) { +// // Count is zero so return a null regexp as a special case +// null +// } else if (count == 1) { +// // If the count is 1 we want to match everything from the beginning of the string until we +// // find the first occurrence of the deliminator or the end of the string +// "\\A(" + notDelim + "*)" +// } else if (count > 0) { +// // If the count is > 1 we first match 0 up to count - 1 occurrences of the patten +// // `not the deliminator 0 or more times followed by the deliminator` +// // After that we go back to matching everything until we find the deliminator or the end of +// // the string +// "\\A((?:" + notDelim + "*" + quotedDelim + "){0," + (count - 1) + "}" + notDelim + "*)" +// } else if (count == -1) { +// // A -1 looks like 1 but we start looking at the end of the string +// "(" + notDelim + "*)\\Z" +// } else { //count < 0 +// // All others look like a positive count, but again we are matching starting at the end of +// // the string instead of the beginning +// "((?:" + notDelim + "*" + quotedDelim + "){0," + ((-count) - 1) + "}" + notDelim + "*)\\Z" +// } +// } +//} case class GpuSubstringIndex(strExpr: Expression, - regexp: String, ignoredDelimExpr: Expression, ignoredCountExpr: Expression) extends GpuTernaryExpressionArgsAnyScalarScalar with ImplicitCastInputTypes { @@ -1660,6 +1660,16 @@ case class GpuSubstringIndex(strExpr: Expression, // spark-rapids plugin issue https://github.com/NVIDIA/spark-rapids/issues/8750 override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { + // here I would put the logic to deal with substringindex using + // substringLocatemultiple and substring + // 1. substringLocateMultiple to get all the index + withResource(str.getBase.stringLocateMultiple(delim.getBase)) { strLocateRes => + + } + + // 2. and then substring to return the string + + if (regexp == null) { withResource(str.getBase.isNull) { isNull => withResource(Scalar.fromString("")) { emptyString => @@ -1671,6 +1681,9 @@ case class GpuSubstringIndex(strExpr: Expression, table.getColumn(0).incRefCount() } } + + + } override def doColumnar(numRows: Int, val0: GpuScalar, val1: GpuScalar, diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala index 164406fdf83..41eee634432 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala @@ -23,3 +23,4 @@ import org.apache.spark.sql.catalyst.expressions.StringExpressionsSuite import org.apache.spark.sql.rapids.utils.RapidsTestsTrait class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} +git \ No newline at end of file From c8772dc91783b721dd2befa57e6dd7b98440e638 Mon Sep 17 00:00:00 2001 From: fejiang Date: Sun, 7 Jul 2024 20:52:57 +0800 Subject: [PATCH 02/17] local change of substring index Signed-off-by: fejiang --- .../spark/sql/rapids/stringFunctions.scala | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 9cd0af047ff..2a8a82129c6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1663,26 +1663,19 @@ case class GpuSubstringIndex(strExpr: Expression, // here I would put the logic to deal with substringindex using // substringLocatemultiple and substring // 1. substringLocateMultiple to get all the index - withResource(str.getBase.stringLocateMultiple(delim.getBase)) { strLocateRes => - - } // 2. and then substring to return the string - - - if (regexp == null) { - withResource(str.getBase.isNull) { isNull => - withResource(Scalar.fromString("")) { emptyString => - isNull.ifElse(str.getBase, emptyString) - } - } - } else { - withResource(str.getBase.extractRe(new RegexProgram(regexp))) { table: Table => - table.getColumn(0).incRefCount() - } - } - - +// if (regexp == null) { +// withResource(str.getBase.isNull) { isNull => +// withResource(Scalar.fromString("")) { emptyString => +// isNull.ifElse(str.getBase, emptyString) +// } +// } +// } else { +// withResource(str.getBase.extractRe(new RegexProgram(regexp))) { table: Table => +// table.getColumn(0).incRefCount() +// } +// } } From 62e16a08bbe42c1d96ccc11626804b1b40766c95 Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 8 Jul 2024 00:02:34 +0800 Subject: [PATCH 03/17] stringFunctions scala Signed-off-by: fejiang --- .../org/apache/spark/sql/rapids/stringFunctions.scala | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 2a8a82129c6..59b678e7565 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -28,6 +28,7 @@ import com.nvidia.spark.rapids.Arm._ import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.jni.CastStrings import com.nvidia.spark.rapids.jni.RegexRewriteUtils +import com.nvidia.spark.rapids.jni.GpuSubstringIndex import com.nvidia.spark.rapids.shims.{ShimExpression, SparkShimImpl} import org.apache.spark.sql.catalyst.expressions._ @@ -1585,6 +1586,8 @@ class SubstringIndexMeta( rule: DataFromReplacementRule) extends TernaryExprMeta[SubstringIndex](expr, conf, parent, rule) { //private var regexp: String = _ + private var delim: String = _ + private var c: Integer = _ override def tagExprForGpu(): Unit = { val delim = GpuOverrides.extractStringLit(expr.delimExpr).getOrElse("") @@ -1660,6 +1663,12 @@ case class GpuSubstringIndex(strExpr: Expression, // spark-rapids plugin issue https://github.com/NVIDIA/spark-rapids/issues/8750 override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { + + withResource(str.getBase) { strV => + com.nvidia.spark.rapids.jni.GpuSubstringIndex.gpuSubstringIndex(strV, + delim.getValue.asInstanceOf[String], + count.getValue.asInstanceOf[Int]); + } // here I would put the logic to deal with substringindex using // substringLocatemultiple and substring // 1. substringLocateMultiple to get all the index From d30dbfa5b7d15dd6735778eee92a069111c7006c Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 8 Jul 2024 12:02:34 +0800 Subject: [PATCH 04/17] stringFunctions import conflict resolved Signed-off-by: fejiang --- .../spark/sql/rapids/stringFunctions.scala | 36 +++++++++---------- .../suites/RapidsStringExpressionsSuite.scala | 3 +- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 59b678e7565..7e2da1608a6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -22,13 +22,13 @@ import java.util.{Locale, Optional} import scala.collection.mutable.ArrayBuffer -import ai.rapids.cudf.{BinaryOp, BinaryOperable, CaptureGroups, ColumnVector, ColumnView, DType, PadSide, RegexProgram, RoundMode, Scalar, Table} +import ai.rapids.cudf.{BinaryOp, BinaryOperable, CaptureGroups, ColumnVector, ColumnView, DType, PadSide, RegexProgram, RoundMode, Scalar} import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm._ import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.jni.CastStrings +import com.nvidia.spark.rapids.jni.GpuSubstringIndexUtils import com.nvidia.spark.rapids.jni.RegexRewriteUtils -import com.nvidia.spark.rapids.jni.GpuSubstringIndex import com.nvidia.spark.rapids.shims.{ShimExpression, SparkShimImpl} import org.apache.spark.sql.catalyst.expressions._ @@ -1586,22 +1586,22 @@ class SubstringIndexMeta( rule: DataFromReplacementRule) extends TernaryExprMeta[SubstringIndex](expr, conf, parent, rule) { //private var regexp: String = _ - private var delim: String = _ - private var c: Integer = _ - - override def tagExprForGpu(): Unit = { - val delim = GpuOverrides.extractStringLit(expr.delimExpr).getOrElse("") - // delim can be null in here just return "" them -// if (delim == null) { -// willNotWorkOnGpu("only a single character deliminator is supported") +// private var delim: String = _ +// private var c: Integer = _ + +// override def tagExprForGpu(): Unit = { +// val delim = GpuOverrides.extractStringLit(expr.delimExpr).getOrElse("") +// // delim can be null in here just return "" them +//// if (delim == null) { +//// willNotWorkOnGpu("only a single character deliminator is supported") +//// } +// +// val count = GpuOverrides.extractLit(expr.countExpr) +// if (canThisBeReplaced) { +// val c = count.get.value.asInstanceOf[Integer] +// //this.regexp = GpuSubstringIndex.makeExtractRe(delim, c) // } - - val count = GpuOverrides.extractLit(expr.countExpr) - if (canThisBeReplaced) { - val c = count.get.value.asInstanceOf[Integer] - //this.regexp = GpuSubstringIndex.makeExtractRe(delim, c) - } - } +// } override def convertToGpu( column: Expression, @@ -1665,7 +1665,7 @@ case class GpuSubstringIndex(strExpr: Expression, count: GpuScalar): ColumnVector = { withResource(str.getBase) { strV => - com.nvidia.spark.rapids.jni.GpuSubstringIndex.gpuSubstringIndex(strV, + GpuSubstringIndexUtils.substringIndex(strV, delim.getValue.asInstanceOf[String], count.getValue.asInstanceOf[Int]); } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala index 41eee634432..58ca447c0f7 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala @@ -22,5 +22,4 @@ package org.apache.spark.sql.rapids.suites import org.apache.spark.sql.catalyst.expressions.StringExpressionsSuite import org.apache.spark.sql.rapids.utils.RapidsTestsTrait -class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} -git \ No newline at end of file +class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} \ No newline at end of file From 841829f1d9165cdeae4a7c83507adbee77ac4de7 Mon Sep 17 00:00:00 2001 From: fejiang Date: Tue, 9 Jul 2024 10:08:40 +0800 Subject: [PATCH 05/17] doColumnar calling Signed-off-by: fejiang --- .../spark/sql/rapids/stringFunctions.scala | 79 ++----------------- .../suites/RapidsStringExpressionsSuite.scala | 35 +++++++- .../sql/rapids/utils/RapidsTestSettings.scala | 2 +- 3 files changed, 42 insertions(+), 74 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 7e2da1608a6..74ebba55091 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1585,23 +1585,7 @@ class SubstringIndexMeta( override val parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) extends TernaryExprMeta[SubstringIndex](expr, conf, parent, rule) { - //private var regexp: String = _ -// private var delim: String = _ -// private var c: Integer = _ - -// override def tagExprForGpu(): Unit = { -// val delim = GpuOverrides.extractStringLit(expr.delimExpr).getOrElse("") -// // delim can be null in here just return "" them -//// if (delim == null) { -//// willNotWorkOnGpu("only a single character deliminator is supported") -//// } -// -// val count = GpuOverrides.extractLit(expr.countExpr) -// if (canThisBeReplaced) { -// val c = count.get.value.asInstanceOf[Integer] -// //this.regexp = GpuSubstringIndex.makeExtractRe(delim, c) -// } -// } + override def convertToGpu( column: Expression, @@ -1609,39 +1593,6 @@ class SubstringIndexMeta( count: Expression): GpuExpression = GpuSubstringIndex(column, delim, count) } -//object GpuSubstringIndex { -// def makeExtractRe(delim: String, count: Integer): String = { -// if (delim.length != 1) { -// throw new IllegalStateException("NOT SUPPORTED") -// } -// val quotedDelim = CudfRegexp.cudfQuote(delim.charAt(0)) -// val notDelim = CudfRegexp.notCharSet(delim.charAt(0)) -// // substring_index has a deliminator and a count. If the count is positive then -// // you get back a substring from 0 until the Nth deliminator is found -// // If the count is negative it goes in reverse -// if (count == 0) { -// // Count is zero so return a null regexp as a special case -// null -// } else if (count == 1) { -// // If the count is 1 we want to match everything from the beginning of the string until we -// // find the first occurrence of the deliminator or the end of the string -// "\\A(" + notDelim + "*)" -// } else if (count > 0) { -// // If the count is > 1 we first match 0 up to count - 1 occurrences of the patten -// // `not the deliminator 0 or more times followed by the deliminator` -// // After that we go back to matching everything until we find the deliminator or the end of -// // the string -// "\\A((?:" + notDelim + "*" + quotedDelim + "){0," + (count - 1) + "}" + notDelim + "*)" -// } else if (count == -1) { -// // A -1 looks like 1 but we start looking at the end of the string -// "(" + notDelim + "*)\\Z" -// } else { //count < 0 -// // All others look like a positive count, but again we are matching starting at the end of -// // the string instead of the beginning -// "((?:" + notDelim + "*" + quotedDelim + "){0," + ((-count) - 1) + "}" + notDelim + "*)\\Z" -// } -// } -//} case class GpuSubstringIndex(strExpr: Expression, ignoredDelimExpr: Expression, @@ -1664,27 +1615,13 @@ case class GpuSubstringIndex(strExpr: Expression, override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { - withResource(str.getBase) { strV => - GpuSubstringIndexUtils.substringIndex(strV, - delim.getValue.asInstanceOf[String], - count.getValue.asInstanceOf[Int]); - } - // here I would put the logic to deal with substringindex using - // substringLocatemultiple and substring - // 1. substringLocateMultiple to get all the index - - // 2. and then substring to return the string -// if (regexp == null) { -// withResource(str.getBase.isNull) { isNull => -// withResource(Scalar.fromString("")) { emptyString => -// isNull.ifElse(str.getBase, emptyString) -// } -// } -// } else { -// withResource(str.getBase.extractRe(new RegexProgram(regexp))) { table: Table => -// table.getColumn(0).incRefCount() -// } -// } + val strColumnView = str.getBase + val delimiter = delim.getValue.asInstanceOf[UTF8String].toString + val cnt = count.getValue.asInstanceOf[Int] + // Use withResource to manage the lifecycle of ColumnVector + withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => + result.incRefCount() // Increment reference count to ensure it is not prematurely deallocated + } } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala index 58ca447c0f7..d2abb8e7623 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala @@ -19,7 +19,38 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites -import org.apache.spark.sql.catalyst.expressions.StringExpressionsSuite +import org.apache.spark.sql.catalyst.expressions.{Literal, StringExpressionsSuite, SubstringIndex} import org.apache.spark.sql.rapids.utils.RapidsTestsTrait +import org.apache.spark.sql.types.StringType -class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} \ No newline at end of file +class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait { + test("string substring_index function in rapids") { + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(3)), "www.apache.org") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(2)), "www.apache") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(1)), "www") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(0)), "") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-3)), "www.apache.org") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-2)), "apache.org") + checkEvaluation( + SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-1)), "org") + checkEvaluation( + SubstringIndex(Literal(""), Literal("."), Literal(-2)), "") + checkEvaluation( + SubstringIndex(Literal.create(null, StringType), Literal("."), Literal(-2)), null) + checkEvaluation(SubstringIndex( + Literal("www.apache.org"), Literal.create(null, StringType), Literal(-2)), null) + // non ascii chars + // scalastyle:off + checkEvaluation( + SubstringIndex(Literal("大千世界大千世界"), Literal( "千"), Literal(2)), "大千世界大") + // scalastyle:on + checkEvaluation( + SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache") + } +} \ No newline at end of file diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index ba8dcbe6efe..8e1edda83ef 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -66,7 +66,7 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsRegexpExpressionsSuite] enableSuite[RapidsStringExpressionsSuite] - .exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/8750")) + //.exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/8750")) .exclude("SPARK-22550: Elt should not generate codes beyond 64KB", WONT_FIX_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB", WONT_FIX_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) enableSuite[RapidsStringFunctionsSuite] From f8bae930bd3f7b22bf703de0233e9ae63464f9ca Mon Sep 17 00:00:00 2001 From: fejiang Date: Thu, 11 Jul 2024 19:45:06 +0800 Subject: [PATCH 06/17] delimiter change to scalar type Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 74ebba55091..5f3ba3f98b6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1616,7 +1616,7 @@ case class GpuSubstringIndex(strExpr: Expression, count: GpuScalar): ColumnVector = { val strColumnView = str.getBase - val delimiter = delim.getValue.asInstanceOf[UTF8String].toString + val delimiter = delim.getValue val cnt = count.getValue.asInstanceOf[Int] // Use withResource to manage the lifecycle of ColumnVector withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => From 0d8381722f4ba1be50bfe8d8a5b58e1c096eb9b7 Mon Sep 17 00:00:00 2001 From: fejiang Date: Thu, 11 Jul 2024 20:53:53 +0800 Subject: [PATCH 07/17] delimiter changed to scalar type Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 5f3ba3f98b6..59428f0d2ae 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1616,7 +1616,7 @@ case class GpuSubstringIndex(strExpr: Expression, count: GpuScalar): ColumnVector = { val strColumnView = str.getBase - val delimiter = delim.getValue + val delimiter = GpuScalar.from(delim, StringType) val cnt = count.getValue.asInstanceOf[Int] // Use withResource to manage the lifecycle of ColumnVector withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => From b2517c8cdf7a68a9c11c91fa93c2f0735b7a8e16 Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 15 Jul 2024 10:15:13 +0800 Subject: [PATCH 08/17] changed delimiter type Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 59428f0d2ae..9f86f81b4d7 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1616,7 +1616,7 @@ case class GpuSubstringIndex(strExpr: Expression, count: GpuScalar): ColumnVector = { val strColumnView = str.getBase - val delimiter = GpuScalar.from(delim, StringType) + val delimiter = delim.getBase val cnt = count.getValue.asInstanceOf[Int] // Use withResource to manage the lifecycle of ColumnVector withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => From 5a31e5c5489b02129635ddcffe2cff68cec60c3d Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 15 Jul 2024 10:35:30 +0800 Subject: [PATCH 09/17] comment removed Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 9f86f81b4d7..b8f18b4ec27 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1618,9 +1618,8 @@ case class GpuSubstringIndex(strExpr: Expression, val strColumnView = str.getBase val delimiter = delim.getBase val cnt = count.getValue.asInstanceOf[Int] - // Use withResource to manage the lifecycle of ColumnVector withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => - result.incRefCount() // Increment reference count to ensure it is not prematurely deallocated + result.incRefCount() } } From ec47fbd80f34ea603b8af0b5233315923282d1d6 Mon Sep 17 00:00:00 2001 From: fejiang Date: Wed, 17 Jul 2024 16:20:20 +0800 Subject: [PATCH 10/17] unwanted test case Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 4 ---- .../apache/spark/sql/rapids/utils/RapidsTestSettings.scala | 1 - 2 files changed, 5 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index b8f18b4ec27..07e7acd20c5 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1608,10 +1608,6 @@ case class GpuSubstringIndex(strExpr: Expression, override def prettyName: String = "substring_index" - // This is a bit hacked up at the moment. We are going to use a regular expression to extract - // a single value. It only works if the delim is a single character. A full version of - // substring_index for the GPU has been requested at https://github.com/rapidsai/cudf/issues/5158 - // spark-rapids plugin issue https://github.com/NVIDIA/spark-rapids/issues/8750 override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 8e1edda83ef..8d45771877f 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -66,7 +66,6 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsRegexpExpressionsSuite] enableSuite[RapidsStringExpressionsSuite] - //.exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/8750")) .exclude("SPARK-22550: Elt should not generate codes beyond 64KB", WONT_FIX_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB", WONT_FIX_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) enableSuite[RapidsStringFunctionsSuite] From 502531ea4725f163105af024cb86a8d483476dc8 Mon Sep 17 00:00:00 2001 From: fejiang Date: Thu, 18 Jul 2024 17:05:32 +0800 Subject: [PATCH 11/17] IT test added Signed-off-by: fejiang --- integration_tests/src/main/python/string_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 6ca0e1a1967..0ed2c4cc41a 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -77,7 +77,9 @@ def test_split_positive_limit(): @pytest.mark.parametrize('data_gen,delim', [(mk_str_gen('([ABC]{0,3}_?){0,7}'), '_'), (mk_str_gen('([MNP_]{0,3}\\.?){0,5}'), '.'), - (mk_str_gen('([123]{0,3}\\^?){0,5}'), '^')], ids=idfn) + (mk_str_gen('([123]{0,3}\\^?){0,5}'), '^'), + (mk_str_gen('([XYZ]{0,3}XYZ?){0,5}'), 'XYZ'), + (mk_str_gen('([DEF]{0,3}DELIM?){0,5}'), 'DELIM')], ids=idfn) def test_substring_index(data_gen,delim): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( From ddd34043e90bdc3a364eee50c9d84f93215d3697 Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 22 Jul 2024 14:33:31 +0800 Subject: [PATCH 12/17] remove RapidExpressionsSuite Signed-off-by: fejiang --- .../suites/RapidsStringExpressionsSuite.scala | 35 ++----------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala index d2abb8e7623..58ca447c0f7 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala @@ -19,38 +19,7 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites -import org.apache.spark.sql.catalyst.expressions.{Literal, StringExpressionsSuite, SubstringIndex} +import org.apache.spark.sql.catalyst.expressions.StringExpressionsSuite import org.apache.spark.sql.rapids.utils.RapidsTestsTrait -import org.apache.spark.sql.types.StringType -class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait { - test("string substring_index function in rapids") { - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(3)), "www.apache.org") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(2)), "www.apache") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(1)), "www") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(0)), "") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-3)), "www.apache.org") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-2)), "apache.org") - checkEvaluation( - SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-1)), "org") - checkEvaluation( - SubstringIndex(Literal(""), Literal("."), Literal(-2)), "") - checkEvaluation( - SubstringIndex(Literal.create(null, StringType), Literal("."), Literal(-2)), null) - checkEvaluation(SubstringIndex( - Literal("www.apache.org"), Literal.create(null, StringType), Literal(-2)), null) - // non ascii chars - // scalastyle:off - checkEvaluation( - SubstringIndex(Literal("大千世界大千世界"), Literal( "千"), Literal(2)), "大千世界大") - // scalastyle:on - checkEvaluation( - SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache") - } -} \ No newline at end of file +class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} \ No newline at end of file From f99af717c7eed61c9b974ea3a9653e388e44936c Mon Sep 17 00:00:00 2001 From: fejiang Date: Mon, 22 Jul 2024 16:45:09 +0800 Subject: [PATCH 13/17] adding evaluating logic when using GpuScalar Signed-off-by: fejiang --- .../apache/spark/sql/rapids/stringFunctions.scala | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 07e7acd20c5..63ab413a496 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1610,14 +1610,12 @@ case class GpuSubstringIndex(strExpr: Expression, override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { - - val strColumnView = str.getBase - val delimiter = delim.getBase - val cnt = count.getValue.asInstanceOf[Int] - withResource(GpuSubstringIndexUtils.substringIndex(strColumnView, delimiter, cnt)) { result => - result.incRefCount() + if(delim.isValid && count.isValid){ + GpuSubstringIndexUtils.substringIndex(str.getBase, delim.getBase, + count.getValue.asInstanceOf[Int]) + }else{ + GpuColumnVector.columnVectorFromNull(str.getRowCount.toInt, StringType) } - } override def doColumnar(numRows: Int, val0: GpuScalar, val1: GpuScalar, From 849d1949f43ced6103bbfcaee2279e3c7221ee13 Mon Sep 17 00:00:00 2001 From: fejiang Date: Tue, 23 Jul 2024 10:25:00 +0800 Subject: [PATCH 14/17] formatting Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 6 +++--- .../sql/rapids/suites/RapidsStringExpressionsSuite.scala | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 63ab413a496..59c428f9bf4 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1610,10 +1610,10 @@ case class GpuSubstringIndex(strExpr: Expression, override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { - if(delim.isValid && count.isValid){ + if (delim.isValid && count.isValid) { GpuSubstringIndexUtils.substringIndex(str.getBase, delim.getBase, - count.getValue.asInstanceOf[Int]) - }else{ + count.getValue.asInstanceOf[Int]) + } else { GpuColumnVector.columnVectorFromNull(str.getRowCount.toInt, StringType) } } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala index 58ca447c0f7..164406fdf83 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsStringExpressionsSuite.scala @@ -22,4 +22,4 @@ package org.apache.spark.sql.rapids.suites import org.apache.spark.sql.catalyst.expressions.StringExpressionsSuite import org.apache.spark.sql.rapids.utils.RapidsTestsTrait -class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} \ No newline at end of file +class RapidsStringExpressionsSuite extends StringExpressionsSuite with RapidsTestsTrait {} From 7b9fc147e4da833634ac0b3942bcf13017ddecb7 Mon Sep 17 00:00:00 2001 From: fejiang Date: Tue, 23 Jul 2024 10:26:50 +0800 Subject: [PATCH 15/17] formatting Signed-off-by: fejiang --- .../scala/org/apache/spark/sql/rapids/stringFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 59c428f9bf4..4e243c79736 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1611,7 +1611,7 @@ case class GpuSubstringIndex(strExpr: Expression, override def doColumnar(str: GpuColumnVector, delim: GpuScalar, count: GpuScalar): ColumnVector = { if (delim.isValid && count.isValid) { - GpuSubstringIndexUtils.substringIndex(str.getBase, delim.getBase, + GpuSubstringIndexUtils.substringIndex(str.getBase, delim.getBase, count.getValue.asInstanceOf[Int]) } else { GpuColumnVector.columnVectorFromNull(str.getRowCount.toInt, StringType) From c7cdb014af965419bfb4ac1aef33b68f375f5c97 Mon Sep 17 00:00:00 2001 From: fejiang Date: Wed, 24 Jul 2024 14:35:40 +0800 Subject: [PATCH 16/17] remove the single delim note in gpuoverride Signed-off-by: fejiang --- .../src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 7fa18b2b782..9429df6b83b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -2983,8 +2983,7 @@ object GpuOverrides extends Logging { "substring_index operator", ExprChecks.projectOnly(TypeSig.STRING, TypeSig.STRING, Seq(ParamCheck("str", TypeSig.STRING, TypeSig.STRING), - ParamCheck("delim", TypeSig.lit(TypeEnum.STRING) - .withPsNote(TypeEnum.STRING, "only a single character is allowed"), TypeSig.STRING), + ParamCheck("delim", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING), ParamCheck("count", TypeSig.lit(TypeEnum.INT), TypeSig.INT))), (in, conf, p, r) => new SubstringIndexMeta(in, conf, p, r)), expr[StringRepeat]( From bf99263c7b015e53f706aaec4212ce371dd4a20c Mon Sep 17 00:00:00 2001 From: fejiang Date: Wed, 24 Jul 2024 22:39:35 +0800 Subject: [PATCH 17/17] doc generated Signed-off-by: fejiang --- docs/supported_ops.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 6b6c1ebf828..0bbddb460fa 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -16525,7 +16525,7 @@ are limited. -PS
only a single character is allowed;
Literal value only
+PS
Literal value only