From 21ac6cfc9e5565d3bb15f1cfc5bfc042a4cdfe69 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Fri, 11 Nov 2022 22:00:02 +0800 Subject: [PATCH] Fix a substring issue for a corner case (#7040) When "pos + len < 0 && len >= 0", GPU substring returns different strings than CPU. Signed-off-by: Firestarman --- .../src/main/python/string_test.py | 9 ++++ .../spark/sql/rapids/stringFunctions.scala | 43 ++++++++++++------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index fd0cdba0149..05eee3c77cb 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -329,12 +329,21 @@ def test_substring(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'SUBSTRING(a, 1, 5)', + 'SUBSTRING(a, 5, 2147483647)', + 'SUBSTRING(a, 5, -2147483648)', 'SUBSTRING(a, 1)', 'SUBSTRING(a, -3)', 'SUBSTRING(a, 3, -2)', 'SUBSTRING(a, 100)', + 'SUBSTRING(a, -100)', 'SUBSTRING(a, NULL)', 'SUBSTRING(a, 1, NULL)', + 'SUBSTRING(a, -5, 0)', + 'SUBSTRING(a, -5, 4)', + 'SUBSTRING(a, 10, 0)', + 'SUBSTRING(a, -50, 10)', + 'SUBSTRING(a, -10, -1)', + 'SUBSTRING(a, 0, 10)', 'SUBSTRING(a, 0, 0)')) def test_repeat_scalar_and_column(): diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 9b53337e259..7bfe15b67ed 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -484,22 +484,35 @@ case class GpuSubstring(str: Expression, pos: Expression, len: Expression) val2: GpuColumnVector): ColumnVector = throw new UnsupportedOperationException(s"Cannot columnar evaluate expression: $this") - override def doColumnar(column: GpuColumnVector, - position: GpuScalar, - length: GpuScalar): ColumnVector = { - val substringPos = position.getValue.asInstanceOf[Int] - val substringLen = length.getValue.asInstanceOf[Int] - if (substringLen < 0) { // Spark returns empty string if length is negative - column.getBase.substring(0, 0) - } else if (substringPos >= 0) { // If position is non negative - if (substringPos == 0) { // calculate substring from first character to length - column.getBase.substring(substringPos, substringLen) - } else { // calculate substring from position to length - column.getBase.substring(substringPos - 1, substringPos + substringLen - 1) - } - } else { // If position is negative, evaluate from end. - column.getBase.substring(substringPos, Integer.MAX_VALUE) + override def doColumnar(column: GpuColumnVector, position: GpuScalar, + length: GpuScalar): ColumnVector = { + val pos = position.getValue.asInstanceOf[Int] + val len = length.getValue.asInstanceOf[Int] + val (start, endOpt) = if (len <= 0) { + // Spark returns empty string if length is negative or zero + (0, Some(0)) + } else if (pos > 0) { + // 1-based index, convert to 0-based index + val head = pos - 1 + val tail = if (head.toLong + len > Int.MaxValue) Int.MaxValue else head + len + (head, Some(tail)) + } else if (pos == 0) { + // 0-based index, calculate substring from 0 to length + (0, Some(len)) + } else if (pos + len < 0) { + // Drop the last "abs(substringPos + substringLen)" chars. + // e.g. + // >> substring("abc", -3, 1) + // >> "a" // dropping the last 2 [= abs(-3+1)] chars. + // `pos + len` does not overflow as `pos < 0 && len > 0` here. + (pos, Some(pos + len)) + } else { // pos + len >= 0 + // Read from start until the end. + // e.g. `substring("abc", -3, 4)` outputs "abc". + (pos, None) } + val col = column.getBase + endOpt.map(col.substring(start, _)).getOrElse(col.substring(start)) } override def doColumnar(numRows: Int, val0: GpuScalar, val1: GpuScalar,