Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a substring issue for a corner case [databricks] #7040

Merged
merged 1 commit into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,21 @@ def test_substring():
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'SUBSTRING(a, 1, 5)',
'SUBSTRING(a, 5, 2147483647)',
'SUBSTRING(a, 5, -2147483648)',
'SUBSTRING(a, 1)',
'SUBSTRING(a, -3)',
'SUBSTRING(a, 3, -2)',
'SUBSTRING(a, 100)',
'SUBSTRING(a, -100)',
'SUBSTRING(a, NULL)',
'SUBSTRING(a, 1, NULL)',
'SUBSTRING(a, -5, 0)',
'SUBSTRING(a, -5, 4)',
'SUBSTRING(a, 10, 0)',
'SUBSTRING(a, -50, 10)',
'SUBSTRING(a, -10, -1)',
'SUBSTRING(a, 0, 10)',
'SUBSTRING(a, 0, 0)'))

def test_repeat_scalar_and_column():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,22 +484,35 @@ case class GpuSubstring(str: Expression, pos: Expression, len: Expression)
val2: GpuColumnVector): ColumnVector =
throw new UnsupportedOperationException(s"Cannot columnar evaluate expression: $this")

override def doColumnar(column: GpuColumnVector,
position: GpuScalar,
length: GpuScalar): ColumnVector = {
val substringPos = position.getValue.asInstanceOf[Int]
val substringLen = length.getValue.asInstanceOf[Int]
if (substringLen < 0) { // Spark returns empty string if length is negative
column.getBase.substring(0, 0)
} else if (substringPos >= 0) { // If position is non negative
if (substringPos == 0) { // calculate substring from first character to length
column.getBase.substring(substringPos, substringLen)
} else { // calculate substring from position to length
column.getBase.substring(substringPos - 1, substringPos + substringLen - 1)
}
} else { // If position is negative, evaluate from end.
column.getBase.substring(substringPos, Integer.MAX_VALUE)
override def doColumnar(column: GpuColumnVector, position: GpuScalar,
length: GpuScalar): ColumnVector = {
val pos = position.getValue.asInstanceOf[Int]
val len = length.getValue.asInstanceOf[Int]
val (start, endOpt) = if (len <= 0) {
// Spark returns empty string if length is negative or zero
(0, Some(0))
} else if (pos > 0) {
// 1-based index, convert to 0-based index
val head = pos - 1
val tail = if (head.toLong + len > Int.MaxValue) Int.MaxValue else head + len
(head, Some(tail))
} else if (pos == 0) {
// 0-based index, calculate substring from 0 to length
(0, Some(len))
} else if (pos + len < 0) {
// Drop the last "abs(substringPos + substringLen)" chars.
// e.g.
// >> substring("abc", -3, 1)
// >> "a" // dropping the last 2 [= abs(-3+1)] chars.
// `pos + len` does not overflow as `pos < 0 && len > 0` here.
(pos, Some(pos + len))
} else { // pos + len >= 0
// Read from start until the end.
// e.g. `substring("abc", -3, 4)` outputs "abc".
(pos, None)
}
val col = column.getBase
endOpt.map(col.substring(start, _)).getOrElse(col.substring(start))
}

override def doColumnar(numRows: Int, val0: GpuScalar, val1: GpuScalar,
Expand Down