-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix: substr()
on StringView column's behavior is inconsistent with the old version
#12383
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -118,15 +118,37 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> { | |
} | ||
} | ||
|
||
// Return the exact byte index for [start, end), set count to -1 to ignore count | ||
fn get_true_start_end(input: &str, start: usize, count: i64) -> (usize, usize) { | ||
// Convert the given `start` and `count` to valid byte indices within `input` string | ||
// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)` | ||
// `start` is 1-based, if `count` is not provided count to the end of the string | ||
// Input indices are character-based, and return values are byte indices | ||
// The input bounds can be outside string bounds, this function will return | ||
// the intersection between input bounds and valid string bounds | ||
// | ||
// * Example | ||
// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx] | ||
// `get_true_start_end('Hi🌏', 1, None) -> (0, 6)` | ||
// `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)` | ||
// `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)` | ||
fn get_true_start_end(input: &str, start: i64, count: Option<u64>) -> (usize, usize) { | ||
let start = start - 1; | ||
let end = match count { | ||
Some(count) => start + count as i64, | ||
None => input.len() as i64, | ||
}; | ||
let count_to_end = count.is_some(); | ||
|
||
let start = start.clamp(0, input.len() as i64) as usize; | ||
let end = end.clamp(0, input.len() as i64) as usize; | ||
let count = end - start; | ||
|
||
let (mut st, mut ed) = (input.len(), input.len()); | ||
let mut start_counting = false; | ||
let mut cnt = 0; | ||
for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() { | ||
if char_cnt == start { | ||
st = byte_cnt; | ||
if count != -1 { | ||
if count_to_end { | ||
start_counting = true; | ||
} else { | ||
break; | ||
|
@@ -153,20 +175,15 @@ fn make_and_append_view( | |
start: u32, | ||
) { | ||
let substr_len = substr.len(); | ||
if substr_len == 0 { | ||
null_builder.append_null(); | ||
views_buffer.push(0); | ||
let sub_view = if substr_len > 12 { | ||
let view = ByteView::from(*raw); | ||
make_view(substr.as_bytes(), view.buffer_index, view.offset + start) | ||
} else { | ||
let sub_view = if substr_len > 12 { | ||
let view = ByteView::from(*raw); | ||
make_view(substr.as_bytes(), view.buffer_index, view.offset + start) | ||
} else { | ||
// inline value does not need block id or offset | ||
make_view(substr.as_bytes(), 0, 0) | ||
}; | ||
views_buffer.push(sub_view); | ||
null_builder.append_non_null(); | ||
} | ||
// inline value does not need block id or offset | ||
make_view(substr.as_bytes(), 0, 0) | ||
Comment on lines
+178
to
+183
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the inline check here is duplicated with the one within There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would say the point we are checking size here is to help with the operation of "modifying the views directly". But I agree that maybe we could add more API upstream in arrow to hide the logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Maybe we can just let it run as same as the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see what you mean. I think
It might actually work since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ok, it is indeed more clear to distinguish inlined and not inlned. |
||
}; | ||
views_buffer.push(sub_view); | ||
null_builder.append_non_null(); | ||
} | ||
|
||
// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 | ||
|
@@ -180,32 +197,26 @@ fn string_view_substr( | |
|
||
let start_array = as_int64_array(&args[0])?; | ||
|
||
// In either case of `substr(s, i)` or `substr(s, i, cnt)` | ||
// If any of input argument is `NULL`, the result is `NULL` | ||
match args.len() { | ||
1 => { | ||
for (idx, (raw, start)) in string_view_array | ||
.views() | ||
for ((str_opt, raw_view), start_opt) in string_view_array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old implementation will treat empty string in view as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another potential thing that might be faster is to iterate on the null array directly (rather than the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for correcting my mistakes. I really appreciate it! I'll be more careful with my code next time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Great point, there are also other potential optimizations like specializing not null columns, scalar literal arguments, and ASCII case. So I prefer only to make a simple and correct implementation in this PR, and evaluate those optimizations as a future task |
||
.iter() | ||
.zip(string_view_array.views().iter()) | ||
.zip(start_array.iter()) | ||
.enumerate() | ||
{ | ||
if let Some(start) = start { | ||
let start = (start - 1).max(0) as usize; | ||
|
||
// Safety: | ||
// idx is always smaller or equal to string_view_array.views.len() | ||
unsafe { | ||
let str = string_view_array.value_unchecked(idx); | ||
let (start, end) = get_true_start_end(str, start, -1); | ||
let substr = &str[start..end]; | ||
if let (Some(str), Some(start)) = (str_opt, start_opt) { | ||
let (start, end) = get_true_start_end(str, start, None); | ||
let substr = &str[start..end]; | ||
|
||
make_and_append_view( | ||
&mut views_buf, | ||
&mut null_builder, | ||
raw, | ||
substr, | ||
start as u32, | ||
); | ||
} | ||
make_and_append_view( | ||
&mut views_buf, | ||
&mut null_builder, | ||
raw_view, | ||
substr, | ||
start as u32, | ||
); | ||
} else { | ||
null_builder.append_null(); | ||
views_buf.push(0); | ||
|
@@ -214,35 +225,31 @@ fn string_view_substr( | |
} | ||
2 => { | ||
let count_array = as_int64_array(&args[1])?; | ||
for (idx, ((raw, start), count)) in string_view_array | ||
.views() | ||
for (((str_opt, raw_view), start_opt), count_opt) in string_view_array | ||
.iter() | ||
.zip(string_view_array.views().iter()) | ||
.zip(start_array.iter()) | ||
.zip(count_array.iter()) | ||
.enumerate() | ||
{ | ||
if let (Some(start), Some(count)) = (start, count) { | ||
let start = (start - 1).max(0) as usize; | ||
if let (Some(str), Some(start), Some(count)) = | ||
(str_opt, start_opt, count_opt) | ||
{ | ||
if count < 0 { | ||
return exec_err!( | ||
"negative substring length not allowed: substr(<str>, {start}, {count})" | ||
); | ||
} else { | ||
// Safety: | ||
// idx is always smaller or equal to string_view_array.views.len() | ||
unsafe { | ||
let str = string_view_array.value_unchecked(idx); | ||
let (start, end) = get_true_start_end(str, start, count); | ||
let substr = &str[start..end]; | ||
|
||
make_and_append_view( | ||
&mut views_buf, | ||
&mut null_builder, | ||
raw, | ||
substr, | ||
start as u32, | ||
); | ||
} | ||
let (start, end) = | ||
get_true_start_end(str, start, Some(count as u64)); | ||
let substr = &str[start..end]; | ||
|
||
make_and_append_view( | ||
&mut views_buf, | ||
&mut null_builder, | ||
raw_view, | ||
substr, | ||
start as u32, | ||
); | ||
} | ||
} else { | ||
null_builder.append_null(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
😍 for the comments