/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/group_column.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use arrow::array::make_view; |
19 | | use arrow::array::BufferBuilder; |
20 | | use arrow::array::ByteView; |
21 | | use arrow::array::GenericBinaryArray; |
22 | | use arrow::array::GenericStringArray; |
23 | | use arrow::array::OffsetSizeTrait; |
24 | | use arrow::array::PrimitiveArray; |
25 | | use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; |
26 | | use arrow::buffer::OffsetBuffer; |
27 | | use arrow::buffer::ScalarBuffer; |
28 | | use arrow::datatypes::ByteArrayType; |
29 | | use arrow::datatypes::ByteViewType; |
30 | | use arrow::datatypes::DataType; |
31 | | use arrow::datatypes::GenericBinaryType; |
32 | | use arrow_array::GenericByteViewArray; |
33 | | use arrow_buffer::Buffer; |
34 | | use datafusion_common::utils::proxy::VecAllocExt; |
35 | | |
36 | | use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; |
37 | | use arrow_array::types::GenericStringType; |
38 | | use datafusion_physical_expr_common::binary_map::{OutputType, INITIAL_BUFFER_CAPACITY}; |
39 | | use std::marker::PhantomData; |
40 | | use std::mem; |
41 | | use std::sync::Arc; |
42 | | use std::vec; |
43 | | |
44 | | const BYTE_VIEW_MAX_BLOCK_SIZE: usize = 2 * 1024 * 1024; |
45 | | |
46 | | /// Trait for storing a single column of group values in [`GroupValuesColumn`] |
47 | | /// |
48 | | /// Implementations of this trait store an in-progress collection of group values |
49 | | /// (similar to various builders in Arrow-rs) that allow for quick comparison to |
50 | | /// incoming rows. |
51 | | /// |
52 | | /// [`GroupValuesColumn`]: crate::aggregates::group_values::GroupValuesColumn |
53 | | pub trait GroupColumn: Send + Sync { |
54 | | /// Returns equal if the row stored in this builder at `lhs_row` is equal to |
55 | | /// the row in `array` at `rhs_row` |
56 | | /// |
57 | | /// Note that this comparison returns true if both elements are NULL |
58 | | fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool; |
59 | | /// Appends the row at `row` in `array` to this builder |
60 | | fn append_val(&mut self, array: &ArrayRef, row: usize); |
61 | | /// Returns the number of rows stored in this builder |
62 | | fn len(&self) -> usize; |
63 | | /// Returns the number of bytes used by this [`GroupColumn`] |
64 | | fn size(&self) -> usize; |
65 | | /// Builds a new array from all of the stored rows |
66 | | fn build(self: Box<Self>) -> ArrayRef; |
67 | | /// Builds a new array from the first `n` stored rows, shifting the |
68 | | /// remaining rows to the start of the builder |
69 | | fn take_n(&mut self, n: usize) -> ArrayRef; |
70 | | } |
71 | | |
72 | | /// An implementation of [`GroupColumn`] for primitive values |
73 | | /// |
74 | | /// Optimized to skip null buffer construction if the input is known to be non nullable |
75 | | /// |
76 | | /// # Template parameters |
77 | | /// |
78 | | /// `T`: the native Rust type that stores the data |
79 | | /// `NULLABLE`: if the data can contain any nulls |
80 | | #[derive(Debug)] |
81 | | pub struct PrimitiveGroupValueBuilder<T: ArrowPrimitiveType, const NULLABLE: bool> { |
82 | | group_values: Vec<T::Native>, |
83 | | nulls: MaybeNullBufferBuilder, |
84 | | } |
85 | | |
86 | | impl<T, const NULLABLE: bool> PrimitiveGroupValueBuilder<T, NULLABLE> |
87 | | where |
88 | | T: ArrowPrimitiveType, |
89 | | { |
90 | | /// Create a new `PrimitiveGroupValueBuilder` |
91 | 29 | pub fn new() -> Self { |
92 | 29 | Self { |
93 | 29 | group_values: vec![], |
94 | 29 | nulls: MaybeNullBufferBuilder::new(), |
95 | 29 | } |
96 | 29 | } |
97 | | } |
98 | | |
99 | | impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn |
100 | | for PrimitiveGroupValueBuilder<T, NULLABLE> |
101 | | { |
102 | 295k | fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { |
103 | 295k | // Perf: skip null check (by short circuit) if input is not nullable |
104 | 295k | if NULLABLE { |
105 | 295k | let exist_null = self.nulls.is_null(lhs_row); |
106 | 295k | let input_null = array.is_null(rhs_row); |
107 | 295k | if let Some(result196k ) = nulls_equal_to(exist_null, input_null) { |
108 | 196k | return result; |
109 | 98.4k | } |
110 | | // Otherwise, we need to check their values |
111 | 2 | } |
112 | | |
113 | 98.4k | self.group_values[lhs_row] == array.as_primitive::<T>().value(rhs_row) |
114 | 295k | } |
115 | | |
116 | 337 | fn append_val(&mut self, array: &ArrayRef, row: usize) { |
117 | 337 | // Perf: skip null check if input can't have nulls |
118 | 337 | if NULLABLE { |
119 | 335 | if array.is_null(row) { |
120 | 107 | self.nulls.append(true); |
121 | 107 | self.group_values.push(T::default_value()); |
122 | 228 | } else { |
123 | 228 | self.nulls.append(false); |
124 | 228 | self.group_values.push(array.as_primitive::<T>().value(row)); |
125 | 228 | } |
126 | 2 | } else { |
127 | 2 | self.group_values.push(array.as_primitive::<T>().value(row)); |
128 | 2 | } |
129 | 337 | } |
130 | | |
131 | 664 | fn len(&self) -> usize { |
132 | 664 | self.group_values.len() |
133 | 664 | } |
134 | | |
135 | 92 | fn size(&self) -> usize { |
136 | 92 | self.group_values.allocated_size() + self.nulls.allocated_size() |
137 | 92 | } |
138 | | |
139 | 23 | fn build(self: Box<Self>) -> ArrayRef { |
140 | 23 | let Self { |
141 | 23 | group_values, |
142 | 23 | nulls, |
143 | 23 | } = *self; |
144 | 23 | |
145 | 23 | let nulls = nulls.build(); |
146 | 23 | if !NULLABLE { |
147 | 0 | assert!(nulls.is_none(), "unexpected nulls in non nullable input"); |
148 | 23 | } |
149 | | |
150 | 23 | Arc::new(PrimitiveArray::<T>::new( |
151 | 23 | ScalarBuffer::from(group_values), |
152 | 23 | nulls, |
153 | 23 | )) |
154 | 23 | } |
155 | | |
156 | 8 | fn take_n(&mut self, n: usize) -> ArrayRef { |
157 | 8 | let first_n = self.group_values.drain(0..n).collect::<Vec<_>>(); |
158 | | |
159 | 8 | let first_n_nulls = if NULLABLE { self.nulls.take_n(n) } else { None0 }; |
160 | | |
161 | 8 | Arc::new(PrimitiveArray::<T>::new( |
162 | 8 | ScalarBuffer::from(first_n), |
163 | 8 | first_n_nulls, |
164 | 8 | )) |
165 | 8 | } |
166 | | } |
167 | | |
168 | | /// An implementation of [`GroupColumn`] for binary and utf8 types. |
169 | | /// |
170 | | /// Stores a collection of binary or utf8 group values in a single buffer |
171 | | /// in a way that allows: |
172 | | /// |
173 | | /// 1. Efficient comparison of incoming rows to existing rows |
174 | | /// 2. Efficient construction of the final output array |
175 | | pub struct ByteGroupValueBuilder<O> |
176 | | where |
177 | | O: OffsetSizeTrait, |
178 | | { |
179 | | output_type: OutputType, |
180 | | buffer: BufferBuilder<u8>, |
181 | | /// Offsets into `buffer` for each distinct value. These offsets as used |
182 | | /// directly to create the final `GenericBinaryArray`. The `i`th string is |
183 | | /// stored in the range `offsets[i]..offsets[i+1]` in `buffer`. Null values |
184 | | /// are stored as a zero length string. |
185 | | offsets: Vec<O>, |
186 | | /// Nulls |
187 | | nulls: MaybeNullBufferBuilder, |
188 | | } |
189 | | |
190 | | impl<O> ByteGroupValueBuilder<O> |
191 | | where |
192 | | O: OffsetSizeTrait, |
193 | | { |
194 | 2 | pub fn new(output_type: OutputType) -> Self { |
195 | 2 | Self { |
196 | 2 | output_type, |
197 | 2 | buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY), |
198 | 2 | offsets: vec![O::default()], |
199 | 2 | nulls: MaybeNullBufferBuilder::new(), |
200 | 2 | } |
201 | 2 | } |
202 | | |
203 | 15 | fn append_val_inner<B>(&mut self, array: &ArrayRef, row: usize) |
204 | 15 | where |
205 | 15 | B: ByteArrayType, |
206 | 15 | { |
207 | 15 | let arr = array.as_bytes::<B>(); |
208 | 15 | if arr.is_null(row) { |
209 | 8 | self.nulls.append(true); |
210 | 8 | // nulls need a zero length in the offset buffer |
211 | 8 | let offset = self.buffer.len(); |
212 | 8 | self.offsets.push(O::usize_as(offset)); |
213 | 8 | } else { |
214 | 7 | self.nulls.append(false); |
215 | 7 | let value: &[u8] = arr.value(row).as_ref(); |
216 | 7 | self.buffer.append_slice(value); |
217 | 7 | self.offsets.push(O::usize_as(self.buffer.len())); |
218 | 7 | } |
219 | 15 | } |
220 | | |
221 | 6 | fn equal_to_inner<B>(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool |
222 | 6 | where |
223 | 6 | B: ByteArrayType, |
224 | 6 | { |
225 | 6 | let array = array.as_bytes::<B>(); |
226 | 6 | let exist_null = self.nulls.is_null(lhs_row); |
227 | 6 | let input_null = array.is_null(rhs_row); |
228 | 6 | if let Some(result4 ) = nulls_equal_to(exist_null, input_null) { |
229 | 4 | return result; |
230 | 2 | } |
231 | 2 | // Otherwise, we need to check their values |
232 | 2 | self.value(lhs_row) == (array.value(rhs_row).as_ref() as &[u8]) |
233 | 6 | } |
234 | | |
235 | | /// return the current value of the specified row irrespective of null |
236 | 2 | pub fn value(&self, row: usize) -> &[u8] { |
237 | 2 | let l = self.offsets[row].as_usize(); |
238 | 2 | let r = self.offsets[row + 1].as_usize(); |
239 | 2 | // Safety: the offsets are constructed correctly and never decrease |
240 | 2 | unsafe { self.buffer.as_slice().get_unchecked(l..r) } |
241 | 2 | } |
242 | | } |
243 | | |
244 | | impl<O> GroupColumn for ByteGroupValueBuilder<O> |
245 | | where |
246 | | O: OffsetSizeTrait, |
247 | | { |
248 | 6 | fn equal_to(&self, lhs_row: usize, column: &ArrayRef, rhs_row: usize) -> bool { |
249 | 6 | // Sanity array type |
250 | 6 | match self.output_type { |
251 | | OutputType::Binary => { |
252 | 0 | debug_assert!(matches!( |
253 | 0 | column.data_type(), |
254 | | DataType::Binary | DataType::LargeBinary |
255 | | )); |
256 | 0 | self.equal_to_inner::<GenericBinaryType<O>>(lhs_row, column, rhs_row) |
257 | | } |
258 | | OutputType::Utf8 => { |
259 | 6 | debug_assert!(matches!0 ( |
260 | 6 | column.data_type(), |
261 | | DataType::Utf8 | DataType::LargeUtf8 |
262 | | )); |
263 | 6 | self.equal_to_inner::<GenericStringType<O>>(lhs_row, column, rhs_row) |
264 | | } |
265 | 0 | _ => unreachable!("View types should use `ArrowBytesViewMap`"), |
266 | | } |
267 | 6 | } |
268 | | |
269 | 15 | fn append_val(&mut self, column: &ArrayRef, row: usize) { |
270 | 15 | // Sanity array type |
271 | 15 | match self.output_type { |
272 | | OutputType::Binary => { |
273 | 0 | debug_assert!(matches!( |
274 | 0 | column.data_type(), |
275 | | DataType::Binary | DataType::LargeBinary |
276 | | )); |
277 | 0 | self.append_val_inner::<GenericBinaryType<O>>(column, row) |
278 | | } |
279 | | OutputType::Utf8 => { |
280 | 15 | debug_assert!(matches!0 ( |
281 | 15 | column.data_type(), |
282 | | DataType::Utf8 | DataType::LargeUtf8 |
283 | | )); |
284 | 15 | self.append_val_inner::<GenericStringType<O>>(column, row) |
285 | | } |
286 | 0 | _ => unreachable!("View types should use `ArrowBytesViewMap`"), |
287 | | }; |
288 | 15 | } |
289 | | |
290 | 3 | fn len(&self) -> usize { |
291 | 3 | self.offsets.len() - 1 |
292 | 3 | } |
293 | | |
294 | 0 | fn size(&self) -> usize { |
295 | 0 | self.buffer.capacity() * std::mem::size_of::<u8>() |
296 | 0 | + self.offsets.allocated_size() |
297 | 0 | + self.nulls.allocated_size() |
298 | 0 | } |
299 | | |
300 | 0 | fn build(self: Box<Self>) -> ArrayRef { |
301 | 0 | let Self { |
302 | 0 | output_type, |
303 | 0 | mut buffer, |
304 | 0 | offsets, |
305 | 0 | nulls, |
306 | 0 | } = *self; |
307 | 0 |
|
308 | 0 | let null_buffer = nulls.build(); |
309 | 0 |
|
310 | 0 | // SAFETY: the offsets were constructed correctly in `insert_if_new` -- |
311 | 0 | // monotonically increasing, overflows were checked. |
312 | 0 | let offsets = unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(offsets)) }; |
313 | 0 | let values = buffer.finish(); |
314 | 0 | match output_type { |
315 | | OutputType::Binary => { |
316 | | // SAFETY: the offsets were constructed correctly |
317 | 0 | Arc::new(unsafe { |
318 | 0 | GenericBinaryArray::new_unchecked(offsets, values, null_buffer) |
319 | 0 | }) |
320 | | } |
321 | | OutputType::Utf8 => { |
322 | | // SAFETY: |
323 | | // 1. the offsets were constructed safely |
324 | | // |
325 | | // 2. the input arrays were all the correct type and thus since |
326 | | // all the values that went in were valid (e.g. utf8) so are all |
327 | | // the values that come out |
328 | 0 | Arc::new(unsafe { |
329 | 0 | GenericStringArray::new_unchecked(offsets, values, null_buffer) |
330 | 0 | }) |
331 | | } |
332 | 0 | _ => unreachable!("View types should use `ArrowBytesViewMap`"), |
333 | | } |
334 | 0 | } |
335 | | |
336 | 3 | fn take_n(&mut self, n: usize) -> ArrayRef { |
337 | 3 | debug_assert!(self.len() >= n); |
338 | 3 | let null_buffer = self.nulls.take_n(n); |
339 | 3 | let first_remaining_offset = O::as_usize(self.offsets[n]); |
340 | 3 | |
341 | 3 | // Given offests like [0, 2, 4, 5] and n = 1, we expect to get |
342 | 3 | // offsets [0, 2, 3]. We first create two offsets for first_n as [0, 2] and the remaining as [2, 4, 5]. |
343 | 3 | // And we shift the offset starting from 0 for the remaining one, [2, 4, 5] -> [0, 2, 3]. |
344 | 3 | let mut first_n_offsets = self.offsets.drain(0..n).collect::<Vec<_>>(); |
345 | 3 | let offset_n = *self.offsets.first().unwrap(); |
346 | 3 | self.offsets |
347 | 3 | .iter_mut() |
348 | 7 | .for_each(|offset| *offset = offset.sub(offset_n)); |
349 | 3 | first_n_offsets.push(offset_n); |
350 | 3 | |
351 | 3 | // SAFETY: the offsets were constructed correctly in `insert_if_new` -- |
352 | 3 | // monotonically increasing, overflows were checked. |
353 | 3 | let offsets = |
354 | 3 | unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(first_n_offsets)) }; |
355 | 3 | |
356 | 3 | let mut remaining_buffer = |
357 | 3 | BufferBuilder::new(self.buffer.len() - first_remaining_offset); |
358 | 3 | // TODO: Current approach copy the remaining and truncate the original one |
359 | 3 | // Find out a way to avoid copying buffer but split the original one into two. |
360 | 3 | remaining_buffer.append_slice(&self.buffer.as_slice()[first_remaining_offset..]); |
361 | 3 | self.buffer.truncate(first_remaining_offset); |
362 | 3 | let values = self.buffer.finish(); |
363 | 3 | self.buffer = remaining_buffer; |
364 | 3 | |
365 | 3 | match self.output_type { |
366 | | OutputType::Binary => { |
367 | | // SAFETY: the offsets were constructed correctly |
368 | 0 | Arc::new(unsafe { |
369 | 0 | GenericBinaryArray::new_unchecked(offsets, values, null_buffer) |
370 | 0 | }) |
371 | | } |
372 | | OutputType::Utf8 => { |
373 | | // SAFETY: |
374 | | // 1. the offsets were constructed safely |
375 | | // |
376 | | // 2. we asserted the input arrays were all the correct type and |
377 | | // thus since all the values that went in were valid (e.g. utf8) |
378 | | // so are all the values that come out |
379 | 3 | Arc::new(unsafe { |
380 | 3 | GenericStringArray::new_unchecked(offsets, values, null_buffer) |
381 | 3 | }) |
382 | | } |
383 | 0 | _ => unreachable!("View types should use `ArrowBytesViewMap`"), |
384 | | } |
385 | 3 | } |
386 | | } |
387 | | |
388 | | /// An implementation of [`GroupColumn`] for binary view and utf8 view types. |
389 | | /// |
390 | | /// Stores a collection of binary view or utf8 view group values in a buffer |
391 | | /// whose structure is similar to `GenericByteViewArray`, and we can get benefits: |
392 | | /// |
393 | | /// 1. Efficient comparison of incoming rows to existing rows |
394 | | /// 2. Efficient construction of the final output array |
395 | | /// 3. Efficient to perform `take_n` comparing to use `GenericByteViewBuilder` |
396 | | pub struct ByteViewGroupValueBuilder<B: ByteViewType> { |
397 | | /// The views of string values |
398 | | /// |
399 | | /// If string len <= 12, the view's format will be: |
400 | | /// string(12B) | len(4B) |
401 | | /// |
402 | | /// If string len > 12, its format will be: |
403 | | /// offset(4B) | buffer_index(4B) | prefix(4B) | len(4B) |
404 | | views: Vec<u128>, |
405 | | |
406 | | /// The progressing block |
407 | | /// |
408 | | /// New values will be inserted into it until its capacity |
409 | | /// is not enough(detail can see `max_block_size`). |
410 | | in_progress: Vec<u8>, |
411 | | |
412 | | /// The completed blocks |
413 | | completed: Vec<Buffer>, |
414 | | |
415 | | /// The max size of `in_progress` |
416 | | /// |
417 | | /// `in_progress` will be flushed into `completed`, and create new `in_progress` |
418 | | /// when found its remaining capacity(`max_block_size` - `len(in_progress)`), |
419 | | /// is no enough to store the appended value. |
420 | | /// |
421 | | /// Currently it is fixed at 2MB. |
422 | | max_block_size: usize, |
423 | | |
424 | | /// Nulls |
425 | | nulls: MaybeNullBufferBuilder, |
426 | | |
427 | | /// phantom data so the type requires `<B>` |
428 | | _phantom: PhantomData<B>, |
429 | | } |
430 | | |
431 | | impl<B: ByteViewType> ByteViewGroupValueBuilder<B> { |
432 | 7 | pub fn new() -> Self { |
433 | 7 | Self { |
434 | 7 | views: Vec::new(), |
435 | 7 | in_progress: Vec::new(), |
436 | 7 | completed: Vec::new(), |
437 | 7 | max_block_size: BYTE_VIEW_MAX_BLOCK_SIZE, |
438 | 7 | nulls: MaybeNullBufferBuilder::new(), |
439 | 7 | _phantom: PhantomData {}, |
440 | 7 | } |
441 | 7 | } |
442 | | |
443 | | /// Set the max block size |
444 | 6 | fn with_max_block_size(mut self, max_block_size: usize) -> Self { |
445 | 6 | self.max_block_size = max_block_size; |
446 | 6 | self |
447 | 6 | } |
448 | | |
449 | 53 | fn append_val_inner(&mut self, array: &ArrayRef, row: usize) |
450 | 53 | where |
451 | 53 | B: ByteViewType, |
452 | 53 | { |
453 | 53 | let arr = array.as_byte_view::<B>(); |
454 | 53 | |
455 | 53 | // Null row case, set and return |
456 | 53 | if arr.is_null(row) { |
457 | 19 | self.nulls.append(true); |
458 | 19 | self.views.push(0); |
459 | 19 | return; |
460 | 34 | } |
461 | 34 | |
462 | 34 | // Not null row case |
463 | 34 | self.nulls.append(false); |
464 | 34 | let value: &[u8] = arr.value(row).as_ref(); |
465 | 34 | |
466 | 34 | let value_len = value.len(); |
467 | 34 | let view = if value_len <= 12 { |
468 | 18 | make_view(value, 0, 0) |
469 | | } else { |
470 | | // Ensure big enough block to hold the value firstly |
471 | 16 | self.ensure_in_progress_big_enough(value_len); |
472 | 16 | |
473 | 16 | // Append value |
474 | 16 | let buffer_index = self.completed.len(); |
475 | 16 | let offset = self.in_progress.len(); |
476 | 16 | self.in_progress.extend_from_slice(value); |
477 | 16 | |
478 | 16 | make_view(value, buffer_index as u32, offset as u32) |
479 | | }; |
480 | | |
481 | | // Append view |
482 | 34 | self.views.push(view); |
483 | 53 | } |
484 | | |
485 | 16 | fn ensure_in_progress_big_enough(&mut self, value_len: usize) { |
486 | 16 | debug_assert!(value_len > 12); |
487 | 16 | let require_cap = self.in_progress.len() + value_len; |
488 | 16 | |
489 | 16 | // If current block isn't big enough, flush it and create a new in progress block |
490 | 16 | if require_cap > self.max_block_size { |
491 | 6 | let flushed_block = mem::replace( |
492 | 6 | &mut self.in_progress, |
493 | 6 | Vec::with_capacity(self.max_block_size), |
494 | 6 | ); |
495 | 6 | let buffer = Buffer::from_vec(flushed_block); |
496 | 6 | self.completed.push(buffer); |
497 | 10 | } |
498 | 16 | } |
499 | | |
500 | 7 | fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { |
501 | 7 | let array = array.as_byte_view::<B>(); |
502 | 7 | |
503 | 7 | // Check if nulls equal firstly |
504 | 7 | let exist_null = self.nulls.is_null(lhs_row); |
505 | 7 | let input_null = array.is_null(rhs_row); |
506 | 7 | if let Some(result5 ) = nulls_equal_to(exist_null, input_null) { |
507 | 5 | return result; |
508 | 2 | } |
509 | 2 | |
510 | 2 | // Otherwise, we need to check their values |
511 | 2 | let exist_view = self.views[lhs_row]; |
512 | 2 | let exist_view_len = exist_view as u32; |
513 | 2 | |
514 | 2 | let input_view = array.views()[rhs_row]; |
515 | 2 | let input_view_len = input_view as u32; |
516 | 2 | |
517 | 2 | // The check logic |
518 | 2 | // - Check len equality |
519 | 2 | // - If inlined, check inlined value |
520 | 2 | // - If non-inlined, check prefix and then check value in buffer |
521 | 2 | // when needed |
522 | 2 | if exist_view_len != input_view_len { |
523 | 1 | return false; |
524 | 1 | } |
525 | 1 | |
526 | 1 | if exist_view_len <= 12 { |
527 | 1 | let exist_inline = unsafe { |
528 | 1 | GenericByteViewArray::<B>::inline_value( |
529 | 1 | &exist_view, |
530 | 1 | exist_view_len as usize, |
531 | 1 | ) |
532 | 1 | }; |
533 | 1 | let input_inline = unsafe { |
534 | 1 | GenericByteViewArray::<B>::inline_value( |
535 | 1 | &input_view, |
536 | 1 | input_view_len as usize, |
537 | 1 | ) |
538 | 1 | }; |
539 | 1 | exist_inline == input_inline |
540 | | } else { |
541 | 0 | let exist_prefix = |
542 | 0 | unsafe { GenericByteViewArray::<B>::inline_value(&exist_view, 4) }; |
543 | 0 | let input_prefix = |
544 | 0 | unsafe { GenericByteViewArray::<B>::inline_value(&input_view, 4) }; |
545 | 0 |
|
546 | 0 | if exist_prefix != input_prefix { |
547 | 0 | return false; |
548 | 0 | } |
549 | 0 |
|
550 | 0 | let exist_full = { |
551 | 0 | let byte_view = ByteView::from(exist_view); |
552 | 0 | self.value( |
553 | 0 | byte_view.buffer_index as usize, |
554 | 0 | byte_view.offset as usize, |
555 | 0 | byte_view.length as usize, |
556 | 0 | ) |
557 | 0 | }; |
558 | 0 | let input_full: &[u8] = unsafe { array.value_unchecked(rhs_row).as_ref() }; |
559 | 0 | exist_full == input_full |
560 | | } |
561 | 7 | } |
562 | | |
563 | 0 | fn value(&self, buffer_index: usize, offset: usize, length: usize) -> &[u8] { |
564 | 0 | debug_assert!(buffer_index <= self.completed.len()); |
565 | | |
566 | 0 | if buffer_index < self.completed.len() { |
567 | 0 | let block = &self.completed[buffer_index]; |
568 | 0 | &block[offset..offset + length] |
569 | | } else { |
570 | 0 | &self.in_progress[offset..offset + length] |
571 | | } |
572 | 0 | } |
573 | | |
574 | 4 | fn build_inner(self) -> ArrayRef { |
575 | 4 | let Self { |
576 | 4 | views, |
577 | 4 | in_progress, |
578 | 4 | mut completed, |
579 | 4 | nulls, |
580 | 4 | .. |
581 | 4 | } = self; |
582 | 4 | |
583 | 4 | // Build nulls |
584 | 4 | let null_buffer = nulls.build(); |
585 | 4 | |
586 | 4 | // Build values |
587 | 4 | // Flush `in_process` firstly |
588 | 4 | if !in_progress.is_empty() { |
589 | 4 | let buffer = Buffer::from(in_progress); |
590 | 4 | completed.push(buffer); |
591 | 4 | }0 |
592 | | |
593 | 4 | let views = ScalarBuffer::from(views); |
594 | 4 | |
595 | 4 | Arc::new(GenericByteViewArray::<B>::new( |
596 | 4 | views, |
597 | 4 | completed, |
598 | 4 | null_buffer, |
599 | 4 | )) |
600 | 4 | } |
601 | | |
602 | 9 | fn take_n_inner(&mut self, n: usize) -> ArrayRef { |
603 | 9 | debug_assert!(self.len() >= n); |
604 | | |
605 | | // The `n == len` case, we need to take all |
606 | 9 | if self.len() == n { |
607 | 3 | let new_builder = Self::new().with_max_block_size(self.max_block_size); |
608 | 3 | let cur_builder = std::mem::replace(self, new_builder); |
609 | 3 | return cur_builder.build_inner(); |
610 | 6 | } |
611 | 6 | |
612 | 6 | // The `n < len` case |
613 | 6 | // Take n for nulls |
614 | 6 | let null_buffer = self.nulls.take_n(n); |
615 | 6 | |
616 | 6 | // Take n for values: |
617 | 6 | // - Take first n `view`s from `views` |
618 | 6 | // |
619 | 6 | // - Find the last non-inlined `view`, if all inlined, |
620 | 6 | // we can build array and return happily, otherwise we |
621 | 6 | // we need to continue to process related buffers |
622 | 6 | // |
623 | 6 | // - Get the last related `buffer index`(let's name it `buffer index n`) |
624 | 6 | // from last non-inlined `view` |
625 | 6 | // |
626 | 6 | // - Take buffers, the key is that we need to know if we need to take |
627 | 6 | // the whole last related buffer. The logic is a bit complex, you can |
628 | 6 | // detail in `take_buffers_with_whole_last`, `take_buffers_with_partial_last` |
629 | 6 | // and other related steps in following |
630 | 6 | // |
631 | 6 | // - Shift the `buffer index` of remaining non-inlined `views` |
632 | 6 | // |
633 | 6 | let first_n_views = self.views.drain(0..n).collect::<Vec<_>>(); |
634 | 6 | |
635 | 6 | let last_non_inlined_view = first_n_views |
636 | 6 | .iter() |
637 | 6 | .rev() |
638 | 9 | .find(|view| ((**view) as u32) > 12)6 ; |
639 | | |
640 | 6 | if let Some(view4 ) = last_non_inlined_view { |
641 | 4 | let view = ByteView::from(*view); |
642 | 4 | let last_related_buffer_index = view.buffer_index as usize; |
643 | 4 | |
644 | 4 | // Check should we take the whole `last_related_buffer_index` buffer |
645 | 4 | let take_whole_last_buffer = self.should_take_whole_buffer( |
646 | 4 | last_related_buffer_index, |
647 | 4 | (view.offset + view.length) as usize, |
648 | 4 | ); |
649 | | |
650 | | // Take related buffers |
651 | 4 | let buffers = if take_whole_last_buffer { |
652 | 2 | self.take_buffers_with_whole_last(last_related_buffer_index) |
653 | | } else { |
654 | 2 | self.take_buffers_with_partial_last( |
655 | 2 | last_related_buffer_index, |
656 | 2 | (view.offset + view.length) as usize, |
657 | 2 | ) |
658 | | }; |
659 | | |
660 | | // Shift `buffer index`s finally |
661 | 4 | let shifts = if take_whole_last_buffer { |
662 | 2 | last_related_buffer_index + 1 |
663 | | } else { |
664 | 2 | last_related_buffer_index |
665 | | }; |
666 | | |
667 | 20 | self.views.iter_mut().for_each(4 |view| { |
668 | 20 | if (*view as u32) > 12 { |
669 | 10 | let mut byte_view = ByteView::from(*view); |
670 | 10 | byte_view.buffer_index -= shifts as u32; |
671 | 10 | *view = byte_view.as_u128(); |
672 | 10 | } |
673 | 20 | }); |
674 | 4 | |
675 | 4 | // Build array and return |
676 | 4 | let views = ScalarBuffer::from(first_n_views); |
677 | 4 | Arc::new(GenericByteViewArray::<B>::new(views, buffers, null_buffer)) |
678 | | } else { |
679 | 2 | let views = ScalarBuffer::from(first_n_views); |
680 | 2 | Arc::new(GenericByteViewArray::<B>::new( |
681 | 2 | views, |
682 | 2 | Vec::new(), |
683 | 2 | null_buffer, |
684 | 2 | )) |
685 | | } |
686 | 9 | } |
687 | | |
688 | 2 | fn take_buffers_with_whole_last( |
689 | 2 | &mut self, |
690 | 2 | last_related_buffer_index: usize, |
691 | 2 | ) -> Vec<Buffer> { |
692 | 2 | if last_related_buffer_index == self.completed.len() { |
693 | 0 | self.flush_in_progress(); |
694 | 2 | } |
695 | 2 | self.completed |
696 | 2 | .drain(0..last_related_buffer_index + 1) |
697 | 2 | .collect() |
698 | 2 | } |
699 | | |
700 | 2 | fn take_buffers_with_partial_last( |
701 | 2 | &mut self, |
702 | 2 | last_related_buffer_index: usize, |
703 | 2 | take_len: usize, |
704 | 2 | ) -> Vec<Buffer> { |
705 | 2 | let mut take_buffers = Vec::with_capacity(last_related_buffer_index + 1); |
706 | 2 | |
707 | 2 | // Take `0 ~ last_related_buffer_index - 1` buffers |
708 | 2 | if !self.completed.is_empty() || last_related_buffer_index == 01 { |
709 | 2 | take_buffers.extend(self.completed.drain(0..last_related_buffer_index)); |
710 | 2 | }0 |
711 | | |
712 | | // Process the `last_related_buffer_index` buffers |
713 | 2 | let last_buffer = if last_related_buffer_index < self.completed.len() { |
714 | | // If it is in `completed`, simply clone |
715 | 1 | self.completed[last_related_buffer_index].clone() |
716 | | } else { |
717 | | // If it is `in_progress`, copied `0 ~ offset` part |
718 | 1 | let taken_last_buffer = self.in_progress[0..take_len].to_vec(); |
719 | 1 | Buffer::from_vec(taken_last_buffer) |
720 | | }; |
721 | 2 | take_buffers.push(last_buffer); |
722 | 2 | |
723 | 2 | take_buffers |
724 | 2 | } |
725 | | |
726 | | #[inline] |
727 | 4 | fn should_take_whole_buffer(&self, buffer_index: usize, take_len: usize) -> bool { |
728 | 4 | if buffer_index < self.completed.len() { |
729 | 3 | take_len == self.completed[buffer_index].len() |
730 | | } else { |
731 | 1 | take_len == self.in_progress.len() |
732 | | } |
733 | 4 | } |
734 | | |
735 | 0 | fn flush_in_progress(&mut self) { |
736 | 0 | let flushed_block = mem::replace( |
737 | 0 | &mut self.in_progress, |
738 | 0 | Vec::with_capacity(self.max_block_size), |
739 | 0 | ); |
740 | 0 | let buffer = Buffer::from_vec(flushed_block); |
741 | 0 | self.completed.push(buffer); |
742 | 0 | } |
743 | | } |
744 | | |
745 | | impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> { |
746 | 7 | fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { |
747 | 7 | self.equal_to_inner(lhs_row, array, rhs_row) |
748 | 7 | } |
749 | | |
750 | 53 | fn append_val(&mut self, array: &ArrayRef, row: usize) { |
751 | 53 | self.append_val_inner(array, row) |
752 | 53 | } |
753 | | |
754 | 18 | fn len(&self) -> usize { |
755 | 18 | self.views.len() |
756 | 18 | } |
757 | | |
758 | 0 | fn size(&self) -> usize { |
759 | 0 | let buffers_size = self |
760 | 0 | .completed |
761 | 0 | .iter() |
762 | 0 | .map(|buf| buf.capacity() * std::mem::size_of::<u8>()) |
763 | 0 | .sum::<usize>(); |
764 | 0 |
|
765 | 0 | self.nulls.allocated_size() |
766 | 0 | + self.views.capacity() * std::mem::size_of::<u128>() |
767 | 0 | + self.in_progress.capacity() * std::mem::size_of::<u8>() |
768 | 0 | + buffers_size |
769 | 0 | + std::mem::size_of::<Self>() |
770 | 0 | } |
771 | | |
772 | 1 | fn build(self: Box<Self>) -> ArrayRef { |
773 | 1 | Self::build_inner(*self) |
774 | 1 | } |
775 | | |
776 | 9 | fn take_n(&mut self, n: usize) -> ArrayRef { |
777 | 9 | self.take_n_inner(n) |
778 | 9 | } |
779 | | } |
780 | | |
781 | | /// Determines if the nullability of the existing and new input array can be used |
782 | | /// to short-circuit the comparison of the two values. |
783 | | /// |
784 | | /// Returns `Some(result)` if the result of the comparison can be determined |
785 | | /// from the nullness of the two values, and `None` if the comparison must be |
786 | | /// done on the values themselves. |
787 | 295k | fn nulls_equal_to(lhs_null: bool, rhs_null: bool) -> Option<bool> { |
788 | 295k | match (lhs_null, rhs_null) { |
789 | 196k | (true, true) => Some(true), |
790 | 7 | (false, true) | (true, false) => Some(false), |
791 | 98.4k | _ => None, |
792 | | } |
793 | 295k | } |
794 | | |
795 | | #[cfg(test)] |
796 | | mod tests { |
797 | | use std::sync::Arc; |
798 | | |
799 | | use arrow::{ |
800 | | array::AsArray, |
801 | | datatypes::{Int64Type, StringViewType}, |
802 | | }; |
803 | | use arrow_array::{ArrayRef, Int64Array, StringArray, StringViewArray}; |
804 | | use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; |
805 | | use datafusion_physical_expr::binary_map::OutputType; |
806 | | |
807 | | use crate::aggregates::group_values::group_column::{ |
808 | | ByteViewGroupValueBuilder, PrimitiveGroupValueBuilder, |
809 | | }; |
810 | | |
811 | | use super::{ByteGroupValueBuilder, GroupColumn}; |
812 | | |
813 | | #[test] |
814 | 1 | fn test_take_n() { |
815 | 1 | let mut builder = ByteGroupValueBuilder::<i32>::new(OutputType::Utf8); |
816 | 1 | let array = Arc::new(StringArray::from(vec![Some("a"), None])) as ArrayRef; |
817 | 1 | // a, null, null |
818 | 1 | builder.append_val(&array, 0); |
819 | 1 | builder.append_val(&array, 1); |
820 | 1 | builder.append_val(&array, 1); |
821 | 1 | |
822 | 1 | // (a, null) remaining: null |
823 | 1 | let output = builder.take_n(2); |
824 | 1 | assert_eq!(&output, &array); |
825 | | |
826 | | // null, a, null, a |
827 | 1 | builder.append_val(&array, 0); |
828 | 1 | builder.append_val(&array, 1); |
829 | 1 | builder.append_val(&array, 0); |
830 | 1 | |
831 | 1 | // (null, a) remaining: (null, a) |
832 | 1 | let output = builder.take_n(2); |
833 | 1 | let array = Arc::new(StringArray::from(vec![None, Some("a")])) as ArrayRef; |
834 | 1 | assert_eq!(&output, &array); |
835 | | |
836 | 1 | let array = Arc::new(StringArray::from(vec![ |
837 | 1 | Some("a"), |
838 | 1 | None, |
839 | 1 | Some("longstringfortest"), |
840 | 1 | ])) as ArrayRef; |
841 | 1 | |
842 | 1 | // null, a, longstringfortest, null, null |
843 | 1 | builder.append_val(&array, 2); |
844 | 1 | builder.append_val(&array, 1); |
845 | 1 | builder.append_val(&array, 1); |
846 | 1 | |
847 | 1 | // (null, a, longstringfortest, null) remaining: (null) |
848 | 1 | let output = builder.take_n(4); |
849 | 1 | let array = Arc::new(StringArray::from(vec![ |
850 | 1 | None, |
851 | 1 | Some("a"), |
852 | 1 | Some("longstringfortest"), |
853 | 1 | None, |
854 | 1 | ])) as ArrayRef; |
855 | 1 | assert_eq!(&output, &array); |
856 | 1 | } |
857 | | |
858 | | #[test] |
859 | 1 | fn test_nullable_primitive_equal_to() { |
860 | 1 | // Will cover such cases: |
861 | 1 | // - exist null, input not null |
862 | 1 | // - exist null, input null; values not equal |
863 | 1 | // - exist null, input null; values equal |
864 | 1 | // - exist not null, input null |
865 | 1 | // - exist not null, input not null; values not equal |
866 | 1 | // - exist not null, input not null; values equal |
867 | 1 | |
868 | 1 | // Define PrimitiveGroupValueBuilder |
869 | 1 | let mut builder = PrimitiveGroupValueBuilder::<Int64Type, true>::new(); |
870 | 1 | let builder_array = Arc::new(Int64Array::from(vec![ |
871 | 1 | None, |
872 | 1 | None, |
873 | 1 | None, |
874 | 1 | Some(1), |
875 | 1 | Some(2), |
876 | 1 | Some(3), |
877 | 1 | ])) as ArrayRef; |
878 | 1 | builder.append_val(&builder_array, 0); |
879 | 1 | builder.append_val(&builder_array, 1); |
880 | 1 | builder.append_val(&builder_array, 2); |
881 | 1 | builder.append_val(&builder_array, 3); |
882 | 1 | builder.append_val(&builder_array, 4); |
883 | 1 | builder.append_val(&builder_array, 5); |
884 | 1 | |
885 | 1 | // Define input array |
886 | 1 | let (_nulls, values, _) = |
887 | 1 | Int64Array::from(vec![Some(1), Some(2), None, None, Some(1), Some(3)]) |
888 | 1 | .into_parts(); |
889 | 1 | |
890 | 1 | // explicitly build a boolean buffer where one of the null values also happens to match |
891 | 1 | let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); |
892 | 1 | boolean_buffer_builder.append(true); |
893 | 1 | boolean_buffer_builder.append(false); // this sets Some(2) to null above |
894 | 1 | boolean_buffer_builder.append(false); |
895 | 1 | boolean_buffer_builder.append(false); |
896 | 1 | boolean_buffer_builder.append(true); |
897 | 1 | boolean_buffer_builder.append(true); |
898 | 1 | let nulls = NullBuffer::new(boolean_buffer_builder.finish()); |
899 | 1 | let input_array = Arc::new(Int64Array::new(values, Some(nulls))) as ArrayRef; |
900 | 1 | |
901 | 1 | // Check |
902 | 1 | assert!(!builder.equal_to(0, &input_array, 0)); |
903 | 1 | assert!(builder.equal_to(1, &input_array, 1)); |
904 | 1 | assert!(builder.equal_to(2, &input_array, 2)); |
905 | 1 | assert!(!builder.equal_to(3, &input_array, 3)); |
906 | 1 | assert!(!builder.equal_to(4, &input_array, 4)); |
907 | 1 | assert!(builder.equal_to(5, &input_array, 5)); |
908 | 1 | } |
909 | | |
910 | | #[test] |
911 | 1 | fn test_not_nullable_primitive_equal_to() { |
912 | 1 | // Will cover such cases: |
913 | 1 | // - values equal |
914 | 1 | // - values not equal |
915 | 1 | |
916 | 1 | // Define PrimitiveGroupValueBuilder |
917 | 1 | let mut builder = PrimitiveGroupValueBuilder::<Int64Type, false>::new(); |
918 | 1 | let builder_array = |
919 | 1 | Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as ArrayRef; |
920 | 1 | builder.append_val(&builder_array, 0); |
921 | 1 | builder.append_val(&builder_array, 1); |
922 | 1 | |
923 | 1 | // Define input array |
924 | 1 | let input_array = Arc::new(Int64Array::from(vec![Some(0), Some(2)])) as ArrayRef; |
925 | 1 | |
926 | 1 | // Check |
927 | 1 | assert!(builder.equal_to(0, &input_array, 0)); |
928 | 1 | assert!(!builder.equal_to(1, &input_array, 1)); |
929 | 1 | } |
930 | | |
931 | | #[test] |
932 | 1 | fn test_byte_array_equal_to() { |
933 | 1 | // Will cover such cases: |
934 | 1 | // - exist null, input not null |
935 | 1 | // - exist null, input null; values not equal |
936 | 1 | // - exist null, input null; values equal |
937 | 1 | // - exist not null, input null |
938 | 1 | // - exist not null, input not null; values not equal |
939 | 1 | // - exist not null, input not null; values equal |
940 | 1 | |
941 | 1 | // Define PrimitiveGroupValueBuilder |
942 | 1 | let mut builder = ByteGroupValueBuilder::<i32>::new(OutputType::Utf8); |
943 | 1 | let builder_array = Arc::new(StringArray::from(vec![ |
944 | 1 | None, |
945 | 1 | None, |
946 | 1 | None, |
947 | 1 | Some("foo"), |
948 | 1 | Some("bar"), |
949 | 1 | Some("baz"), |
950 | 1 | ])) as ArrayRef; |
951 | 1 | builder.append_val(&builder_array, 0); |
952 | 1 | builder.append_val(&builder_array, 1); |
953 | 1 | builder.append_val(&builder_array, 2); |
954 | 1 | builder.append_val(&builder_array, 3); |
955 | 1 | builder.append_val(&builder_array, 4); |
956 | 1 | builder.append_val(&builder_array, 5); |
957 | 1 | |
958 | 1 | // Define input array |
959 | 1 | let (offsets, buffer, _nulls) = StringArray::from(vec![ |
960 | 1 | Some("foo"), |
961 | 1 | Some("bar"), |
962 | 1 | None, |
963 | 1 | None, |
964 | 1 | Some("foo"), |
965 | 1 | Some("baz"), |
966 | 1 | ]) |
967 | 1 | .into_parts(); |
968 | 1 | |
969 | 1 | // explicitly build a boolean buffer where one of the null values also happens to match |
970 | 1 | let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); |
971 | 1 | boolean_buffer_builder.append(true); |
972 | 1 | boolean_buffer_builder.append(false); // this sets Some("bar") to null above |
973 | 1 | boolean_buffer_builder.append(false); |
974 | 1 | boolean_buffer_builder.append(false); |
975 | 1 | boolean_buffer_builder.append(true); |
976 | 1 | boolean_buffer_builder.append(true); |
977 | 1 | let nulls = NullBuffer::new(boolean_buffer_builder.finish()); |
978 | 1 | let input_array = |
979 | 1 | Arc::new(StringArray::new(offsets, buffer, Some(nulls))) as ArrayRef; |
980 | 1 | |
981 | 1 | // Check |
982 | 1 | assert!(!builder.equal_to(0, &input_array, 0)); |
983 | 1 | assert!(builder.equal_to(1, &input_array, 1)); |
984 | 1 | assert!(builder.equal_to(2, &input_array, 2)); |
985 | 1 | assert!(!builder.equal_to(3, &input_array, 3)); |
986 | 1 | assert!(!builder.equal_to(4, &input_array, 4)); |
987 | 1 | assert!(builder.equal_to(5, &input_array, 5)); |
988 | 1 | } |
989 | | |
990 | | #[test] |
991 | 1 | fn test_byte_view_append_val() { |
992 | 1 | let mut builder = |
993 | 1 | ByteViewGroupValueBuilder::<StringViewType>::new().with_max_block_size(60); |
994 | 1 | let builder_array = StringViewArray::from(vec![ |
995 | 1 | Some("this string is quite long"), // in buffer 0 |
996 | 1 | Some("foo"), |
997 | 1 | None, |
998 | 1 | Some("bar"), |
999 | 1 | Some("this string is also quite long"), // buffer 0 |
1000 | 1 | Some("this string is quite long"), // buffer 1 |
1001 | 1 | Some("bar"), |
1002 | 1 | ]); |
1003 | 1 | let builder_array: ArrayRef = Arc::new(builder_array); |
1004 | 7 | for row in 0..builder_array.len()1 { |
1005 | 7 | builder.append_val(&builder_array, row); |
1006 | 7 | } |
1007 | | |
1008 | 1 | let output = Box::new(builder).build(); |
1009 | 1 | // should be 2 output buffers to hold all the data |
1010 | 1 | assert_eq!(output.as_string_view().data_buffers().len(), 2,); |
1011 | 1 | assert_eq!(&output, &builder_array) |
1012 | 1 | } |
1013 | | |
1014 | | #[test] |
1015 | 1 | fn test_byte_view_equal_to() { |
1016 | 1 | // Will cover such cases: |
1017 | 1 | // - exist null, input not null |
1018 | 1 | // - exist null, input null; values not equal |
1019 | 1 | // - exist null, input null; values equal |
1020 | 1 | // - exist not null, input null |
1021 | 1 | // - exist not null, input not null; values not equal |
1022 | 1 | // - exist not null, input not null; values equal |
1023 | 1 | |
1024 | 1 | let mut builder = ByteViewGroupValueBuilder::<StringViewType>::new(); |
1025 | 1 | let builder_array = Arc::new(StringViewArray::from(vec![ |
1026 | 1 | None, |
1027 | 1 | None, |
1028 | 1 | None, |
1029 | 1 | Some("foo"), |
1030 | 1 | Some("bar"), |
1031 | 1 | Some("this string is quite long"), |
1032 | 1 | Some("baz"), |
1033 | 1 | ])) as ArrayRef; |
1034 | 1 | builder.append_val(&builder_array, 0); |
1035 | 1 | builder.append_val(&builder_array, 1); |
1036 | 1 | builder.append_val(&builder_array, 2); |
1037 | 1 | builder.append_val(&builder_array, 3); |
1038 | 1 | builder.append_val(&builder_array, 4); |
1039 | 1 | builder.append_val(&builder_array, 5); |
1040 | 1 | builder.append_val(&builder_array, 6); |
1041 | 1 | |
1042 | 1 | // Define input array |
1043 | 1 | let (views, buffer, _nulls) = StringViewArray::from(vec![ |
1044 | 1 | Some("foo"), |
1045 | 1 | Some("bar"), // set to null |
1046 | 1 | Some("this string is quite long"), // set to null |
1047 | 1 | None, |
1048 | 1 | None, |
1049 | 1 | Some("foo"), |
1050 | 1 | Some("baz"), |
1051 | 1 | ]) |
1052 | 1 | .into_parts(); |
1053 | 1 | |
1054 | 1 | // explicitly build a boolean buffer where one of the null values also happens to match |
1055 | 1 | let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); |
1056 | 1 | boolean_buffer_builder.append(true); |
1057 | 1 | boolean_buffer_builder.append(false); // this sets Some("bar") to null above |
1058 | 1 | boolean_buffer_builder.append(false); // this sets Some("thisstringisquitelong") to null above |
1059 | 1 | boolean_buffer_builder.append(false); |
1060 | 1 | boolean_buffer_builder.append(false); |
1061 | 1 | boolean_buffer_builder.append(true); |
1062 | 1 | boolean_buffer_builder.append(true); |
1063 | 1 | let nulls = NullBuffer::new(boolean_buffer_builder.finish()); |
1064 | 1 | let input_array = |
1065 | 1 | Arc::new(StringViewArray::new(views, buffer, Some(nulls))) as ArrayRef; |
1066 | 1 | |
1067 | 1 | // Check |
1068 | 1 | assert!(!builder.equal_to(0, &input_array, 0)); |
1069 | 1 | assert!(builder.equal_to(1, &input_array, 1)); |
1070 | 1 | assert!(builder.equal_to(2, &input_array, 2)); |
1071 | 1 | assert!(!builder.equal_to(3, &input_array, 3)); |
1072 | 1 | assert!(!builder.equal_to(4, &input_array, 4)); |
1073 | 1 | assert!(!builder.equal_to(5, &input_array, 5)); |
1074 | 1 | assert!(builder.equal_to(6, &input_array, 6)); |
1075 | 1 | } |
1076 | | |
1077 | | #[test] |
1078 | 1 | fn test_byte_view_take_n() { |
1079 | 1 | // ####### Define cases and init ####### |
1080 | 1 | |
1081 | 1 | // `take_n` is really complex, we should consider and test following situations: |
1082 | 1 | // 1. Take nulls |
1083 | 1 | // 2. Take all `inlined`s |
1084 | 1 | // 3. Take non-inlined + partial last buffer in `completed` |
1085 | 1 | // 4. Take non-inlined + whole last buffer in `completed` |
1086 | 1 | // 5. Take non-inlined + partial last `in_progress` |
1087 | 1 | // 6. Take non-inlined + while last buffer in ``in_progress` |
1088 | 1 | // 7. Take all views at once |
1089 | 1 | |
1090 | 1 | let mut builder = |
1091 | 1 | ByteViewGroupValueBuilder::<StringViewType>::new().with_max_block_size(60); |
1092 | 1 | let input_array = StringViewArray::from(vec![ |
1093 | 1 | // Test situation 1 |
1094 | 1 | None, |
1095 | 1 | None, |
1096 | 1 | // Test situation 2 (also test take null together) |
1097 | 1 | None, |
1098 | 1 | Some("foo"), |
1099 | 1 | Some("bar"), |
1100 | 1 | // Test situation 3 (also test take null + inlined) |
1101 | 1 | None, |
1102 | 1 | Some("foo"), |
1103 | 1 | Some("this string is quite long"), |
1104 | 1 | Some("this string is also quite long"), |
1105 | 1 | // Test situation 4 (also test take null + inlined) |
1106 | 1 | None, |
1107 | 1 | Some("bar"), |
1108 | 1 | Some("this string is quite long"), |
1109 | 1 | // Test situation 5 (also test take null + inlined) |
1110 | 1 | None, |
1111 | 1 | Some("foo"), |
1112 | 1 | Some("another string that is is quite long"), |
1113 | 1 | Some("this string not so long"), |
1114 | 1 | // Test situation 6 (also test take null + inlined + insert again after taking) |
1115 | 1 | None, |
1116 | 1 | Some("bar"), |
1117 | 1 | Some("this string is quite long"), |
1118 | 1 | // Insert 4 and just take 3 to ensure it will go the path of situation 6 |
1119 | 1 | None, |
1120 | 1 | // Finally, we create a new builder, insert the whole array and then |
1121 | 1 | // take whole at once for testing situation 7 |
1122 | 1 | ]); |
1123 | 1 | |
1124 | 1 | let input_array: ArrayRef = Arc::new(input_array); |
1125 | 1 | let first_ones_to_append = 16; // For testing situation 1~5 |
1126 | 1 | let second_ones_to_append = 3; // For testing situation 6 |
1127 | 1 | let final_ones_to_append = input_array.len(); // For testing situation 7 |
1128 | | |
1129 | | // ####### Test situation 1~5 ####### |
1130 | 16 | for row in 0..first_ones_to_append1 { |
1131 | 16 | builder.append_val(&input_array, row); |
1132 | 16 | } |
1133 | | |
1134 | 1 | assert_eq!(builder.completed.len(), 2); |
1135 | 1 | assert_eq!(builder.in_progress.len(), 59); |
1136 | | |
1137 | | // Situation 1 |
1138 | 1 | let taken_array = builder.take_n(2); |
1139 | 1 | assert_eq!(&taken_array, &input_array.slice(0, 2)); |
1140 | | |
1141 | | // Situation 2 |
1142 | 1 | let taken_array = builder.take_n(3); |
1143 | 1 | assert_eq!(&taken_array, &input_array.slice(2, 3)); |
1144 | | |
1145 | | // Situation 3 |
1146 | 1 | let taken_array = builder.take_n(3); |
1147 | 1 | assert_eq!(&taken_array, &input_array.slice(5, 3)); |
1148 | | |
1149 | 1 | let taken_array = builder.take_n(1); |
1150 | 1 | assert_eq!(&taken_array, &input_array.slice(8, 1)); |
1151 | | |
1152 | | // Situation 4 |
1153 | 1 | let taken_array = builder.take_n(3); |
1154 | 1 | assert_eq!(&taken_array, &input_array.slice(9, 3)); |
1155 | | |
1156 | | // Situation 5 |
1157 | 1 | let taken_array = builder.take_n(3); |
1158 | 1 | assert_eq!(&taken_array, &input_array.slice(12, 3)); |
1159 | | |
1160 | 1 | let taken_array = builder.take_n(1); |
1161 | 1 | assert_eq!(&taken_array, &input_array.slice(15, 1)); |
1162 | | |
1163 | | // ####### Test situation 6 ####### |
1164 | 1 | assert!(builder.completed.is_empty()); |
1165 | 1 | assert!(builder.in_progress.is_empty()); |
1166 | 1 | assert!(builder.views.is_empty()); |
1167 | | |
1168 | 3 | for row in first_ones_to_append..first_ones_to_append + second_ones_to_append1 { |
1169 | 3 | builder.append_val(&input_array, row); |
1170 | 3 | } |
1171 | | |
1172 | 1 | assert!(builder.completed.is_empty()); |
1173 | 1 | assert_eq!(builder.in_progress.len(), 25); |
1174 | | |
1175 | 1 | let taken_array = builder.take_n(3); |
1176 | 1 | assert_eq!(&taken_array, &input_array.slice(16, 3)); |
1177 | | |
1178 | | // ####### Test situation 7 ####### |
1179 | | // Create a new builder |
1180 | 1 | let mut builder = |
1181 | 1 | ByteViewGroupValueBuilder::<StringViewType>::new().with_max_block_size(60); |
1182 | | |
1183 | 20 | for row in 0..final_ones_to_append1 { |
1184 | 20 | builder.append_val(&input_array, row); |
1185 | 20 | } |
1186 | | |
1187 | 1 | assert_eq!(builder.completed.len(), 3); |
1188 | 1 | assert_eq!(builder.in_progress.len(), 25); |
1189 | | |
1190 | 1 | let taken_array = builder.take_n(final_ones_to_append); |
1191 | 1 | assert_eq!(&taken_array, &input_array); |
1192 | 1 | } |
1193 | | } |