/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate/src/first_last.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines the FIRST_VALUE/LAST_VALUE aggregations. |
19 | | |
20 | | use std::any::Any; |
21 | | use std::fmt::Debug; |
22 | | use std::sync::Arc; |
23 | | |
24 | | use arrow::array::{ArrayRef, AsArray, BooleanArray}; |
25 | | use arrow::compute::{self, lexsort_to_indices, SortColumn}; |
26 | | use arrow::datatypes::{DataType, Field}; |
27 | | use datafusion_common::utils::{compare_rows, get_row_at_idx, take_arrays}; |
28 | | use datafusion_common::{ |
29 | | arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, |
30 | | }; |
31 | | use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; |
32 | | use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; |
33 | | use datafusion_expr::{ |
34 | | Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Expr, ExprFunctionExt, |
35 | | Signature, SortExpr, TypeSignature, Volatility, |
36 | | }; |
37 | | use datafusion_functions_aggregate_common::utils::get_sort_options; |
38 | | use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; |
39 | | |
40 | | create_func!(FirstValue, first_value_udaf); |
41 | | |
42 | | /// Returns the first value in a group of values. |
43 | 0 | pub fn first_value(expression: Expr, order_by: Option<Vec<SortExpr>>) -> Expr { |
44 | 0 | if let Some(order_by) = order_by { |
45 | 0 | first_value_udaf() |
46 | 0 | .call(vec![expression]) |
47 | 0 | .order_by(order_by) |
48 | 0 | .build() |
49 | 0 | // guaranteed to be `Expr::AggregateFunction` |
50 | 0 | .unwrap() |
51 | | } else { |
52 | 0 | first_value_udaf().call(vec![expression]) |
53 | | } |
54 | 0 | } |
55 | | |
56 | | pub struct FirstValue { |
57 | | signature: Signature, |
58 | | requirement_satisfied: bool, |
59 | | } |
60 | | |
61 | | impl Debug for FirstValue { |
62 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
63 | 0 | f.debug_struct("FirstValue") |
64 | 0 | .field("name", &self.name()) |
65 | 0 | .field("signature", &self.signature) |
66 | 0 | .field("accumulator", &"<FUNC>") |
67 | 0 | .finish() |
68 | 0 | } |
69 | | } |
70 | | |
71 | | impl Default for FirstValue { |
72 | 1 | fn default() -> Self { |
73 | 1 | Self::new() |
74 | 1 | } |
75 | | } |
76 | | |
77 | | impl FirstValue { |
78 | 1 | pub fn new() -> Self { |
79 | 1 | Self { |
80 | 1 | signature: Signature::one_of( |
81 | 1 | vec![ |
82 | 1 | // TODO: we can introduce more strict signature that only numeric of array types are allowed |
83 | 1 | TypeSignature::ArraySignature(ArrayFunctionSignature::Array), |
84 | 1 | TypeSignature::Numeric(1), |
85 | 1 | TypeSignature::Uniform(1, vec![DataType::Utf8]), |
86 | 1 | ], |
87 | 1 | Volatility::Immutable, |
88 | 1 | ), |
89 | 1 | requirement_satisfied: false, |
90 | 1 | } |
91 | 1 | } |
92 | | |
93 | 0 | fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { |
94 | 0 | self.requirement_satisfied = requirement_satisfied; |
95 | 0 | self |
96 | 0 | } |
97 | | } |
98 | | |
99 | | impl AggregateUDFImpl for FirstValue { |
100 | 0 | fn as_any(&self) -> &dyn Any { |
101 | 0 | self |
102 | 0 | } |
103 | | |
104 | 5 | fn name(&self) -> &str { |
105 | 5 | "first_value" |
106 | 5 | } |
107 | | |
108 | 5 | fn signature(&self) -> &Signature { |
109 | 5 | &self.signature |
110 | 5 | } |
111 | | |
112 | 5 | fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { |
113 | 5 | Ok(arg_types[0].clone()) |
114 | 5 | } |
115 | | |
116 | 65 | fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> { |
117 | 65 | let ordering_dtypes = acc_args |
118 | 65 | .ordering_req |
119 | 65 | .iter() |
120 | 65 | .map(|e| e.expr.data_type(acc_args.schema)) |
121 | 65 | .collect::<Result<Vec<_>>>()?0 ; |
122 | | |
123 | | // When requirement is empty, or it is signalled by outside caller that |
124 | | // the ordering requirement is/will be satisfied. |
125 | 65 | let requirement_satisfied = |
126 | 65 | acc_args.ordering_req.is_empty() || self.requirement_satisfied; |
127 | | |
128 | 65 | FirstValueAccumulator::try_new( |
129 | 65 | acc_args.return_type, |
130 | 65 | &ordering_dtypes, |
131 | 65 | acc_args.ordering_req.to_vec(), |
132 | 65 | acc_args.ignore_nulls, |
133 | 65 | ) |
134 | 65 | .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) |
135 | 65 | } |
136 | | |
137 | 29 | fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> { |
138 | 29 | let mut fields = vec![Field::new( |
139 | 29 | format_state_name(args.name, "first_value"), |
140 | 29 | args.return_type.clone(), |
141 | 29 | true, |
142 | 29 | )]; |
143 | 29 | fields.extend(args.ordering_fields.to_vec()); |
144 | 29 | fields.push(Field::new("is_set", DataType::Boolean, true)); |
145 | 29 | Ok(fields) |
146 | 29 | } |
147 | | |
148 | 0 | fn aliases(&self) -> &[String] { |
149 | 0 | &[] |
150 | 0 | } |
151 | | |
152 | 0 | fn with_beneficial_ordering( |
153 | 0 | self: Arc<Self>, |
154 | 0 | beneficial_ordering: bool, |
155 | 0 | ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> { |
156 | 0 | Ok(Some(Arc::new( |
157 | 0 | FirstValue::new().with_requirement_satisfied(beneficial_ordering), |
158 | 0 | ))) |
159 | 0 | } |
160 | | |
161 | 26 | fn order_sensitivity(&self) -> AggregateOrderSensitivity { |
162 | 26 | AggregateOrderSensitivity::Beneficial |
163 | 26 | } |
164 | | |
165 | 0 | fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { |
166 | 0 | datafusion_expr::ReversedUDAF::Reversed(last_value_udaf()) |
167 | 0 | } |
168 | | } |
169 | | |
170 | | #[derive(Debug)] |
171 | | pub struct FirstValueAccumulator { |
172 | | first: ScalarValue, |
173 | | // At the beginning, `is_set` is false, which means `first` is not seen yet. |
174 | | // Once we see the first value, we set the `is_set` flag and do not update `first` anymore. |
175 | | is_set: bool, |
176 | | // Stores ordering values, of the aggregator requirement corresponding to first value |
177 | | // of the aggregator. These values are used during merging of multiple partitions. |
178 | | orderings: Vec<ScalarValue>, |
179 | | // Stores the applicable ordering requirement. |
180 | | ordering_req: LexOrdering, |
181 | | // Stores whether incoming data already satisfies the ordering requirement. |
182 | | requirement_satisfied: bool, |
183 | | // Ignore null values. |
184 | | ignore_nulls: bool, |
185 | | } |
186 | | |
187 | | impl FirstValueAccumulator { |
188 | | /// Creates a new `FirstValueAccumulator` for the given `data_type`. |
189 | 65 | pub fn try_new( |
190 | 65 | data_type: &DataType, |
191 | 65 | ordering_dtypes: &[DataType], |
192 | 65 | ordering_req: LexOrdering, |
193 | 65 | ignore_nulls: bool, |
194 | 65 | ) -> Result<Self> { |
195 | 65 | let orderings = ordering_dtypes |
196 | 65 | .iter() |
197 | 65 | .map(ScalarValue::try_from) |
198 | 65 | .collect::<Result<Vec<_>>>()?0 ; |
199 | 65 | let requirement_satisfied = ordering_req.is_empty(); |
200 | 65 | ScalarValue::try_from(data_type).map(|first| Self { |
201 | 65 | first, |
202 | 65 | is_set: false, |
203 | 65 | orderings, |
204 | 65 | ordering_req, |
205 | 65 | requirement_satisfied, |
206 | 65 | ignore_nulls, |
207 | 65 | }) |
208 | 65 | } |
209 | | |
210 | 65 | pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { |
211 | 65 | self.requirement_satisfied = requirement_satisfied; |
212 | 65 | self |
213 | 65 | } |
214 | | |
215 | | // Updates state with the values in the given row. |
216 | 67 | fn update_with_new_row(&mut self, row: &[ScalarValue]) { |
217 | 67 | self.first = row[0].clone(); |
218 | 67 | self.orderings = row[1..].to_vec(); |
219 | 67 | self.is_set = true; |
220 | 67 | } |
221 | | |
222 | 48 | fn get_first_idx(&self, values: &[ArrayRef]) -> Result<Option<usize>> { |
223 | 48 | let [value, ordering_values @ ..] = values else { |
224 | 0 | return internal_err!("Empty row in FIRST_VALUE"); |
225 | | }; |
226 | 48 | if self.requirement_satisfied { |
227 | | // Get first entry according to the pre-existing ordering (0th index): |
228 | 0 | if self.ignore_nulls { |
229 | | // If ignoring nulls, find the first non-null value. |
230 | 0 | for i in 0..value.len() { |
231 | 0 | if !value.is_null(i) { |
232 | 0 | return Ok(Some(i)); |
233 | 0 | } |
234 | | } |
235 | 0 | return Ok(None); |
236 | | } else { |
237 | | // If not ignoring nulls, return the first value if it exists. |
238 | 0 | return Ok((!value.is_empty()).then_some(0)); |
239 | | } |
240 | 48 | } |
241 | 48 | let sort_columns = ordering_values |
242 | 48 | .iter() |
243 | 48 | .zip(self.ordering_req.iter()) |
244 | 48 | .map(|(values, req)| SortColumn { |
245 | 48 | values: Arc::clone(values), |
246 | 48 | options: Some(req.options), |
247 | 48 | }) |
248 | 48 | .collect::<Vec<_>>(); |
249 | 48 | |
250 | 48 | if self.ignore_nulls { |
251 | 0 | let indices = lexsort_to_indices(&sort_columns, None)?; |
252 | | // If ignoring nulls, find the first non-null value. |
253 | 0 | for index in indices.iter().flatten() { |
254 | 0 | if !value.is_null(index as usize) { |
255 | 0 | return Ok(Some(index as usize)); |
256 | 0 | } |
257 | | } |
258 | 0 | Ok(None) |
259 | | } else { |
260 | 48 | let indices = lexsort_to_indices(&sort_columns, Some(1))?0 ; |
261 | 48 | Ok((!indices.is_empty()).then_some(indices.value(0) as _)) |
262 | | } |
263 | 48 | } |
264 | | } |
265 | | |
266 | | impl Accumulator for FirstValueAccumulator { |
267 | 53 | fn state(&mut self) -> Result<Vec<ScalarValue>> { |
268 | 53 | let mut result = vec![self.first.clone()]; |
269 | 53 | result.extend(self.orderings.iter().cloned()); |
270 | 53 | result.push(ScalarValue::Boolean(Some(self.is_set))); |
271 | 53 | Ok(result) |
272 | 53 | } |
273 | | |
274 | 48 | fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { |
275 | 48 | if !self.is_set { |
276 | 48 | if let Some(first_idx) = self.get_first_idx(values)?0 { |
277 | 48 | let row = get_row_at_idx(values, first_idx)?0 ; |
278 | 48 | self.update_with_new_row(&row); |
279 | 0 | } |
280 | 0 | } else if !self.requirement_satisfied { |
281 | 0 | if let Some(first_idx) = self.get_first_idx(values)? { |
282 | 0 | let row = get_row_at_idx(values, first_idx)?; |
283 | 0 | let orderings = &row[1..]; |
284 | 0 | if compare_rows( |
285 | 0 | &self.orderings, |
286 | 0 | orderings, |
287 | 0 | &get_sort_options(&self.ordering_req), |
288 | 0 | )? |
289 | 0 | .is_gt() |
290 | 0 | { |
291 | 0 | self.update_with_new_row(&row); |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | 48 | Ok(()) |
296 | 48 | } |
297 | | |
298 | 33 | fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { |
299 | 33 | // FIRST_VALUE(first1, first2, first3, ...) |
300 | 33 | // last index contains is_set flag. |
301 | 33 | let is_set_idx = states.len() - 1; |
302 | 33 | let flags = states[is_set_idx].as_boolean(); |
303 | 33 | let filtered_states = filter_states_according_to_is_set(states, flags)?0 ; |
304 | | // 1..is_set_idx range corresponds to ordering section |
305 | 33 | let sort_cols = |
306 | 33 | convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); |
307 | | |
308 | 33 | let ordered_states = if sort_cols.is_empty() { |
309 | | // When no ordering is given, use the existing state as is: |
310 | 0 | filtered_states |
311 | | } else { |
312 | 33 | let indices = lexsort_to_indices(&sort_cols, None)?0 ; |
313 | 33 | take_arrays(&filtered_states, &indices)?0 |
314 | | }; |
315 | 33 | if !ordered_states[0].is_empty() { |
316 | 33 | let first_row = get_row_at_idx(&ordered_states, 0)?0 ; |
317 | | // When collecting orderings, we exclude the is_set flag from the state. |
318 | 33 | let first_ordering = &first_row[1..is_set_idx]; |
319 | 33 | let sort_options = get_sort_options(&self.ordering_req); |
320 | 33 | // Either there is no existing value, or there is an earlier version in new data. |
321 | 33 | if !self.is_set |
322 | 16 | || compare_rows(&self.orderings, first_ordering, &sort_options)?0 .is_gt() |
323 | 19 | { |
324 | 19 | // Update with first value in the state. Note that we should exclude the |
325 | 19 | // is_set flag from the state. Otherwise, we will end up with a state |
326 | 19 | // containing two is_set flags. |
327 | 19 | self.update_with_new_row(&first_row[0..is_set_idx]); |
328 | 19 | }14 |
329 | 0 | } |
330 | 33 | Ok(()) |
331 | 33 | } |
332 | | |
333 | 12 | fn evaluate(&mut self) -> Result<ScalarValue> { |
334 | 12 | Ok(self.first.clone()) |
335 | 12 | } |
336 | | |
337 | 292 | fn size(&self) -> usize { |
338 | 292 | std::mem::size_of_val(self) - std::mem::size_of_val(&self.first) |
339 | 292 | + self.first.size() |
340 | 292 | + ScalarValue::size_of_vec(&self.orderings) |
341 | 292 | - std::mem::size_of_val(&self.orderings) |
342 | 292 | } |
343 | | } |
344 | | |
345 | | make_udaf_expr_and_func!( |
346 | | LastValue, |
347 | | last_value, |
348 | | "Returns the last value in a group of values.", |
349 | | last_value_udaf |
350 | | ); |
351 | | |
352 | | pub struct LastValue { |
353 | | signature: Signature, |
354 | | requirement_satisfied: bool, |
355 | | } |
356 | | |
357 | | impl Debug for LastValue { |
358 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
359 | 0 | f.debug_struct("LastValue") |
360 | 0 | .field("name", &self.name()) |
361 | 0 | .field("signature", &self.signature) |
362 | 0 | .field("accumulator", &"<FUNC>") |
363 | 0 | .finish() |
364 | 0 | } |
365 | | } |
366 | | |
367 | | impl Default for LastValue { |
368 | 1 | fn default() -> Self { |
369 | 1 | Self::new() |
370 | 1 | } |
371 | | } |
372 | | |
373 | | impl LastValue { |
374 | 1 | pub fn new() -> Self { |
375 | 1 | Self { |
376 | 1 | signature: Signature::one_of( |
377 | 1 | vec![ |
378 | 1 | // TODO: we can introduce more strict signature that only numeric of array types are allowed |
379 | 1 | TypeSignature::ArraySignature(ArrayFunctionSignature::Array), |
380 | 1 | TypeSignature::Numeric(1), |
381 | 1 | TypeSignature::Uniform(1, vec![DataType::Utf8]), |
382 | 1 | ], |
383 | 1 | Volatility::Immutable, |
384 | 1 | ), |
385 | 1 | requirement_satisfied: false, |
386 | 1 | } |
387 | 1 | } |
388 | | |
389 | 0 | fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { |
390 | 0 | self.requirement_satisfied = requirement_satisfied; |
391 | 0 | self |
392 | 0 | } |
393 | | } |
394 | | |
395 | | impl AggregateUDFImpl for LastValue { |
396 | 0 | fn as_any(&self) -> &dyn Any { |
397 | 0 | self |
398 | 0 | } |
399 | | |
400 | 5 | fn name(&self) -> &str { |
401 | 5 | "last_value" |
402 | 5 | } |
403 | | |
404 | 5 | fn signature(&self) -> &Signature { |
405 | 5 | &self.signature |
406 | 5 | } |
407 | | |
408 | 5 | fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { |
409 | 5 | Ok(arg_types[0].clone()) |
410 | 5 | } |
411 | | |
412 | 65 | fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> { |
413 | 65 | let ordering_dtypes = acc_args |
414 | 65 | .ordering_req |
415 | 65 | .iter() |
416 | 65 | .map(|e| e.expr.data_type(acc_args.schema)) |
417 | 65 | .collect::<Result<Vec<_>>>()?0 ; |
418 | | |
419 | 65 | let requirement_satisfied = |
420 | 65 | acc_args.ordering_req.is_empty() || self.requirement_satisfied; |
421 | | |
422 | 65 | LastValueAccumulator::try_new( |
423 | 65 | acc_args.return_type, |
424 | 65 | &ordering_dtypes, |
425 | 65 | acc_args.ordering_req.to_vec(), |
426 | 65 | acc_args.ignore_nulls, |
427 | 65 | ) |
428 | 65 | .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) |
429 | 65 | } |
430 | | |
431 | 29 | fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> { |
432 | 29 | let StateFieldsArgs { |
433 | 29 | name, |
434 | 29 | input_types, |
435 | 29 | return_type: _, |
436 | 29 | ordering_fields, |
437 | 29 | is_distinct: _, |
438 | 29 | } = args; |
439 | 29 | let mut fields = vec![Field::new( |
440 | 29 | format_state_name(name, "last_value"), |
441 | 29 | input_types[0].clone(), |
442 | 29 | true, |
443 | 29 | )]; |
444 | 29 | fields.extend(ordering_fields.to_vec()); |
445 | 29 | fields.push(Field::new("is_set", DataType::Boolean, true)); |
446 | 29 | Ok(fields) |
447 | 29 | } |
448 | | |
449 | 0 | fn aliases(&self) -> &[String] { |
450 | 0 | &[] |
451 | 0 | } |
452 | | |
453 | 0 | fn with_beneficial_ordering( |
454 | 0 | self: Arc<Self>, |
455 | 0 | beneficial_ordering: bool, |
456 | 0 | ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> { |
457 | 0 | Ok(Some(Arc::new( |
458 | 0 | LastValue::new().with_requirement_satisfied(beneficial_ordering), |
459 | 0 | ))) |
460 | 0 | } |
461 | | |
462 | 26 | fn order_sensitivity(&self) -> AggregateOrderSensitivity { |
463 | 26 | AggregateOrderSensitivity::Beneficial |
464 | 26 | } |
465 | | |
466 | 0 | fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { |
467 | 0 | datafusion_expr::ReversedUDAF::Reversed(first_value_udaf()) |
468 | 0 | } |
469 | | } |
470 | | |
471 | | #[derive(Debug)] |
472 | | struct LastValueAccumulator { |
473 | | last: ScalarValue, |
474 | | // The `is_set` flag keeps track of whether the last value is finalized. |
475 | | // This information is used to discriminate genuine NULLs and NULLS that |
476 | | // occur due to empty partitions. |
477 | | is_set: bool, |
478 | | orderings: Vec<ScalarValue>, |
479 | | // Stores the applicable ordering requirement. |
480 | | ordering_req: LexOrdering, |
481 | | // Stores whether incoming data already satisfies the ordering requirement. |
482 | | requirement_satisfied: bool, |
483 | | // Ignore null values. |
484 | | ignore_nulls: bool, |
485 | | } |
486 | | |
487 | | impl LastValueAccumulator { |
488 | | /// Creates a new `LastValueAccumulator` for the given `data_type`. |
489 | 65 | pub fn try_new( |
490 | 65 | data_type: &DataType, |
491 | 65 | ordering_dtypes: &[DataType], |
492 | 65 | ordering_req: LexOrdering, |
493 | 65 | ignore_nulls: bool, |
494 | 65 | ) -> Result<Self> { |
495 | 65 | let orderings = ordering_dtypes |
496 | 65 | .iter() |
497 | 65 | .map(ScalarValue::try_from) |
498 | 65 | .collect::<Result<Vec<_>>>()?0 ; |
499 | 65 | let requirement_satisfied = ordering_req.is_empty(); |
500 | 65 | ScalarValue::try_from(data_type).map(|last| Self { |
501 | 65 | last, |
502 | 65 | is_set: false, |
503 | 65 | orderings, |
504 | 65 | ordering_req, |
505 | 65 | requirement_satisfied, |
506 | 65 | ignore_nulls, |
507 | 65 | }) |
508 | 65 | } |
509 | | |
510 | | // Updates state with the values in the given row. |
511 | 71 | fn update_with_new_row(&mut self, row: &[ScalarValue]) { |
512 | 71 | self.last = row[0].clone(); |
513 | 71 | self.orderings = row[1..].to_vec(); |
514 | 71 | self.is_set = true; |
515 | 71 | } |
516 | | |
517 | 48 | fn get_last_idx(&self, values: &[ArrayRef]) -> Result<Option<usize>> { |
518 | 48 | let [value, ordering_values @ ..] = values else { |
519 | 0 | return internal_err!("Empty row in LAST_VALUE"); |
520 | | }; |
521 | 48 | if self.requirement_satisfied { |
522 | | // Get last entry according to the order of data: |
523 | 0 | if self.ignore_nulls { |
524 | | // If ignoring nulls, find the last non-null value. |
525 | 0 | for i in (0..value.len()).rev() { |
526 | 0 | if !value.is_null(i) { |
527 | 0 | return Ok(Some(i)); |
528 | 0 | } |
529 | | } |
530 | 0 | return Ok(None); |
531 | | } else { |
532 | 0 | return Ok((!value.is_empty()).then_some(value.len() - 1)); |
533 | | } |
534 | 48 | } |
535 | 48 | let sort_columns = ordering_values |
536 | 48 | .iter() |
537 | 48 | .zip(self.ordering_req.iter()) |
538 | 48 | .map(|(values, req)| { |
539 | 48 | // Take the reverse ordering requirement. This enables us to |
540 | 48 | // use "fetch = 1" to get the last value. |
541 | 48 | SortColumn { |
542 | 48 | values: Arc::clone(values), |
543 | 48 | options: Some(!req.options), |
544 | 48 | } |
545 | 48 | }) |
546 | 48 | .collect::<Vec<_>>(); |
547 | 48 | |
548 | 48 | if self.ignore_nulls { |
549 | 0 | let indices = lexsort_to_indices(&sort_columns, None)?; |
550 | | // If ignoring nulls, find the last non-null value. |
551 | 0 | for index in indices.iter().flatten() { |
552 | 0 | if !value.is_null(index as usize) { |
553 | 0 | return Ok(Some(index as usize)); |
554 | 0 | } |
555 | | } |
556 | 0 | Ok(None) |
557 | | } else { |
558 | 48 | let indices = lexsort_to_indices(&sort_columns, Some(1))?0 ; |
559 | 48 | Ok((!indices.is_empty()).then_some(indices.value(0) as _)) |
560 | | } |
561 | 48 | } |
562 | | |
563 | 65 | fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { |
564 | 65 | self.requirement_satisfied = requirement_satisfied; |
565 | 65 | self |
566 | 65 | } |
567 | | } |
568 | | |
569 | | impl Accumulator for LastValueAccumulator { |
570 | 53 | fn state(&mut self) -> Result<Vec<ScalarValue>> { |
571 | 53 | let mut result = vec![self.last.clone()]; |
572 | 53 | result.extend(self.orderings.clone()); |
573 | 53 | result.push(ScalarValue::Boolean(Some(self.is_set))); |
574 | 53 | Ok(result) |
575 | 53 | } |
576 | | |
577 | 48 | fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { |
578 | 48 | if !self.is_set || self.requirement_satisfied0 { |
579 | 48 | if let Some(last_idx) = self.get_last_idx(values)?0 { |
580 | 48 | let row = get_row_at_idx(values, last_idx)?0 ; |
581 | 48 | self.update_with_new_row(&row); |
582 | 0 | } |
583 | 0 | } else if let Some(last_idx) = self.get_last_idx(values)? { |
584 | 0 | let row = get_row_at_idx(values, last_idx)?; |
585 | 0 | let orderings = &row[1..]; |
586 | 0 | // Update when there is a more recent entry |
587 | 0 | if compare_rows( |
588 | 0 | &self.orderings, |
589 | 0 | orderings, |
590 | 0 | &get_sort_options(&self.ordering_req), |
591 | 0 | )? |
592 | 0 | .is_lt() |
593 | 0 | { |
594 | 0 | self.update_with_new_row(&row); |
595 | 0 | } |
596 | 0 | } |
597 | | |
598 | 48 | Ok(()) |
599 | 48 | } |
600 | | |
601 | 33 | fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { |
602 | 33 | // LAST_VALUE(last1, last2, last3, ...) |
603 | 33 | // last index contains is_set flag. |
604 | 33 | let is_set_idx = states.len() - 1; |
605 | 33 | let flags = states[is_set_idx].as_boolean(); |
606 | 33 | let filtered_states = filter_states_according_to_is_set(states, flags)?0 ; |
607 | | // 1..is_set_idx range corresponds to ordering section |
608 | 33 | let sort_cols = |
609 | 33 | convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); |
610 | | |
611 | 33 | let ordered_states = if sort_cols.is_empty() { |
612 | | // When no ordering is given, use existing state as is: |
613 | 0 | filtered_states |
614 | | } else { |
615 | 33 | let indices = lexsort_to_indices(&sort_cols, None)?0 ; |
616 | 33 | take_arrays(&filtered_states, &indices)?0 |
617 | | }; |
618 | | |
619 | 33 | if !ordered_states[0].is_empty() { |
620 | 33 | let last_idx = ordered_states[0].len() - 1; |
621 | 33 | let last_row = get_row_at_idx(&ordered_states, last_idx)?0 ; |
622 | | // When collecting orderings, we exclude the is_set flag from the state. |
623 | 33 | let last_ordering = &last_row[1..is_set_idx]; |
624 | 33 | let sort_options = get_sort_options(&self.ordering_req); |
625 | 33 | // Either there is no existing value, or there is a newer (latest) |
626 | 33 | // version in the new data: |
627 | 33 | if !self.is_set |
628 | 16 | || compare_rows(&self.orderings, last_ordering, &sort_options)?0 .is_lt() |
629 | 23 | { |
630 | 23 | // Update with last value in the state. Note that we should exclude the |
631 | 23 | // is_set flag from the state. Otherwise, we will end up with a state |
632 | 23 | // containing two is_set flags. |
633 | 23 | self.update_with_new_row(&last_row[0..is_set_idx]); |
634 | 23 | }10 |
635 | 0 | } |
636 | 33 | Ok(()) |
637 | 33 | } |
638 | | |
639 | 12 | fn evaluate(&mut self) -> Result<ScalarValue> { |
640 | 12 | Ok(self.last.clone()) |
641 | 12 | } |
642 | | |
643 | 292 | fn size(&self) -> usize { |
644 | 292 | std::mem::size_of_val(self) - std::mem::size_of_val(&self.last) |
645 | 292 | + self.last.size() |
646 | 292 | + ScalarValue::size_of_vec(&self.orderings) |
647 | 292 | - std::mem::size_of_val(&self.orderings) |
648 | 292 | } |
649 | | } |
650 | | |
651 | | /// Filters states according to the `is_set` flag at the last column and returns |
652 | | /// the resulting states. |
653 | 66 | fn filter_states_according_to_is_set( |
654 | 66 | states: &[ArrayRef], |
655 | 66 | flags: &BooleanArray, |
656 | 66 | ) -> Result<Vec<ArrayRef>> { |
657 | 66 | states |
658 | 66 | .iter() |
659 | 198 | .map(|state| compute::filter(state, flags).map_err(|e| arrow_datafusion_err!(e)0 )) |
660 | 66 | .collect::<Result<Vec<_>>>() |
661 | 66 | } |
662 | | |
663 | | /// Combines array refs and their corresponding orderings to construct `SortColumn`s. |
664 | 66 | fn convert_to_sort_cols( |
665 | 66 | arrs: &[ArrayRef], |
666 | 66 | sort_exprs: &[PhysicalSortExpr], |
667 | 66 | ) -> Vec<SortColumn> { |
668 | 66 | arrs.iter() |
669 | 66 | .zip(sort_exprs.iter()) |
670 | 66 | .map(|(item, sort_expr)| SortColumn { |
671 | 66 | values: Arc::clone(item), |
672 | 66 | options: Some(sort_expr.options), |
673 | 66 | }) |
674 | 66 | .collect::<Vec<_>>() |
675 | 66 | } |
676 | | |
677 | | #[cfg(test)] |
678 | | mod tests { |
679 | | use arrow::array::Int64Array; |
680 | | |
681 | | use super::*; |
682 | | |
683 | | #[test] |
684 | | fn test_first_last_value_value() -> Result<()> { |
685 | | let mut first_accumulator = |
686 | | FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
687 | | let mut last_accumulator = |
688 | | LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
689 | | // first value in the tuple is start of the range (inclusive), |
690 | | // second value in the tuple is end of the range (exclusive) |
691 | | let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; |
692 | | // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 |
693 | | let arrs = ranges |
694 | | .into_iter() |
695 | | .map(|(start, end)| { |
696 | | Arc::new(Int64Array::from((start..end).collect::<Vec<_>>())) as ArrayRef |
697 | | }) |
698 | | .collect::<Vec<_>>(); |
699 | | for arr in arrs { |
700 | | // Once first_value is set, accumulator should remember it. |
701 | | // It shouldn't update first_value for each new batch |
702 | | first_accumulator.update_batch(&[Arc::clone(&arr)])?; |
703 | | // last_value should be updated for each new batch. |
704 | | last_accumulator.update_batch(&[arr])?; |
705 | | } |
706 | | // First Value comes from the first value of the first batch which is 0 |
707 | | assert_eq!(first_accumulator.evaluate()?, ScalarValue::Int64(Some(0))); |
708 | | // Last value comes from the last value of the last batch which is 12 |
709 | | assert_eq!(last_accumulator.evaluate()?, ScalarValue::Int64(Some(12))); |
710 | | Ok(()) |
711 | | } |
712 | | |
713 | | #[test] |
714 | | fn test_first_last_state_after_merge() -> Result<()> { |
715 | | let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; |
716 | | // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 |
717 | | let arrs = ranges |
718 | | .into_iter() |
719 | | .map(|(start, end)| { |
720 | | Arc::new((start..end).collect::<Int64Array>()) as ArrayRef |
721 | | }) |
722 | | .collect::<Vec<_>>(); |
723 | | |
724 | | // FirstValueAccumulator |
725 | | let mut first_accumulator = |
726 | | FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
727 | | |
728 | | first_accumulator.update_batch(&[Arc::clone(&arrs[0])])?; |
729 | | let state1 = first_accumulator.state()?; |
730 | | |
731 | | let mut first_accumulator = |
732 | | FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
733 | | first_accumulator.update_batch(&[Arc::clone(&arrs[1])])?; |
734 | | let state2 = first_accumulator.state()?; |
735 | | |
736 | | assert_eq!(state1.len(), state2.len()); |
737 | | |
738 | | let mut states = vec![]; |
739 | | |
740 | | for idx in 0..state1.len() { |
741 | | states.push(arrow::compute::concat(&[ |
742 | | &state1[idx].to_array()?, |
743 | | &state2[idx].to_array()?, |
744 | | ])?); |
745 | | } |
746 | | |
747 | | let mut first_accumulator = |
748 | | FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
749 | | first_accumulator.merge_batch(&states)?; |
750 | | |
751 | | let merged_state = first_accumulator.state()?; |
752 | | assert_eq!(merged_state.len(), state1.len()); |
753 | | |
754 | | // LastValueAccumulator |
755 | | let mut last_accumulator = |
756 | | LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
757 | | |
758 | | last_accumulator.update_batch(&[Arc::clone(&arrs[0])])?; |
759 | | let state1 = last_accumulator.state()?; |
760 | | |
761 | | let mut last_accumulator = |
762 | | LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
763 | | last_accumulator.update_batch(&[Arc::clone(&arrs[1])])?; |
764 | | let state2 = last_accumulator.state()?; |
765 | | |
766 | | assert_eq!(state1.len(), state2.len()); |
767 | | |
768 | | let mut states = vec![]; |
769 | | |
770 | | for idx in 0..state1.len() { |
771 | | states.push(arrow::compute::concat(&[ |
772 | | &state1[idx].to_array()?, |
773 | | &state2[idx].to_array()?, |
774 | | ])?); |
775 | | } |
776 | | |
777 | | let mut last_accumulator = |
778 | | LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; |
779 | | last_accumulator.merge_batch(&states)?; |
780 | | |
781 | | let merged_state = last_accumulator.state()?; |
782 | | assert_eq!(merged_state.len(), state1.len()); |
783 | | |
784 | | Ok(()) |
785 | | } |
786 | | } |