Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate/src/first_last.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Defines the FIRST_VALUE/LAST_VALUE aggregations.
19
20
use std::any::Any;
21
use std::fmt::Debug;
22
use std::sync::Arc;
23
24
use arrow::array::{ArrayRef, AsArray, BooleanArray};
25
use arrow::compute::{self, lexsort_to_indices, SortColumn};
26
use arrow::datatypes::{DataType, Field};
27
use datafusion_common::utils::{compare_rows, get_row_at_idx, take_arrays};
28
use datafusion_common::{
29
    arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue,
30
};
31
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
32
use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
33
use datafusion_expr::{
34
    Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Expr, ExprFunctionExt,
35
    Signature, SortExpr, TypeSignature, Volatility,
36
};
37
use datafusion_functions_aggregate_common::utils::get_sort_options;
38
use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
39
40
create_func!(FirstValue, first_value_udaf);
41
42
/// Returns the first value in a group of values.
43
0
pub fn first_value(expression: Expr, order_by: Option<Vec<SortExpr>>) -> Expr {
44
0
    if let Some(order_by) = order_by {
45
0
        first_value_udaf()
46
0
            .call(vec![expression])
47
0
            .order_by(order_by)
48
0
            .build()
49
0
            // guaranteed to be `Expr::AggregateFunction`
50
0
            .unwrap()
51
    } else {
52
0
        first_value_udaf().call(vec![expression])
53
    }
54
0
}
55
56
pub struct FirstValue {
57
    signature: Signature,
58
    requirement_satisfied: bool,
59
}
60
61
impl Debug for FirstValue {
62
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
63
0
        f.debug_struct("FirstValue")
64
0
            .field("name", &self.name())
65
0
            .field("signature", &self.signature)
66
0
            .field("accumulator", &"<FUNC>")
67
0
            .finish()
68
0
    }
69
}
70
71
impl Default for FirstValue {
72
1
    fn default() -> Self {
73
1
        Self::new()
74
1
    }
75
}
76
77
impl FirstValue {
78
1
    pub fn new() -> Self {
79
1
        Self {
80
1
            signature: Signature::one_of(
81
1
                vec![
82
1
                    // TODO: we can introduce more strict signature that only numeric of array types are allowed
83
1
                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array),
84
1
                    TypeSignature::Numeric(1),
85
1
                    TypeSignature::Uniform(1, vec![DataType::Utf8]),
86
1
                ],
87
1
                Volatility::Immutable,
88
1
            ),
89
1
            requirement_satisfied: false,
90
1
        }
91
1
    }
92
93
0
    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
94
0
        self.requirement_satisfied = requirement_satisfied;
95
0
        self
96
0
    }
97
}
98
99
impl AggregateUDFImpl for FirstValue {
100
0
    fn as_any(&self) -> &dyn Any {
101
0
        self
102
0
    }
103
104
5
    fn name(&self) -> &str {
105
5
        "first_value"
106
5
    }
107
108
5
    fn signature(&self) -> &Signature {
109
5
        &self.signature
110
5
    }
111
112
5
    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
113
5
        Ok(arg_types[0].clone())
114
5
    }
115
116
65
    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
117
65
        let ordering_dtypes = acc_args
118
65
            .ordering_req
119
65
            .iter()
120
65
            .map(|e| e.expr.data_type(acc_args.schema))
121
65
            .collect::<Result<Vec<_>>>()
?0
;
122
123
        // When requirement is empty, or it is signalled by outside caller that
124
        // the ordering requirement is/will be satisfied.
125
65
        let requirement_satisfied =
126
65
            acc_args.ordering_req.is_empty() || self.requirement_satisfied;
127
128
65
        FirstValueAccumulator::try_new(
129
65
            acc_args.return_type,
130
65
            &ordering_dtypes,
131
65
            acc_args.ordering_req.to_vec(),
132
65
            acc_args.ignore_nulls,
133
65
        )
134
65
        .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _)
135
65
    }
136
137
29
    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> {
138
29
        let mut fields = vec![Field::new(
139
29
            format_state_name(args.name, "first_value"),
140
29
            args.return_type.clone(),
141
29
            true,
142
29
        )];
143
29
        fields.extend(args.ordering_fields.to_vec());
144
29
        fields.push(Field::new("is_set", DataType::Boolean, true));
145
29
        Ok(fields)
146
29
    }
147
148
0
    fn aliases(&self) -> &[String] {
149
0
        &[]
150
0
    }
151
152
0
    fn with_beneficial_ordering(
153
0
        self: Arc<Self>,
154
0
        beneficial_ordering: bool,
155
0
    ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
156
0
        Ok(Some(Arc::new(
157
0
            FirstValue::new().with_requirement_satisfied(beneficial_ordering),
158
0
        )))
159
0
    }
160
161
26
    fn order_sensitivity(&self) -> AggregateOrderSensitivity {
162
26
        AggregateOrderSensitivity::Beneficial
163
26
    }
164
165
0
    fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
166
0
        datafusion_expr::ReversedUDAF::Reversed(last_value_udaf())
167
0
    }
168
}
169
170
#[derive(Debug)]
171
pub struct FirstValueAccumulator {
172
    first: ScalarValue,
173
    // At the beginning, `is_set` is false, which means `first` is not seen yet.
174
    // Once we see the first value, we set the `is_set` flag and do not update `first` anymore.
175
    is_set: bool,
176
    // Stores ordering values, of the aggregator requirement corresponding to first value
177
    // of the aggregator. These values are used during merging of multiple partitions.
178
    orderings: Vec<ScalarValue>,
179
    // Stores the applicable ordering requirement.
180
    ordering_req: LexOrdering,
181
    // Stores whether incoming data already satisfies the ordering requirement.
182
    requirement_satisfied: bool,
183
    // Ignore null values.
184
    ignore_nulls: bool,
185
}
186
187
impl FirstValueAccumulator {
188
    /// Creates a new `FirstValueAccumulator` for the given `data_type`.
189
65
    pub fn try_new(
190
65
        data_type: &DataType,
191
65
        ordering_dtypes: &[DataType],
192
65
        ordering_req: LexOrdering,
193
65
        ignore_nulls: bool,
194
65
    ) -> Result<Self> {
195
65
        let orderings = ordering_dtypes
196
65
            .iter()
197
65
            .map(ScalarValue::try_from)
198
65
            .collect::<Result<Vec<_>>>()
?0
;
199
65
        let requirement_satisfied = ordering_req.is_empty();
200
65
        ScalarValue::try_from(data_type).map(|first| Self {
201
65
            first,
202
65
            is_set: false,
203
65
            orderings,
204
65
            ordering_req,
205
65
            requirement_satisfied,
206
65
            ignore_nulls,
207
65
        })
208
65
    }
209
210
65
    pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
211
65
        self.requirement_satisfied = requirement_satisfied;
212
65
        self
213
65
    }
214
215
    // Updates state with the values in the given row.
216
67
    fn update_with_new_row(&mut self, row: &[ScalarValue]) {
217
67
        self.first = row[0].clone();
218
67
        self.orderings = row[1..].to_vec();
219
67
        self.is_set = true;
220
67
    }
221
222
48
    fn get_first_idx(&self, values: &[ArrayRef]) -> Result<Option<usize>> {
223
48
        let [value, ordering_values @ ..] = values else {
224
0
            return internal_err!("Empty row in FIRST_VALUE");
225
        };
226
48
        if self.requirement_satisfied {
227
            // Get first entry according to the pre-existing ordering (0th index):
228
0
            if self.ignore_nulls {
229
                // If ignoring nulls, find the first non-null value.
230
0
                for i in 0..value.len() {
231
0
                    if !value.is_null(i) {
232
0
                        return Ok(Some(i));
233
0
                    }
234
                }
235
0
                return Ok(None);
236
            } else {
237
                // If not ignoring nulls, return the first value if it exists.
238
0
                return Ok((!value.is_empty()).then_some(0));
239
            }
240
48
        }
241
48
        let sort_columns = ordering_values
242
48
            .iter()
243
48
            .zip(self.ordering_req.iter())
244
48
            .map(|(values, req)| SortColumn {
245
48
                values: Arc::clone(values),
246
48
                options: Some(req.options),
247
48
            })
248
48
            .collect::<Vec<_>>();
249
48
250
48
        if self.ignore_nulls {
251
0
            let indices = lexsort_to_indices(&sort_columns, None)?;
252
            // If ignoring nulls, find the first non-null value.
253
0
            for index in indices.iter().flatten() {
254
0
                if !value.is_null(index as usize) {
255
0
                    return Ok(Some(index as usize));
256
0
                }
257
            }
258
0
            Ok(None)
259
        } else {
260
48
            let indices = lexsort_to_indices(&sort_columns, Some(1))
?0
;
261
48
            Ok((!indices.is_empty()).then_some(indices.value(0) as _))
262
        }
263
48
    }
264
}
265
266
impl Accumulator for FirstValueAccumulator {
267
53
    fn state(&mut self) -> Result<Vec<ScalarValue>> {
268
53
        let mut result = vec![self.first.clone()];
269
53
        result.extend(self.orderings.iter().cloned());
270
53
        result.push(ScalarValue::Boolean(Some(self.is_set)));
271
53
        Ok(result)
272
53
    }
273
274
48
    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
275
48
        if !self.is_set {
276
48
            if let Some(first_idx) = self.get_first_idx(values)
?0
{
277
48
                let row = get_row_at_idx(values, first_idx)
?0
;
278
48
                self.update_with_new_row(&row);
279
0
            }
280
0
        } else if !self.requirement_satisfied {
281
0
            if let Some(first_idx) = self.get_first_idx(values)? {
282
0
                let row = get_row_at_idx(values, first_idx)?;
283
0
                let orderings = &row[1..];
284
0
                if compare_rows(
285
0
                    &self.orderings,
286
0
                    orderings,
287
0
                    &get_sort_options(&self.ordering_req),
288
0
                )?
289
0
                .is_gt()
290
0
                {
291
0
                    self.update_with_new_row(&row);
292
0
                }
293
0
            }
294
0
        }
295
48
        Ok(())
296
48
    }
297
298
33
    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
299
33
        // FIRST_VALUE(first1, first2, first3, ...)
300
33
        // last index contains is_set flag.
301
33
        let is_set_idx = states.len() - 1;
302
33
        let flags = states[is_set_idx].as_boolean();
303
33
        let filtered_states = filter_states_according_to_is_set(states, flags)
?0
;
304
        // 1..is_set_idx range corresponds to ordering section
305
33
        let sort_cols =
306
33
            convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req);
307
308
33
        let ordered_states = if sort_cols.is_empty() {
309
            // When no ordering is given, use the existing state as is:
310
0
            filtered_states
311
        } else {
312
33
            let indices = lexsort_to_indices(&sort_cols, None)
?0
;
313
33
            take_arrays(&filtered_states, &indices)
?0
314
        };
315
33
        if !ordered_states[0].is_empty() {
316
33
            let first_row = get_row_at_idx(&ordered_states, 0)
?0
;
317
            // When collecting orderings, we exclude the is_set flag from the state.
318
33
            let first_ordering = &first_row[1..is_set_idx];
319
33
            let sort_options = get_sort_options(&self.ordering_req);
320
33
            // Either there is no existing value, or there is an earlier version in new data.
321
33
            if !self.is_set
322
16
                || compare_rows(&self.orderings, first_ordering, &sort_options)
?0
.is_gt()
323
19
            {
324
19
                // Update with first value in the state. Note that we should exclude the
325
19
                // is_set flag from the state. Otherwise, we will end up with a state
326
19
                // containing two is_set flags.
327
19
                self.update_with_new_row(&first_row[0..is_set_idx]);
328
19
            }
14
329
0
        }
330
33
        Ok(())
331
33
    }
332
333
12
    fn evaluate(&mut self) -> Result<ScalarValue> {
334
12
        Ok(self.first.clone())
335
12
    }
336
337
292
    fn size(&self) -> usize {
338
292
        std::mem::size_of_val(self) - std::mem::size_of_val(&self.first)
339
292
            + self.first.size()
340
292
            + ScalarValue::size_of_vec(&self.orderings)
341
292
            - std::mem::size_of_val(&self.orderings)
342
292
    }
343
}
344
345
make_udaf_expr_and_func!(
346
    LastValue,
347
    last_value,
348
    "Returns the last value in a group of values.",
349
    last_value_udaf
350
);
351
352
pub struct LastValue {
353
    signature: Signature,
354
    requirement_satisfied: bool,
355
}
356
357
impl Debug for LastValue {
358
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
359
0
        f.debug_struct("LastValue")
360
0
            .field("name", &self.name())
361
0
            .field("signature", &self.signature)
362
0
            .field("accumulator", &"<FUNC>")
363
0
            .finish()
364
0
    }
365
}
366
367
impl Default for LastValue {
368
1
    fn default() -> Self {
369
1
        Self::new()
370
1
    }
371
}
372
373
impl LastValue {
374
1
    pub fn new() -> Self {
375
1
        Self {
376
1
            signature: Signature::one_of(
377
1
                vec![
378
1
                    // TODO: we can introduce more strict signature that only numeric of array types are allowed
379
1
                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array),
380
1
                    TypeSignature::Numeric(1),
381
1
                    TypeSignature::Uniform(1, vec![DataType::Utf8]),
382
1
                ],
383
1
                Volatility::Immutable,
384
1
            ),
385
1
            requirement_satisfied: false,
386
1
        }
387
1
    }
388
389
0
    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
390
0
        self.requirement_satisfied = requirement_satisfied;
391
0
        self
392
0
    }
393
}
394
395
impl AggregateUDFImpl for LastValue {
396
0
    fn as_any(&self) -> &dyn Any {
397
0
        self
398
0
    }
399
400
5
    fn name(&self) -> &str {
401
5
        "last_value"
402
5
    }
403
404
5
    fn signature(&self) -> &Signature {
405
5
        &self.signature
406
5
    }
407
408
5
    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
409
5
        Ok(arg_types[0].clone())
410
5
    }
411
412
65
    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
413
65
        let ordering_dtypes = acc_args
414
65
            .ordering_req
415
65
            .iter()
416
65
            .map(|e| e.expr.data_type(acc_args.schema))
417
65
            .collect::<Result<Vec<_>>>()
?0
;
418
419
65
        let requirement_satisfied =
420
65
            acc_args.ordering_req.is_empty() || self.requirement_satisfied;
421
422
65
        LastValueAccumulator::try_new(
423
65
            acc_args.return_type,
424
65
            &ordering_dtypes,
425
65
            acc_args.ordering_req.to_vec(),
426
65
            acc_args.ignore_nulls,
427
65
        )
428
65
        .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _)
429
65
    }
430
431
29
    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> {
432
29
        let StateFieldsArgs {
433
29
            name,
434
29
            input_types,
435
29
            return_type: _,
436
29
            ordering_fields,
437
29
            is_distinct: _,
438
29
        } = args;
439
29
        let mut fields = vec![Field::new(
440
29
            format_state_name(name, "last_value"),
441
29
            input_types[0].clone(),
442
29
            true,
443
29
        )];
444
29
        fields.extend(ordering_fields.to_vec());
445
29
        fields.push(Field::new("is_set", DataType::Boolean, true));
446
29
        Ok(fields)
447
29
    }
448
449
0
    fn aliases(&self) -> &[String] {
450
0
        &[]
451
0
    }
452
453
0
    fn with_beneficial_ordering(
454
0
        self: Arc<Self>,
455
0
        beneficial_ordering: bool,
456
0
    ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
457
0
        Ok(Some(Arc::new(
458
0
            LastValue::new().with_requirement_satisfied(beneficial_ordering),
459
0
        )))
460
0
    }
461
462
26
    fn order_sensitivity(&self) -> AggregateOrderSensitivity {
463
26
        AggregateOrderSensitivity::Beneficial
464
26
    }
465
466
0
    fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
467
0
        datafusion_expr::ReversedUDAF::Reversed(first_value_udaf())
468
0
    }
469
}
470
471
#[derive(Debug)]
472
struct LastValueAccumulator {
473
    last: ScalarValue,
474
    // The `is_set` flag keeps track of whether the last value is finalized.
475
    // This information is used to discriminate genuine NULLs and NULLS that
476
    // occur due to empty partitions.
477
    is_set: bool,
478
    orderings: Vec<ScalarValue>,
479
    // Stores the applicable ordering requirement.
480
    ordering_req: LexOrdering,
481
    // Stores whether incoming data already satisfies the ordering requirement.
482
    requirement_satisfied: bool,
483
    // Ignore null values.
484
    ignore_nulls: bool,
485
}
486
487
impl LastValueAccumulator {
488
    /// Creates a new `LastValueAccumulator` for the given `data_type`.
489
65
    pub fn try_new(
490
65
        data_type: &DataType,
491
65
        ordering_dtypes: &[DataType],
492
65
        ordering_req: LexOrdering,
493
65
        ignore_nulls: bool,
494
65
    ) -> Result<Self> {
495
65
        let orderings = ordering_dtypes
496
65
            .iter()
497
65
            .map(ScalarValue::try_from)
498
65
            .collect::<Result<Vec<_>>>()
?0
;
499
65
        let requirement_satisfied = ordering_req.is_empty();
500
65
        ScalarValue::try_from(data_type).map(|last| Self {
501
65
            last,
502
65
            is_set: false,
503
65
            orderings,
504
65
            ordering_req,
505
65
            requirement_satisfied,
506
65
            ignore_nulls,
507
65
        })
508
65
    }
509
510
    // Updates state with the values in the given row.
511
71
    fn update_with_new_row(&mut self, row: &[ScalarValue]) {
512
71
        self.last = row[0].clone();
513
71
        self.orderings = row[1..].to_vec();
514
71
        self.is_set = true;
515
71
    }
516
517
48
    fn get_last_idx(&self, values: &[ArrayRef]) -> Result<Option<usize>> {
518
48
        let [value, ordering_values @ ..] = values else {
519
0
            return internal_err!("Empty row in LAST_VALUE");
520
        };
521
48
        if self.requirement_satisfied {
522
            // Get last entry according to the order of data:
523
0
            if self.ignore_nulls {
524
                // If ignoring nulls, find the last non-null value.
525
0
                for i in (0..value.len()).rev() {
526
0
                    if !value.is_null(i) {
527
0
                        return Ok(Some(i));
528
0
                    }
529
                }
530
0
                return Ok(None);
531
            } else {
532
0
                return Ok((!value.is_empty()).then_some(value.len() - 1));
533
            }
534
48
        }
535
48
        let sort_columns = ordering_values
536
48
            .iter()
537
48
            .zip(self.ordering_req.iter())
538
48
            .map(|(values, req)| {
539
48
                // Take the reverse ordering requirement. This enables us to
540
48
                // use "fetch = 1" to get the last value.
541
48
                SortColumn {
542
48
                    values: Arc::clone(values),
543
48
                    options: Some(!req.options),
544
48
                }
545
48
            })
546
48
            .collect::<Vec<_>>();
547
48
548
48
        if self.ignore_nulls {
549
0
            let indices = lexsort_to_indices(&sort_columns, None)?;
550
            // If ignoring nulls, find the last non-null value.
551
0
            for index in indices.iter().flatten() {
552
0
                if !value.is_null(index as usize) {
553
0
                    return Ok(Some(index as usize));
554
0
                }
555
            }
556
0
            Ok(None)
557
        } else {
558
48
            let indices = lexsort_to_indices(&sort_columns, Some(1))
?0
;
559
48
            Ok((!indices.is_empty()).then_some(indices.value(0) as _))
560
        }
561
48
    }
562
563
65
    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
564
65
        self.requirement_satisfied = requirement_satisfied;
565
65
        self
566
65
    }
567
}
568
569
impl Accumulator for LastValueAccumulator {
570
53
    fn state(&mut self) -> Result<Vec<ScalarValue>> {
571
53
        let mut result = vec![self.last.clone()];
572
53
        result.extend(self.orderings.clone());
573
53
        result.push(ScalarValue::Boolean(Some(self.is_set)));
574
53
        Ok(result)
575
53
    }
576
577
48
    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
578
48
        if !self.is_set || 
self.requirement_satisfied0
{
579
48
            if let Some(last_idx) = self.get_last_idx(values)
?0
{
580
48
                let row = get_row_at_idx(values, last_idx)
?0
;
581
48
                self.update_with_new_row(&row);
582
0
            }
583
0
        } else if let Some(last_idx) = self.get_last_idx(values)? {
584
0
            let row = get_row_at_idx(values, last_idx)?;
585
0
            let orderings = &row[1..];
586
0
            // Update when there is a more recent entry
587
0
            if compare_rows(
588
0
                &self.orderings,
589
0
                orderings,
590
0
                &get_sort_options(&self.ordering_req),
591
0
            )?
592
0
            .is_lt()
593
0
            {
594
0
                self.update_with_new_row(&row);
595
0
            }
596
0
        }
597
598
48
        Ok(())
599
48
    }
600
601
33
    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
602
33
        // LAST_VALUE(last1, last2, last3, ...)
603
33
        // last index contains is_set flag.
604
33
        let is_set_idx = states.len() - 1;
605
33
        let flags = states[is_set_idx].as_boolean();
606
33
        let filtered_states = filter_states_according_to_is_set(states, flags)
?0
;
607
        // 1..is_set_idx range corresponds to ordering section
608
33
        let sort_cols =
609
33
            convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req);
610
611
33
        let ordered_states = if sort_cols.is_empty() {
612
            // When no ordering is given, use existing state as is:
613
0
            filtered_states
614
        } else {
615
33
            let indices = lexsort_to_indices(&sort_cols, None)
?0
;
616
33
            take_arrays(&filtered_states, &indices)
?0
617
        };
618
619
33
        if !ordered_states[0].is_empty() {
620
33
            let last_idx = ordered_states[0].len() - 1;
621
33
            let last_row = get_row_at_idx(&ordered_states, last_idx)
?0
;
622
            // When collecting orderings, we exclude the is_set flag from the state.
623
33
            let last_ordering = &last_row[1..is_set_idx];
624
33
            let sort_options = get_sort_options(&self.ordering_req);
625
33
            // Either there is no existing value, or there is a newer (latest)
626
33
            // version in the new data:
627
33
            if !self.is_set
628
16
                || compare_rows(&self.orderings, last_ordering, &sort_options)
?0
.is_lt()
629
23
            {
630
23
                // Update with last value in the state. Note that we should exclude the
631
23
                // is_set flag from the state. Otherwise, we will end up with a state
632
23
                // containing two is_set flags.
633
23
                self.update_with_new_row(&last_row[0..is_set_idx]);
634
23
            }
10
635
0
        }
636
33
        Ok(())
637
33
    }
638
639
12
    fn evaluate(&mut self) -> Result<ScalarValue> {
640
12
        Ok(self.last.clone())
641
12
    }
642
643
292
    fn size(&self) -> usize {
644
292
        std::mem::size_of_val(self) - std::mem::size_of_val(&self.last)
645
292
            + self.last.size()
646
292
            + ScalarValue::size_of_vec(&self.orderings)
647
292
            - std::mem::size_of_val(&self.orderings)
648
292
    }
649
}
650
651
/// Filters states according to the `is_set` flag at the last column and returns
652
/// the resulting states.
653
66
fn filter_states_according_to_is_set(
654
66
    states: &[ArrayRef],
655
66
    flags: &BooleanArray,
656
66
) -> Result<Vec<ArrayRef>> {
657
66
    states
658
66
        .iter()
659
198
        .map(|state| compute::filter(state, flags).map_err(|e| 
arrow_datafusion_err!(e)0
))
660
66
        .collect::<Result<Vec<_>>>()
661
66
}
662
663
/// Combines array refs and their corresponding orderings to construct `SortColumn`s.
664
66
fn convert_to_sort_cols(
665
66
    arrs: &[ArrayRef],
666
66
    sort_exprs: &[PhysicalSortExpr],
667
66
) -> Vec<SortColumn> {
668
66
    arrs.iter()
669
66
        .zip(sort_exprs.iter())
670
66
        .map(|(item, sort_expr)| SortColumn {
671
66
            values: Arc::clone(item),
672
66
            options: Some(sort_expr.options),
673
66
        })
674
66
        .collect::<Vec<_>>()
675
66
}
676
677
#[cfg(test)]
678
mod tests {
679
    use arrow::array::Int64Array;
680
681
    use super::*;
682
683
    #[test]
684
    fn test_first_last_value_value() -> Result<()> {
685
        let mut first_accumulator =
686
            FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
687
        let mut last_accumulator =
688
            LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
689
        // first value in the tuple is start of the range (inclusive),
690
        // second value in the tuple is end of the range (exclusive)
691
        let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)];
692
        // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12
693
        let arrs = ranges
694
            .into_iter()
695
            .map(|(start, end)| {
696
                Arc::new(Int64Array::from((start..end).collect::<Vec<_>>())) as ArrayRef
697
            })
698
            .collect::<Vec<_>>();
699
        for arr in arrs {
700
            // Once first_value is set, accumulator should remember it.
701
            // It shouldn't update first_value for each new batch
702
            first_accumulator.update_batch(&[Arc::clone(&arr)])?;
703
            // last_value should be updated for each new batch.
704
            last_accumulator.update_batch(&[arr])?;
705
        }
706
        // First Value comes from the first value of the first batch which is 0
707
        assert_eq!(first_accumulator.evaluate()?, ScalarValue::Int64(Some(0)));
708
        // Last value comes from the last value of the last batch which is 12
709
        assert_eq!(last_accumulator.evaluate()?, ScalarValue::Int64(Some(12)));
710
        Ok(())
711
    }
712
713
    #[test]
714
    fn test_first_last_state_after_merge() -> Result<()> {
715
        let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)];
716
        // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12
717
        let arrs = ranges
718
            .into_iter()
719
            .map(|(start, end)| {
720
                Arc::new((start..end).collect::<Int64Array>()) as ArrayRef
721
            })
722
            .collect::<Vec<_>>();
723
724
        // FirstValueAccumulator
725
        let mut first_accumulator =
726
            FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
727
728
        first_accumulator.update_batch(&[Arc::clone(&arrs[0])])?;
729
        let state1 = first_accumulator.state()?;
730
731
        let mut first_accumulator =
732
            FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
733
        first_accumulator.update_batch(&[Arc::clone(&arrs[1])])?;
734
        let state2 = first_accumulator.state()?;
735
736
        assert_eq!(state1.len(), state2.len());
737
738
        let mut states = vec![];
739
740
        for idx in 0..state1.len() {
741
            states.push(arrow::compute::concat(&[
742
                &state1[idx].to_array()?,
743
                &state2[idx].to_array()?,
744
            ])?);
745
        }
746
747
        let mut first_accumulator =
748
            FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
749
        first_accumulator.merge_batch(&states)?;
750
751
        let merged_state = first_accumulator.state()?;
752
        assert_eq!(merged_state.len(), state1.len());
753
754
        // LastValueAccumulator
755
        let mut last_accumulator =
756
            LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
757
758
        last_accumulator.update_batch(&[Arc::clone(&arrs[0])])?;
759
        let state1 = last_accumulator.state()?;
760
761
        let mut last_accumulator =
762
            LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
763
        last_accumulator.update_batch(&[Arc::clone(&arrs[1])])?;
764
        let state2 = last_accumulator.state()?;
765
766
        assert_eq!(state1.len(), state2.len());
767
768
        let mut states = vec![];
769
770
        for idx in 0..state1.len() {
771
            states.push(arrow::compute::concat(&[
772
                &state1[idx].to_array()?,
773
                &state2[idx].to_array()?,
774
            ])?);
775
        }
776
777
        let mut last_accumulator =
778
            LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?;
779
        last_accumulator.merge_batch(&states)?;
780
781
        let merged_state = last_accumulator.state()?;
782
        assert_eq!(merged_state.len(), state1.len());
783
784
        Ok(())
785
    }
786
}