/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/order/full.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use datafusion_expr::EmitTo; |
19 | | |
20 | | /// Tracks grouping state when the data is ordered entirely by its |
21 | | /// group keys |
22 | | /// |
23 | | /// When the group values are sorted, as soon as we see group `n+1` we |
24 | | /// know we will never see any rows for group `n` again and thus they |
25 | | /// can be emitted. |
26 | | /// |
27 | | /// For example, given `SUM(amt) GROUP BY id` if the input is sorted |
28 | | /// by `id` as soon as a new `id` value is seen all previous values |
29 | | /// can be emitted. |
30 | | /// |
31 | | /// The state is tracked like this: |
32 | | /// |
33 | | /// ```text |
34 | | /// ┌─────┐ ┌──────────────────┐ |
35 | | /// │┌───┐│ │ ┌──────────────┐ │ ┏━━━━━━━━━━━━━━┓ |
36 | | /// ││ 0 ││ │ │ 123 │ │ ┌─────┃ 13 ┃ |
37 | | /// │└───┘│ │ └──────────────┘ │ │ ┗━━━━━━━━━━━━━━┛ |
38 | | /// │ ... │ │ ... │ │ |
39 | | /// │┌───┐│ │ ┌──────────────┐ │ │ current |
40 | | /// ││12 ││ │ │ 234 │ │ │ |
41 | | /// │├───┤│ │ ├──────────────┤ │ │ |
42 | | /// ││12 ││ │ │ 234 │ │ │ |
43 | | /// │├───┤│ │ ├──────────────┤ │ │ |
44 | | /// ││13 ││ │ │ 456 │◀┼───┘ |
45 | | /// │└───┘│ │ └──────────────┘ │ |
46 | | /// └─────┘ └──────────────────┘ |
47 | | /// |
48 | | /// group indices group_values current tracks the most |
49 | | /// (in group value recent group index |
50 | | /// order) |
51 | | /// ``` |
52 | | /// |
53 | | /// In this diagram, the current group is `13`, and thus groups |
54 | | /// `0..12` can be emitted. Note that `13` can not yet be emitted as |
55 | | /// there may be more values in the next batch with the same group_id. |
56 | | #[derive(Debug)] |
57 | | pub struct GroupOrderingFull { |
58 | | state: State, |
59 | | } |
60 | | |
61 | | #[derive(Debug)] |
62 | | enum State { |
63 | | /// Seen no input yet |
64 | | Start, |
65 | | |
66 | | /// Data is in progress. `current` is the current group for which |
67 | | /// values are being generated. Can emit `current` - 1 |
68 | | InProgress { current: usize }, |
69 | | |
70 | | /// Seen end of input: all groups can be emitted |
71 | | Complete, |
72 | | } |
73 | | |
74 | | impl GroupOrderingFull { |
75 | 4 | pub fn new() -> Self { |
76 | 4 | Self { |
77 | 4 | state: State::Start, |
78 | 4 | } |
79 | 4 | } |
80 | | |
81 | | // How many groups be emitted, or None if no data can be emitted |
82 | 12 | pub fn emit_to(&self) -> Option<EmitTo> { |
83 | 12 | match &self.state { |
84 | 0 | State::Start => None, |
85 | 12 | State::InProgress { current, .. } => { |
86 | 12 | if *current == 0 { |
87 | | // Can not emit if still on the first row |
88 | 4 | None |
89 | | } else { |
90 | | // otherwise emit all rows prior to the current group |
91 | 8 | Some(EmitTo::First(*current)) |
92 | | } |
93 | | } |
94 | 0 | State::Complete { .. } => Some(EmitTo::All), |
95 | | } |
96 | 12 | } |
97 | | |
98 | | /// remove the first n groups from the internal state, shifting |
99 | | /// all existing indexes down by `n` |
100 | 8 | pub fn remove_groups(&mut self, n: usize) { |
101 | 8 | match &mut self.state { |
102 | 0 | State::Start => panic!("invalid state: start"), |
103 | 8 | State::InProgress { current } => { |
104 | 8 | // shift down by n |
105 | 8 | assert!(*current >= n); |
106 | 8 | *current -= n; |
107 | | } |
108 | 0 | State::Complete { .. } => panic!("invalid state: complete"), |
109 | | } |
110 | 8 | } |
111 | | |
112 | | /// Note that the input is complete so any outstanding groups are done as well |
113 | 4 | pub fn input_done(&mut self) { |
114 | 4 | self.state = State::Complete; |
115 | 4 | } |
116 | | |
117 | | /// Called when new groups are added in a batch. See documentation |
118 | | /// on [`super::GroupOrdering::new_groups`] |
119 | 12 | pub fn new_groups(&mut self, total_num_groups: usize) { |
120 | 12 | assert_ne!(total_num_groups, 0); |
121 | | |
122 | | // Update state |
123 | 12 | let max_group_index = total_num_groups - 1; |
124 | 12 | self.state = match self.state { |
125 | 4 | State::Start => State::InProgress { |
126 | 4 | current: max_group_index, |
127 | 4 | }, |
128 | 8 | State::InProgress { current } => { |
129 | 8 | // expect to see new group indexes when called again |
130 | 8 | assert!(current <= max_group_index, "{current} <= {max_group_index}"0 ); |
131 | 8 | State::InProgress { |
132 | 8 | current: max_group_index, |
133 | 8 | } |
134 | | } |
135 | | State::Complete { .. } => { |
136 | 0 | panic!("Saw new group after input was complete"); |
137 | | } |
138 | | }; |
139 | 12 | } |
140 | | |
141 | 28 | pub(crate) fn size(&self) -> usize { |
142 | 28 | std::mem::size_of::<Self>() |
143 | 28 | } |
144 | | } |
145 | | |
146 | | impl Default for GroupOrderingFull { |
147 | 0 | fn default() -> Self { |
148 | 0 | Self::new() |
149 | 0 | } |
150 | | } |