/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/window/rank.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines physical expression for `rank`, `dense_rank`, and `percent_rank` that can evaluated |
19 | | //! at runtime during query execution |
20 | | |
21 | | use crate::expressions::Column; |
22 | | use crate::window::window_expr::RankState; |
23 | | use crate::window::BuiltInWindowFunctionExpr; |
24 | | use crate::{PhysicalExpr, PhysicalSortExpr}; |
25 | | |
26 | | use arrow::array::ArrayRef; |
27 | | use arrow::array::{Float64Array, UInt64Array}; |
28 | | use arrow::datatypes::{DataType, Field}; |
29 | | use arrow_schema::{SchemaRef, SortOptions}; |
30 | | use datafusion_common::utils::get_row_at_idx; |
31 | | use datafusion_common::{exec_err, Result, ScalarValue}; |
32 | | use datafusion_expr::PartitionEvaluator; |
33 | | |
34 | | use std::any::Any; |
35 | | use std::iter; |
36 | | use std::ops::Range; |
37 | | use std::sync::Arc; |
38 | | |
39 | | /// Rank calculates the rank in the window function with order by |
40 | | #[derive(Debug)] |
41 | | pub struct Rank { |
42 | | name: String, |
43 | | rank_type: RankType, |
44 | | /// Output data type |
45 | | data_type: DataType, |
46 | | } |
47 | | |
48 | | impl Rank { |
49 | | /// Get rank_type of the rank in window function with order by |
50 | 0 | pub fn get_type(&self) -> RankType { |
51 | 0 | self.rank_type |
52 | 0 | } |
53 | | } |
54 | | |
55 | | #[derive(Debug, Copy, Clone)] |
56 | | pub enum RankType { |
57 | | Basic, |
58 | | Dense, |
59 | | Percent, |
60 | | } |
61 | | |
62 | | /// Create a rank window function |
63 | 0 | pub fn rank(name: String, data_type: &DataType) -> Rank { |
64 | 0 | Rank { |
65 | 0 | name, |
66 | 0 | rank_type: RankType::Basic, |
67 | 0 | data_type: data_type.clone(), |
68 | 0 | } |
69 | 0 | } |
70 | | |
71 | | /// Create a dense rank window function |
72 | 0 | pub fn dense_rank(name: String, data_type: &DataType) -> Rank { |
73 | 0 | Rank { |
74 | 0 | name, |
75 | 0 | rank_type: RankType::Dense, |
76 | 0 | data_type: data_type.clone(), |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | /// Create a percent rank window function |
81 | 0 | pub fn percent_rank(name: String, data_type: &DataType) -> Rank { |
82 | 0 | Rank { |
83 | 0 | name, |
84 | 0 | rank_type: RankType::Percent, |
85 | 0 | data_type: data_type.clone(), |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | | impl BuiltInWindowFunctionExpr for Rank { |
90 | | /// Return a reference to Any that can be used for downcasting |
91 | 0 | fn as_any(&self) -> &dyn Any { |
92 | 0 | self |
93 | 0 | } |
94 | | |
95 | 0 | fn field(&self) -> Result<Field> { |
96 | 0 | let nullable = false; |
97 | 0 | Ok(Field::new(self.name(), self.data_type.clone(), nullable)) |
98 | 0 | } |
99 | | |
100 | 0 | fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> { |
101 | 0 | vec![] |
102 | 0 | } |
103 | | |
104 | 0 | fn name(&self) -> &str { |
105 | 0 | &self.name |
106 | 0 | } |
107 | | |
108 | 0 | fn create_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> { |
109 | 0 | Ok(Box::new(RankEvaluator { |
110 | 0 | state: RankState::default(), |
111 | 0 | rank_type: self.rank_type, |
112 | 0 | })) |
113 | 0 | } |
114 | | |
115 | 0 | fn get_result_ordering(&self, schema: &SchemaRef) -> Option<PhysicalSortExpr> { |
116 | 0 | // The built-in RANK window function (in all modes) introduces a new ordering: |
117 | 0 | schema.column_with_name(self.name()).map(|(idx, field)| { |
118 | 0 | let expr = Arc::new(Column::new(field.name(), idx)); |
119 | 0 | let options = SortOptions { |
120 | 0 | descending: false, |
121 | 0 | nulls_first: false, |
122 | 0 | }; // ASC, NULLS LAST |
123 | 0 | PhysicalSortExpr { expr, options } |
124 | 0 | }) |
125 | 0 | } |
126 | | } |
127 | | |
128 | | #[derive(Debug)] |
129 | | pub(crate) struct RankEvaluator { |
130 | | state: RankState, |
131 | | rank_type: RankType, |
132 | | } |
133 | | |
134 | | impl PartitionEvaluator for RankEvaluator { |
135 | 0 | fn is_causal(&self) -> bool { |
136 | 0 | matches!(self.rank_type, RankType::Basic | RankType::Dense) |
137 | 0 | } |
138 | | |
139 | | /// Evaluates the window function inside the given range. |
140 | 0 | fn evaluate( |
141 | 0 | &mut self, |
142 | 0 | values: &[ArrayRef], |
143 | 0 | range: &Range<usize>, |
144 | 0 | ) -> Result<ScalarValue> { |
145 | 0 | let row_idx = range.start; |
146 | 0 | // There is no argument, values are order by column values (where rank is calculated) |
147 | 0 | let range_columns = values; |
148 | 0 | let last_rank_data = get_row_at_idx(range_columns, row_idx)?; |
149 | 0 | let new_rank_encountered = |
150 | 0 | if let Some(state_last_rank_data) = &self.state.last_rank_data { |
151 | | // if rank data changes, new rank is encountered |
152 | 0 | state_last_rank_data != &last_rank_data |
153 | | } else { |
154 | | // First rank seen |
155 | 0 | true |
156 | | }; |
157 | 0 | if new_rank_encountered { |
158 | 0 | self.state.last_rank_data = Some(last_rank_data); |
159 | 0 | self.state.last_rank_boundary += self.state.current_group_count; |
160 | 0 | self.state.current_group_count = 1; |
161 | 0 | self.state.n_rank += 1; |
162 | 0 | } else { |
163 | 0 | // data is still in the same rank |
164 | 0 | self.state.current_group_count += 1; |
165 | 0 | } |
166 | 0 | match self.rank_type { |
167 | 0 | RankType::Basic => Ok(ScalarValue::UInt64(Some( |
168 | 0 | self.state.last_rank_boundary as u64 + 1, |
169 | 0 | ))), |
170 | 0 | RankType::Dense => Ok(ScalarValue::UInt64(Some(self.state.n_rank as u64))), |
171 | | RankType::Percent => { |
172 | 0 | exec_err!("Can not execute PERCENT_RANK in a streaming fashion") |
173 | | } |
174 | | } |
175 | 0 | } |
176 | | |
177 | 0 | fn evaluate_all_with_rank( |
178 | 0 | &self, |
179 | 0 | num_rows: usize, |
180 | 0 | ranks_in_partition: &[Range<usize>], |
181 | 0 | ) -> Result<ArrayRef> { |
182 | | // see https://www.postgresql.org/docs/current/functions-window.html |
183 | 0 | let result: ArrayRef = match self.rank_type { |
184 | 0 | RankType::Dense => Arc::new(UInt64Array::from_iter_values( |
185 | 0 | ranks_in_partition |
186 | 0 | .iter() |
187 | 0 | .zip(1u64..) |
188 | 0 | .flat_map(|(range, rank)| { |
189 | 0 | let len = range.end - range.start; |
190 | 0 | iter::repeat(rank).take(len) |
191 | 0 | }), |
192 | 0 | )), |
193 | | RankType::Percent => { |
194 | | // Returns the relative rank of the current row, that is (rank - 1) / (total partition rows - 1). The value thus ranges from 0 to 1 inclusive. |
195 | 0 | let denominator = num_rows as f64; |
196 | 0 | Arc::new(Float64Array::from_iter_values( |
197 | 0 | ranks_in_partition |
198 | 0 | .iter() |
199 | 0 | .scan(0_u64, |acc, range| { |
200 | 0 | let len = range.end - range.start; |
201 | 0 | let value = (*acc as f64) / (denominator - 1.0).max(1.0); |
202 | 0 | let result = iter::repeat(value).take(len); |
203 | 0 | *acc += len as u64; |
204 | 0 | Some(result) |
205 | 0 | }) |
206 | 0 | .flatten(), |
207 | 0 | )) |
208 | | } |
209 | 0 | RankType::Basic => Arc::new(UInt64Array::from_iter_values( |
210 | 0 | ranks_in_partition |
211 | 0 | .iter() |
212 | 0 | .scan(1_u64, |acc, range| { |
213 | 0 | let len = range.end - range.start; |
214 | 0 | let result = iter::repeat(*acc).take(len); |
215 | 0 | *acc += len as u64; |
216 | 0 | Some(result) |
217 | 0 | }) |
218 | 0 | .flatten(), |
219 | 0 | )), |
220 | | }; |
221 | 0 | Ok(result) |
222 | 0 | } |
223 | | |
224 | 0 | fn supports_bounded_execution(&self) -> bool { |
225 | 0 | matches!(self.rank_type, RankType::Basic | RankType::Dense) |
226 | 0 | } |
227 | | |
228 | 0 | fn include_rank(&self) -> bool { |
229 | 0 | true |
230 | 0 | } |
231 | | } |
232 | | |
233 | | #[cfg(test)] |
234 | | mod tests { |
235 | | use super::*; |
236 | | use datafusion_common::cast::{as_float64_array, as_uint64_array}; |
237 | | |
238 | | fn test_with_rank(expr: &Rank, expected: Vec<u64>) -> Result<()> { |
239 | | test_i32_result(expr, vec![0..2, 2..3, 3..6, 6..7, 7..8], expected) |
240 | | } |
241 | | |
242 | | #[allow(clippy::single_range_in_vec_init)] |
243 | | fn test_without_rank(expr: &Rank, expected: Vec<u64>) -> Result<()> { |
244 | | test_i32_result(expr, vec![0..8], expected) |
245 | | } |
246 | | |
247 | | fn test_f64_result( |
248 | | expr: &Rank, |
249 | | num_rows: usize, |
250 | | ranks: Vec<Range<usize>>, |
251 | | expected: Vec<f64>, |
252 | | ) -> Result<()> { |
253 | | let result = expr |
254 | | .create_evaluator()? |
255 | | .evaluate_all_with_rank(num_rows, &ranks)?; |
256 | | let result = as_float64_array(&result)?; |
257 | | let result = result.values(); |
258 | | assert_eq!(expected, *result); |
259 | | Ok(()) |
260 | | } |
261 | | |
262 | | fn test_i32_result( |
263 | | expr: &Rank, |
264 | | ranks: Vec<Range<usize>>, |
265 | | expected: Vec<u64>, |
266 | | ) -> Result<()> { |
267 | | let result = expr.create_evaluator()?.evaluate_all_with_rank(8, &ranks)?; |
268 | | let result = as_uint64_array(&result)?; |
269 | | let result = result.values(); |
270 | | assert_eq!(expected, *result); |
271 | | Ok(()) |
272 | | } |
273 | | |
274 | | #[test] |
275 | | fn test_dense_rank() -> Result<()> { |
276 | | let r = dense_rank("arr".into(), &DataType::UInt64); |
277 | | test_without_rank(&r, vec![1; 8])?; |
278 | | test_with_rank(&r, vec![1, 1, 2, 3, 3, 3, 4, 5])?; |
279 | | Ok(()) |
280 | | } |
281 | | |
282 | | #[test] |
283 | | fn test_rank() -> Result<()> { |
284 | | let r = rank("arr".into(), &DataType::UInt64); |
285 | | test_without_rank(&r, vec![1; 8])?; |
286 | | test_with_rank(&r, vec![1, 1, 3, 4, 4, 4, 7, 8])?; |
287 | | Ok(()) |
288 | | } |
289 | | |
290 | | #[test] |
291 | | #[allow(clippy::single_range_in_vec_init)] |
292 | | fn test_percent_rank() -> Result<()> { |
293 | | let r = percent_rank("arr".into(), &DataType::Float64); |
294 | | |
295 | | // empty case |
296 | | let expected = vec![0.0; 0]; |
297 | | test_f64_result(&r, 0, vec![0..0; 0], expected)?; |
298 | | |
299 | | // singleton case |
300 | | let expected = vec![0.0]; |
301 | | test_f64_result(&r, 1, vec![0..1], expected)?; |
302 | | |
303 | | // uniform case |
304 | | let expected = vec![0.0; 7]; |
305 | | test_f64_result(&r, 7, vec![0..7], expected)?; |
306 | | |
307 | | // non-trivial case |
308 | | let expected = vec![0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5]; |
309 | | test_f64_result(&r, 7, vec![0..3, 3..7], expected)?; |
310 | | |
311 | | Ok(()) |
312 | | } |
313 | | } |