/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/window/cume_dist.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines physical expression for `cume_dist` that can evaluated |
19 | | //! at runtime during query execution |
20 | | |
21 | | use crate::window::BuiltInWindowFunctionExpr; |
22 | | use crate::PhysicalExpr; |
23 | | use arrow::array::ArrayRef; |
24 | | use arrow::array::Float64Array; |
25 | | use arrow::datatypes::{DataType, Field}; |
26 | | use datafusion_common::Result; |
27 | | use datafusion_expr::PartitionEvaluator; |
28 | | use std::any::Any; |
29 | | use std::iter; |
30 | | use std::ops::Range; |
31 | | use std::sync::Arc; |
32 | | |
33 | | /// CumeDist calculates the cume_dist in the window function with order by |
34 | | #[derive(Debug)] |
35 | | pub struct CumeDist { |
36 | | name: String, |
37 | | /// Output data type |
38 | | data_type: DataType, |
39 | | } |
40 | | |
41 | | /// Create a cume_dist window function |
42 | 0 | pub fn cume_dist(name: String, data_type: &DataType) -> CumeDist { |
43 | 0 | CumeDist { |
44 | 0 | name, |
45 | 0 | data_type: data_type.clone(), |
46 | 0 | } |
47 | 0 | } |
48 | | |
49 | | impl BuiltInWindowFunctionExpr for CumeDist { |
50 | | /// Return a reference to Any that can be used for downcasting |
51 | 0 | fn as_any(&self) -> &dyn Any { |
52 | 0 | self |
53 | 0 | } |
54 | | |
55 | 0 | fn field(&self) -> Result<Field> { |
56 | 0 | let nullable = false; |
57 | 0 | Ok(Field::new(self.name(), self.data_type.clone(), nullable)) |
58 | 0 | } |
59 | | |
60 | 0 | fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> { |
61 | 0 | vec![] |
62 | 0 | } |
63 | | |
64 | 0 | fn name(&self) -> &str { |
65 | 0 | &self.name |
66 | 0 | } |
67 | | |
68 | 0 | fn create_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> { |
69 | 0 | Ok(Box::new(CumeDistEvaluator {})) |
70 | 0 | } |
71 | | } |
72 | | |
73 | | #[derive(Debug)] |
74 | | pub(crate) struct CumeDistEvaluator; |
75 | | |
76 | | impl PartitionEvaluator for CumeDistEvaluator { |
77 | 0 | fn evaluate_all_with_rank( |
78 | 0 | &self, |
79 | 0 | num_rows: usize, |
80 | 0 | ranks_in_partition: &[Range<usize>], |
81 | 0 | ) -> Result<ArrayRef> { |
82 | 0 | let scalar = num_rows as f64; |
83 | 0 | let result = Float64Array::from_iter_values( |
84 | 0 | ranks_in_partition |
85 | 0 | .iter() |
86 | 0 | .scan(0_u64, |acc, range| { |
87 | 0 | let len = range.end - range.start; |
88 | 0 | *acc += len as u64; |
89 | 0 | let value: f64 = (*acc as f64) / scalar; |
90 | 0 | let result = iter::repeat(value).take(len); |
91 | 0 | Some(result) |
92 | 0 | }) |
93 | 0 | .flatten(), |
94 | 0 | ); |
95 | 0 | Ok(Arc::new(result)) |
96 | 0 | } |
97 | | |
98 | 0 | fn include_rank(&self) -> bool { |
99 | 0 | true |
100 | 0 | } |
101 | | } |
102 | | |
103 | | #[cfg(test)] |
104 | | mod tests { |
105 | | use super::*; |
106 | | use datafusion_common::cast::as_float64_array; |
107 | | |
108 | | fn test_i32_result( |
109 | | expr: &CumeDist, |
110 | | num_rows: usize, |
111 | | ranks: Vec<Range<usize>>, |
112 | | expected: Vec<f64>, |
113 | | ) -> Result<()> { |
114 | | let result = expr |
115 | | .create_evaluator()? |
116 | | .evaluate_all_with_rank(num_rows, &ranks)?; |
117 | | let result = as_float64_array(&result)?; |
118 | | let result = result.values(); |
119 | | assert_eq!(expected, *result); |
120 | | Ok(()) |
121 | | } |
122 | | |
123 | | #[test] |
124 | | #[allow(clippy::single_range_in_vec_init)] |
125 | | fn test_cume_dist() -> Result<()> { |
126 | | let r = cume_dist("arr".into(), &DataType::Float64); |
127 | | |
128 | | let expected = vec![0.0; 0]; |
129 | | test_i32_result(&r, 0, vec![], expected)?; |
130 | | |
131 | | let expected = vec![1.0; 1]; |
132 | | test_i32_result(&r, 1, vec![0..1], expected)?; |
133 | | |
134 | | let expected = vec![1.0; 2]; |
135 | | test_i32_result(&r, 2, vec![0..2], expected)?; |
136 | | |
137 | | let expected = vec![0.5, 0.5, 1.0, 1.0]; |
138 | | test_i32_result(&r, 4, vec![0..2, 2..4], expected)?; |
139 | | |
140 | | let expected = vec![0.25, 0.5, 0.75, 1.0]; |
141 | | test_i32_result(&r, 4, vec![0..1, 1..2, 2..3, 3..4], expected)?; |
142 | | |
143 | | Ok(()) |
144 | | } |
145 | | } |