/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/columnar_value.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! [`ColumnarValue`] represents the result of evaluating an expression. |
19 | | |
20 | | use arrow::array::ArrayRef; |
21 | | use arrow::array::NullArray; |
22 | | use arrow::compute::{kernels, CastOptions}; |
23 | | use arrow::datatypes::{DataType, TimeUnit}; |
24 | | use datafusion_common::format::DEFAULT_CAST_OPTIONS; |
25 | | use datafusion_common::{internal_err, Result, ScalarValue}; |
26 | | use std::sync::Arc; |
27 | | |
28 | | /// The result of evaluating an expression. |
29 | | /// |
30 | | /// [`ColumnarValue::Scalar`] represents a single value repeated any number of |
31 | | /// times. This is an important performance optimization for handling values |
32 | | /// that do not change across rows. |
33 | | /// |
34 | | /// [`ColumnarValue::Array`] represents a column of data, stored as an Arrow |
35 | | /// [`ArrayRef`] |
36 | | /// |
37 | | /// A slice of `ColumnarValue`s logically represents a table, with each column |
38 | | /// having the same number of rows. This means that all `Array`s are the same |
39 | | /// length. |
40 | | /// |
41 | | /// # Example |
42 | | /// |
43 | | /// A `ColumnarValue::Array` with an array of 5 elements and a |
44 | | /// `ColumnarValue::Scalar` with the value 100 |
45 | | /// |
46 | | /// ```text |
47 | | /// ┌──────────────┐ |
48 | | /// │ ┌──────────┐ │ |
49 | | /// │ │ "A" │ │ |
50 | | /// │ ├──────────┤ │ |
51 | | /// │ │ "B" │ │ |
52 | | /// │ ├──────────┤ │ |
53 | | /// │ │ "C" │ │ |
54 | | /// │ ├──────────┤ │ |
55 | | /// │ │ "D" │ │ ┌──────────────┐ |
56 | | /// │ ├──────────┤ │ │ ┌──────────┐ │ |
57 | | /// │ │ "E" │ │ │ │ 100 │ │ |
58 | | /// │ └──────────┘ │ │ └──────────┘ │ |
59 | | /// └──────────────┘ └──────────────┘ |
60 | | /// |
61 | | /// ColumnarValue:: ColumnarValue:: |
62 | | /// Array Scalar |
63 | | /// ``` |
64 | | /// |
65 | | /// Logically represents the following table: |
66 | | /// |
67 | | /// | Column 1| Column 2 | |
68 | | /// | ------- | -------- | |
69 | | /// | A | 100 | |
70 | | /// | B | 100 | |
71 | | /// | C | 100 | |
72 | | /// | D | 100 | |
73 | | /// | E | 100 | |
74 | | /// |
75 | | /// # Performance Notes |
76 | | /// |
77 | | /// When implementing functions or operators, it is important to consider the |
78 | | /// performance implications of handling scalar values. |
79 | | /// |
80 | | /// Because all functions must handle [`ArrayRef`], it is |
81 | | /// convenient to convert [`ColumnarValue::Scalar`]s using |
82 | | /// [`Self::into_array`]. For example, [`ColumnarValue::values_to_arrays`] |
83 | | /// converts multiple columnar values into arrays of the same length. |
84 | | /// |
85 | | /// However, it is often much more performant to provide a different, |
86 | | /// implementation that handles scalar values differently |
87 | | #[derive(Clone, Debug)] |
88 | | pub enum ColumnarValue { |
89 | | /// Array of values |
90 | | Array(ArrayRef), |
91 | | /// A single value |
92 | | Scalar(ScalarValue), |
93 | | } |
94 | | |
95 | | impl From<ArrayRef> for ColumnarValue { |
96 | 0 | fn from(value: ArrayRef) -> Self { |
97 | 0 | ColumnarValue::Array(value) |
98 | 0 | } |
99 | | } |
100 | | |
101 | | impl From<ScalarValue> for ColumnarValue { |
102 | 0 | fn from(value: ScalarValue) -> Self { |
103 | 0 | ColumnarValue::Scalar(value) |
104 | 0 | } |
105 | | } |
106 | | |
107 | | impl ColumnarValue { |
108 | 264k | pub fn data_type(&self) -> DataType { |
109 | 264k | match self { |
110 | 175k | ColumnarValue::Array(array_value) => array_value.data_type().clone(), |
111 | 88.3k | ColumnarValue::Scalar(scalar_value) => scalar_value.data_type(), |
112 | | } |
113 | 264k | } |
114 | | |
115 | | /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified |
116 | | /// number of rows. [`Self::Scalar`] is converted by repeating the same |
117 | | /// scalar multiple times which is not as efficient as handling the scalar |
118 | | /// directly. |
119 | | /// |
120 | | /// See [`Self::values_to_arrays`] to convert multiple columnar values into |
121 | | /// arrays of the same length. |
122 | | /// |
123 | | /// # Errors |
124 | | /// |
125 | | /// Errors if `self` is a Scalar that fails to be converted into an array of size |
126 | 129k | pub fn into_array(self, num_rows: usize) -> Result<ArrayRef> { |
127 | 129k | Ok(match self { |
128 | 129k | ColumnarValue::Array(array) => array, |
129 | 68 | ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?0 , |
130 | | }) |
131 | 129k | } |
132 | | |
133 | | /// null columnar values are implemented as a null array in order to pass batch |
134 | | /// num_rows |
135 | 0 | pub fn create_null_array(num_rows: usize) -> Self { |
136 | 0 | ColumnarValue::Array(Arc::new(NullArray::new(num_rows))) |
137 | 0 | } |
138 | | |
139 | | /// Converts [`ColumnarValue`]s to [`ArrayRef`]s with the same length. |
140 | | /// |
141 | | /// # Performance Note |
142 | | /// |
143 | | /// This function expands any [`ScalarValue`] to an array. This expansion |
144 | | /// permits using a single function in terms of arrays, but it can be |
145 | | /// inefficient compared to handling the scalar value directly. |
146 | | /// |
147 | | /// Thus, it is recommended to provide specialized implementations for |
148 | | /// scalar values if performance is a concern. |
149 | | /// |
150 | | /// # Errors |
151 | | /// |
152 | | /// If there are multiple array arguments that have different lengths |
153 | 0 | pub fn values_to_arrays(args: &[ColumnarValue]) -> Result<Vec<ArrayRef>> { |
154 | 0 | if args.is_empty() { |
155 | 0 | return Ok(vec![]); |
156 | 0 | } |
157 | 0 |
|
158 | 0 | let mut array_len = None; |
159 | 0 | for arg in args { |
160 | 0 | array_len = match (arg, array_len) { |
161 | 0 | (ColumnarValue::Array(a), None) => Some(a.len()), |
162 | 0 | (ColumnarValue::Array(a), Some(array_len)) => { |
163 | 0 | if array_len == a.len() { |
164 | 0 | Some(array_len) |
165 | | } else { |
166 | 0 | return internal_err!( |
167 | 0 | "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len() |
168 | 0 | ); |
169 | | } |
170 | | } |
171 | 0 | (ColumnarValue::Scalar(_), array_len) => array_len, |
172 | | } |
173 | | } |
174 | | |
175 | | // If array_len is none, it means there are only scalars, so make a 1 element array |
176 | 0 | let inferred_length = array_len.unwrap_or(1); |
177 | | |
178 | 0 | let args = args |
179 | 0 | .iter() |
180 | 0 | .map(|arg| arg.clone().into_array(inferred_length)) |
181 | 0 | .collect::<Result<Vec<_>>>()?; |
182 | | |
183 | 0 | Ok(args) |
184 | 0 | } |
185 | | |
186 | | /// Cast's this [ColumnarValue] to the specified `DataType` |
187 | 2.38k | pub fn cast_to( |
188 | 2.38k | &self, |
189 | 2.38k | cast_type: &DataType, |
190 | 2.38k | cast_options: Option<&CastOptions<'static>>, |
191 | 2.38k | ) -> Result<ColumnarValue> { |
192 | 2.38k | let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS); |
193 | 2.38k | match self { |
194 | 2.38k | ColumnarValue::Array(array) => Ok(ColumnarValue::Array( |
195 | 2.38k | kernels::cast::cast_with_options(array, cast_type, &cast_options)?0 , |
196 | | )), |
197 | 0 | ColumnarValue::Scalar(scalar) => { |
198 | 0 | let scalar_array = |
199 | 0 | if cast_type == &DataType::Timestamp(TimeUnit::Nanosecond, None) { |
200 | 0 | if let ScalarValue::Float64(Some(float_ts)) = scalar { |
201 | 0 | ScalarValue::Int64(Some( |
202 | 0 | (float_ts * 1_000_000_000_f64).trunc() as i64, |
203 | 0 | )) |
204 | 0 | .to_array()? |
205 | | } else { |
206 | 0 | scalar.to_array()? |
207 | | } |
208 | | } else { |
209 | 0 | scalar.to_array()? |
210 | | }; |
211 | 0 | let cast_array = kernels::cast::cast_with_options( |
212 | 0 | &scalar_array, |
213 | 0 | cast_type, |
214 | 0 | &cast_options, |
215 | 0 | )?; |
216 | 0 | let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; |
217 | 0 | Ok(ColumnarValue::Scalar(cast_scalar)) |
218 | | } |
219 | | } |
220 | 2.38k | } |
221 | | } |
222 | | |
223 | | #[cfg(test)] |
224 | | mod tests { |
225 | | use super::*; |
226 | | |
227 | | #[test] |
228 | | fn values_to_arrays() { |
229 | | // (input, expected) |
230 | | let cases = vec![ |
231 | | // empty |
232 | | TestCase { |
233 | | input: vec![], |
234 | | expected: vec![], |
235 | | }, |
236 | | // one array of length 3 |
237 | | TestCase { |
238 | | input: vec![ColumnarValue::Array(make_array(1, 3))], |
239 | | expected: vec![make_array(1, 3)], |
240 | | }, |
241 | | // two arrays length 3 |
242 | | TestCase { |
243 | | input: vec![ |
244 | | ColumnarValue::Array(make_array(1, 3)), |
245 | | ColumnarValue::Array(make_array(2, 3)), |
246 | | ], |
247 | | expected: vec![make_array(1, 3), make_array(2, 3)], |
248 | | }, |
249 | | // array and scalar |
250 | | TestCase { |
251 | | input: vec![ |
252 | | ColumnarValue::Array(make_array(1, 3)), |
253 | | ColumnarValue::Scalar(ScalarValue::Int32(Some(100))), |
254 | | ], |
255 | | expected: vec![ |
256 | | make_array(1, 3), |
257 | | make_array(100, 3), // scalar is expanded |
258 | | ], |
259 | | }, |
260 | | // scalar and array |
261 | | TestCase { |
262 | | input: vec![ |
263 | | ColumnarValue::Scalar(ScalarValue::Int32(Some(100))), |
264 | | ColumnarValue::Array(make_array(1, 3)), |
265 | | ], |
266 | | expected: vec![ |
267 | | make_array(100, 3), // scalar is expanded |
268 | | make_array(1, 3), |
269 | | ], |
270 | | }, |
271 | | // multiple scalars and array |
272 | | TestCase { |
273 | | input: vec![ |
274 | | ColumnarValue::Scalar(ScalarValue::Int32(Some(100))), |
275 | | ColumnarValue::Array(make_array(1, 3)), |
276 | | ColumnarValue::Scalar(ScalarValue::Int32(Some(200))), |
277 | | ], |
278 | | expected: vec![ |
279 | | make_array(100, 3), // scalar is expanded |
280 | | make_array(1, 3), |
281 | | make_array(200, 3), // scalar is expanded |
282 | | ], |
283 | | }, |
284 | | ]; |
285 | | for case in cases { |
286 | | case.run(); |
287 | | } |
288 | | } |
289 | | |
290 | | #[test] |
291 | | #[should_panic( |
292 | | expected = "Arguments has mixed length. Expected length: 3, found length: 4" |
293 | | )] |
294 | | fn values_to_arrays_mixed_length() { |
295 | | ColumnarValue::values_to_arrays(&[ |
296 | | ColumnarValue::Array(make_array(1, 3)), |
297 | | ColumnarValue::Array(make_array(2, 4)), |
298 | | ]) |
299 | | .unwrap(); |
300 | | } |
301 | | |
302 | | #[test] |
303 | | #[should_panic( |
304 | | expected = "Arguments has mixed length. Expected length: 3, found length: 7" |
305 | | )] |
306 | | fn values_to_arrays_mixed_length_and_scalar() { |
307 | | ColumnarValue::values_to_arrays(&[ |
308 | | ColumnarValue::Array(make_array(1, 3)), |
309 | | ColumnarValue::Scalar(ScalarValue::Int32(Some(100))), |
310 | | ColumnarValue::Array(make_array(2, 7)), |
311 | | ]) |
312 | | .unwrap(); |
313 | | } |
314 | | |
315 | | struct TestCase { |
316 | | input: Vec<ColumnarValue>, |
317 | | expected: Vec<ArrayRef>, |
318 | | } |
319 | | |
320 | | impl TestCase { |
321 | | fn run(self) { |
322 | | let Self { input, expected } = self; |
323 | | |
324 | | assert_eq!( |
325 | | ColumnarValue::values_to_arrays(&input).unwrap(), |
326 | | expected, |
327 | | "\ninput: {input:?}\nexpected: {expected:?}" |
328 | | ); |
329 | | } |
330 | | } |
331 | | |
332 | | /// Makes an array of length `len` with all elements set to `val` |
333 | | fn make_array(val: i32, len: usize) -> ArrayRef { |
334 | | Arc::new(arrow::array::Int32Array::from(vec![val; len])) |
335 | | } |
336 | | } |