Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/columnar_value.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! [`ColumnarValue`] represents the result of evaluating an expression.
19
20
use arrow::array::ArrayRef;
21
use arrow::array::NullArray;
22
use arrow::compute::{kernels, CastOptions};
23
use arrow::datatypes::{DataType, TimeUnit};
24
use datafusion_common::format::DEFAULT_CAST_OPTIONS;
25
use datafusion_common::{internal_err, Result, ScalarValue};
26
use std::sync::Arc;
27
28
/// The result of evaluating an expression.
29
///
30
/// [`ColumnarValue::Scalar`] represents a single value repeated any number of
31
/// times. This is an important performance optimization for handling values
32
/// that do not change across rows.
33
///
34
/// [`ColumnarValue::Array`] represents a column of data, stored as an  Arrow
35
/// [`ArrayRef`]
36
///
37
/// A slice of `ColumnarValue`s logically represents a table, with each column
38
/// having the same number of rows. This means that all `Array`s are the same
39
/// length.
40
///
41
/// # Example
42
///
43
/// A `ColumnarValue::Array` with an array of 5 elements and a
44
/// `ColumnarValue::Scalar` with the value 100
45
///
46
/// ```text
47
/// ┌──────────────┐
48
/// │ ┌──────────┐ │
49
/// │ │   "A"    │ │
50
/// │ ├──────────┤ │
51
/// │ │   "B"    │ │
52
/// │ ├──────────┤ │
53
/// │ │   "C"    │ │
54
/// │ ├──────────┤ │
55
/// │ │   "D"    │ │        ┌──────────────┐
56
/// │ ├──────────┤ │        │ ┌──────────┐ │
57
/// │ │   "E"    │ │        │ │   100    │ │
58
/// │ └──────────┘ │        │ └──────────┘ │
59
/// └──────────────┘        └──────────────┘
60
///
61
///  ColumnarValue::        ColumnarValue::
62
///       Array                 Scalar
63
/// ```
64
///
65
/// Logically represents the following table:
66
///
67
/// | Column 1| Column 2 |
68
/// | ------- | -------- |
69
/// | A | 100 |
70
/// | B | 100 |
71
/// | C | 100 |
72
/// | D | 100 |
73
/// | E | 100 |
74
///
75
/// # Performance Notes
76
///
77
/// When implementing functions or operators, it is important to consider the
78
/// performance implications of handling scalar values.
79
///
80
/// Because all functions must handle [`ArrayRef`], it is
81
/// convenient to convert [`ColumnarValue::Scalar`]s using
82
/// [`Self::into_array`]. For example,  [`ColumnarValue::values_to_arrays`]
83
/// converts multiple columnar values into arrays of the same length.
84
///
85
/// However, it is often much more performant to provide a different,
86
/// implementation that handles scalar values differently
87
#[derive(Clone, Debug)]
88
pub enum ColumnarValue {
89
    /// Array of values
90
    Array(ArrayRef),
91
    /// A single value
92
    Scalar(ScalarValue),
93
}
94
95
impl From<ArrayRef> for ColumnarValue {
96
0
    fn from(value: ArrayRef) -> Self {
97
0
        ColumnarValue::Array(value)
98
0
    }
99
}
100
101
impl From<ScalarValue> for ColumnarValue {
102
0
    fn from(value: ScalarValue) -> Self {
103
0
        ColumnarValue::Scalar(value)
104
0
    }
105
}
106
107
impl ColumnarValue {
108
264k
    pub fn data_type(&self) -> DataType {
109
264k
        match self {
110
175k
            ColumnarValue::Array(array_value) => array_value.data_type().clone(),
111
88.3k
            ColumnarValue::Scalar(scalar_value) => scalar_value.data_type(),
112
        }
113
264k
    }
114
115
    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
116
    /// number of rows. [`Self::Scalar`] is converted by repeating the same
117
    /// scalar multiple times which is not as efficient as handling the scalar
118
    /// directly.
119
    ///
120
    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
121
    /// arrays of the same length.
122
    ///
123
    /// # Errors
124
    ///
125
    /// Errors if `self` is a Scalar that fails to be converted into an array of size
126
129k
    pub fn into_array(self, num_rows: usize) -> Result<ArrayRef> {
127
129k
        Ok(match self {
128
129k
            ColumnarValue::Array(array) => array,
129
68
            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)
?0
,
130
        })
131
129k
    }
132
133
    /// null columnar values are implemented as a null array in order to pass batch
134
    /// num_rows
135
0
    pub fn create_null_array(num_rows: usize) -> Self {
136
0
        ColumnarValue::Array(Arc::new(NullArray::new(num_rows)))
137
0
    }
138
139
    /// Converts  [`ColumnarValue`]s to [`ArrayRef`]s with the same length.
140
    ///
141
    /// # Performance Note
142
    ///
143
    /// This function expands any [`ScalarValue`] to an array. This expansion
144
    /// permits using a single function in terms of arrays, but it can be
145
    /// inefficient compared to handling the scalar value directly.
146
    ///
147
    /// Thus, it is recommended to provide specialized implementations for
148
    /// scalar values if performance is a concern.
149
    ///
150
    /// # Errors
151
    ///
152
    /// If there are multiple array arguments that have different lengths
153
0
    pub fn values_to_arrays(args: &[ColumnarValue]) -> Result<Vec<ArrayRef>> {
154
0
        if args.is_empty() {
155
0
            return Ok(vec![]);
156
0
        }
157
0
158
0
        let mut array_len = None;
159
0
        for arg in args {
160
0
            array_len = match (arg, array_len) {
161
0
                (ColumnarValue::Array(a), None) => Some(a.len()),
162
0
                (ColumnarValue::Array(a), Some(array_len)) => {
163
0
                    if array_len == a.len() {
164
0
                        Some(array_len)
165
                    } else {
166
0
                        return internal_err!(
167
0
                            "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
168
0
                        );
169
                    }
170
                }
171
0
                (ColumnarValue::Scalar(_), array_len) => array_len,
172
            }
173
        }
174
175
        // If array_len is none, it means there are only scalars, so make a 1 element array
176
0
        let inferred_length = array_len.unwrap_or(1);
177
178
0
        let args = args
179
0
            .iter()
180
0
            .map(|arg| arg.clone().into_array(inferred_length))
181
0
            .collect::<Result<Vec<_>>>()?;
182
183
0
        Ok(args)
184
0
    }
185
186
    /// Cast's this [ColumnarValue] to the specified `DataType`
187
2.38k
    pub fn cast_to(
188
2.38k
        &self,
189
2.38k
        cast_type: &DataType,
190
2.38k
        cast_options: Option<&CastOptions<'static>>,
191
2.38k
    ) -> Result<ColumnarValue> {
192
2.38k
        let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
193
2.38k
        match self {
194
2.38k
            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
195
2.38k
                kernels::cast::cast_with_options(array, cast_type, &cast_options)
?0
,
196
            )),
197
0
            ColumnarValue::Scalar(scalar) => {
198
0
                let scalar_array =
199
0
                    if cast_type == &DataType::Timestamp(TimeUnit::Nanosecond, None) {
200
0
                        if let ScalarValue::Float64(Some(float_ts)) = scalar {
201
0
                            ScalarValue::Int64(Some(
202
0
                                (float_ts * 1_000_000_000_f64).trunc() as i64,
203
0
                            ))
204
0
                            .to_array()?
205
                        } else {
206
0
                            scalar.to_array()?
207
                        }
208
                    } else {
209
0
                        scalar.to_array()?
210
                    };
211
0
                let cast_array = kernels::cast::cast_with_options(
212
0
                    &scalar_array,
213
0
                    cast_type,
214
0
                    &cast_options,
215
0
                )?;
216
0
                let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?;
217
0
                Ok(ColumnarValue::Scalar(cast_scalar))
218
            }
219
        }
220
2.38k
    }
221
}
222
223
#[cfg(test)]
224
mod tests {
225
    use super::*;
226
227
    #[test]
228
    fn values_to_arrays() {
229
        // (input, expected)
230
        let cases = vec![
231
            // empty
232
            TestCase {
233
                input: vec![],
234
                expected: vec![],
235
            },
236
            // one array of length 3
237
            TestCase {
238
                input: vec![ColumnarValue::Array(make_array(1, 3))],
239
                expected: vec![make_array(1, 3)],
240
            },
241
            // two arrays length 3
242
            TestCase {
243
                input: vec![
244
                    ColumnarValue::Array(make_array(1, 3)),
245
                    ColumnarValue::Array(make_array(2, 3)),
246
                ],
247
                expected: vec![make_array(1, 3), make_array(2, 3)],
248
            },
249
            // array and scalar
250
            TestCase {
251
                input: vec![
252
                    ColumnarValue::Array(make_array(1, 3)),
253
                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
254
                ],
255
                expected: vec![
256
                    make_array(1, 3),
257
                    make_array(100, 3), // scalar is expanded
258
                ],
259
            },
260
            // scalar and array
261
            TestCase {
262
                input: vec![
263
                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
264
                    ColumnarValue::Array(make_array(1, 3)),
265
                ],
266
                expected: vec![
267
                    make_array(100, 3), // scalar is expanded
268
                    make_array(1, 3),
269
                ],
270
            },
271
            // multiple scalars and array
272
            TestCase {
273
                input: vec![
274
                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
275
                    ColumnarValue::Array(make_array(1, 3)),
276
                    ColumnarValue::Scalar(ScalarValue::Int32(Some(200))),
277
                ],
278
                expected: vec![
279
                    make_array(100, 3), // scalar is expanded
280
                    make_array(1, 3),
281
                    make_array(200, 3), // scalar is expanded
282
                ],
283
            },
284
        ];
285
        for case in cases {
286
            case.run();
287
        }
288
    }
289
290
    #[test]
291
    #[should_panic(
292
        expected = "Arguments has mixed length. Expected length: 3, found length: 4"
293
    )]
294
    fn values_to_arrays_mixed_length() {
295
        ColumnarValue::values_to_arrays(&[
296
            ColumnarValue::Array(make_array(1, 3)),
297
            ColumnarValue::Array(make_array(2, 4)),
298
        ])
299
        .unwrap();
300
    }
301
302
    #[test]
303
    #[should_panic(
304
        expected = "Arguments has mixed length. Expected length: 3, found length: 7"
305
    )]
306
    fn values_to_arrays_mixed_length_and_scalar() {
307
        ColumnarValue::values_to_arrays(&[
308
            ColumnarValue::Array(make_array(1, 3)),
309
            ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
310
            ColumnarValue::Array(make_array(2, 7)),
311
        ])
312
        .unwrap();
313
    }
314
315
    struct TestCase {
316
        input: Vec<ColumnarValue>,
317
        expected: Vec<ArrayRef>,
318
    }
319
320
    impl TestCase {
321
        fn run(self) {
322
            let Self { input, expected } = self;
323
324
            assert_eq!(
325
                ColumnarValue::values_to_arrays(&input).unwrap(),
326
                expected,
327
                "\ninput: {input:?}\nexpected: {expected:?}"
328
            );
329
        }
330
    }
331
332
    /// Makes an array of length `len` with all elements set to `val`
333
    fn make_array(val: i32, len: usize) -> ArrayRef {
334
        Arc::new(arrow::array::Int32Array::from(vec![val; len]))
335
    }
336
}