/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/scalar_function.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Declaration of built-in (scalar) functions. |
19 | | //! This module contains built-in functions' enumeration and metadata. |
20 | | //! |
21 | | //! Generally, a function has: |
22 | | //! * a signature |
23 | | //! * a return type, that is a function of the incoming argument's types |
24 | | //! * the computation, that must accept each valid signature |
25 | | //! |
26 | | //! * Signature: see `Signature` |
27 | | //! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, ([f32]) -> f32, ([f64]) -> f64. |
28 | | //! |
29 | | //! This module also has a set of coercion rules to improve user experience: if an argument i32 is passed |
30 | | //! to a function that supports f64, it is coerced to f64. |
31 | | |
32 | | use std::any::Any; |
33 | | use std::fmt::{self, Debug, Formatter}; |
34 | | use std::hash::{Hash, Hasher}; |
35 | | use std::sync::Arc; |
36 | | |
37 | | use crate::physical_expr::{down_cast_any_ref, physical_exprs_equal}; |
38 | | use crate::PhysicalExpr; |
39 | | |
40 | | use arrow::datatypes::{DataType, Schema}; |
41 | | use arrow::record_batch::RecordBatch; |
42 | | use datafusion_common::{internal_err, DFSchema, Result}; |
43 | | use datafusion_expr::interval_arithmetic::Interval; |
44 | | use datafusion_expr::sort_properties::ExprProperties; |
45 | | use datafusion_expr::type_coercion::functions::data_types_with_scalar_udf; |
46 | | use datafusion_expr::{expr_vec_fmt, ColumnarValue, Expr, ScalarUDF}; |
47 | | |
48 | | /// Physical expression of a scalar function |
49 | | pub struct ScalarFunctionExpr { |
50 | | fun: Arc<ScalarUDF>, |
51 | | name: String, |
52 | | args: Vec<Arc<dyn PhysicalExpr>>, |
53 | | return_type: DataType, |
54 | | nullable: bool, |
55 | | } |
56 | | |
57 | | impl Debug for ScalarFunctionExpr { |
58 | 0 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
59 | 0 | f.debug_struct("ScalarFunctionExpr") |
60 | 0 | .field("fun", &"<FUNC>") |
61 | 0 | .field("name", &self.name) |
62 | 0 | .field("args", &self.args) |
63 | 0 | .field("return_type", &self.return_type) |
64 | 0 | .finish() |
65 | 0 | } |
66 | | } |
67 | | |
68 | | impl ScalarFunctionExpr { |
69 | | /// Create a new Scalar function |
70 | 0 | pub fn new( |
71 | 0 | name: &str, |
72 | 0 | fun: Arc<ScalarUDF>, |
73 | 0 | args: Vec<Arc<dyn PhysicalExpr>>, |
74 | 0 | return_type: DataType, |
75 | 0 | ) -> Self { |
76 | 0 | Self { |
77 | 0 | fun, |
78 | 0 | name: name.to_owned(), |
79 | 0 | args, |
80 | 0 | return_type, |
81 | 0 | nullable: true, |
82 | 0 | } |
83 | 0 | } |
84 | | |
85 | | /// Get the scalar function implementation |
86 | 0 | pub fn fun(&self) -> &ScalarUDF { |
87 | 0 | &self.fun |
88 | 0 | } |
89 | | |
90 | | /// The name for this expression |
91 | 0 | pub fn name(&self) -> &str { |
92 | 0 | &self.name |
93 | 0 | } |
94 | | |
95 | | /// Input arguments |
96 | 0 | pub fn args(&self) -> &[Arc<dyn PhysicalExpr>] { |
97 | 0 | &self.args |
98 | 0 | } |
99 | | |
100 | | /// Data type produced by this expression |
101 | 0 | pub fn return_type(&self) -> &DataType { |
102 | 0 | &self.return_type |
103 | 0 | } |
104 | | |
105 | 0 | pub fn with_nullable(mut self, nullable: bool) -> Self { |
106 | 0 | self.nullable = nullable; |
107 | 0 | self |
108 | 0 | } |
109 | | |
110 | 0 | pub fn nullable(&self) -> bool { |
111 | 0 | self.nullable |
112 | 0 | } |
113 | | } |
114 | | |
115 | | impl fmt::Display for ScalarFunctionExpr { |
116 | 0 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
117 | 0 | write!(f, "{}({})", self.name, expr_vec_fmt!(self.args)) |
118 | 0 | } |
119 | | } |
120 | | |
121 | | impl PhysicalExpr for ScalarFunctionExpr { |
122 | | /// Return a reference to Any that can be used for downcasting |
123 | 0 | fn as_any(&self) -> &dyn Any { |
124 | 0 | self |
125 | 0 | } |
126 | | |
127 | 0 | fn data_type(&self, _input_schema: &Schema) -> Result<DataType> { |
128 | 0 | Ok(self.return_type.clone()) |
129 | 0 | } |
130 | | |
131 | 0 | fn nullable(&self, _input_schema: &Schema) -> Result<bool> { |
132 | 0 | Ok(self.nullable) |
133 | 0 | } |
134 | | |
135 | 0 | fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> { |
136 | 0 | let inputs = self |
137 | 0 | .args |
138 | 0 | .iter() |
139 | 0 | .map(|e| e.evaluate(batch)) |
140 | 0 | .collect::<Result<Vec<_>>>()?; |
141 | | |
142 | | // evaluate the function |
143 | 0 | let output = match self.args.is_empty() { |
144 | 0 | true => self.fun.invoke_no_args(batch.num_rows()), |
145 | 0 | false => self.fun.invoke(&inputs), |
146 | 0 | }?; |
147 | | |
148 | 0 | if let ColumnarValue::Array(array) = &output { |
149 | 0 | if array.len() != batch.num_rows() { |
150 | 0 | return internal_err!("UDF returned a different number of rows than expected. Expected: {}, Got: {}", |
151 | 0 | batch.num_rows(), array.len()); |
152 | 0 | } |
153 | 0 | } |
154 | 0 | Ok(output) |
155 | 0 | } |
156 | | |
157 | 0 | fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> { |
158 | 0 | self.args.iter().collect() |
159 | 0 | } |
160 | | |
161 | 0 | fn with_new_children( |
162 | 0 | self: Arc<Self>, |
163 | 0 | children: Vec<Arc<dyn PhysicalExpr>>, |
164 | 0 | ) -> Result<Arc<dyn PhysicalExpr>> { |
165 | 0 | Ok(Arc::new( |
166 | 0 | ScalarFunctionExpr::new( |
167 | 0 | &self.name, |
168 | 0 | Arc::clone(&self.fun), |
169 | 0 | children, |
170 | 0 | self.return_type().clone(), |
171 | 0 | ) |
172 | 0 | .with_nullable(self.nullable), |
173 | 0 | )) |
174 | 0 | } |
175 | | |
176 | 0 | fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> { |
177 | 0 | self.fun.evaluate_bounds(children) |
178 | 0 | } |
179 | | |
180 | 0 | fn propagate_constraints( |
181 | 0 | &self, |
182 | 0 | interval: &Interval, |
183 | 0 | children: &[&Interval], |
184 | 0 | ) -> Result<Option<Vec<Interval>>> { |
185 | 0 | self.fun.propagate_constraints(interval, children) |
186 | 0 | } |
187 | | |
188 | 0 | fn dyn_hash(&self, state: &mut dyn Hasher) { |
189 | 0 | let mut s = state; |
190 | 0 | self.name.hash(&mut s); |
191 | 0 | self.args.hash(&mut s); |
192 | 0 | self.return_type.hash(&mut s); |
193 | 0 | // Add `self.fun` when hash is available |
194 | 0 | } |
195 | | |
196 | 0 | fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> { |
197 | 0 | let sort_properties = self.fun.output_ordering(children)?; |
198 | 0 | let children_range = children |
199 | 0 | .iter() |
200 | 0 | .map(|props| &props.range) |
201 | 0 | .collect::<Vec<_>>(); |
202 | 0 | let range = self.fun().evaluate_bounds(&children_range)?; |
203 | | |
204 | 0 | Ok(ExprProperties { |
205 | 0 | sort_properties, |
206 | 0 | range, |
207 | 0 | }) |
208 | 0 | } |
209 | | } |
210 | | |
211 | | impl PartialEq<dyn Any> for ScalarFunctionExpr { |
212 | | /// Comparing name, args and return_type |
213 | 0 | fn eq(&self, other: &dyn Any) -> bool { |
214 | 0 | down_cast_any_ref(other) |
215 | 0 | .downcast_ref::<Self>() |
216 | 0 | .map(|x| { |
217 | 0 | self.name == x.name |
218 | 0 | && physical_exprs_equal(&self.args, &x.args) |
219 | 0 | && self.return_type == x.return_type |
220 | 0 | }) |
221 | 0 | .unwrap_or(false) |
222 | 0 | } |
223 | | } |
224 | | |
225 | | /// Create a physical expression for the UDF. |
226 | 0 | pub fn create_physical_expr( |
227 | 0 | fun: &ScalarUDF, |
228 | 0 | input_phy_exprs: &[Arc<dyn PhysicalExpr>], |
229 | 0 | input_schema: &Schema, |
230 | 0 | args: &[Expr], |
231 | 0 | input_dfschema: &DFSchema, |
232 | 0 | ) -> Result<Arc<dyn PhysicalExpr>> { |
233 | 0 | let input_expr_types = input_phy_exprs |
234 | 0 | .iter() |
235 | 0 | .map(|e| e.data_type(input_schema)) |
236 | 0 | .collect::<Result<Vec<_>>>()?; |
237 | | |
238 | | // verify that input data types is consistent with function's `TypeSignature` |
239 | 0 | data_types_with_scalar_udf(&input_expr_types, fun)?; |
240 | | |
241 | | // Since we have arg_types, we dont need args and schema. |
242 | 0 | let return_type = |
243 | 0 | fun.return_type_from_exprs(args, input_dfschema, &input_expr_types)?; |
244 | | |
245 | 0 | Ok(Arc::new( |
246 | 0 | ScalarFunctionExpr::new( |
247 | 0 | fun.name(), |
248 | 0 | Arc::new(fun.clone()), |
249 | 0 | input_phy_exprs.to_vec(), |
250 | 0 | return_type, |
251 | 0 | ) |
252 | 0 | .with_nullable(fun.is_nullable(args, input_dfschema)), |
253 | 0 | )) |
254 | 0 | } |