/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/planner.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::sync::Arc; |
19 | | |
20 | | use crate::scalar_function; |
21 | | use crate::{ |
22 | | expressions::{self, binary, like, similar_to, Column, Literal}, |
23 | | PhysicalExpr, |
24 | | }; |
25 | | |
26 | | use arrow::datatypes::Schema; |
27 | | use datafusion_common::{ |
28 | | exec_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, ToDFSchema, |
29 | | }; |
30 | | use datafusion_expr::execution_props::ExecutionProps; |
31 | | use datafusion_expr::expr::{Alias, Cast, InList, ScalarFunction}; |
32 | | use datafusion_expr::var_provider::is_system_variables; |
33 | | use datafusion_expr::var_provider::VarType; |
34 | | use datafusion_expr::{ |
35 | | binary_expr, lit, Between, BinaryExpr, Expr, Like, Operator, TryCast, |
36 | | }; |
37 | | |
38 | | /// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 |
39 | | /// AS int)`. |
40 | | /// |
41 | | /// [PhysicalExpr] are the physical counterpart to [Expr] used in logical |
42 | | /// planning, and can be evaluated directly on a [RecordBatch]. They are |
43 | | /// normally created from [Expr] by a [PhysicalPlanner] and can be created |
44 | | /// directly using [create_physical_expr]. |
45 | | /// |
46 | | /// A Physical expression knows its type, nullability and how to evaluate itself. |
47 | | /// |
48 | | /// [PhysicalPlanner]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html |
49 | | /// [RecordBatch]: https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html |
50 | | /// |
51 | | /// # Example: Create `PhysicalExpr` from `Expr` |
52 | | /// ``` |
53 | | /// # use arrow::datatypes::{DataType, Field, Schema}; |
54 | | /// # use datafusion_common::DFSchema; |
55 | | /// # use datafusion_expr::{Expr, col, lit}; |
56 | | /// # use datafusion_physical_expr::create_physical_expr; |
57 | | /// # use datafusion_expr::execution_props::ExecutionProps; |
58 | | /// // For a logical expression `a = 1`, we can create a physical expression |
59 | | /// let expr = col("a").eq(lit(1)); |
60 | | /// // To create a PhysicalExpr we need 1. a schema |
61 | | /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); |
62 | | /// let df_schema = DFSchema::try_from(schema).unwrap(); |
63 | | /// // 2. ExecutionProps |
64 | | /// let props = ExecutionProps::new(); |
65 | | /// // We can now create a PhysicalExpr: |
66 | | /// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); |
67 | | /// ``` |
68 | | /// |
69 | | /// # Example: Executing a PhysicalExpr to obtain [ColumnarValue] |
70 | | /// ``` |
71 | | /// # use std::sync::Arc; |
72 | | /// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; |
73 | | /// # use arrow::datatypes::{DataType, Field, Schema}; |
74 | | /// # use datafusion_common::{assert_batches_eq, DFSchema}; |
75 | | /// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; |
76 | | /// # use datafusion_physical_expr::create_physical_expr; |
77 | | /// # use datafusion_expr::execution_props::ExecutionProps; |
78 | | /// # let expr = col("a").eq(lit(1)); |
79 | | /// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); |
80 | | /// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); |
81 | | /// # let props = ExecutionProps::new(); |
82 | | /// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: |
83 | | /// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); |
84 | | /// // Input of [1,2,3] |
85 | | /// let input_batch = RecordBatch::try_from_iter(vec![ |
86 | | /// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) |
87 | | /// ]).unwrap(); |
88 | | /// // The result is a ColumnarValue (either an Array or a Scalar) |
89 | | /// let result = physical_expr.evaluate(&input_batch).unwrap(); |
90 | | /// // In this case, a BooleanArray with the result of the comparison |
91 | | /// let ColumnarValue::Array(arr) = result else { |
92 | | /// panic!("Expected an array") |
93 | | /// }; |
94 | | /// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); |
95 | | /// ``` |
96 | | /// |
97 | | /// [ColumnarValue]: datafusion_expr::ColumnarValue |
98 | | /// |
99 | | /// Create a physical expression from a logical expression ([Expr]). |
100 | | /// |
101 | | /// # Arguments |
102 | | /// |
103 | | /// * `e` - The logical expression |
104 | | /// * `input_dfschema` - The DataFusion schema for the input, used to resolve `Column` references |
105 | | /// to qualified or unqualified fields by name. |
106 | 0 | pub fn create_physical_expr( |
107 | 0 | e: &Expr, |
108 | 0 | input_dfschema: &DFSchema, |
109 | 0 | execution_props: &ExecutionProps, |
110 | 0 | ) -> Result<Arc<dyn PhysicalExpr>> { |
111 | 0 | let input_schema: &Schema = &input_dfschema.into(); |
112 | 0 |
|
113 | 0 | match e { |
114 | 0 | Expr::Alias(Alias { expr, .. }) => { |
115 | 0 | Ok(create_physical_expr(expr, input_dfschema, execution_props)?) |
116 | | } |
117 | 0 | Expr::Column(c) => { |
118 | 0 | let idx = input_dfschema.index_of_column(c)?; |
119 | 0 | Ok(Arc::new(Column::new(&c.name, idx))) |
120 | | } |
121 | 0 | Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), |
122 | 0 | Expr::ScalarVariable(_, variable_names) => { |
123 | 0 | if is_system_variables(variable_names) { |
124 | 0 | match execution_props.get_var_provider(VarType::System) { |
125 | 0 | Some(provider) => { |
126 | 0 | let scalar_value = provider.get_value(variable_names.clone())?; |
127 | 0 | Ok(Arc::new(Literal::new(scalar_value))) |
128 | | } |
129 | 0 | _ => plan_err!("No system variable provider found"), |
130 | | } |
131 | | } else { |
132 | 0 | match execution_props.get_var_provider(VarType::UserDefined) { |
133 | 0 | Some(provider) => { |
134 | 0 | let scalar_value = provider.get_value(variable_names.clone())?; |
135 | 0 | Ok(Arc::new(Literal::new(scalar_value))) |
136 | | } |
137 | 0 | _ => plan_err!("No user defined variable provider found"), |
138 | | } |
139 | | } |
140 | | } |
141 | 0 | Expr::IsTrue(expr) => { |
142 | 0 | let binary_op = binary_expr( |
143 | 0 | expr.as_ref().clone(), |
144 | 0 | Operator::IsNotDistinctFrom, |
145 | 0 | lit(true), |
146 | 0 | ); |
147 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
148 | | } |
149 | 0 | Expr::IsNotTrue(expr) => { |
150 | 0 | let binary_op = |
151 | 0 | binary_expr(expr.as_ref().clone(), Operator::IsDistinctFrom, lit(true)); |
152 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
153 | | } |
154 | 0 | Expr::IsFalse(expr) => { |
155 | 0 | let binary_op = binary_expr( |
156 | 0 | expr.as_ref().clone(), |
157 | 0 | Operator::IsNotDistinctFrom, |
158 | 0 | lit(false), |
159 | 0 | ); |
160 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
161 | | } |
162 | 0 | Expr::IsNotFalse(expr) => { |
163 | 0 | let binary_op = |
164 | 0 | binary_expr(expr.as_ref().clone(), Operator::IsDistinctFrom, lit(false)); |
165 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
166 | | } |
167 | 0 | Expr::IsUnknown(expr) => { |
168 | 0 | let binary_op = binary_expr( |
169 | 0 | expr.as_ref().clone(), |
170 | 0 | Operator::IsNotDistinctFrom, |
171 | 0 | Expr::Literal(ScalarValue::Boolean(None)), |
172 | 0 | ); |
173 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
174 | | } |
175 | 0 | Expr::IsNotUnknown(expr) => { |
176 | 0 | let binary_op = binary_expr( |
177 | 0 | expr.as_ref().clone(), |
178 | 0 | Operator::IsDistinctFrom, |
179 | 0 | Expr::Literal(ScalarValue::Boolean(None)), |
180 | 0 | ); |
181 | 0 | create_physical_expr(&binary_op, input_dfschema, execution_props) |
182 | | } |
183 | 0 | Expr::BinaryExpr(BinaryExpr { left, op, right }) => { |
184 | | // Create physical expressions for left and right operands |
185 | 0 | let lhs = create_physical_expr(left, input_dfschema, execution_props)?; |
186 | 0 | let rhs = create_physical_expr(right, input_dfschema, execution_props)?; |
187 | | // Note that the logical planner is responsible |
188 | | // for type coercion on the arguments (e.g. if one |
189 | | // argument was originally Int32 and one was |
190 | | // Int64 they will both be coerced to Int64). |
191 | | // |
192 | | // There should be no coercion during physical |
193 | | // planning. |
194 | 0 | binary(lhs, *op, rhs, input_schema) |
195 | | } |
196 | | Expr::Like(Like { |
197 | 0 | negated, |
198 | 0 | expr, |
199 | 0 | pattern, |
200 | 0 | escape_char, |
201 | 0 | case_insensitive, |
202 | 0 | }) => { |
203 | 0 | if escape_char.is_some() { |
204 | 0 | return exec_err!("LIKE does not support escape_char"); |
205 | 0 | } |
206 | 0 | let physical_expr = |
207 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?; |
208 | 0 | let physical_pattern = |
209 | 0 | create_physical_expr(pattern, input_dfschema, execution_props)?; |
210 | 0 | like( |
211 | 0 | *negated, |
212 | 0 | *case_insensitive, |
213 | 0 | physical_expr, |
214 | 0 | physical_pattern, |
215 | 0 | input_schema, |
216 | 0 | ) |
217 | | } |
218 | | Expr::SimilarTo(Like { |
219 | 0 | negated, |
220 | 0 | expr, |
221 | 0 | pattern, |
222 | 0 | escape_char, |
223 | 0 | case_insensitive, |
224 | 0 | }) => { |
225 | 0 | if escape_char.is_some() { |
226 | 0 | return exec_err!("SIMILAR TO does not support escape_char yet"); |
227 | 0 | } |
228 | 0 | let physical_expr = |
229 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?; |
230 | 0 | let physical_pattern = |
231 | 0 | create_physical_expr(pattern, input_dfschema, execution_props)?; |
232 | 0 | similar_to(*negated, *case_insensitive, physical_expr, physical_pattern) |
233 | | } |
234 | 0 | Expr::Case(case) => { |
235 | 0 | let expr: Option<Arc<dyn PhysicalExpr>> = if let Some(e) = &case.expr { |
236 | 0 | Some(create_physical_expr( |
237 | 0 | e.as_ref(), |
238 | 0 | input_dfschema, |
239 | 0 | execution_props, |
240 | 0 | )?) |
241 | | } else { |
242 | 0 | None |
243 | | }; |
244 | 0 | let (when_expr, then_expr): (Vec<&Expr>, Vec<&Expr>) = case |
245 | 0 | .when_then_expr |
246 | 0 | .iter() |
247 | 0 | .map(|(w, t)| (w.as_ref(), t.as_ref())) |
248 | 0 | .unzip(); |
249 | 0 | let when_expr = |
250 | 0 | create_physical_exprs(when_expr, input_dfschema, execution_props)?; |
251 | 0 | let then_expr = |
252 | 0 | create_physical_exprs(then_expr, input_dfschema, execution_props)?; |
253 | 0 | let when_then_expr: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = |
254 | 0 | when_expr |
255 | 0 | .iter() |
256 | 0 | .zip(then_expr.iter()) |
257 | 0 | .map(|(w, t)| (Arc::clone(w), Arc::clone(t))) |
258 | 0 | .collect(); |
259 | 0 | let else_expr: Option<Arc<dyn PhysicalExpr>> = |
260 | 0 | if let Some(e) = &case.else_expr { |
261 | 0 | Some(create_physical_expr( |
262 | 0 | e.as_ref(), |
263 | 0 | input_dfschema, |
264 | 0 | execution_props, |
265 | 0 | )?) |
266 | | } else { |
267 | 0 | None |
268 | | }; |
269 | 0 | Ok(expressions::case(expr, when_then_expr, else_expr)?) |
270 | | } |
271 | 0 | Expr::Cast(Cast { expr, data_type }) => expressions::cast( |
272 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?, |
273 | 0 | input_schema, |
274 | 0 | data_type.clone(), |
275 | | ), |
276 | 0 | Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast( |
277 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?, |
278 | 0 | input_schema, |
279 | 0 | data_type.clone(), |
280 | | ), |
281 | 0 | Expr::Not(expr) => { |
282 | 0 | expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?) |
283 | | } |
284 | 0 | Expr::Negative(expr) => expressions::negative( |
285 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?, |
286 | 0 | input_schema, |
287 | | ), |
288 | 0 | Expr::IsNull(expr) => expressions::is_null(create_physical_expr( |
289 | 0 | expr, |
290 | 0 | input_dfschema, |
291 | 0 | execution_props, |
292 | 0 | )?), |
293 | 0 | Expr::IsNotNull(expr) => expressions::is_not_null(create_physical_expr( |
294 | 0 | expr, |
295 | 0 | input_dfschema, |
296 | 0 | execution_props, |
297 | 0 | )?), |
298 | 0 | Expr::ScalarFunction(ScalarFunction { func, args }) => { |
299 | 0 | let physical_args = |
300 | 0 | create_physical_exprs(args, input_dfschema, execution_props)?; |
301 | | |
302 | 0 | scalar_function::create_physical_expr( |
303 | 0 | Arc::clone(func).as_ref(), |
304 | 0 | &physical_args, |
305 | 0 | input_schema, |
306 | 0 | args, |
307 | 0 | input_dfschema, |
308 | 0 | ) |
309 | | } |
310 | | Expr::Between(Between { |
311 | 0 | expr, |
312 | 0 | negated, |
313 | 0 | low, |
314 | 0 | high, |
315 | | }) => { |
316 | 0 | let value_expr = create_physical_expr(expr, input_dfschema, execution_props)?; |
317 | 0 | let low_expr = create_physical_expr(low, input_dfschema, execution_props)?; |
318 | 0 | let high_expr = create_physical_expr(high, input_dfschema, execution_props)?; |
319 | | |
320 | | // rewrite the between into the two binary operators |
321 | 0 | let binary_expr = binary( |
322 | 0 | binary( |
323 | 0 | Arc::clone(&value_expr), |
324 | 0 | Operator::GtEq, |
325 | 0 | low_expr, |
326 | 0 | input_schema, |
327 | 0 | )?, |
328 | 0 | Operator::And, |
329 | 0 | binary( |
330 | 0 | Arc::clone(&value_expr), |
331 | 0 | Operator::LtEq, |
332 | 0 | high_expr, |
333 | 0 | input_schema, |
334 | 0 | )?, |
335 | 0 | input_schema, |
336 | 0 | ); |
337 | 0 |
|
338 | 0 | if *negated { |
339 | 0 | expressions::not(binary_expr?) |
340 | | } else { |
341 | 0 | binary_expr |
342 | | } |
343 | | } |
344 | | Expr::InList(InList { |
345 | 0 | expr, |
346 | 0 | list, |
347 | 0 | negated, |
348 | 0 | }) => match expr.as_ref() { |
349 | | Expr::Literal(ScalarValue::Utf8(None)) => { |
350 | 0 | Ok(expressions::lit(ScalarValue::Boolean(None))) |
351 | | } |
352 | | _ => { |
353 | 0 | let value_expr = |
354 | 0 | create_physical_expr(expr, input_dfschema, execution_props)?; |
355 | | |
356 | 0 | let list_exprs = |
357 | 0 | create_physical_exprs(list, input_dfschema, execution_props)?; |
358 | 0 | expressions::in_list(value_expr, list_exprs, negated, input_schema) |
359 | | } |
360 | | }, |
361 | 0 | other => { |
362 | 0 | not_impl_err!("Physical plan does not support logical expression {other:?}") |
363 | | } |
364 | | } |
365 | 0 | } |
366 | | |
367 | | /// Create vector of Physical Expression from a vector of logical expression |
368 | 0 | pub fn create_physical_exprs<'a, I>( |
369 | 0 | exprs: I, |
370 | 0 | input_dfschema: &DFSchema, |
371 | 0 | execution_props: &ExecutionProps, |
372 | 0 | ) -> Result<Vec<Arc<dyn PhysicalExpr>>> |
373 | 0 | where |
374 | 0 | I: IntoIterator<Item = &'a Expr>, |
375 | 0 | { |
376 | 0 | exprs |
377 | 0 | .into_iter() |
378 | 0 | .map(|expr| create_physical_expr(expr, input_dfschema, execution_props)) |
379 | 0 | .collect::<Result<Vec<_>>>() |
380 | 0 | } |
381 | | |
382 | | /// Convert a logical expression to a physical expression (without any simplification, etc) |
383 | 0 | pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc<dyn PhysicalExpr> { |
384 | 0 | let df_schema = schema.clone().to_dfschema().unwrap(); |
385 | 0 | let execution_props = ExecutionProps::new(); |
386 | 0 | create_physical_expr(expr, &df_schema, &execution_props).unwrap() |
387 | 0 | } |
388 | | |
389 | | #[cfg(test)] |
390 | | mod tests { |
391 | | use arrow_array::{ArrayRef, BooleanArray, RecordBatch, StringArray}; |
392 | | use arrow_schema::{DataType, Field}; |
393 | | |
394 | | use datafusion_expr::{col, lit}; |
395 | | |
396 | | use super::*; |
397 | | |
398 | | #[test] |
399 | | fn test_create_physical_expr_scalar_input_output() -> Result<()> { |
400 | | let expr = col("letter").eq(lit("A")); |
401 | | |
402 | | let schema = Schema::new(vec![Field::new("letter", DataType::Utf8, false)]); |
403 | | let df_schema = DFSchema::try_from_qualified_schema("data", &schema)?; |
404 | | let p = create_physical_expr(&expr, &df_schema, &ExecutionProps::new())?; |
405 | | |
406 | | let batch = RecordBatch::try_new( |
407 | | Arc::new(schema), |
408 | | vec![Arc::new(StringArray::from_iter_values(vec![ |
409 | | "A", "B", "C", "D", |
410 | | ]))], |
411 | | )?; |
412 | | let result = p.evaluate(&batch)?; |
413 | | let result = result.into_array(4).expect("Failed to convert to array"); |
414 | | |
415 | | assert_eq!( |
416 | | &result, |
417 | | &(Arc::new(BooleanArray::from(vec![true, false, false, false,])) as ArrayRef) |
418 | | ); |
419 | | |
420 | | Ok(()) |
421 | | } |
422 | | } |