/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/projection.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines the projection execution plan. A projection determines which columns or expressions |
19 | | //! are returned from a query. The SQL statement `SELECT a, b, a+b FROM t1` is an example |
20 | | //! of a projection on table `t1` where the expressions `a`, `b`, and `a+b` are the |
21 | | //! projection expressions. `SELECT` without `FROM` will only evaluate expressions. |
22 | | |
23 | | use std::any::Any; |
24 | | use std::collections::HashMap; |
25 | | use std::pin::Pin; |
26 | | use std::sync::Arc; |
27 | | use std::task::{Context, Poll}; |
28 | | |
29 | | use super::expressions::Column; |
30 | | use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; |
31 | | use super::{ |
32 | | DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream, |
33 | | SendableRecordBatchStream, Statistics, |
34 | | }; |
35 | | use crate::{ColumnStatistics, DisplayFormatType, ExecutionPlan, PhysicalExpr}; |
36 | | |
37 | | use arrow::datatypes::{Field, Schema, SchemaRef}; |
38 | | use arrow::record_batch::{RecordBatch, RecordBatchOptions}; |
39 | | use datafusion_common::stats::Precision; |
40 | | use datafusion_common::Result; |
41 | | use datafusion_execution::TaskContext; |
42 | | use datafusion_physical_expr::equivalence::ProjectionMapping; |
43 | | use datafusion_physical_expr::expressions::Literal; |
44 | | |
45 | | use futures::stream::{Stream, StreamExt}; |
46 | | use log::trace; |
47 | | |
48 | | /// Execution plan for a projection |
49 | | #[derive(Debug, Clone)] |
50 | | pub struct ProjectionExec { |
51 | | /// The projection expressions stored as tuples of (expression, output column name) |
52 | | pub(crate) expr: Vec<(Arc<dyn PhysicalExpr>, String)>, |
53 | | /// The schema once the projection has been applied to the input |
54 | | schema: SchemaRef, |
55 | | /// The input plan |
56 | | input: Arc<dyn ExecutionPlan>, |
57 | | /// Execution metrics |
58 | | metrics: ExecutionPlanMetricsSet, |
59 | | /// Cache holding plan properties like equivalences, output partitioning etc. |
60 | | cache: PlanProperties, |
61 | | } |
62 | | |
63 | | impl ProjectionExec { |
64 | | /// Create a projection on an input |
65 | 2 | pub fn try_new( |
66 | 2 | expr: Vec<(Arc<dyn PhysicalExpr>, String)>, |
67 | 2 | input: Arc<dyn ExecutionPlan>, |
68 | 2 | ) -> Result<Self> { |
69 | 2 | let input_schema = input.schema(); |
70 | 2 | |
71 | 2 | let fields: Result<Vec<Field>> = expr |
72 | 2 | .iter() |
73 | 3 | .map(|(e, name)| { |
74 | 3 | let mut field = Field::new( |
75 | 3 | name, |
76 | 3 | e.data_type(&input_schema)?0 , |
77 | 3 | e.nullable(&input_schema)?0 , |
78 | | ); |
79 | 3 | field.set_metadata( |
80 | 3 | get_field_metadata(e, &input_schema).unwrap_or_default(), |
81 | 3 | ); |
82 | 3 | |
83 | 3 | Ok(field) |
84 | 3 | }) |
85 | 2 | .collect(); |
86 | | |
87 | 2 | let schema = Arc::new(Schema::new_with_metadata( |
88 | 2 | fields?0 , |
89 | 2 | input_schema.metadata().clone(), |
90 | | )); |
91 | | |
92 | | // construct a map from the input expressions to the output expression of the Projection |
93 | 2 | let projection_mapping = ProjectionMapping::try_new(&expr, &input_schema)?0 ; |
94 | 2 | let cache = |
95 | 2 | Self::compute_properties(&input, &projection_mapping, Arc::clone(&schema))?0 ; |
96 | 2 | Ok(Self { |
97 | 2 | expr, |
98 | 2 | schema, |
99 | 2 | input, |
100 | 2 | metrics: ExecutionPlanMetricsSet::new(), |
101 | 2 | cache, |
102 | 2 | }) |
103 | 2 | } |
104 | | |
105 | | /// The projection expressions stored as tuples of (expression, output column name) |
106 | 0 | pub fn expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] { |
107 | 0 | &self.expr |
108 | 0 | } |
109 | | |
110 | | /// The input plan |
111 | 0 | pub fn input(&self) -> &Arc<dyn ExecutionPlan> { |
112 | 0 | &self.input |
113 | 0 | } |
114 | | |
115 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
116 | 2 | fn compute_properties( |
117 | 2 | input: &Arc<dyn ExecutionPlan>, |
118 | 2 | projection_mapping: &ProjectionMapping, |
119 | 2 | schema: SchemaRef, |
120 | 2 | ) -> Result<PlanProperties> { |
121 | 2 | // Calculate equivalence properties: |
122 | 2 | let mut input_eq_properties = input.equivalence_properties().clone(); |
123 | 2 | input_eq_properties.substitute_oeq_class(projection_mapping)?0 ; |
124 | 2 | let eq_properties = input_eq_properties.project(projection_mapping, schema); |
125 | 2 | |
126 | 2 | // Calculate output partitioning, which needs to respect aliases: |
127 | 2 | let input_partition = input.output_partitioning(); |
128 | 2 | let output_partitioning = |
129 | 2 | input_partition.project(projection_mapping, &input_eq_properties); |
130 | 2 | |
131 | 2 | Ok(PlanProperties::new( |
132 | 2 | eq_properties, |
133 | 2 | output_partitioning, |
134 | 2 | input.execution_mode(), |
135 | 2 | )) |
136 | 2 | } |
137 | | } |
138 | | |
139 | | impl DisplayAs for ProjectionExec { |
140 | 1 | fn fmt_as( |
141 | 1 | &self, |
142 | 1 | t: DisplayFormatType, |
143 | 1 | f: &mut std::fmt::Formatter, |
144 | 1 | ) -> std::fmt::Result { |
145 | 1 | match t { |
146 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
147 | 1 | let expr: Vec<String> = self |
148 | 1 | .expr |
149 | 1 | .iter() |
150 | 3 | .map(|(e, alias)| { |
151 | 3 | let e = e.to_string(); |
152 | 3 | if &e != alias { |
153 | 3 | format!("{e} as {alias}") |
154 | | } else { |
155 | 0 | e |
156 | | } |
157 | 3 | }) |
158 | 1 | .collect(); |
159 | 1 | |
160 | 1 | write!(f, "ProjectionExec: expr=[{}]", expr.join(", ")) |
161 | 1 | } |
162 | 1 | } |
163 | 1 | } |
164 | | } |
165 | | |
166 | | impl ExecutionPlan for ProjectionExec { |
167 | 0 | fn name(&self) -> &'static str { |
168 | 0 | "ProjectionExec" |
169 | 0 | } |
170 | | |
171 | | /// Return a reference to Any that can be used for downcasting |
172 | 0 | fn as_any(&self) -> &dyn Any { |
173 | 0 | self |
174 | 0 | } |
175 | | |
176 | 1 | fn properties(&self) -> &PlanProperties { |
177 | 1 | &self.cache |
178 | 1 | } |
179 | | |
180 | 1 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
181 | 1 | vec![&self.input] |
182 | 1 | } |
183 | | |
184 | 0 | fn maintains_input_order(&self) -> Vec<bool> { |
185 | 0 | // tell optimizer this operator doesn't reorder its input |
186 | 0 | vec![true] |
187 | 0 | } |
188 | | |
189 | 0 | fn with_new_children( |
190 | 0 | self: Arc<Self>, |
191 | 0 | mut children: Vec<Arc<dyn ExecutionPlan>>, |
192 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
193 | 0 | ProjectionExec::try_new(self.expr.clone(), children.swap_remove(0)) |
194 | 0 | .map(|p| Arc::new(p) as _) |
195 | 0 | } |
196 | | |
197 | 0 | fn benefits_from_input_partitioning(&self) -> Vec<bool> { |
198 | 0 | let all_simple_exprs = self |
199 | 0 | .expr |
200 | 0 | .iter() |
201 | 0 | .all(|(e, _)| e.as_any().is::<Column>() || e.as_any().is::<Literal>()); |
202 | 0 | // If expressions are all either column_expr or Literal, then all computations in this projection are reorder or rename, |
203 | 0 | // and projection would not benefit from the repartition, benefits_from_input_partitioning will return false. |
204 | 0 | vec![!all_simple_exprs] |
205 | 0 | } |
206 | | |
207 | 2 | fn execute( |
208 | 2 | &self, |
209 | 2 | partition: usize, |
210 | 2 | context: Arc<TaskContext>, |
211 | 2 | ) -> Result<SendableRecordBatchStream> { |
212 | 2 | trace!("Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()0 ); |
213 | | Ok(Box::pin(ProjectionStream { |
214 | 2 | schema: Arc::clone(&self.schema), |
215 | 3 | expr: self.expr.iter().map(|x| Arc::clone(&x.0)).collect(), |
216 | 2 | input: self.input.execute(partition, context)?0 , |
217 | 2 | baseline_metrics: BaselineMetrics::new(&self.metrics, partition), |
218 | | })) |
219 | 2 | } |
220 | | |
221 | 0 | fn metrics(&self) -> Option<MetricsSet> { |
222 | 0 | Some(self.metrics.clone_inner()) |
223 | 0 | } |
224 | | |
225 | 0 | fn statistics(&self) -> Result<Statistics> { |
226 | 0 | Ok(stats_projection( |
227 | 0 | self.input.statistics()?, |
228 | 0 | self.expr.iter().map(|(e, _)| Arc::clone(e)), |
229 | 0 | Arc::clone(&self.schema), |
230 | 0 | )) |
231 | 0 | } |
232 | | |
233 | 0 | fn supports_limit_pushdown(&self) -> bool { |
234 | 0 | true |
235 | 0 | } |
236 | | } |
237 | | |
238 | | /// If e is a direct column reference, returns the field level |
239 | | /// metadata for that field, if any. Otherwise returns None |
240 | 62 | pub(crate) fn get_field_metadata( |
241 | 62 | e: &Arc<dyn PhysicalExpr>, |
242 | 62 | input_schema: &Schema, |
243 | 62 | ) -> Option<HashMap<String, String>> { |
244 | 62 | // Look up field by index in schema (not NAME as there can be more than one |
245 | 62 | // column with the same name) |
246 | 62 | e.as_any() |
247 | 62 | .downcast_ref::<Column>() |
248 | 62 | .map(|column| input_schema.field(column.index()).metadata()61 ) |
249 | 62 | .cloned() |
250 | 62 | } |
251 | | |
252 | 2 | fn stats_projection( |
253 | 2 | mut stats: Statistics, |
254 | 2 | exprs: impl Iterator<Item = Arc<dyn PhysicalExpr>>, |
255 | 2 | schema: SchemaRef, |
256 | 2 | ) -> Statistics { |
257 | 2 | let mut primitive_row_size = 0; |
258 | 2 | let mut primitive_row_size_possible = true; |
259 | 2 | let mut column_statistics = vec![]; |
260 | 6 | for expr4 in exprs { |
261 | 4 | let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() { |
262 | 4 | stats.column_statistics[col.index()].clone() |
263 | | } else { |
264 | | // TODO stats: estimate more statistics from expressions |
265 | | // (expressions should compute their statistics themselves) |
266 | 0 | ColumnStatistics::new_unknown() |
267 | | }; |
268 | 4 | column_statistics.push(col_stats); |
269 | 4 | if let Ok(data_type) = expr.data_type(&schema) { |
270 | 4 | if let Some(value3 ) = data_type.primitive_width() { |
271 | 3 | primitive_row_size += value; |
272 | 3 | continue; |
273 | 1 | } |
274 | 0 | } |
275 | 1 | primitive_row_size_possible = false; |
276 | | } |
277 | | |
278 | 2 | if primitive_row_size_possible { |
279 | 1 | stats.total_byte_size = |
280 | 1 | Precision::Exact(primitive_row_size).multiply(&stats.num_rows); |
281 | 1 | } |
282 | 2 | stats.column_statistics = column_statistics; |
283 | 2 | stats |
284 | 2 | } |
285 | | |
286 | | impl ProjectionStream { |
287 | 6 | fn batch_project(&self, batch: &RecordBatch) -> Result<RecordBatch> { |
288 | 6 | // records time on drop |
289 | 6 | let _timer = self.baseline_metrics.elapsed_compute().timer(); |
290 | 6 | let arrays = self |
291 | 6 | .expr |
292 | 6 | .iter() |
293 | 15 | .map(|expr| { |
294 | 15 | expr.evaluate(batch) |
295 | 15 | .and_then(|v| v.into_array(batch.num_rows())) |
296 | 15 | }) |
297 | 6 | .collect::<Result<Vec<_>>>()?0 ; |
298 | | |
299 | 6 | if arrays.is_empty() { |
300 | 1 | let options = |
301 | 1 | RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); |
302 | 1 | RecordBatch::try_new_with_options(Arc::clone(&self.schema), arrays, &options) |
303 | 1 | .map_err(Into::into) |
304 | | } else { |
305 | 5 | RecordBatch::try_new(Arc::clone(&self.schema), arrays).map_err(Into::into) |
306 | | } |
307 | 6 | } |
308 | | } |
309 | | |
310 | | /// Projection iterator |
311 | | struct ProjectionStream { |
312 | | schema: SchemaRef, |
313 | | expr: Vec<Arc<dyn PhysicalExpr>>, |
314 | | input: SendableRecordBatchStream, |
315 | | baseline_metrics: BaselineMetrics, |
316 | | } |
317 | | |
318 | | impl Stream for ProjectionStream { |
319 | | type Item = Result<RecordBatch>; |
320 | | |
321 | 561 | fn poll_next( |
322 | 561 | mut self: Pin<&mut Self>, |
323 | 561 | cx: &mut Context<'_>, |
324 | 561 | ) -> Poll<Option<Self::Item>> { |
325 | 561 | let poll = self.input.poll_next_unpin(cx).map(|x| m7 atch x6 { |
326 | 6 | Some(Ok(batch)) => Some(self.batch_project(&batch)), |
327 | 1 | other => other, |
328 | 561 | }7 ); |
329 | 561 | |
330 | 561 | self.baseline_metrics.record_poll(poll) |
331 | 561 | } |
332 | | |
333 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
334 | 0 | // same number of record batches |
335 | 0 | self.input.size_hint() |
336 | 0 | } |
337 | | } |
338 | | |
339 | | impl RecordBatchStream for ProjectionStream { |
340 | | /// Get the schema |
341 | 0 | fn schema(&self) -> SchemaRef { |
342 | 0 | Arc::clone(&self.schema) |
343 | 0 | } |
344 | | } |
345 | | |
346 | | #[cfg(test)] |
347 | | mod tests { |
348 | | use super::*; |
349 | | use crate::common::collect; |
350 | | use crate::expressions; |
351 | | use crate::test; |
352 | | |
353 | | use arrow_schema::DataType; |
354 | | use datafusion_common::ScalarValue; |
355 | | |
356 | | #[tokio::test] |
357 | 1 | async fn project_no_column() -> Result<()> { |
358 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
359 | 1 | |
360 | 1 | let exec = test::scan_partitioned(1); |
361 | 1 | let expected = collect(exec.execute(0, Arc::clone(&task_ctx))?0 ) |
362 | 1 | .await0 |
363 | 1 | .unwrap(); |
364 | 1 | |
365 | 1 | let projection = ProjectionExec::try_new(vec![], exec)?0 ; |
366 | 1 | let stream = projection.execute(0, Arc::clone(&task_ctx))?0 ; |
367 | 1 | let output = collect(stream).await0 .unwrap(); |
368 | 1 | assert_eq!(output.len(), expected.len()); |
369 | 1 | |
370 | 1 | Ok(()) |
371 | 1 | } |
372 | | |
373 | 2 | fn get_stats() -> Statistics { |
374 | 2 | Statistics { |
375 | 2 | num_rows: Precision::Exact(5), |
376 | 2 | total_byte_size: Precision::Exact(23), |
377 | 2 | column_statistics: vec![ |
378 | 2 | ColumnStatistics { |
379 | 2 | distinct_count: Precision::Exact(5), |
380 | 2 | max_value: Precision::Exact(ScalarValue::Int64(Some(21))), |
381 | 2 | min_value: Precision::Exact(ScalarValue::Int64(Some(-4))), |
382 | 2 | null_count: Precision::Exact(0), |
383 | 2 | }, |
384 | 2 | ColumnStatistics { |
385 | 2 | distinct_count: Precision::Exact(1), |
386 | 2 | max_value: Precision::Exact(ScalarValue::from("x")), |
387 | 2 | min_value: Precision::Exact(ScalarValue::from("a")), |
388 | 2 | null_count: Precision::Exact(3), |
389 | 2 | }, |
390 | 2 | ColumnStatistics { |
391 | 2 | distinct_count: Precision::Absent, |
392 | 2 | max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))), |
393 | 2 | min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))), |
394 | 2 | null_count: Precision::Absent, |
395 | 2 | }, |
396 | 2 | ], |
397 | 2 | } |
398 | 2 | } |
399 | | |
400 | 2 | fn get_schema() -> Schema { |
401 | 2 | let field_0 = Field::new("col0", DataType::Int64, false); |
402 | 2 | let field_1 = Field::new("col1", DataType::Utf8, false); |
403 | 2 | let field_2 = Field::new("col2", DataType::Float32, false); |
404 | 2 | Schema::new(vec![field_0, field_1, field_2]) |
405 | 2 | } |
406 | | #[tokio::test] |
407 | 1 | async fn test_stats_projection_columns_only() { |
408 | 1 | let source = get_stats(); |
409 | 1 | let schema = get_schema(); |
410 | 1 | |
411 | 1 | let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![ |
412 | 1 | Arc::new(expressions::Column::new("col1", 1)), |
413 | 1 | Arc::new(expressions::Column::new("col0", 0)), |
414 | 1 | ]; |
415 | 1 | |
416 | 1 | let result = stats_projection(source, exprs.into_iter(), Arc::new(schema)); |
417 | 1 | |
418 | 1 | let expected = Statistics { |
419 | 1 | num_rows: Precision::Exact(5), |
420 | 1 | total_byte_size: Precision::Exact(23), |
421 | 1 | column_statistics: vec![ |
422 | 1 | ColumnStatistics { |
423 | 1 | distinct_count: Precision::Exact(1), |
424 | 1 | max_value: Precision::Exact(ScalarValue::from("x")), |
425 | 1 | min_value: Precision::Exact(ScalarValue::from("a")), |
426 | 1 | null_count: Precision::Exact(3), |
427 | 1 | }, |
428 | 1 | ColumnStatistics { |
429 | 1 | distinct_count: Precision::Exact(5), |
430 | 1 | max_value: Precision::Exact(ScalarValue::Int64(Some(21))), |
431 | 1 | min_value: Precision::Exact(ScalarValue::Int64(Some(-4))), |
432 | 1 | null_count: Precision::Exact(0), |
433 | 1 | }, |
434 | 1 | ], |
435 | 1 | }; |
436 | 1 | |
437 | 1 | assert_eq!(result, expected); |
438 | 1 | } |
439 | | |
440 | | #[tokio::test] |
441 | 1 | async fn test_stats_projection_column_with_primitive_width_only() { |
442 | 1 | let source = get_stats(); |
443 | 1 | let schema = get_schema(); |
444 | 1 | |
445 | 1 | let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![ |
446 | 1 | Arc::new(expressions::Column::new("col2", 2)), |
447 | 1 | Arc::new(expressions::Column::new("col0", 0)), |
448 | 1 | ]; |
449 | 1 | |
450 | 1 | let result = stats_projection(source, exprs.into_iter(), Arc::new(schema)); |
451 | 1 | |
452 | 1 | let expected = Statistics { |
453 | 1 | num_rows: Precision::Exact(5), |
454 | 1 | total_byte_size: Precision::Exact(60), |
455 | 1 | column_statistics: vec![ |
456 | 1 | ColumnStatistics { |
457 | 1 | distinct_count: Precision::Absent, |
458 | 1 | max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))), |
459 | 1 | min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))), |
460 | 1 | null_count: Precision::Absent, |
461 | 1 | }, |
462 | 1 | ColumnStatistics { |
463 | 1 | distinct_count: Precision::Exact(5), |
464 | 1 | max_value: Precision::Exact(ScalarValue::Int64(Some(21))), |
465 | 1 | min_value: Precision::Exact(ScalarValue::Int64(Some(-4))), |
466 | 1 | null_count: Precision::Exact(0), |
467 | 1 | }, |
468 | 1 | ], |
469 | 1 | }; |
470 | 1 | |
471 | 1 | assert_eq!(result, expected); |
472 | 1 | } |
473 | | } |