Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/projection.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Defines the projection execution plan. A projection determines which columns or expressions
19
//! are returned from a query. The SQL statement `SELECT a, b, a+b FROM t1` is an example
20
//! of a projection on table `t1` where the expressions `a`, `b`, and `a+b` are the
21
//! projection expressions. `SELECT` without `FROM` will only evaluate expressions.
22
23
use std::any::Any;
24
use std::collections::HashMap;
25
use std::pin::Pin;
26
use std::sync::Arc;
27
use std::task::{Context, Poll};
28
29
use super::expressions::Column;
30
use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
31
use super::{
32
    DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream,
33
    SendableRecordBatchStream, Statistics,
34
};
35
use crate::{ColumnStatistics, DisplayFormatType, ExecutionPlan, PhysicalExpr};
36
37
use arrow::datatypes::{Field, Schema, SchemaRef};
38
use arrow::record_batch::{RecordBatch, RecordBatchOptions};
39
use datafusion_common::stats::Precision;
40
use datafusion_common::Result;
41
use datafusion_execution::TaskContext;
42
use datafusion_physical_expr::equivalence::ProjectionMapping;
43
use datafusion_physical_expr::expressions::Literal;
44
45
use futures::stream::{Stream, StreamExt};
46
use log::trace;
47
48
/// Execution plan for a projection
49
#[derive(Debug, Clone)]
50
pub struct ProjectionExec {
51
    /// The projection expressions stored as tuples of (expression, output column name)
52
    pub(crate) expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
53
    /// The schema once the projection has been applied to the input
54
    schema: SchemaRef,
55
    /// The input plan
56
    input: Arc<dyn ExecutionPlan>,
57
    /// Execution metrics
58
    metrics: ExecutionPlanMetricsSet,
59
    /// Cache holding plan properties like equivalences, output partitioning etc.
60
    cache: PlanProperties,
61
}
62
63
impl ProjectionExec {
64
    /// Create a projection on an input
65
2
    pub fn try_new(
66
2
        expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
67
2
        input: Arc<dyn ExecutionPlan>,
68
2
    ) -> Result<Self> {
69
2
        let input_schema = input.schema();
70
2
71
2
        let fields: Result<Vec<Field>> = expr
72
2
            .iter()
73
3
            .map(|(e, name)| {
74
3
                let mut field = Field::new(
75
3
                    name,
76
3
                    e.data_type(&input_schema)
?0
,
77
3
                    e.nullable(&input_schema)
?0
,
78
                );
79
3
                field.set_metadata(
80
3
                    get_field_metadata(e, &input_schema).unwrap_or_default(),
81
3
                );
82
3
83
3
                Ok(field)
84
3
            })
85
2
            .collect();
86
87
2
        let schema = Arc::new(Schema::new_with_metadata(
88
2
            fields
?0
,
89
2
            input_schema.metadata().clone(),
90
        ));
91
92
        // construct a map from the input expressions to the output expression of the Projection
93
2
        let projection_mapping = ProjectionMapping::try_new(&expr, &input_schema)
?0
;
94
2
        let cache =
95
2
            Self::compute_properties(&input, &projection_mapping, Arc::clone(&schema))
?0
;
96
2
        Ok(Self {
97
2
            expr,
98
2
            schema,
99
2
            input,
100
2
            metrics: ExecutionPlanMetricsSet::new(),
101
2
            cache,
102
2
        })
103
2
    }
104
105
    /// The projection expressions stored as tuples of (expression, output column name)
106
0
    pub fn expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
107
0
        &self.expr
108
0
    }
109
110
    /// The input plan
111
0
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
112
0
        &self.input
113
0
    }
114
115
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
116
2
    fn compute_properties(
117
2
        input: &Arc<dyn ExecutionPlan>,
118
2
        projection_mapping: &ProjectionMapping,
119
2
        schema: SchemaRef,
120
2
    ) -> Result<PlanProperties> {
121
2
        // Calculate equivalence properties:
122
2
        let mut input_eq_properties = input.equivalence_properties().clone();
123
2
        input_eq_properties.substitute_oeq_class(projection_mapping)
?0
;
124
2
        let eq_properties = input_eq_properties.project(projection_mapping, schema);
125
2
126
2
        // Calculate output partitioning, which needs to respect aliases:
127
2
        let input_partition = input.output_partitioning();
128
2
        let output_partitioning =
129
2
            input_partition.project(projection_mapping, &input_eq_properties);
130
2
131
2
        Ok(PlanProperties::new(
132
2
            eq_properties,
133
2
            output_partitioning,
134
2
            input.execution_mode(),
135
2
        ))
136
2
    }
137
}
138
139
impl DisplayAs for ProjectionExec {
140
1
    fn fmt_as(
141
1
        &self,
142
1
        t: DisplayFormatType,
143
1
        f: &mut std::fmt::Formatter,
144
1
    ) -> std::fmt::Result {
145
1
        match t {
146
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
147
1
                let expr: Vec<String> = self
148
1
                    .expr
149
1
                    .iter()
150
3
                    .map(|(e, alias)| {
151
3
                        let e = e.to_string();
152
3
                        if &e != alias {
153
3
                            format!("{e} as {alias}")
154
                        } else {
155
0
                            e
156
                        }
157
3
                    })
158
1
                    .collect();
159
1
160
1
                write!(f, "ProjectionExec: expr=[{}]", expr.join(", "))
161
1
            }
162
1
        }
163
1
    }
164
}
165
166
impl ExecutionPlan for ProjectionExec {
167
0
    fn name(&self) -> &'static str {
168
0
        "ProjectionExec"
169
0
    }
170
171
    /// Return a reference to Any that can be used for downcasting
172
0
    fn as_any(&self) -> &dyn Any {
173
0
        self
174
0
    }
175
176
1
    fn properties(&self) -> &PlanProperties {
177
1
        &self.cache
178
1
    }
179
180
1
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
181
1
        vec![&self.input]
182
1
    }
183
184
0
    fn maintains_input_order(&self) -> Vec<bool> {
185
0
        // tell optimizer this operator doesn't reorder its input
186
0
        vec![true]
187
0
    }
188
189
0
    fn with_new_children(
190
0
        self: Arc<Self>,
191
0
        mut children: Vec<Arc<dyn ExecutionPlan>>,
192
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
193
0
        ProjectionExec::try_new(self.expr.clone(), children.swap_remove(0))
194
0
            .map(|p| Arc::new(p) as _)
195
0
    }
196
197
0
    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
198
0
        let all_simple_exprs = self
199
0
            .expr
200
0
            .iter()
201
0
            .all(|(e, _)| e.as_any().is::<Column>() || e.as_any().is::<Literal>());
202
0
        // If expressions are all either column_expr or Literal, then all computations in this projection are reorder or rename,
203
0
        // and projection would not benefit from the repartition, benefits_from_input_partitioning will return false.
204
0
        vec![!all_simple_exprs]
205
0
    }
206
207
2
    fn execute(
208
2
        &self,
209
2
        partition: usize,
210
2
        context: Arc<TaskContext>,
211
2
    ) -> Result<SendableRecordBatchStream> {
212
2
        trace!(
"Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()0
);
213
        Ok(Box::pin(ProjectionStream {
214
2
            schema: Arc::clone(&self.schema),
215
3
            expr: self.expr.iter().map(|x| Arc::clone(&x.0)).collect(),
216
2
            input: self.input.execute(partition, context)
?0
,
217
2
            baseline_metrics: BaselineMetrics::new(&self.metrics, partition),
218
        }))
219
2
    }
220
221
0
    fn metrics(&self) -> Option<MetricsSet> {
222
0
        Some(self.metrics.clone_inner())
223
0
    }
224
225
0
    fn statistics(&self) -> Result<Statistics> {
226
0
        Ok(stats_projection(
227
0
            self.input.statistics()?,
228
0
            self.expr.iter().map(|(e, _)| Arc::clone(e)),
229
0
            Arc::clone(&self.schema),
230
0
        ))
231
0
    }
232
233
0
    fn supports_limit_pushdown(&self) -> bool {
234
0
        true
235
0
    }
236
}
237
238
/// If e is a direct column reference, returns the field level
239
/// metadata for that field, if any. Otherwise returns None
240
62
pub(crate) fn get_field_metadata(
241
62
    e: &Arc<dyn PhysicalExpr>,
242
62
    input_schema: &Schema,
243
62
) -> Option<HashMap<String, String>> {
244
62
    // Look up field by index in schema (not NAME as there can be more than one
245
62
    // column with the same name)
246
62
    e.as_any()
247
62
        .downcast_ref::<Column>()
248
62
        .map(|column| 
input_schema.field(column.index()).metadata()61
)
249
62
        .cloned()
250
62
}
251
252
2
fn stats_projection(
253
2
    mut stats: Statistics,
254
2
    exprs: impl Iterator<Item = Arc<dyn PhysicalExpr>>,
255
2
    schema: SchemaRef,
256
2
) -> Statistics {
257
2
    let mut primitive_row_size = 0;
258
2
    let mut primitive_row_size_possible = true;
259
2
    let mut column_statistics = vec![];
260
6
    for 
expr4
in exprs {
261
4
        let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
262
4
            stats.column_statistics[col.index()].clone()
263
        } else {
264
            // TODO stats: estimate more statistics from expressions
265
            // (expressions should compute their statistics themselves)
266
0
            ColumnStatistics::new_unknown()
267
        };
268
4
        column_statistics.push(col_stats);
269
4
        if let Ok(data_type) = expr.data_type(&schema) {
270
4
            if let Some(
value3
) = data_type.primitive_width() {
271
3
                primitive_row_size += value;
272
3
                continue;
273
1
            }
274
0
        }
275
1
        primitive_row_size_possible = false;
276
    }
277
278
2
    if primitive_row_size_possible {
279
1
        stats.total_byte_size =
280
1
            Precision::Exact(primitive_row_size).multiply(&stats.num_rows);
281
1
    }
282
2
    stats.column_statistics = column_statistics;
283
2
    stats
284
2
}
285
286
impl ProjectionStream {
287
6
    fn batch_project(&self, batch: &RecordBatch) -> Result<RecordBatch> {
288
6
        // records time on drop
289
6
        let _timer = self.baseline_metrics.elapsed_compute().timer();
290
6
        let arrays = self
291
6
            .expr
292
6
            .iter()
293
15
            .map(|expr| {
294
15
                expr.evaluate(batch)
295
15
                    .and_then(|v| v.into_array(batch.num_rows()))
296
15
            })
297
6
            .collect::<Result<Vec<_>>>()
?0
;
298
299
6
        if arrays.is_empty() {
300
1
            let options =
301
1
                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
302
1
            RecordBatch::try_new_with_options(Arc::clone(&self.schema), arrays, &options)
303
1
                .map_err(Into::into)
304
        } else {
305
5
            RecordBatch::try_new(Arc::clone(&self.schema), arrays).map_err(Into::into)
306
        }
307
6
    }
308
}
309
310
/// Projection iterator
311
struct ProjectionStream {
312
    schema: SchemaRef,
313
    expr: Vec<Arc<dyn PhysicalExpr>>,
314
    input: SendableRecordBatchStream,
315
    baseline_metrics: BaselineMetrics,
316
}
317
318
impl Stream for ProjectionStream {
319
    type Item = Result<RecordBatch>;
320
321
561
    fn poll_next(
322
561
        mut self: Pin<&mut Self>,
323
561
        cx: &mut Context<'_>,
324
561
    ) -> Poll<Option<Self::Item>> {
325
561
        let poll = self.input.poll_next_unpin(cx).map(|x| 
m7
atch
x6
{
326
6
            Some(Ok(batch)) => Some(self.batch_project(&batch)),
327
1
            other => other,
328
561
        
}7
);
329
561
330
561
        self.baseline_metrics.record_poll(poll)
331
561
    }
332
333
0
    fn size_hint(&self) -> (usize, Option<usize>) {
334
0
        // same number of record batches
335
0
        self.input.size_hint()
336
0
    }
337
}
338
339
impl RecordBatchStream for ProjectionStream {
340
    /// Get the schema
341
0
    fn schema(&self) -> SchemaRef {
342
0
        Arc::clone(&self.schema)
343
0
    }
344
}
345
346
#[cfg(test)]
347
mod tests {
348
    use super::*;
349
    use crate::common::collect;
350
    use crate::expressions;
351
    use crate::test;
352
353
    use arrow_schema::DataType;
354
    use datafusion_common::ScalarValue;
355
356
    #[tokio::test]
357
1
    async fn project_no_column() -> Result<()> {
358
1
        let task_ctx = Arc::new(TaskContext::default());
359
1
360
1
        let exec = test::scan_partitioned(1);
361
1
        let expected = collect(exec.execute(0, Arc::clone(&task_ctx))
?0
)
362
1
            .
await0
363
1
            .unwrap();
364
1
365
1
        let projection = ProjectionExec::try_new(vec![], exec)
?0
;
366
1
        let stream = projection.execute(0, Arc::clone(&task_ctx))
?0
;
367
1
        let output = collect(stream).
await0
.unwrap();
368
1
        assert_eq!(output.len(), expected.len());
369
1
370
1
        Ok(())
371
1
    }
372
373
2
    fn get_stats() -> Statistics {
374
2
        Statistics {
375
2
            num_rows: Precision::Exact(5),
376
2
            total_byte_size: Precision::Exact(23),
377
2
            column_statistics: vec![
378
2
                ColumnStatistics {
379
2
                    distinct_count: Precision::Exact(5),
380
2
                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
381
2
                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
382
2
                    null_count: Precision::Exact(0),
383
2
                },
384
2
                ColumnStatistics {
385
2
                    distinct_count: Precision::Exact(1),
386
2
                    max_value: Precision::Exact(ScalarValue::from("x")),
387
2
                    min_value: Precision::Exact(ScalarValue::from("a")),
388
2
                    null_count: Precision::Exact(3),
389
2
                },
390
2
                ColumnStatistics {
391
2
                    distinct_count: Precision::Absent,
392
2
                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
393
2
                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
394
2
                    null_count: Precision::Absent,
395
2
                },
396
2
            ],
397
2
        }
398
2
    }
399
400
2
    fn get_schema() -> Schema {
401
2
        let field_0 = Field::new("col0", DataType::Int64, false);
402
2
        let field_1 = Field::new("col1", DataType::Utf8, false);
403
2
        let field_2 = Field::new("col2", DataType::Float32, false);
404
2
        Schema::new(vec![field_0, field_1, field_2])
405
2
    }
406
    #[tokio::test]
407
1
    async fn test_stats_projection_columns_only() {
408
1
        let source = get_stats();
409
1
        let schema = get_schema();
410
1
411
1
        let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
412
1
            Arc::new(expressions::Column::new("col1", 1)),
413
1
            Arc::new(expressions::Column::new("col0", 0)),
414
1
        ];
415
1
416
1
        let result = stats_projection(source, exprs.into_iter(), Arc::new(schema));
417
1
418
1
        let expected = Statistics {
419
1
            num_rows: Precision::Exact(5),
420
1
            total_byte_size: Precision::Exact(23),
421
1
            column_statistics: vec![
422
1
                ColumnStatistics {
423
1
                    distinct_count: Precision::Exact(1),
424
1
                    max_value: Precision::Exact(ScalarValue::from("x")),
425
1
                    min_value: Precision::Exact(ScalarValue::from("a")),
426
1
                    null_count: Precision::Exact(3),
427
1
                },
428
1
                ColumnStatistics {
429
1
                    distinct_count: Precision::Exact(5),
430
1
                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
431
1
                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
432
1
                    null_count: Precision::Exact(0),
433
1
                },
434
1
            ],
435
1
        };
436
1
437
1
        assert_eq!(result, expected);
438
1
    }
439
440
    #[tokio::test]
441
1
    async fn test_stats_projection_column_with_primitive_width_only() {
442
1
        let source = get_stats();
443
1
        let schema = get_schema();
444
1
445
1
        let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
446
1
            Arc::new(expressions::Column::new("col2", 2)),
447
1
            Arc::new(expressions::Column::new("col0", 0)),
448
1
        ];
449
1
450
1
        let result = stats_projection(source, exprs.into_iter(), Arc::new(schema));
451
1
452
1
        let expected = Statistics {
453
1
            num_rows: Precision::Exact(5),
454
1
            total_byte_size: Precision::Exact(60),
455
1
            column_statistics: vec![
456
1
                ColumnStatistics {
457
1
                    distinct_count: Precision::Absent,
458
1
                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
459
1
                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
460
1
                    null_count: Precision::Absent,
461
1
                },
462
1
                ColumnStatistics {
463
1
                    distinct_count: Precision::Exact(5),
464
1
                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
465
1
                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
466
1
                    null_count: Precision::Exact(0),
467
1
                },
468
1
            ],
469
1
        };
470
1
471
1
        assert_eq!(result, expected);
472
1
    }
473
}