Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/analyze.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Defines the ANALYZE operator
19
20
use std::any::Any;
21
use std::sync::Arc;
22
23
use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
24
use super::{
25
    DisplayAs, Distribution, ExecutionPlanProperties, PlanProperties,
26
    SendableRecordBatchStream,
27
};
28
use crate::display::DisplayableExecutionPlan;
29
use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
30
31
use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
32
use datafusion_common::instant::Instant;
33
use datafusion_common::{internal_err, DataFusionError, Result};
34
use datafusion_execution::TaskContext;
35
use datafusion_physical_expr::EquivalenceProperties;
36
37
use futures::StreamExt;
38
39
/// `EXPLAIN ANALYZE` execution plan operator. This operator runs its input,
40
/// discards the results, and then prints out an annotated plan with metrics
41
#[derive(Debug, Clone)]
42
pub struct AnalyzeExec {
43
    /// control how much extra to print
44
    verbose: bool,
45
    /// if statistics should be displayed
46
    show_statistics: bool,
47
    /// The input plan (the plan being analyzed)
48
    pub(crate) input: Arc<dyn ExecutionPlan>,
49
    /// The output schema for RecordBatches of this exec node
50
    schema: SchemaRef,
51
    cache: PlanProperties,
52
}
53
54
impl AnalyzeExec {
55
    /// Create a new AnalyzeExec
56
1
    pub fn new(
57
1
        verbose: bool,
58
1
        show_statistics: bool,
59
1
        input: Arc<dyn ExecutionPlan>,
60
1
        schema: SchemaRef,
61
1
    ) -> Self {
62
1
        let cache = Self::compute_properties(&input, Arc::clone(&schema));
63
1
        AnalyzeExec {
64
1
            verbose,
65
1
            show_statistics,
66
1
            input,
67
1
            schema,
68
1
            cache,
69
1
        }
70
1
    }
71
72
    /// access to verbose
73
0
    pub fn verbose(&self) -> bool {
74
0
        self.verbose
75
0
    }
76
77
    /// access to show_statistics
78
0
    pub fn show_statistics(&self) -> bool {
79
0
        self.show_statistics
80
0
    }
81
82
    /// The input plan
83
0
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
84
0
        &self.input
85
0
    }
86
87
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
88
1
    fn compute_properties(
89
1
        input: &Arc<dyn ExecutionPlan>,
90
1
        schema: SchemaRef,
91
1
    ) -> PlanProperties {
92
1
        let eq_properties = EquivalenceProperties::new(schema);
93
1
        let output_partitioning = Partitioning::UnknownPartitioning(1);
94
1
        let exec_mode = input.execution_mode();
95
1
        PlanProperties::new(eq_properties, output_partitioning, exec_mode)
96
1
    }
97
}
98
99
impl DisplayAs for AnalyzeExec {
100
0
    fn fmt_as(
101
0
        &self,
102
0
        t: DisplayFormatType,
103
0
        f: &mut std::fmt::Formatter,
104
0
    ) -> std::fmt::Result {
105
0
        match t {
106
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
107
0
                write!(f, "AnalyzeExec verbose={}", self.verbose)
108
0
            }
109
0
        }
110
0
    }
111
}
112
113
impl ExecutionPlan for AnalyzeExec {
114
0
    fn name(&self) -> &'static str {
115
0
        "AnalyzeExec"
116
0
    }
117
118
    /// Return a reference to Any that can be used for downcasting
119
0
    fn as_any(&self) -> &dyn Any {
120
0
        self
121
0
    }
122
123
2
    fn properties(&self) -> &PlanProperties {
124
2
        &self.cache
125
2
    }
126
127
0
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
128
0
        vec![&self.input]
129
0
    }
130
131
    /// AnalyzeExec is handled specially so this value is ignored
132
0
    fn required_input_distribution(&self) -> Vec<Distribution> {
133
0
        vec![]
134
0
    }
135
136
0
    fn with_new_children(
137
0
        self: Arc<Self>,
138
0
        mut children: Vec<Arc<dyn ExecutionPlan>>,
139
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
140
0
        Ok(Arc::new(Self::new(
141
0
            self.verbose,
142
0
            self.show_statistics,
143
0
            children.pop().unwrap(),
144
0
            Arc::clone(&self.schema),
145
0
        )))
146
0
    }
147
148
1
    fn execute(
149
1
        &self,
150
1
        partition: usize,
151
1
        context: Arc<TaskContext>,
152
1
    ) -> Result<SendableRecordBatchStream> {
153
1
        if 0 != partition {
154
0
            return internal_err!(
155
0
                "AnalyzeExec invalid partition. Expected 0, got {partition}"
156
0
            );
157
1
        }
158
1
159
1
        // Gather futures that will run each input partition in
160
1
        // parallel (on a separate tokio task) using a JoinSet to
161
1
        // cancel outstanding futures on drop
162
1
        let num_input_partitions = self.input.output_partitioning().partition_count();
163
1
        let mut builder =
164
1
            RecordBatchReceiverStream::builder(self.schema(), num_input_partitions);
165
166
1
        for input_partition in 0..num_input_partitions {
167
1
            builder.run_input(
168
1
                Arc::clone(&self.input),
169
1
                input_partition,
170
1
                Arc::clone(&context),
171
1
            );
172
1
        }
173
174
        // Create future that computes thefinal output
175
1
        let start = Instant::now();
176
1
        let captured_input = Arc::clone(&self.input);
177
1
        let captured_schema = Arc::clone(&self.schema);
178
1
        let verbose = self.verbose;
179
1
        let show_statistics = self.show_statistics;
180
1
181
1
        // future that gathers the results from all the tasks in the
182
1
        // JoinSet that computes the overall row count and final
183
1
        // record batch
184
1
        let mut input_stream = builder.build();
185
1
        let output = async move {
186
1
            let mut total_rows = 0;
187
1
            while let Some(
batch0
) = input_stream.next().
await0
.
transpose()0
?0
{
188
0
                total_rows += batch.num_rows();
189
0
            }
190
191
0
            let duration = Instant::now() - start;
192
0
            create_output_batch(
193
0
                verbose,
194
0
                show_statistics,
195
0
                total_rows,
196
0
                duration,
197
0
                captured_input,
198
0
                captured_schema,
199
0
            )
200
0
        };
201
202
1
        Ok(Box::pin(RecordBatchStreamAdapter::new(
203
1
            Arc::clone(&self.schema),
204
1
            futures::stream::once(output),
205
1
        )))
206
1
    }
207
}
208
209
/// Creates the output of AnalyzeExec as a RecordBatch
210
0
fn create_output_batch(
211
0
    verbose: bool,
212
0
    show_statistics: bool,
213
0
    total_rows: usize,
214
0
    duration: std::time::Duration,
215
0
    input: Arc<dyn ExecutionPlan>,
216
0
    schema: SchemaRef,
217
0
) -> Result<RecordBatch> {
218
0
    let mut type_builder = StringBuilder::with_capacity(1, 1024);
219
0
    let mut plan_builder = StringBuilder::with_capacity(1, 1024);
220
0
221
0
    // TODO use some sort of enum rather than strings?
222
0
    type_builder.append_value("Plan with Metrics");
223
0
224
0
    let annotated_plan = DisplayableExecutionPlan::with_metrics(input.as_ref())
225
0
        .set_show_statistics(show_statistics)
226
0
        .indent(verbose)
227
0
        .to_string();
228
0
    plan_builder.append_value(annotated_plan);
229
0
230
0
    // Verbose output
231
0
    // TODO make this more sophisticated
232
0
    if verbose {
233
0
        type_builder.append_value("Plan with Full Metrics");
234
0
235
0
        let annotated_plan = DisplayableExecutionPlan::with_full_metrics(input.as_ref())
236
0
            .set_show_statistics(show_statistics)
237
0
            .indent(verbose)
238
0
            .to_string();
239
0
        plan_builder.append_value(annotated_plan);
240
0
241
0
        type_builder.append_value("Output Rows");
242
0
        plan_builder.append_value(total_rows.to_string());
243
0
244
0
        type_builder.append_value("Duration");
245
0
        plan_builder.append_value(format!("{duration:?}"));
246
0
    }
247
248
0
    RecordBatch::try_new(
249
0
        schema,
250
0
        vec![
251
0
            Arc::new(type_builder.finish()),
252
0
            Arc::new(plan_builder.finish()),
253
0
        ],
254
0
    )
255
0
    .map_err(DataFusionError::from)
256
0
}
257
258
#[cfg(test)]
259
mod tests {
260
    use super::*;
261
    use crate::{
262
        collect,
263
        test::{
264
            assert_is_pending,
265
            exec::{assert_strong_count_converges_to_zero, BlockingExec},
266
        },
267
    };
268
269
    use arrow::datatypes::{DataType, Field, Schema};
270
    use futures::FutureExt;
271
272
    #[tokio::test]
273
1
    async fn test_drop_cancel() -> Result<()> {
274
1
        let task_ctx = Arc::new(TaskContext::default());
275
1
        let schema =
276
1
            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)]));
277
1
278
1
        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
279
1
        let refs = blocking_exec.refs();
280
1
        let analyze_exec = Arc::new(AnalyzeExec::new(true, false, blocking_exec, schema));
281
1
282
1
        let fut = collect(analyze_exec, task_ctx);
283
1
        let mut fut = fut.boxed();
284
1
285
1
        assert_is_pending(&mut fut);
286
1
        drop(fut);
287
1
        assert_strong_count_converges_to_zero(refs).await;
288
1
289
1
        Ok(())
290
1
    }
291
}