/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/values.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Values execution plan |
19 | | |
20 | | use std::any::Any; |
21 | | use std::sync::Arc; |
22 | | |
23 | | use super::{ |
24 | | common, DisplayAs, ExecutionMode, PlanProperties, SendableRecordBatchStream, |
25 | | Statistics, |
26 | | }; |
27 | | use crate::{ |
28 | | memory::MemoryStream, ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, |
29 | | PhysicalExpr, |
30 | | }; |
31 | | |
32 | | use arrow::datatypes::{Schema, SchemaRef}; |
33 | | use arrow::record_batch::{RecordBatch, RecordBatchOptions}; |
34 | | use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; |
35 | | use datafusion_execution::TaskContext; |
36 | | use datafusion_physical_expr::EquivalenceProperties; |
37 | | |
38 | | /// Execution plan for values list based relation (produces constant rows) |
39 | | #[derive(Debug)] |
40 | | pub struct ValuesExec { |
41 | | /// The schema |
42 | | schema: SchemaRef, |
43 | | /// The data |
44 | | data: Vec<RecordBatch>, |
45 | | /// Cache holding plan properties like equivalences, output partitioning etc. |
46 | | cache: PlanProperties, |
47 | | } |
48 | | |
49 | | impl ValuesExec { |
50 | | /// create a new values exec from data as expr |
51 | 3 | pub fn try_new( |
52 | 3 | schema: SchemaRef, |
53 | 3 | data: Vec<Vec<Arc<dyn PhysicalExpr>>>, |
54 | 3 | ) -> Result<Self> { |
55 | 3 | if data.is_empty() { |
56 | 1 | return plan_err!("Values list cannot be empty"); |
57 | 2 | } |
58 | 2 | let n_row = data.len(); |
59 | 2 | let n_col = schema.fields().len(); |
60 | | // we have this single row batch as a placeholder to satisfy evaluation argument |
61 | | // and generate a single output row |
62 | 2 | let batch = RecordBatch::try_new_with_options( |
63 | 2 | Arc::new(Schema::empty()), |
64 | 2 | vec![], |
65 | 2 | &RecordBatchOptions::new().with_row_count(Some(1)), |
66 | 2 | )?0 ; |
67 | | |
68 | 2 | let arr = (0..n_col) |
69 | 2 | .map(|j| { |
70 | 2 | (0..n_row) |
71 | 2 | .map(|i| { |
72 | 2 | let r = data[i][j].evaluate(&batch); |
73 | | |
74 | 0 | match r { |
75 | 2 | Ok(ColumnarValue::Scalar(scalar)) => Ok(scalar), |
76 | 0 | Ok(ColumnarValue::Array(a)) if a.len() == 1 => { |
77 | 0 | ScalarValue::try_from_array(&a, 0) |
78 | | } |
79 | 0 | Ok(ColumnarValue::Array(a)) => { |
80 | 0 | plan_err!( |
81 | 0 | "Cannot have array values {a:?} in a values list" |
82 | 0 | ) |
83 | | } |
84 | 0 | Err(err) => Err(err), |
85 | | } |
86 | 2 | }) |
87 | 2 | .collect::<Result<Vec<_>>>() |
88 | 2 | .and_then(ScalarValue::iter_to_array) |
89 | 2 | }) |
90 | 2 | .collect::<Result<Vec<_>>>()?0 ; |
91 | 2 | let batch1 = RecordBatch::try_new_with_options( |
92 | 2 | Arc::clone(&schema), |
93 | 2 | arr, |
94 | 2 | &RecordBatchOptions::new().with_row_count(Some(n_row)), |
95 | 2 | )?1 ; |
96 | 1 | let data: Vec<RecordBatch> = vec![batch]; |
97 | 1 | Self::try_new_from_batches(schema, data) |
98 | 3 | } |
99 | | |
100 | | /// Create a new plan using the provided schema and batches. |
101 | | /// |
102 | | /// Errors if any of the batches don't match the provided schema, or if no |
103 | | /// batches are provided. |
104 | 4 | pub fn try_new_from_batches( |
105 | 4 | schema: SchemaRef, |
106 | 4 | batches: Vec<RecordBatch>, |
107 | 4 | ) -> Result<Self> { |
108 | 4 | if batches.is_empty() { |
109 | 1 | return plan_err!("Values list cannot be empty"); |
110 | 3 | } |
111 | | |
112 | 6 | for batch4 in &batches { |
113 | 4 | let batch_schema = batch.schema(); |
114 | 4 | if batch_schema != schema { |
115 | 1 | return plan_err!( |
116 | 1 | "Batch has invalid schema. Expected: {schema}, got: {batch_schema}" |
117 | 1 | ); |
118 | 3 | } |
119 | | } |
120 | | |
121 | 2 | let cache = Self::compute_properties(Arc::clone(&schema)); |
122 | 2 | Ok(ValuesExec { |
123 | 2 | schema, |
124 | 2 | data: batches, |
125 | 2 | cache, |
126 | 2 | }) |
127 | 4 | } |
128 | | |
129 | | /// provides the data |
130 | 0 | pub fn data(&self) -> Vec<RecordBatch> { |
131 | 0 | self.data.clone() |
132 | 0 | } |
133 | | |
134 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
135 | 2 | fn compute_properties(schema: SchemaRef) -> PlanProperties { |
136 | 2 | let eq_properties = EquivalenceProperties::new(schema); |
137 | 2 | |
138 | 2 | PlanProperties::new( |
139 | 2 | eq_properties, |
140 | 2 | Partitioning::UnknownPartitioning(1), |
141 | 2 | ExecutionMode::Bounded, |
142 | 2 | ) |
143 | 2 | } |
144 | | } |
145 | | |
146 | | impl DisplayAs for ValuesExec { |
147 | 0 | fn fmt_as( |
148 | 0 | &self, |
149 | 0 | t: DisplayFormatType, |
150 | 0 | f: &mut std::fmt::Formatter, |
151 | 0 | ) -> std::fmt::Result { |
152 | 0 | match t { |
153 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
154 | 0 | write!(f, "ValuesExec") |
155 | 0 | } |
156 | 0 | } |
157 | 0 | } |
158 | | } |
159 | | |
160 | | impl ExecutionPlan for ValuesExec { |
161 | 0 | fn name(&self) -> &'static str { |
162 | 0 | "ValuesExec" |
163 | 0 | } |
164 | | |
165 | | /// Return a reference to Any that can be used for downcasting |
166 | 0 | fn as_any(&self) -> &dyn Any { |
167 | 0 | self |
168 | 0 | } |
169 | | |
170 | 0 | fn properties(&self) -> &PlanProperties { |
171 | 0 | &self.cache |
172 | 0 | } |
173 | | |
174 | 0 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
175 | 0 | vec![] |
176 | 0 | } |
177 | | |
178 | 0 | fn with_new_children( |
179 | 0 | self: Arc<Self>, |
180 | 0 | _: Vec<Arc<dyn ExecutionPlan>>, |
181 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
182 | 0 | ValuesExec::try_new_from_batches(Arc::clone(&self.schema), self.data.clone()) |
183 | 0 | .map(|e| Arc::new(e) as _) |
184 | 0 | } |
185 | | |
186 | 0 | fn execute( |
187 | 0 | &self, |
188 | 0 | partition: usize, |
189 | 0 | _context: Arc<TaskContext>, |
190 | 0 | ) -> Result<SendableRecordBatchStream> { |
191 | 0 | // ValuesExec has a single output partition |
192 | 0 | if 0 != partition { |
193 | 0 | return internal_err!( |
194 | 0 | "ValuesExec invalid partition {partition} (expected 0)" |
195 | 0 | ); |
196 | 0 | } |
197 | 0 |
|
198 | 0 | Ok(Box::pin(MemoryStream::try_new( |
199 | 0 | self.data(), |
200 | 0 | Arc::clone(&self.schema), |
201 | 0 | None, |
202 | 0 | )?)) |
203 | 0 | } |
204 | | |
205 | 0 | fn statistics(&self) -> Result<Statistics> { |
206 | 0 | let batch = self.data(); |
207 | 0 | Ok(common::compute_record_batch_statistics( |
208 | 0 | &[batch], |
209 | 0 | &self.schema, |
210 | 0 | None, |
211 | 0 | )) |
212 | 0 | } |
213 | | } |
214 | | |
215 | | #[cfg(test)] |
216 | | mod tests { |
217 | | use super::*; |
218 | | use crate::expressions::lit; |
219 | | use crate::test::{self, make_partition}; |
220 | | |
221 | | use arrow_schema::{DataType, Field}; |
222 | | |
223 | | #[tokio::test] |
224 | 1 | async fn values_empty_case() -> Result<()> { |
225 | 1 | let schema = test::aggr_test_schema(); |
226 | 1 | let empty = ValuesExec::try_new(schema, vec![]); |
227 | 1 | assert!(empty.is_err()); |
228 | 1 | Ok(()) |
229 | 1 | } |
230 | | |
231 | | #[test] |
232 | 1 | fn new_exec_with_batches() { |
233 | 1 | let batch = make_partition(7); |
234 | 1 | let schema = batch.schema(); |
235 | 1 | let batches = vec![batch.clone(), batch]; |
236 | 1 | |
237 | 1 | let _exec = ValuesExec::try_new_from_batches(schema, batches).unwrap(); |
238 | 1 | } |
239 | | |
240 | | #[test] |
241 | 1 | fn new_exec_with_batches_empty() { |
242 | 1 | let batch = make_partition(7); |
243 | 1 | let schema = batch.schema(); |
244 | 1 | let _ = ValuesExec::try_new_from_batches(schema, Vec::new()).unwrap_err(); |
245 | 1 | } |
246 | | |
247 | | #[test] |
248 | 1 | fn new_exec_with_batches_invalid_schema() { |
249 | 1 | let batch = make_partition(7); |
250 | 1 | let batches = vec![batch.clone(), batch]; |
251 | 1 | |
252 | 1 | let invalid_schema = Arc::new(Schema::new(vec![ |
253 | 1 | Field::new("col0", DataType::UInt32, false), |
254 | 1 | Field::new("col1", DataType::Utf8, false), |
255 | 1 | ])); |
256 | 1 | let _ = ValuesExec::try_new_from_batches(invalid_schema, batches).unwrap_err(); |
257 | 1 | } |
258 | | |
259 | | // Test issue: https://github.com/apache/datafusion/issues/8763 |
260 | | #[test] |
261 | 1 | fn new_exec_with_non_nullable_schema() { |
262 | 1 | let schema = Arc::new(Schema::new(vec![Field::new( |
263 | 1 | "col0", |
264 | 1 | DataType::UInt32, |
265 | 1 | false, |
266 | 1 | )])); |
267 | 1 | let _ = ValuesExec::try_new(Arc::clone(&schema), vec![vec![lit(1u32)]]).unwrap(); |
268 | 1 | // Test that a null value is rejected |
269 | 1 | let _ = ValuesExec::try_new(schema, vec![vec![lit(ScalarValue::UInt32(None))]]) |
270 | 1 | .unwrap_err(); |
271 | 1 | } |
272 | | } |