/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/expressions/column.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Physical column reference: [`Column`] |
19 | | |
20 | | use std::any::Any; |
21 | | use std::hash::{Hash, Hasher}; |
22 | | use std::sync::Arc; |
23 | | |
24 | | use arrow::{ |
25 | | datatypes::{DataType, Schema}, |
26 | | record_batch::RecordBatch, |
27 | | }; |
28 | | use arrow_schema::SchemaRef; |
29 | | use datafusion_common::tree_node::{Transformed, TreeNode}; |
30 | | use datafusion_common::{internal_err, plan_err, Result}; |
31 | | use datafusion_expr::ColumnarValue; |
32 | | |
33 | | use crate::physical_expr::{down_cast_any_ref, PhysicalExpr}; |
34 | | |
35 | | /// Represents the column at a given index in a RecordBatch |
36 | | /// |
37 | | /// This is a physical expression that represents a column at a given index in an |
38 | | /// arrow [`Schema`] / [`RecordBatch`]. |
39 | | /// |
40 | | /// Unlike the [logical `Expr::Column`], this expression is always resolved by schema index, |
41 | | /// even though it does have a name. This is because the physical plan is always |
42 | | /// resolved to a specific schema and there is no concept of "relation" |
43 | | /// |
44 | | /// # Example: |
45 | | /// If the schema is `a`, `b`, `c` the `Column` for `b` would be represented by |
46 | | /// index 1, since `b` is the second colum in the schema. |
47 | | /// |
48 | | /// ``` |
49 | | /// # use datafusion_physical_expr::expressions::Column; |
50 | | /// # use arrow::datatypes::{DataType, Field, Schema}; |
51 | | /// // Schema with columns a, b, c |
52 | | /// let schema = Schema::new(vec![ |
53 | | /// Field::new("a", DataType::Int32, false), |
54 | | /// Field::new("b", DataType::Int32, false), |
55 | | /// Field::new("c", DataType::Int32, false), |
56 | | /// ]); |
57 | | /// |
58 | | /// // reference to column b is index 1 |
59 | | /// let column_b = Column::new_with_schema("b", &schema).unwrap(); |
60 | | /// assert_eq!(column_b.index(), 1); |
61 | | /// |
62 | | /// // reference to column c is index 2 |
63 | | /// let column_c = Column::new_with_schema("c", &schema).unwrap(); |
64 | | /// assert_eq!(column_c.index(), 2); |
65 | | /// ``` |
66 | | /// [logical `Expr::Column`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Column |
67 | | #[derive(Debug, Hash, PartialEq, Eq, Clone)] |
68 | | pub struct Column { |
69 | | /// The name of the column (used for debugging and display purposes) |
70 | | name: String, |
71 | | /// The index of the column in its schema |
72 | | index: usize, |
73 | | } |
74 | | |
75 | | impl Column { |
76 | | /// Create a new column expression which references the |
77 | | /// column with the given index in the schema. |
78 | 26.4k | pub fn new(name: &str, index: usize) -> Self { |
79 | 26.4k | Self { |
80 | 26.4k | name: name.to_owned(), |
81 | 26.4k | index, |
82 | 26.4k | } |
83 | 26.4k | } |
84 | | |
85 | | /// Create a new column expression which references the |
86 | | /// column with the given name in the schema |
87 | 5.14k | pub fn new_with_schema(name: &str, schema: &Schema) -> Result<Self> { |
88 | 5.14k | Ok(Column::new(name, schema.index_of(name)?0 )) |
89 | 5.14k | } |
90 | | |
91 | | /// Get the column's name |
92 | 808 | pub fn name(&self) -> &str { |
93 | 808 | &self.name |
94 | 808 | } |
95 | | |
96 | | /// Get the column's schema index |
97 | 945 | pub fn index(&self) -> usize { |
98 | 945 | self.index |
99 | 945 | } |
100 | | } |
101 | | |
102 | | impl std::fmt::Display for Column { |
103 | 30 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
104 | 30 | write!(f, "{}@{}", self.name, self.index) |
105 | 30 | } |
106 | | } |
107 | | |
108 | | impl PhysicalExpr for Column { |
109 | | /// Return a reference to Any that can be used for downcasting |
110 | 112k | fn as_any(&self) -> &dyn std::any::Any { |
111 | 112k | self |
112 | 112k | } |
113 | | |
114 | | /// Get the data type of this expression, given the schema of the input |
115 | 79.9k | fn data_type(&self, input_schema: &Schema) -> Result<DataType> { |
116 | 79.9k | self.bounds_check(input_schema)?0 ; |
117 | 79.9k | Ok(input_schema.field(self.index).data_type().clone()) |
118 | 79.9k | } |
119 | | |
120 | | /// Decide whether this expression is nullable, given the schema of the input |
121 | 138 | fn nullable(&self, input_schema: &Schema) -> Result<bool> { |
122 | 138 | self.bounds_check(input_schema)?0 ; |
123 | 138 | Ok(input_schema.field(self.index).is_nullable()) |
124 | 138 | } |
125 | | |
126 | | /// Evaluate the expression |
127 | 129k | fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> { |
128 | 129k | self.bounds_check(batch.schema().as_ref())?0 ; |
129 | 129k | Ok(ColumnarValue::Array(Arc::clone(batch.column(self.index)))) |
130 | 129k | } |
131 | | |
132 | 21.3k | fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> { |
133 | 21.3k | vec![] |
134 | 21.3k | } |
135 | | |
136 | 0 | fn with_new_children( |
137 | 0 | self: Arc<Self>, |
138 | 0 | _children: Vec<Arc<dyn PhysicalExpr>>, |
139 | 0 | ) -> Result<Arc<dyn PhysicalExpr>> { |
140 | 0 | Ok(self) |
141 | 0 | } |
142 | | |
143 | 20 | fn dyn_hash(&self, state: &mut dyn Hasher) { |
144 | 20 | let mut s = state; |
145 | 20 | self.hash(&mut s); |
146 | 20 | } |
147 | | } |
148 | | |
149 | | impl PartialEq<dyn Any> for Column { |
150 | 39.0k | fn eq(&self, other: &dyn Any) -> bool { |
151 | 39.0k | down_cast_any_ref(other) |
152 | 39.0k | .downcast_ref::<Self>() |
153 | 39.0k | .map(|x| self == x32.0k ) |
154 | 39.0k | .unwrap_or(false) |
155 | 39.0k | } |
156 | | } |
157 | | |
158 | | impl Column { |
159 | 209k | fn bounds_check(&self, input_schema: &Schema) -> Result<()> { |
160 | 209k | if self.index < input_schema.fields.len() { |
161 | 209k | Ok(()) |
162 | | } else { |
163 | 0 | internal_err!( |
164 | 0 | "PhysicalExpr Column references column '{}' at index {} (zero-based) but input schema only has {} columns: {:?}", |
165 | 0 | self.name, |
166 | 0 | self.index, |
167 | 0 | input_schema.fields.len(), |
168 | 0 | input_schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>() |
169 | 0 | ) |
170 | | } |
171 | 209k | } |
172 | | } |
173 | | |
174 | | /// Create a column expression |
175 | 1.82k | pub fn col(name: &str, schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> { |
176 | 1.82k | Ok(Arc::new(Column::new_with_schema(name, schema)?0 )) |
177 | 1.82k | } |
178 | | |
179 | | /// Rewrites an expression according to new schema; i.e. changes the columns it |
180 | | /// refers to with the column at corresponding index in the new schema. Returns |
181 | | /// an error if the given schema has fewer columns than the original schema. |
182 | | /// Note that the resulting expression may not be valid if data types in the |
183 | | /// new schema is incompatible with expression nodes. |
184 | 0 | pub fn with_new_schema( |
185 | 0 | expr: Arc<dyn PhysicalExpr>, |
186 | 0 | schema: &SchemaRef, |
187 | 0 | ) -> Result<Arc<dyn PhysicalExpr>> { |
188 | 0 | Ok(expr |
189 | 0 | .transform_up(|expr| { |
190 | 0 | if let Some(col) = expr.as_any().downcast_ref::<Column>() { |
191 | 0 | let idx = col.index(); |
192 | 0 | let Some(field) = schema.fields().get(idx) else { |
193 | 0 | return plan_err!( |
194 | 0 | "New schema has fewer columns than original schema" |
195 | 0 | ); |
196 | | }; |
197 | 0 | let new_col = Column::new(field.name(), idx); |
198 | 0 | Ok(Transformed::yes(Arc::new(new_col) as _)) |
199 | | } else { |
200 | 0 | Ok(Transformed::no(expr)) |
201 | | } |
202 | 0 | })? |
203 | | .data) |
204 | 0 | } |
205 | | |
206 | | #[cfg(test)] |
207 | | mod test { |
208 | | use super::Column; |
209 | | use crate::physical_expr::PhysicalExpr; |
210 | | |
211 | | use arrow::array::StringArray; |
212 | | use arrow::datatypes::{DataType, Field, Schema}; |
213 | | use arrow::record_batch::RecordBatch; |
214 | | use datafusion_common::Result; |
215 | | |
216 | | use std::sync::Arc; |
217 | | |
218 | | #[test] |
219 | | fn out_of_bounds_data_type() { |
220 | | let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]); |
221 | | let col = Column::new("id", 9); |
222 | | let error = col.data_type(&schema).expect_err("error").strip_backtrace(); |
223 | | assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \ |
224 | | but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \ |
225 | | DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error)) |
226 | | } |
227 | | |
228 | | #[test] |
229 | | fn out_of_bounds_nullable() { |
230 | | let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]); |
231 | | let col = Column::new("id", 9); |
232 | | let error = col.nullable(&schema).expect_err("error").strip_backtrace(); |
233 | | assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \ |
234 | | but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \ |
235 | | DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error)) |
236 | | } |
237 | | |
238 | | #[test] |
239 | | fn out_of_bounds_evaluate() -> Result<()> { |
240 | | let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]); |
241 | | let data: StringArray = vec!["data"].into(); |
242 | | let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)])?; |
243 | | let col = Column::new("id", 9); |
244 | | let error = col.evaluate(&batch).expect_err("error").strip_backtrace(); |
245 | | assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \ |
246 | | but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \ |
247 | | DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error)); |
248 | | Ok(()) |
249 | | } |
250 | | } |