Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/expressions/column.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Physical column reference: [`Column`]
19
20
use std::any::Any;
21
use std::hash::{Hash, Hasher};
22
use std::sync::Arc;
23
24
use arrow::{
25
    datatypes::{DataType, Schema},
26
    record_batch::RecordBatch,
27
};
28
use arrow_schema::SchemaRef;
29
use datafusion_common::tree_node::{Transformed, TreeNode};
30
use datafusion_common::{internal_err, plan_err, Result};
31
use datafusion_expr::ColumnarValue;
32
33
use crate::physical_expr::{down_cast_any_ref, PhysicalExpr};
34
35
/// Represents the column at a given index in a RecordBatch
36
///
37
/// This is a physical expression that represents a column at a given index in an
38
/// arrow [`Schema`] / [`RecordBatch`].
39
///
40
/// Unlike the [logical `Expr::Column`], this expression is always resolved by schema index,
41
/// even though it does have a name. This is because the physical plan is always
42
/// resolved to a specific schema and there is no concept of "relation"
43
///
44
/// # Example:
45
///  If the schema is `a`, `b`, `c` the `Column` for `b` would be represented by
46
///  index 1, since `b` is the second colum in the schema.
47
///
48
/// ```
49
/// # use datafusion_physical_expr::expressions::Column;
50
/// # use arrow::datatypes::{DataType, Field, Schema};
51
/// // Schema with columns a, b, c
52
/// let schema = Schema::new(vec![
53
///    Field::new("a", DataType::Int32, false),
54
///    Field::new("b", DataType::Int32, false),
55
///    Field::new("c", DataType::Int32, false),
56
/// ]);
57
///
58
/// // reference to column b is index 1
59
/// let column_b = Column::new_with_schema("b", &schema).unwrap();
60
/// assert_eq!(column_b.index(), 1);
61
///
62
/// // reference to column c is index 2
63
/// let column_c = Column::new_with_schema("c", &schema).unwrap();
64
/// assert_eq!(column_c.index(), 2);
65
/// ```
66
/// [logical `Expr::Column`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Column
67
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
68
pub struct Column {
69
    /// The name of the column (used for debugging and display purposes)
70
    name: String,
71
    /// The index of the column in its schema
72
    index: usize,
73
}
74
75
impl Column {
76
    /// Create a new column expression which references the
77
    /// column with the given index in the schema.
78
26.4k
    pub fn new(name: &str, index: usize) -> Self {
79
26.4k
        Self {
80
26.4k
            name: name.to_owned(),
81
26.4k
            index,
82
26.4k
        }
83
26.4k
    }
84
85
    /// Create a new column expression which references the
86
    /// column with the given name in the schema
87
5.14k
    pub fn new_with_schema(name: &str, schema: &Schema) -> Result<Self> {
88
5.14k
        Ok(Column::new(name, schema.index_of(name)
?0
))
89
5.14k
    }
90
91
    /// Get the column's name
92
808
    pub fn name(&self) -> &str {
93
808
        &self.name
94
808
    }
95
96
    /// Get the column's schema index
97
945
    pub fn index(&self) -> usize {
98
945
        self.index
99
945
    }
100
}
101
102
impl std::fmt::Display for Column {
103
30
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
104
30
        write!(f, "{}@{}", self.name, self.index)
105
30
    }
106
}
107
108
impl PhysicalExpr for Column {
109
    /// Return a reference to Any that can be used for downcasting
110
112k
    fn as_any(&self) -> &dyn std::any::Any {
111
112k
        self
112
112k
    }
113
114
    /// Get the data type of this expression, given the schema of the input
115
79.9k
    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
116
79.9k
        self.bounds_check(input_schema)
?0
;
117
79.9k
        Ok(input_schema.field(self.index).data_type().clone())
118
79.9k
    }
119
120
    /// Decide whether this expression is nullable, given the schema of the input
121
138
    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
122
138
        self.bounds_check(input_schema)
?0
;
123
138
        Ok(input_schema.field(self.index).is_nullable())
124
138
    }
125
126
    /// Evaluate the expression
127
129k
    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
128
129k
        self.bounds_check(batch.schema().as_ref())
?0
;
129
129k
        Ok(ColumnarValue::Array(Arc::clone(batch.column(self.index))))
130
129k
    }
131
132
21.3k
    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
133
21.3k
        vec![]
134
21.3k
    }
135
136
0
    fn with_new_children(
137
0
        self: Arc<Self>,
138
0
        _children: Vec<Arc<dyn PhysicalExpr>>,
139
0
    ) -> Result<Arc<dyn PhysicalExpr>> {
140
0
        Ok(self)
141
0
    }
142
143
20
    fn dyn_hash(&self, state: &mut dyn Hasher) {
144
20
        let mut s = state;
145
20
        self.hash(&mut s);
146
20
    }
147
}
148
149
impl PartialEq<dyn Any> for Column {
150
39.0k
    fn eq(&self, other: &dyn Any) -> bool {
151
39.0k
        down_cast_any_ref(other)
152
39.0k
            .downcast_ref::<Self>()
153
39.0k
            .map(|x| 
self == x32.0k
)
154
39.0k
            .unwrap_or(false)
155
39.0k
    }
156
}
157
158
impl Column {
159
209k
    fn bounds_check(&self, input_schema: &Schema) -> Result<()> {
160
209k
        if self.index < input_schema.fields.len() {
161
209k
            Ok(())
162
        } else {
163
0
            internal_err!(
164
0
                "PhysicalExpr Column references column '{}' at index {} (zero-based) but input schema only has {} columns: {:?}",
165
0
                self.name,
166
0
                self.index,
167
0
                input_schema.fields.len(),
168
0
                input_schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>()
169
0
            )
170
        }
171
209k
    }
172
}
173
174
/// Create a column expression
175
1.82k
pub fn col(name: &str, schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
176
1.82k
    Ok(Arc::new(Column::new_with_schema(name, schema)
?0
))
177
1.82k
}
178
179
/// Rewrites an expression according to new schema; i.e. changes the columns it
180
/// refers to with the column at corresponding index in the new schema. Returns
181
/// an error if the given schema has fewer columns than the original schema.
182
/// Note that the resulting expression may not be valid if data types in the
183
/// new schema is incompatible with expression nodes.
184
0
pub fn with_new_schema(
185
0
    expr: Arc<dyn PhysicalExpr>,
186
0
    schema: &SchemaRef,
187
0
) -> Result<Arc<dyn PhysicalExpr>> {
188
0
    Ok(expr
189
0
        .transform_up(|expr| {
190
0
            if let Some(col) = expr.as_any().downcast_ref::<Column>() {
191
0
                let idx = col.index();
192
0
                let Some(field) = schema.fields().get(idx) else {
193
0
                    return plan_err!(
194
0
                        "New schema has fewer columns than original schema"
195
0
                    );
196
                };
197
0
                let new_col = Column::new(field.name(), idx);
198
0
                Ok(Transformed::yes(Arc::new(new_col) as _))
199
            } else {
200
0
                Ok(Transformed::no(expr))
201
            }
202
0
        })?
203
        .data)
204
0
}
205
206
#[cfg(test)]
207
mod test {
208
    use super::Column;
209
    use crate::physical_expr::PhysicalExpr;
210
211
    use arrow::array::StringArray;
212
    use arrow::datatypes::{DataType, Field, Schema};
213
    use arrow::record_batch::RecordBatch;
214
    use datafusion_common::Result;
215
216
    use std::sync::Arc;
217
218
    #[test]
219
    fn out_of_bounds_data_type() {
220
        let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]);
221
        let col = Column::new("id", 9);
222
        let error = col.data_type(&schema).expect_err("error").strip_backtrace();
223
        assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
224
            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
225
            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error))
226
    }
227
228
    #[test]
229
    fn out_of_bounds_nullable() {
230
        let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]);
231
        let col = Column::new("id", 9);
232
        let error = col.nullable(&schema).expect_err("error").strip_backtrace();
233
        assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
234
            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
235
            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error))
236
    }
237
238
    #[test]
239
    fn out_of_bounds_evaluate() -> Result<()> {
240
        let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]);
241
        let data: StringArray = vec!["data"].into();
242
        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)])?;
243
        let col = Column::new("id", 9);
244
        let error = col.evaluate(&batch).expect_err("error").strip_backtrace();
245
        assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
246
            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
247
            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error));
248
        Ok(())
249
    }
250
}