Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/common/src/column.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Column
19
20
use arrow_schema::{Field, FieldRef};
21
22
use crate::error::_schema_err;
23
use crate::utils::{parse_identifiers_normalized, quote_identifier};
24
use crate::{DFSchema, DataFusionError, Result, SchemaError, TableReference};
25
use std::collections::HashSet;
26
use std::convert::Infallible;
27
use std::fmt;
28
use std::str::FromStr;
29
30
/// A named reference to a qualified field in a schema.
31
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
32
pub struct Column {
33
    /// relation/table reference.
34
    pub relation: Option<TableReference>,
35
    /// field/column name.
36
    pub name: String,
37
}
38
39
impl Column {
40
    /// Create Column from optional qualifier and name. The optional qualifier, if present,
41
    /// will be parsed and normalized by default.
42
    ///
43
    /// See full details on [`TableReference::parse_str`]
44
    ///
45
    /// [`TableReference::parse_str`]: crate::TableReference::parse_str
46
0
    pub fn new(
47
0
        relation: Option<impl Into<TableReference>>,
48
0
        name: impl Into<String>,
49
0
    ) -> Self {
50
0
        Self {
51
0
            relation: relation.map(|r| r.into()),
52
0
            name: name.into(),
53
0
        }
54
0
    }
55
56
    /// Convenience method for when there is no qualifier
57
0
    pub fn new_unqualified(name: impl Into<String>) -> Self {
58
0
        Self {
59
0
            relation: None,
60
0
            name: name.into(),
61
0
        }
62
0
    }
63
64
    /// Create Column from unqualified name.
65
    ///
66
    /// Alias for `Column::new_unqualified`
67
0
    pub fn from_name(name: impl Into<String>) -> Self {
68
0
        Self {
69
0
            relation: None,
70
0
            name: name.into(),
71
0
        }
72
0
    }
73
74
0
    fn from_idents(idents: &mut Vec<String>) -> Option<Self> {
75
0
        let (relation, name) = match idents.len() {
76
0
            1 => (None, idents.remove(0)),
77
0
            2 => (
78
0
                Some(TableReference::Bare {
79
0
                    table: idents.remove(0).into(),
80
0
                }),
81
0
                idents.remove(0),
82
0
            ),
83
0
            3 => (
84
0
                Some(TableReference::Partial {
85
0
                    schema: idents.remove(0).into(),
86
0
                    table: idents.remove(0).into(),
87
0
                }),
88
0
                idents.remove(0),
89
0
            ),
90
0
            4 => (
91
0
                Some(TableReference::Full {
92
0
                    catalog: idents.remove(0).into(),
93
0
                    schema: idents.remove(0).into(),
94
0
                    table: idents.remove(0).into(),
95
0
                }),
96
0
                idents.remove(0),
97
0
            ),
98
            // any expression that failed to parse or has more than 4 period delimited
99
            // identifiers will be treated as an unqualified column name
100
0
            _ => return None,
101
        };
102
0
        Some(Self { relation, name })
103
0
    }
104
105
    /// Deserialize a fully qualified name string into a column
106
    ///
107
    /// Treats the name as a SQL identifier. For example
108
    /// `foo.BAR` would be parsed to a reference to relation `foo`, column name `bar` (lower case)
109
    /// where `"foo.BAR"` would be parsed to a reference to column named `foo.BAR`
110
0
    pub fn from_qualified_name(flat_name: impl Into<String>) -> Self {
111
0
        let flat_name = flat_name.into();
112
0
        Self::from_idents(&mut parse_identifiers_normalized(&flat_name, false))
113
0
            .unwrap_or_else(|| Self {
114
0
                relation: None,
115
0
                name: flat_name,
116
0
            })
117
0
    }
118
119
    /// Deserialize a fully qualified name string into a column preserving column text case
120
0
    pub fn from_qualified_name_ignore_case(flat_name: impl Into<String>) -> Self {
121
0
        let flat_name = flat_name.into();
122
0
        Self::from_idents(&mut parse_identifiers_normalized(&flat_name, true))
123
0
            .unwrap_or_else(|| Self {
124
0
                relation: None,
125
0
                name: flat_name,
126
0
            })
127
0
    }
128
129
    /// return the column's name.
130
    ///
131
    /// Note: This ignores the relation and returns the column name only.
132
0
    pub fn name(&self) -> &str {
133
0
        &self.name
134
0
    }
135
136
    /// Serialize column into a flat name string
137
0
    pub fn flat_name(&self) -> String {
138
0
        match &self.relation {
139
0
            Some(r) => format!("{}.{}", r, self.name),
140
0
            None => self.name.clone(),
141
        }
142
0
    }
143
144
    /// Serialize column into a quoted flat name string
145
0
    pub fn quoted_flat_name(&self) -> String {
146
0
        match &self.relation {
147
0
            Some(r) => {
148
0
                format!(
149
0
                    "{}.{}",
150
0
                    r.to_quoted_string(),
151
0
                    quote_identifier(self.name.as_str())
152
0
                )
153
            }
154
0
            None => quote_identifier(&self.name).to_string(),
155
        }
156
0
    }
157
158
    /// Qualify column if not done yet.
159
    ///
160
    /// If this column already has a [relation](Self::relation), it will be returned as is and the given parameters are
161
    /// ignored. Otherwise this will search through the given schemas to find the column.
162
    ///
163
    /// Will check for ambiguity at each level of `schemas`.
164
    ///
165
    /// A schema matches if there is a single column that -- when unqualified -- matches this column. There is an
166
    /// exception for `USING` statements, see below.
167
    ///
168
    /// # Using columns
169
    /// Take the following SQL statement:
170
    ///
171
    /// ```sql
172
    /// SELECT id FROM t1 JOIN t2 USING(id)
173
    /// ```
174
    ///
175
    /// In this case, both `t1.id` and `t2.id` will match unqualified column `id`. To express this possibility, use
176
    /// `using_columns`. Each entry in this array is a set of columns that are bound together via a `USING` clause. So
177
    /// in this example this would be `[{t1.id, t2.id}]`.
178
    ///
179
    /// Regarding ambiguity check, `schemas` is structured to allow levels of schemas to be passed in.
180
    /// For example:
181
    ///
182
    /// ```text
183
    /// schemas = &[
184
    ///    &[schema1, schema2], // first level
185
    ///    &[schema3, schema4], // second level
186
    /// ]
187
    /// ```
188
    ///
189
    /// Will search for a matching field in all schemas in the first level. If a matching field according to above
190
    /// mentioned conditions is not found, then will check the next level. If found more than one matching column across
191
    /// all schemas in a level, that isn't a USING column, will return an error due to ambiguous column.
192
    ///
193
    /// If checked all levels and couldn't find field, will return field not found error.
194
0
    pub fn normalize_with_schemas_and_ambiguity_check(
195
0
        self,
196
0
        schemas: &[&[&DFSchema]],
197
0
        using_columns: &[HashSet<Column>],
198
0
    ) -> Result<Self> {
199
0
        if self.relation.is_some() {
200
0
            return Ok(self);
201
0
        }
202
203
0
        for schema_level in schemas {
204
0
            let qualified_fields = schema_level
205
0
                .iter()
206
0
                .flat_map(|s| s.qualified_fields_with_unqualified_name(&self.name))
207
0
                .collect::<Vec<_>>();
208
0
            match qualified_fields.len() {
209
0
                0 => continue,
210
0
                1 => return Ok(Column::from(qualified_fields[0])),
211
                _ => {
212
                    // More than 1 fields in this schema have their names set to self.name.
213
                    //
214
                    // This should only happen when a JOIN query with USING constraint references
215
                    // join columns using unqualified column name. For example:
216
                    //
217
                    // ```sql
218
                    // SELECT id FROM t1 JOIN t2 USING(id)
219
                    // ```
220
                    //
221
                    // In this case, both `t1.id` and `t2.id` will match unqualified column `id`.
222
                    // We will use the relation from the first matched field to normalize self.
223
224
                    // Compare matched fields with one USING JOIN clause at a time
225
0
                    let columns = schema_level
226
0
                        .iter()
227
0
                        .flat_map(|s| s.columns_with_unqualified_name(&self.name))
228
0
                        .collect::<Vec<_>>();
229
0
                    for using_col in using_columns {
230
0
                        let all_matched = columns.iter().all(|c| using_col.contains(c));
231
0
                        // All matched fields belong to the same using column set, in orther words
232
0
                        // the same join clause. We simply pick the qualifier from the first match.
233
0
                        if all_matched {
234
0
                            return Ok(columns[0].clone());
235
0
                        }
236
                    }
237
238
                    // If not due to USING columns then due to ambiguous column name
239
0
                    return _schema_err!(SchemaError::AmbiguousReference {
240
0
                        field: Column::new_unqualified(self.name),
241
0
                    });
242
                }
243
            }
244
        }
245
246
0
        _schema_err!(SchemaError::FieldNotFound {
247
0
            field: Box::new(self),
248
0
            valid_fields: schemas
249
0
                .iter()
250
0
                .flat_map(|s| s.iter())
251
0
                .flat_map(|s| s.columns())
252
0
                .collect(),
253
0
        })
254
0
    }
255
}
256
257
impl From<&str> for Column {
258
0
    fn from(c: &str) -> Self {
259
0
        Self::from_qualified_name(c)
260
0
    }
261
}
262
263
/// Create a column, cloning the string
264
impl From<&String> for Column {
265
0
    fn from(c: &String) -> Self {
266
0
        Self::from_qualified_name(c)
267
0
    }
268
}
269
270
/// Create a column, reusing the existing string
271
impl From<String> for Column {
272
0
    fn from(c: String) -> Self {
273
0
        Self::from_qualified_name(c)
274
0
    }
275
}
276
277
/// Create a column, use qualifier and field name
278
impl From<(Option<&TableReference>, &Field)> for Column {
279
0
    fn from((relation, field): (Option<&TableReference>, &Field)) -> Self {
280
0
        Self::new(relation.cloned(), field.name())
281
0
    }
282
}
283
284
/// Create a column, use qualifier and field name
285
impl From<(Option<&TableReference>, &FieldRef)> for Column {
286
0
    fn from((relation, field): (Option<&TableReference>, &FieldRef)) -> Self {
287
0
        Self::new(relation.cloned(), field.name())
288
0
    }
289
}
290
291
impl FromStr for Column {
292
    type Err = Infallible;
293
294
0
    fn from_str(s: &str) -> Result<Self, Self::Err> {
295
0
        Ok(s.into())
296
0
    }
297
}
298
299
impl fmt::Display for Column {
300
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
301
0
        write!(f, "{}", self.flat_name())
302
0
    }
303
}
304
305
#[cfg(test)]
306
mod tests {
307
    use super::*;
308
    use arrow::datatypes::DataType;
309
    use arrow_schema::SchemaBuilder;
310
    use std::sync::Arc;
311
312
    fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result<DFSchema> {
313
        let mut schema_builder = SchemaBuilder::new();
314
        schema_builder.extend(
315
            names
316
                .iter()
317
                .map(|f| Field::new(*f, DataType::Boolean, true)),
318
        );
319
        let schema = Arc::new(schema_builder.finish());
320
        DFSchema::try_from_qualified_schema(qualifier, &schema)
321
    }
322
323
    #[test]
324
    fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> {
325
        let schema1 = create_qualified_schema("t1", vec!["a", "b"])?;
326
        let schema2 = create_qualified_schema("t2", vec!["c", "d"])?;
327
        let schema3 = create_qualified_schema("t3", vec!["a", "b", "c", "d", "e"])?;
328
329
        // already normalized
330
        let col = Column::new(Some("t1"), "a");
331
        let col = col.normalize_with_schemas_and_ambiguity_check(&[], &[])?;
332
        assert_eq!(col, Column::new(Some("t1"), "a"));
333
334
        // should find in first level (schema1)
335
        let col = Column::from_name("a");
336
        let col = col.normalize_with_schemas_and_ambiguity_check(
337
            &[&[&schema1, &schema2], &[&schema3]],
338
            &[],
339
        )?;
340
        assert_eq!(col, Column::new(Some("t1"), "a"));
341
342
        // should find in second level (schema3)
343
        let col = Column::from_name("e");
344
        let col = col.normalize_with_schemas_and_ambiguity_check(
345
            &[&[&schema1, &schema2], &[&schema3]],
346
            &[],
347
        )?;
348
        assert_eq!(col, Column::new(Some("t3"), "e"));
349
350
        // using column in first level (pick schema1)
351
        let mut using_columns = HashSet::new();
352
        using_columns.insert(Column::new(Some("t1"), "a"));
353
        using_columns.insert(Column::new(Some("t3"), "a"));
354
        let col = Column::from_name("a");
355
        let col = col.normalize_with_schemas_and_ambiguity_check(
356
            &[&[&schema1, &schema3], &[&schema2]],
357
            &[using_columns],
358
        )?;
359
        assert_eq!(col, Column::new(Some("t1"), "a"));
360
361
        // not found in any level
362
        let col = Column::from_name("z");
363
        let err = col
364
            .normalize_with_schemas_and_ambiguity_check(
365
                &[&[&schema1, &schema2], &[&schema3]],
366
                &[],
367
            )
368
            .expect_err("should've failed to find field");
369
        let expected = r#"Schema error: No field named z. Valid fields are t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e."#;
370
        assert_eq!(err.strip_backtrace(), expected);
371
372
        // ambiguous column reference
373
        let col = Column::from_name("a");
374
        let err = col
375
            .normalize_with_schemas_and_ambiguity_check(
376
                &[&[&schema1, &schema3], &[&schema2]],
377
                &[],
378
            )
379
            .expect_err("should've found ambiguous field");
380
        let expected = "Schema error: Ambiguous reference to unqualified field a";
381
        assert_eq!(err.strip_backtrace(), expected);
382
383
        Ok(())
384
    }
385
}