/Users/andrewlamb/Software/datafusion/datafusion/common/src/column.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Column |
19 | | |
20 | | use arrow_schema::{Field, FieldRef}; |
21 | | |
22 | | use crate::error::_schema_err; |
23 | | use crate::utils::{parse_identifiers_normalized, quote_identifier}; |
24 | | use crate::{DFSchema, DataFusionError, Result, SchemaError, TableReference}; |
25 | | use std::collections::HashSet; |
26 | | use std::convert::Infallible; |
27 | | use std::fmt; |
28 | | use std::str::FromStr; |
29 | | |
30 | | /// A named reference to a qualified field in a schema. |
31 | | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
32 | | pub struct Column { |
33 | | /// relation/table reference. |
34 | | pub relation: Option<TableReference>, |
35 | | /// field/column name. |
36 | | pub name: String, |
37 | | } |
38 | | |
39 | | impl Column { |
40 | | /// Create Column from optional qualifier and name. The optional qualifier, if present, |
41 | | /// will be parsed and normalized by default. |
42 | | /// |
43 | | /// See full details on [`TableReference::parse_str`] |
44 | | /// |
45 | | /// [`TableReference::parse_str`]: crate::TableReference::parse_str |
46 | 0 | pub fn new( |
47 | 0 | relation: Option<impl Into<TableReference>>, |
48 | 0 | name: impl Into<String>, |
49 | 0 | ) -> Self { |
50 | 0 | Self { |
51 | 0 | relation: relation.map(|r| r.into()), |
52 | 0 | name: name.into(), |
53 | 0 | } |
54 | 0 | } |
55 | | |
56 | | /// Convenience method for when there is no qualifier |
57 | 0 | pub fn new_unqualified(name: impl Into<String>) -> Self { |
58 | 0 | Self { |
59 | 0 | relation: None, |
60 | 0 | name: name.into(), |
61 | 0 | } |
62 | 0 | } |
63 | | |
64 | | /// Create Column from unqualified name. |
65 | | /// |
66 | | /// Alias for `Column::new_unqualified` |
67 | 0 | pub fn from_name(name: impl Into<String>) -> Self { |
68 | 0 | Self { |
69 | 0 | relation: None, |
70 | 0 | name: name.into(), |
71 | 0 | } |
72 | 0 | } |
73 | | |
74 | 0 | fn from_idents(idents: &mut Vec<String>) -> Option<Self> { |
75 | 0 | let (relation, name) = match idents.len() { |
76 | 0 | 1 => (None, idents.remove(0)), |
77 | 0 | 2 => ( |
78 | 0 | Some(TableReference::Bare { |
79 | 0 | table: idents.remove(0).into(), |
80 | 0 | }), |
81 | 0 | idents.remove(0), |
82 | 0 | ), |
83 | 0 | 3 => ( |
84 | 0 | Some(TableReference::Partial { |
85 | 0 | schema: idents.remove(0).into(), |
86 | 0 | table: idents.remove(0).into(), |
87 | 0 | }), |
88 | 0 | idents.remove(0), |
89 | 0 | ), |
90 | 0 | 4 => ( |
91 | 0 | Some(TableReference::Full { |
92 | 0 | catalog: idents.remove(0).into(), |
93 | 0 | schema: idents.remove(0).into(), |
94 | 0 | table: idents.remove(0).into(), |
95 | 0 | }), |
96 | 0 | idents.remove(0), |
97 | 0 | ), |
98 | | // any expression that failed to parse or has more than 4 period delimited |
99 | | // identifiers will be treated as an unqualified column name |
100 | 0 | _ => return None, |
101 | | }; |
102 | 0 | Some(Self { relation, name }) |
103 | 0 | } |
104 | | |
105 | | /// Deserialize a fully qualified name string into a column |
106 | | /// |
107 | | /// Treats the name as a SQL identifier. For example |
108 | | /// `foo.BAR` would be parsed to a reference to relation `foo`, column name `bar` (lower case) |
109 | | /// where `"foo.BAR"` would be parsed to a reference to column named `foo.BAR` |
110 | 0 | pub fn from_qualified_name(flat_name: impl Into<String>) -> Self { |
111 | 0 | let flat_name = flat_name.into(); |
112 | 0 | Self::from_idents(&mut parse_identifiers_normalized(&flat_name, false)) |
113 | 0 | .unwrap_or_else(|| Self { |
114 | 0 | relation: None, |
115 | 0 | name: flat_name, |
116 | 0 | }) |
117 | 0 | } |
118 | | |
119 | | /// Deserialize a fully qualified name string into a column preserving column text case |
120 | 0 | pub fn from_qualified_name_ignore_case(flat_name: impl Into<String>) -> Self { |
121 | 0 | let flat_name = flat_name.into(); |
122 | 0 | Self::from_idents(&mut parse_identifiers_normalized(&flat_name, true)) |
123 | 0 | .unwrap_or_else(|| Self { |
124 | 0 | relation: None, |
125 | 0 | name: flat_name, |
126 | 0 | }) |
127 | 0 | } |
128 | | |
129 | | /// return the column's name. |
130 | | /// |
131 | | /// Note: This ignores the relation and returns the column name only. |
132 | 0 | pub fn name(&self) -> &str { |
133 | 0 | &self.name |
134 | 0 | } |
135 | | |
136 | | /// Serialize column into a flat name string |
137 | 0 | pub fn flat_name(&self) -> String { |
138 | 0 | match &self.relation { |
139 | 0 | Some(r) => format!("{}.{}", r, self.name), |
140 | 0 | None => self.name.clone(), |
141 | | } |
142 | 0 | } |
143 | | |
144 | | /// Serialize column into a quoted flat name string |
145 | 0 | pub fn quoted_flat_name(&self) -> String { |
146 | 0 | match &self.relation { |
147 | 0 | Some(r) => { |
148 | 0 | format!( |
149 | 0 | "{}.{}", |
150 | 0 | r.to_quoted_string(), |
151 | 0 | quote_identifier(self.name.as_str()) |
152 | 0 | ) |
153 | | } |
154 | 0 | None => quote_identifier(&self.name).to_string(), |
155 | | } |
156 | 0 | } |
157 | | |
158 | | /// Qualify column if not done yet. |
159 | | /// |
160 | | /// If this column already has a [relation](Self::relation), it will be returned as is and the given parameters are |
161 | | /// ignored. Otherwise this will search through the given schemas to find the column. |
162 | | /// |
163 | | /// Will check for ambiguity at each level of `schemas`. |
164 | | /// |
165 | | /// A schema matches if there is a single column that -- when unqualified -- matches this column. There is an |
166 | | /// exception for `USING` statements, see below. |
167 | | /// |
168 | | /// # Using columns |
169 | | /// Take the following SQL statement: |
170 | | /// |
171 | | /// ```sql |
172 | | /// SELECT id FROM t1 JOIN t2 USING(id) |
173 | | /// ``` |
174 | | /// |
175 | | /// In this case, both `t1.id` and `t2.id` will match unqualified column `id`. To express this possibility, use |
176 | | /// `using_columns`. Each entry in this array is a set of columns that are bound together via a `USING` clause. So |
177 | | /// in this example this would be `[{t1.id, t2.id}]`. |
178 | | /// |
179 | | /// Regarding ambiguity check, `schemas` is structured to allow levels of schemas to be passed in. |
180 | | /// For example: |
181 | | /// |
182 | | /// ```text |
183 | | /// schemas = &[ |
184 | | /// &[schema1, schema2], // first level |
185 | | /// &[schema3, schema4], // second level |
186 | | /// ] |
187 | | /// ``` |
188 | | /// |
189 | | /// Will search for a matching field in all schemas in the first level. If a matching field according to above |
190 | | /// mentioned conditions is not found, then will check the next level. If found more than one matching column across |
191 | | /// all schemas in a level, that isn't a USING column, will return an error due to ambiguous column. |
192 | | /// |
193 | | /// If checked all levels and couldn't find field, will return field not found error. |
194 | 0 | pub fn normalize_with_schemas_and_ambiguity_check( |
195 | 0 | self, |
196 | 0 | schemas: &[&[&DFSchema]], |
197 | 0 | using_columns: &[HashSet<Column>], |
198 | 0 | ) -> Result<Self> { |
199 | 0 | if self.relation.is_some() { |
200 | 0 | return Ok(self); |
201 | 0 | } |
202 | | |
203 | 0 | for schema_level in schemas { |
204 | 0 | let qualified_fields = schema_level |
205 | 0 | .iter() |
206 | 0 | .flat_map(|s| s.qualified_fields_with_unqualified_name(&self.name)) |
207 | 0 | .collect::<Vec<_>>(); |
208 | 0 | match qualified_fields.len() { |
209 | 0 | 0 => continue, |
210 | 0 | 1 => return Ok(Column::from(qualified_fields[0])), |
211 | | _ => { |
212 | | // More than 1 fields in this schema have their names set to self.name. |
213 | | // |
214 | | // This should only happen when a JOIN query with USING constraint references |
215 | | // join columns using unqualified column name. For example: |
216 | | // |
217 | | // ```sql |
218 | | // SELECT id FROM t1 JOIN t2 USING(id) |
219 | | // ``` |
220 | | // |
221 | | // In this case, both `t1.id` and `t2.id` will match unqualified column `id`. |
222 | | // We will use the relation from the first matched field to normalize self. |
223 | | |
224 | | // Compare matched fields with one USING JOIN clause at a time |
225 | 0 | let columns = schema_level |
226 | 0 | .iter() |
227 | 0 | .flat_map(|s| s.columns_with_unqualified_name(&self.name)) |
228 | 0 | .collect::<Vec<_>>(); |
229 | 0 | for using_col in using_columns { |
230 | 0 | let all_matched = columns.iter().all(|c| using_col.contains(c)); |
231 | 0 | // All matched fields belong to the same using column set, in orther words |
232 | 0 | // the same join clause. We simply pick the qualifier from the first match. |
233 | 0 | if all_matched { |
234 | 0 | return Ok(columns[0].clone()); |
235 | 0 | } |
236 | | } |
237 | | |
238 | | // If not due to USING columns then due to ambiguous column name |
239 | 0 | return _schema_err!(SchemaError::AmbiguousReference { |
240 | 0 | field: Column::new_unqualified(self.name), |
241 | 0 | }); |
242 | | } |
243 | | } |
244 | | } |
245 | | |
246 | 0 | _schema_err!(SchemaError::FieldNotFound { |
247 | 0 | field: Box::new(self), |
248 | 0 | valid_fields: schemas |
249 | 0 | .iter() |
250 | 0 | .flat_map(|s| s.iter()) |
251 | 0 | .flat_map(|s| s.columns()) |
252 | 0 | .collect(), |
253 | 0 | }) |
254 | 0 | } |
255 | | } |
256 | | |
257 | | impl From<&str> for Column { |
258 | 0 | fn from(c: &str) -> Self { |
259 | 0 | Self::from_qualified_name(c) |
260 | 0 | } |
261 | | } |
262 | | |
263 | | /// Create a column, cloning the string |
264 | | impl From<&String> for Column { |
265 | 0 | fn from(c: &String) -> Self { |
266 | 0 | Self::from_qualified_name(c) |
267 | 0 | } |
268 | | } |
269 | | |
270 | | /// Create a column, reusing the existing string |
271 | | impl From<String> for Column { |
272 | 0 | fn from(c: String) -> Self { |
273 | 0 | Self::from_qualified_name(c) |
274 | 0 | } |
275 | | } |
276 | | |
277 | | /// Create a column, use qualifier and field name |
278 | | impl From<(Option<&TableReference>, &Field)> for Column { |
279 | 0 | fn from((relation, field): (Option<&TableReference>, &Field)) -> Self { |
280 | 0 | Self::new(relation.cloned(), field.name()) |
281 | 0 | } |
282 | | } |
283 | | |
284 | | /// Create a column, use qualifier and field name |
285 | | impl From<(Option<&TableReference>, &FieldRef)> for Column { |
286 | 0 | fn from((relation, field): (Option<&TableReference>, &FieldRef)) -> Self { |
287 | 0 | Self::new(relation.cloned(), field.name()) |
288 | 0 | } |
289 | | } |
290 | | |
291 | | impl FromStr for Column { |
292 | | type Err = Infallible; |
293 | | |
294 | 0 | fn from_str(s: &str) -> Result<Self, Self::Err> { |
295 | 0 | Ok(s.into()) |
296 | 0 | } |
297 | | } |
298 | | |
299 | | impl fmt::Display for Column { |
300 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
301 | 0 | write!(f, "{}", self.flat_name()) |
302 | 0 | } |
303 | | } |
304 | | |
305 | | #[cfg(test)] |
306 | | mod tests { |
307 | | use super::*; |
308 | | use arrow::datatypes::DataType; |
309 | | use arrow_schema::SchemaBuilder; |
310 | | use std::sync::Arc; |
311 | | |
312 | | fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result<DFSchema> { |
313 | | let mut schema_builder = SchemaBuilder::new(); |
314 | | schema_builder.extend( |
315 | | names |
316 | | .iter() |
317 | | .map(|f| Field::new(*f, DataType::Boolean, true)), |
318 | | ); |
319 | | let schema = Arc::new(schema_builder.finish()); |
320 | | DFSchema::try_from_qualified_schema(qualifier, &schema) |
321 | | } |
322 | | |
323 | | #[test] |
324 | | fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> { |
325 | | let schema1 = create_qualified_schema("t1", vec!["a", "b"])?; |
326 | | let schema2 = create_qualified_schema("t2", vec!["c", "d"])?; |
327 | | let schema3 = create_qualified_schema("t3", vec!["a", "b", "c", "d", "e"])?; |
328 | | |
329 | | // already normalized |
330 | | let col = Column::new(Some("t1"), "a"); |
331 | | let col = col.normalize_with_schemas_and_ambiguity_check(&[], &[])?; |
332 | | assert_eq!(col, Column::new(Some("t1"), "a")); |
333 | | |
334 | | // should find in first level (schema1) |
335 | | let col = Column::from_name("a"); |
336 | | let col = col.normalize_with_schemas_and_ambiguity_check( |
337 | | &[&[&schema1, &schema2], &[&schema3]], |
338 | | &[], |
339 | | )?; |
340 | | assert_eq!(col, Column::new(Some("t1"), "a")); |
341 | | |
342 | | // should find in second level (schema3) |
343 | | let col = Column::from_name("e"); |
344 | | let col = col.normalize_with_schemas_and_ambiguity_check( |
345 | | &[&[&schema1, &schema2], &[&schema3]], |
346 | | &[], |
347 | | )?; |
348 | | assert_eq!(col, Column::new(Some("t3"), "e")); |
349 | | |
350 | | // using column in first level (pick schema1) |
351 | | let mut using_columns = HashSet::new(); |
352 | | using_columns.insert(Column::new(Some("t1"), "a")); |
353 | | using_columns.insert(Column::new(Some("t3"), "a")); |
354 | | let col = Column::from_name("a"); |
355 | | let col = col.normalize_with_schemas_and_ambiguity_check( |
356 | | &[&[&schema1, &schema3], &[&schema2]], |
357 | | &[using_columns], |
358 | | )?; |
359 | | assert_eq!(col, Column::new(Some("t1"), "a")); |
360 | | |
361 | | // not found in any level |
362 | | let col = Column::from_name("z"); |
363 | | let err = col |
364 | | .normalize_with_schemas_and_ambiguity_check( |
365 | | &[&[&schema1, &schema2], &[&schema3]], |
366 | | &[], |
367 | | ) |
368 | | .expect_err("should've failed to find field"); |
369 | | let expected = r#"Schema error: No field named z. Valid fields are t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e."#; |
370 | | assert_eq!(err.strip_backtrace(), expected); |
371 | | |
372 | | // ambiguous column reference |
373 | | let col = Column::from_name("a"); |
374 | | let err = col |
375 | | .normalize_with_schemas_and_ambiguity_check( |
376 | | &[&[&schema1, &schema3], &[&schema2]], |
377 | | &[], |
378 | | ) |
379 | | .expect_err("should've found ambiguous field"); |
380 | | let expected = "Schema error: Ambiguous reference to unqualified field a"; |
381 | | assert_eq!(err.strip_backtrace(), expected); |
382 | | |
383 | | Ok(()) |
384 | | } |
385 | | } |