/Users/andrewlamb/Software/datafusion/datafusion/common/src/dfschema.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! DFSchema is an extended schema struct that DataFusion uses to provide support for |
19 | | //! fields with optional relation names. |
20 | | |
21 | | use std::collections::{BTreeSet, HashMap, HashSet}; |
22 | | use std::fmt::{Display, Formatter}; |
23 | | use std::hash::Hash; |
24 | | use std::sync::Arc; |
25 | | |
26 | | use crate::error::{DataFusionError, Result, _plan_err, _schema_err}; |
27 | | use crate::{ |
28 | | field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, |
29 | | SchemaError, TableReference, |
30 | | }; |
31 | | |
32 | | use arrow::compute::can_cast_types; |
33 | | use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; |
34 | | use arrow_schema::SchemaBuilder; |
35 | | |
36 | | /// A reference-counted reference to a [DFSchema]. |
37 | | pub type DFSchemaRef = Arc<DFSchema>; |
38 | | |
39 | | /// DFSchema wraps an Arrow schema and adds relation names. |
40 | | /// |
41 | | /// The schema may hold the fields across multiple tables. Some fields may be |
42 | | /// qualified and some unqualified. A qualified field is a field that has a |
43 | | /// relation name associated with it. |
44 | | /// |
45 | | /// Unqualified fields must be unique not only amongst themselves, but also must |
46 | | /// have a distinct name from any qualified field names. This allows finding a |
47 | | /// qualified field by name to be possible, so long as there aren't multiple |
48 | | /// qualified fields with the same name. |
49 | | /// |
50 | | /// There is an alias to `Arc<DFSchema>` named [DFSchemaRef]. |
51 | | /// |
52 | | /// # Creating qualified schemas |
53 | | /// |
54 | | /// Use [DFSchema::try_from_qualified_schema] to create a qualified schema from |
55 | | /// an Arrow schema. |
56 | | /// |
57 | | /// ```rust |
58 | | /// use datafusion_common::{DFSchema, Column}; |
59 | | /// use arrow_schema::{DataType, Field, Schema}; |
60 | | /// |
61 | | /// let arrow_schema = Schema::new(vec![ |
62 | | /// Field::new("c1", DataType::Int32, false), |
63 | | /// ]); |
64 | | /// |
65 | | /// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); |
66 | | /// let column = Column::from_qualified_name("t1.c1"); |
67 | | /// assert!(df_schema.has_column(&column)); |
68 | | /// |
69 | | /// // Can also access qualified fields with unqualified name, if it's unambiguous |
70 | | /// let column = Column::from_qualified_name("c1"); |
71 | | /// assert!(df_schema.has_column(&column)); |
72 | | /// ``` |
73 | | /// |
74 | | /// # Creating unqualified schemas |
75 | | /// |
76 | | /// Create an unqualified schema using TryFrom: |
77 | | /// |
78 | | /// ```rust |
79 | | /// use datafusion_common::{DFSchema, Column}; |
80 | | /// use arrow_schema::{DataType, Field, Schema}; |
81 | | /// |
82 | | /// let arrow_schema = Schema::new(vec![ |
83 | | /// Field::new("c1", DataType::Int32, false), |
84 | | /// ]); |
85 | | /// |
86 | | /// let df_schema = DFSchema::try_from(arrow_schema).unwrap(); |
87 | | /// let column = Column::new_unqualified("c1"); |
88 | | /// assert!(df_schema.has_column(&column)); |
89 | | /// ``` |
90 | | /// |
91 | | /// # Converting back to Arrow schema |
92 | | /// |
93 | | /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: |
94 | | /// |
95 | | /// ```rust |
96 | | /// use datafusion_common::DFSchema; |
97 | | /// use arrow_schema::Schema; |
98 | | /// use arrow::datatypes::Field; |
99 | | /// use std::collections::HashMap; |
100 | | /// |
101 | | /// let df_schema = DFSchema::from_unqualified_fields(vec![ |
102 | | /// Field::new("c1", arrow::datatypes::DataType::Int32, false), |
103 | | /// ].into(),HashMap::new()).unwrap(); |
104 | | /// let schema = Schema::from(df_schema); |
105 | | /// assert_eq!(schema.fields().len(), 1); |
106 | | /// ``` |
107 | | #[derive(Debug, Clone, PartialEq, Eq)] |
108 | | pub struct DFSchema { |
109 | | /// Inner Arrow schema reference. |
110 | | inner: SchemaRef, |
111 | | /// Optional qualifiers for each column in this schema. In the same order as |
112 | | /// the `self.inner.fields()` |
113 | | field_qualifiers: Vec<Option<TableReference>>, |
114 | | /// Stores functional dependencies in the schema. |
115 | | functional_dependencies: FunctionalDependencies, |
116 | | } |
117 | | |
118 | | impl DFSchema { |
119 | | /// Creates an empty `DFSchema` |
120 | 0 | pub fn empty() -> Self { |
121 | 0 | Self { |
122 | 0 | inner: Arc::new(Schema::new([])), |
123 | 0 | field_qualifiers: vec![], |
124 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | | /// Return a reference to the inner Arrow [`Schema`] |
129 | | /// |
130 | | /// Note this does not have the qualifier information |
131 | 0 | pub fn as_arrow(&self) -> &Schema { |
132 | 0 | self.inner.as_ref() |
133 | 0 | } |
134 | | |
135 | | /// Return a reference to the inner Arrow [`SchemaRef`] |
136 | | /// |
137 | | /// Note this does not have the qualifier information |
138 | 0 | pub fn inner(&self) -> &SchemaRef { |
139 | 0 | &self.inner |
140 | 0 | } |
141 | | |
142 | | /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier |
143 | 0 | pub fn new_with_metadata( |
144 | 0 | qualified_fields: Vec<(Option<TableReference>, Arc<Field>)>, |
145 | 0 | metadata: HashMap<String, String>, |
146 | 0 | ) -> Result<Self> { |
147 | 0 | let (qualifiers, fields): (Vec<Option<TableReference>>, Vec<Arc<Field>>) = |
148 | 0 | qualified_fields.into_iter().unzip(); |
149 | 0 |
|
150 | 0 | let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); |
151 | 0 |
|
152 | 0 | let dfschema = Self { |
153 | 0 | inner: schema, |
154 | 0 | field_qualifiers: qualifiers, |
155 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
156 | 0 | }; |
157 | 0 | dfschema.check_names()?; |
158 | 0 | Ok(dfschema) |
159 | 0 | } |
160 | | |
161 | | /// Create a new `DFSchema` from a list of Arrow [Field]s |
162 | | #[allow(deprecated)] |
163 | 0 | pub fn from_unqualified_fields( |
164 | 0 | fields: Fields, |
165 | 0 | metadata: HashMap<String, String>, |
166 | 0 | ) -> Result<Self> { |
167 | 0 | Self::from_unqualifed_fields(fields, metadata) |
168 | 0 | } |
169 | | |
170 | | /// Create a new `DFSchema` from a list of Arrow [Field]s |
171 | | #[deprecated( |
172 | | since = "40.0.0", |
173 | | note = "Please use `from_unqualified_fields` instead (this one's name is a typo). This method is subject to be removed soon" |
174 | | )] |
175 | 0 | pub fn from_unqualifed_fields( |
176 | 0 | fields: Fields, |
177 | 0 | metadata: HashMap<String, String>, |
178 | 0 | ) -> Result<Self> { |
179 | 0 | let field_count = fields.len(); |
180 | 0 | let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); |
181 | 0 | let dfschema = Self { |
182 | 0 | inner: schema, |
183 | 0 | field_qualifiers: vec![None; field_count], |
184 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
185 | 0 | }; |
186 | 0 | dfschema.check_names()?; |
187 | 0 | Ok(dfschema) |
188 | 0 | } |
189 | | |
190 | | /// Create a `DFSchema` from an Arrow schema and a given qualifier |
191 | | /// |
192 | | /// To create a schema from an Arrow schema without a qualifier, use |
193 | | /// `DFSchema::try_from`. |
194 | 0 | pub fn try_from_qualified_schema( |
195 | 0 | qualifier: impl Into<TableReference>, |
196 | 0 | schema: &Schema, |
197 | 0 | ) -> Result<Self> { |
198 | 0 | let qualifier = qualifier.into(); |
199 | 0 | let schema = DFSchema { |
200 | 0 | inner: schema.clone().into(), |
201 | 0 | field_qualifiers: vec![Some(qualifier); schema.fields.len()], |
202 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
203 | 0 | }; |
204 | 0 | schema.check_names()?; |
205 | 0 | Ok(schema) |
206 | 0 | } |
207 | | |
208 | | /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier |
209 | 0 | pub fn from_field_specific_qualified_schema( |
210 | 0 | qualifiers: Vec<Option<TableReference>>, |
211 | 0 | schema: &SchemaRef, |
212 | 0 | ) -> Result<Self> { |
213 | 0 | let dfschema = Self { |
214 | 0 | inner: Arc::clone(schema), |
215 | 0 | field_qualifiers: qualifiers, |
216 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
217 | 0 | }; |
218 | 0 | dfschema.check_names()?; |
219 | 0 | Ok(dfschema) |
220 | 0 | } |
221 | | |
222 | | /// Check if the schema have some fields with the same name |
223 | 0 | pub fn check_names(&self) -> Result<()> { |
224 | 0 | let mut qualified_names = BTreeSet::new(); |
225 | 0 | let mut unqualified_names = BTreeSet::new(); |
226 | | |
227 | 0 | for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { |
228 | 0 | if let Some(qualifier) = qualifier { |
229 | 0 | if !qualified_names.insert((qualifier, field.name())) { |
230 | 0 | return _schema_err!(SchemaError::DuplicateQualifiedField { |
231 | 0 | qualifier: Box::new(qualifier.clone()), |
232 | 0 | name: field.name().to_string(), |
233 | 0 | }); |
234 | 0 | } |
235 | 0 | } else if !unqualified_names.insert(field.name()) { |
236 | 0 | return _schema_err!(SchemaError::DuplicateUnqualifiedField { |
237 | 0 | name: field.name().to_string() |
238 | 0 | }); |
239 | 0 | } |
240 | | } |
241 | | |
242 | 0 | for (qualifier, name) in qualified_names { |
243 | 0 | if unqualified_names.contains(name) { |
244 | 0 | return _schema_err!(SchemaError::AmbiguousReference { |
245 | 0 | field: Column::new(Some(qualifier.clone()), name) |
246 | 0 | }); |
247 | 0 | } |
248 | | } |
249 | 0 | Ok(()) |
250 | 0 | } |
251 | | |
252 | | /// Assigns functional dependencies. |
253 | 0 | pub fn with_functional_dependencies( |
254 | 0 | mut self, |
255 | 0 | functional_dependencies: FunctionalDependencies, |
256 | 0 | ) -> Result<Self> { |
257 | 0 | if functional_dependencies.is_valid(self.inner.fields.len()) { |
258 | 0 | self.functional_dependencies = functional_dependencies; |
259 | 0 | Ok(self) |
260 | | } else { |
261 | 0 | _plan_err!( |
262 | 0 | "Invalid functional dependency: {:?}", |
263 | 0 | functional_dependencies |
264 | 0 | ) |
265 | | } |
266 | 0 | } |
267 | | |
268 | | /// Create a new schema that contains the fields from this schema followed by the fields |
269 | | /// from the supplied schema. An error will be returned if there are duplicate field names. |
270 | 0 | pub fn join(&self, schema: &DFSchema) -> Result<Self> { |
271 | 0 | let mut schema_builder = SchemaBuilder::new(); |
272 | 0 | schema_builder.extend(self.inner.fields().iter().cloned()); |
273 | 0 | schema_builder.extend(schema.fields().iter().cloned()); |
274 | 0 | let new_schema = schema_builder.finish(); |
275 | 0 |
|
276 | 0 | let mut new_metadata = self.inner.metadata.clone(); |
277 | 0 | new_metadata.extend(schema.inner.metadata.clone()); |
278 | 0 | let new_schema_with_metadata = new_schema.with_metadata(new_metadata); |
279 | 0 |
|
280 | 0 | let mut new_qualifiers = self.field_qualifiers.clone(); |
281 | 0 | new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); |
282 | 0 |
|
283 | 0 | let new_self = Self { |
284 | 0 | inner: Arc::new(new_schema_with_metadata), |
285 | 0 | field_qualifiers: new_qualifiers, |
286 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
287 | 0 | }; |
288 | 0 | new_self.check_names()?; |
289 | 0 | Ok(new_self) |
290 | 0 | } |
291 | | |
292 | | /// Modify this schema by appending the fields from the supplied schema, ignoring any |
293 | | /// duplicate fields. |
294 | 0 | pub fn merge(&mut self, other_schema: &DFSchema) { |
295 | 0 | if other_schema.inner.fields.is_empty() { |
296 | 0 | return; |
297 | 0 | } |
298 | 0 |
|
299 | 0 | let self_fields: HashSet<(Option<&TableReference>, &FieldRef)> = |
300 | 0 | self.iter().collect(); |
301 | 0 | let self_unqualified_names: HashSet<&str> = self |
302 | 0 | .inner |
303 | 0 | .fields |
304 | 0 | .iter() |
305 | 0 | .map(|field| field.name().as_str()) |
306 | 0 | .collect(); |
307 | 0 |
|
308 | 0 | let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); |
309 | 0 | let mut qualifiers = Vec::new(); |
310 | 0 | for (qualifier, field) in other_schema.iter() { |
311 | | // skip duplicate columns |
312 | 0 | let duplicated_field = match qualifier { |
313 | 0 | Some(q) => self_fields.contains(&(Some(q), field)), |
314 | | // for unqualified columns, check as unqualified name |
315 | 0 | None => self_unqualified_names.contains(field.name().as_str()), |
316 | | }; |
317 | 0 | if !duplicated_field { |
318 | 0 | // self.inner.fields.push(field.clone()); |
319 | 0 | schema_builder.push(Arc::clone(field)); |
320 | 0 | qualifiers.push(qualifier.cloned()); |
321 | 0 | } |
322 | | } |
323 | 0 | let mut metadata = self.inner.metadata.clone(); |
324 | 0 | metadata.extend(other_schema.inner.metadata.clone()); |
325 | 0 |
|
326 | 0 | let finished = schema_builder.finish(); |
327 | 0 | let finished_with_metadata = finished.with_metadata(metadata); |
328 | 0 | self.inner = finished_with_metadata.into(); |
329 | 0 | self.field_qualifiers.extend(qualifiers); |
330 | 0 | } |
331 | | |
332 | | /// Get a list of fields |
333 | 0 | pub fn fields(&self) -> &Fields { |
334 | 0 | &self.inner.fields |
335 | 0 | } |
336 | | |
337 | | /// Returns an immutable reference of a specific `Field` instance selected using an |
338 | | /// offset within the internal `fields` vector |
339 | 0 | pub fn field(&self, i: usize) -> &Field { |
340 | 0 | &self.inner.fields[i] |
341 | 0 | } |
342 | | |
343 | | /// Returns an immutable reference of a specific `Field` instance selected using an |
344 | | /// offset within the internal `fields` vector and its qualifier |
345 | 0 | pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &Field) { |
346 | 0 | (self.field_qualifiers[i].as_ref(), self.field(i)) |
347 | 0 | } |
348 | | |
349 | 0 | pub fn index_of_column_by_name( |
350 | 0 | &self, |
351 | 0 | qualifier: Option<&TableReference>, |
352 | 0 | name: &str, |
353 | 0 | ) -> Option<usize> { |
354 | 0 | let mut matches = self |
355 | 0 | .iter() |
356 | 0 | .enumerate() |
357 | 0 | .filter(|(_, (q, f))| match (qualifier, q) { |
358 | | // field to lookup is qualified. |
359 | | // current field is qualified and not shared between relations, compare both |
360 | | // qualifier and name. |
361 | 0 | (Some(q), Some(field_q)) => q.resolved_eq(field_q) && f.name() == name, |
362 | | // field to lookup is qualified but current field is unqualified. |
363 | 0 | (Some(_), None) => false, |
364 | | // field to lookup is unqualified, no need to compare qualifier |
365 | 0 | (None, Some(_)) | (None, None) => f.name() == name, |
366 | 0 | }) |
367 | 0 | .map(|(idx, _)| idx); |
368 | 0 | matches.next() |
369 | 0 | } |
370 | | |
371 | | /// Find the index of the column with the given qualifier and name, |
372 | | /// returning `None` if not found |
373 | | /// |
374 | | /// See [Self::index_of_column] for a version that returns an error if the |
375 | | /// column is not found |
376 | 0 | pub fn maybe_index_of_column(&self, col: &Column) -> Option<usize> { |
377 | 0 | self.index_of_column_by_name(col.relation.as_ref(), &col.name) |
378 | 0 | } |
379 | | |
380 | | /// Find the index of the column with the given qualifier and name, |
381 | | /// returning `Err` if not found |
382 | | /// |
383 | | /// See [Self::maybe_index_of_column] for a version that returns `None` if |
384 | | /// the column is not found |
385 | 0 | pub fn index_of_column(&self, col: &Column) -> Result<usize> { |
386 | 0 | self.maybe_index_of_column(col) |
387 | 0 | .ok_or_else(|| field_not_found(col.relation.clone(), &col.name, self)) |
388 | 0 | } |
389 | | |
390 | | /// Check if the column is in the current schema |
391 | 0 | pub fn is_column_from_schema(&self, col: &Column) -> bool { |
392 | 0 | self.index_of_column_by_name(col.relation.as_ref(), &col.name) |
393 | 0 | .is_some() |
394 | 0 | } |
395 | | |
396 | | /// Find the field with the given name |
397 | 0 | pub fn field_with_name( |
398 | 0 | &self, |
399 | 0 | qualifier: Option<&TableReference>, |
400 | 0 | name: &str, |
401 | 0 | ) -> Result<&Field> { |
402 | 0 | if let Some(qualifier) = qualifier { |
403 | 0 | self.field_with_qualified_name(qualifier, name) |
404 | | } else { |
405 | 0 | self.field_with_unqualified_name(name) |
406 | | } |
407 | 0 | } |
408 | | |
409 | | /// Check whether the column reference is ambiguous |
410 | 0 | pub fn check_ambiguous_name( |
411 | 0 | &self, |
412 | 0 | qualifier: Option<&TableReference>, |
413 | 0 | name: &str, |
414 | 0 | ) -> Result<()> { |
415 | 0 | let count = self |
416 | 0 | .iter() |
417 | 0 | .filter(|(field_q, f)| match (field_q, qualifier) { |
418 | 0 | (Some(q1), Some(q2)) => q1.resolved_eq(q2) && f.name() == name, |
419 | 0 | (None, None) => f.name() == name, |
420 | 0 | _ => false, |
421 | 0 | }) |
422 | 0 | .take(2) |
423 | 0 | .count(); |
424 | 0 | if count > 1 { |
425 | 0 | _schema_err!(SchemaError::AmbiguousReference { |
426 | 0 | field: Column { |
427 | 0 | relation: None, |
428 | 0 | name: name.to_string(), |
429 | 0 | }, |
430 | 0 | }) |
431 | | } else { |
432 | 0 | Ok(()) |
433 | | } |
434 | 0 | } |
435 | | |
436 | | /// Find the qualified field with the given name |
437 | 0 | pub fn qualified_field_with_name( |
438 | 0 | &self, |
439 | 0 | qualifier: Option<&TableReference>, |
440 | 0 | name: &str, |
441 | 0 | ) -> Result<(Option<&TableReference>, &Field)> { |
442 | 0 | if let Some(qualifier) = qualifier { |
443 | 0 | let idx = self |
444 | 0 | .index_of_column_by_name(Some(qualifier), name) |
445 | 0 | .ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?; |
446 | 0 | Ok((self.field_qualifiers[idx].as_ref(), self.field(idx))) |
447 | | } else { |
448 | 0 | self.qualified_field_with_unqualified_name(name) |
449 | | } |
450 | 0 | } |
451 | | |
452 | | /// Find all fields having the given qualifier |
453 | 0 | pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { |
454 | 0 | self.iter() |
455 | 0 | .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) |
456 | 0 | .map(|(_, f)| f.as_ref()) |
457 | 0 | .collect() |
458 | 0 | } |
459 | | |
460 | | /// Find all fields indices having the given qualifier |
461 | 0 | pub fn fields_indices_with_qualified( |
462 | 0 | &self, |
463 | 0 | qualifier: &TableReference, |
464 | 0 | ) -> Vec<usize> { |
465 | 0 | self.iter() |
466 | 0 | .enumerate() |
467 | 0 | .filter_map(|(idx, (q, _))| q.and_then(|q| q.eq(qualifier).then_some(idx))) |
468 | 0 | .collect() |
469 | 0 | } |
470 | | |
471 | | /// Find all fields that match the given name |
472 | 0 | pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { |
473 | 0 | self.fields() |
474 | 0 | .iter() |
475 | 0 | .filter(|field| field.name() == name) |
476 | 0 | .map(|f| f.as_ref()) |
477 | 0 | .collect() |
478 | 0 | } |
479 | | |
480 | | /// Find all fields that match the given name and return them with their qualifier |
481 | 0 | pub fn qualified_fields_with_unqualified_name( |
482 | 0 | &self, |
483 | 0 | name: &str, |
484 | 0 | ) -> Vec<(Option<&TableReference>, &Field)> { |
485 | 0 | self.iter() |
486 | 0 | .filter(|(_, field)| field.name() == name) |
487 | 0 | .map(|(qualifier, field)| (qualifier, field.as_ref())) |
488 | 0 | .collect() |
489 | 0 | } |
490 | | |
491 | | /// Find all fields that match the given name and convert to column |
492 | 0 | pub fn columns_with_unqualified_name(&self, name: &str) -> Vec<Column> { |
493 | 0 | self.iter() |
494 | 0 | .filter(|(_, field)| field.name() == name) |
495 | 0 | .map(|(qualifier, field)| Column::new(qualifier.cloned(), field.name())) |
496 | 0 | .collect() |
497 | 0 | } |
498 | | |
499 | | /// Return all `Column`s for the schema |
500 | 0 | pub fn columns(&self) -> Vec<Column> { |
501 | 0 | self.iter() |
502 | 0 | .map(|(qualifier, field)| { |
503 | 0 | Column::new(qualifier.cloned(), field.name().clone()) |
504 | 0 | }) |
505 | 0 | .collect() |
506 | 0 | } |
507 | | |
508 | | /// Find the qualified field with the given unqualified name |
509 | 0 | pub fn qualified_field_with_unqualified_name( |
510 | 0 | &self, |
511 | 0 | name: &str, |
512 | 0 | ) -> Result<(Option<&TableReference>, &Field)> { |
513 | 0 | let matches = self.qualified_fields_with_unqualified_name(name); |
514 | 0 | match matches.len() { |
515 | 0 | 0 => Err(unqualified_field_not_found(name, self)), |
516 | 0 | 1 => Ok((matches[0].0, (matches[0].1))), |
517 | | _ => { |
518 | | // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. |
519 | | // Because name may generate from Alias/... . It means that it don't own qualifier. |
520 | | // For example: |
521 | | // Join on id = b.id |
522 | | // Project a.id as id TableScan b id |
523 | | // In this case, there isn't `ambiguous name` problem. When `matches` just contains |
524 | | // one field without qualifier, we should return it. |
525 | 0 | let fields_without_qualifier = matches |
526 | 0 | .iter() |
527 | 0 | .filter(|(q, _)| q.is_none()) |
528 | 0 | .collect::<Vec<_>>(); |
529 | 0 | if fields_without_qualifier.len() == 1 { |
530 | 0 | Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) |
531 | | } else { |
532 | 0 | _schema_err!(SchemaError::AmbiguousReference { |
533 | 0 | field: Column { |
534 | 0 | relation: None, |
535 | 0 | name: name.to_string(), |
536 | 0 | }, |
537 | 0 | }) |
538 | | } |
539 | | } |
540 | | } |
541 | 0 | } |
542 | | |
543 | | /// Find the field with the given name |
544 | 0 | pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { |
545 | 0 | self.qualified_field_with_unqualified_name(name) |
546 | 0 | .map(|(_, field)| field) |
547 | 0 | } |
548 | | |
549 | | /// Find the field with the given qualified name |
550 | 0 | pub fn field_with_qualified_name( |
551 | 0 | &self, |
552 | 0 | qualifier: &TableReference, |
553 | 0 | name: &str, |
554 | 0 | ) -> Result<&Field> { |
555 | 0 | let idx = self |
556 | 0 | .index_of_column_by_name(Some(qualifier), name) |
557 | 0 | .ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?; |
558 | | |
559 | 0 | Ok(self.field(idx)) |
560 | 0 | } |
561 | | |
562 | | /// Find the field with the given qualified column |
563 | 0 | pub fn field_from_column(&self, column: &Column) -> Result<&Field> { |
564 | 0 | match &column.relation { |
565 | 0 | Some(r) => self.field_with_qualified_name(r, &column.name), |
566 | 0 | None => self.field_with_unqualified_name(&column.name), |
567 | | } |
568 | 0 | } |
569 | | |
570 | | /// Find the field with the given qualified column |
571 | 0 | pub fn qualified_field_from_column( |
572 | 0 | &self, |
573 | 0 | column: &Column, |
574 | 0 | ) -> Result<(Option<&TableReference>, &Field)> { |
575 | 0 | self.qualified_field_with_name(column.relation.as_ref(), &column.name) |
576 | 0 | } |
577 | | |
578 | | /// Find if the field exists with the given name |
579 | 0 | pub fn has_column_with_unqualified_name(&self, name: &str) -> bool { |
580 | 0 | self.fields().iter().any(|field| field.name() == name) |
581 | 0 | } |
582 | | |
583 | | /// Find if the field exists with the given qualified name |
584 | 0 | pub fn has_column_with_qualified_name( |
585 | 0 | &self, |
586 | 0 | qualifier: &TableReference, |
587 | 0 | name: &str, |
588 | 0 | ) -> bool { |
589 | 0 | self.iter() |
590 | 0 | .any(|(q, f)| q.map(|q| q.eq(qualifier)).unwrap_or(false) && f.name() == name) |
591 | 0 | } |
592 | | |
593 | | /// Find if the field exists with the given qualified column |
594 | 0 | pub fn has_column(&self, column: &Column) -> bool { |
595 | 0 | match &column.relation { |
596 | 0 | Some(r) => self.has_column_with_qualified_name(r, &column.name), |
597 | 0 | None => self.has_column_with_unqualified_name(&column.name), |
598 | | } |
599 | 0 | } |
600 | | |
601 | | /// Check to see if unqualified field names matches field names in Arrow schema |
602 | 0 | pub fn matches_arrow_schema(&self, arrow_schema: &Schema) -> bool { |
603 | 0 | self.inner |
604 | 0 | .fields |
605 | 0 | .iter() |
606 | 0 | .zip(arrow_schema.fields().iter()) |
607 | 0 | .all(|(dffield, arrowfield)| dffield.name() == arrowfield.name()) |
608 | 0 | } |
609 | | |
610 | | /// Check to see if fields in 2 Arrow schemas are compatible |
611 | 0 | pub fn check_arrow_schema_type_compatible( |
612 | 0 | &self, |
613 | 0 | arrow_schema: &Schema, |
614 | 0 | ) -> Result<()> { |
615 | 0 | let self_arrow_schema: Schema = self.into(); |
616 | 0 | self_arrow_schema |
617 | 0 | .fields() |
618 | 0 | .iter() |
619 | 0 | .zip(arrow_schema.fields().iter()) |
620 | 0 | .try_for_each(|(l_field, r_field)| { |
621 | 0 | if !can_cast_types(r_field.data_type(), l_field.data_type()) { |
622 | 0 | _plan_err!("Column {} (type: {}) is not compatible with column {} (type: {})", |
623 | 0 | r_field.name(), |
624 | 0 | r_field.data_type(), |
625 | 0 | l_field.name(), |
626 | 0 | l_field.data_type()) |
627 | | } else { |
628 | 0 | Ok(()) |
629 | | } |
630 | 0 | }) |
631 | 0 | } |
632 | | |
633 | | /// Returns true if the two schemas have the same qualified named |
634 | | /// fields with logically equivalent data types. Returns false otherwise. |
635 | | /// |
636 | | /// Use [DFSchema]::equivalent_names_and_types for stricter semantic type |
637 | | /// equivalence checking. |
638 | 0 | pub fn logically_equivalent_names_and_types(&self, other: &Self) -> bool { |
639 | 0 | if self.fields().len() != other.fields().len() { |
640 | 0 | return false; |
641 | 0 | } |
642 | 0 | let self_fields = self.iter(); |
643 | 0 | let other_fields = other.iter(); |
644 | 0 | self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { |
645 | 0 | q1 == q2 |
646 | 0 | && f1.name() == f2.name() |
647 | 0 | && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) |
648 | 0 | }) |
649 | 0 | } |
650 | | |
651 | | /// Returns true if the two schemas have the same qualified named |
652 | | /// fields with the same data types. Returns false otherwise. |
653 | | /// |
654 | | /// This is a specialized version of Eq that ignores differences |
655 | | /// in nullability and metadata. |
656 | | /// |
657 | | /// Use [DFSchema]::logically_equivalent_names_and_types for a weaker |
658 | | /// logical type checking, which for example would consider a dictionary |
659 | | /// encoded UTF8 array to be equivalent to a plain UTF8 array. |
660 | 0 | pub fn equivalent_names_and_types(&self, other: &Self) -> bool { |
661 | 0 | if self.fields().len() != other.fields().len() { |
662 | 0 | return false; |
663 | 0 | } |
664 | 0 | let self_fields = self.iter(); |
665 | 0 | let other_fields = other.iter(); |
666 | 0 | self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { |
667 | 0 | q1 == q2 |
668 | 0 | && f1.name() == f2.name() |
669 | 0 | && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) |
670 | 0 | }) |
671 | 0 | } |
672 | | |
673 | | /// Checks if two [`DataType`]s are logically equal. This is a notably weaker constraint |
674 | | /// than datatype_is_semantically_equal in that a Dictionary<K,V> type is logically |
675 | | /// equal to a plain V type, but not semantically equal. Dictionary<K1, V1> is also |
676 | | /// logically equal to Dictionary<K2, V1>. |
677 | 0 | pub fn datatype_is_logically_equal(dt1: &DataType, dt2: &DataType) -> bool { |
678 | 0 | // check nested fields |
679 | 0 | match (dt1, dt2) { |
680 | 0 | (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => { |
681 | 0 | v1.as_ref() == v2.as_ref() |
682 | | } |
683 | 0 | (DataType::Dictionary(_, v1), othertype) => v1.as_ref() == othertype, |
684 | 0 | (othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype, |
685 | 0 | (DataType::List(f1), DataType::List(f2)) |
686 | 0 | | (DataType::LargeList(f1), DataType::LargeList(f2)) |
687 | 0 | | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) |
688 | 0 | | (DataType::Map(f1, _), DataType::Map(f2, _)) => { |
689 | 0 | Self::field_is_logically_equal(f1, f2) |
690 | | } |
691 | 0 | (DataType::Struct(fields1), DataType::Struct(fields2)) => { |
692 | 0 | let iter1 = fields1.iter(); |
693 | 0 | let iter2 = fields2.iter(); |
694 | 0 | fields1.len() == fields2.len() && |
695 | | // all fields have to be the same |
696 | 0 | iter1 |
697 | 0 | .zip(iter2) |
698 | 0 | .all(|(f1, f2)| Self::field_is_logically_equal(f1, f2)) |
699 | | } |
700 | 0 | (DataType::Union(fields1, _), DataType::Union(fields2, _)) => { |
701 | 0 | let iter1 = fields1.iter(); |
702 | 0 | let iter2 = fields2.iter(); |
703 | 0 | fields1.len() == fields2.len() && |
704 | | // all fields have to be the same |
705 | 0 | iter1 |
706 | 0 | .zip(iter2) |
707 | 0 | .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_logically_equal(f1, f2)) |
708 | | } |
709 | 0 | _ => dt1 == dt2, |
710 | | } |
711 | 0 | } |
712 | | |
713 | | /// Returns true of two [`DataType`]s are semantically equal (same |
714 | | /// name and type), ignoring both metadata and nullability. |
715 | | /// |
716 | | /// request to upstream: <https://github.com/apache/arrow-rs/issues/3199> |
717 | 0 | fn datatype_is_semantically_equal(dt1: &DataType, dt2: &DataType) -> bool { |
718 | 0 | // check nested fields |
719 | 0 | match (dt1, dt2) { |
720 | 0 | (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { |
721 | 0 | Self::datatype_is_semantically_equal(k1.as_ref(), k2.as_ref()) |
722 | 0 | && Self::datatype_is_semantically_equal(v1.as_ref(), v2.as_ref()) |
723 | | } |
724 | 0 | (DataType::List(f1), DataType::List(f2)) |
725 | 0 | | (DataType::LargeList(f1), DataType::LargeList(f2)) |
726 | 0 | | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) |
727 | 0 | | (DataType::Map(f1, _), DataType::Map(f2, _)) => { |
728 | 0 | Self::field_is_semantically_equal(f1, f2) |
729 | | } |
730 | 0 | (DataType::Struct(fields1), DataType::Struct(fields2)) => { |
731 | 0 | let iter1 = fields1.iter(); |
732 | 0 | let iter2 = fields2.iter(); |
733 | 0 | fields1.len() == fields2.len() && |
734 | | // all fields have to be the same |
735 | 0 | iter1 |
736 | 0 | .zip(iter2) |
737 | 0 | .all(|(f1, f2)| Self::field_is_semantically_equal(f1, f2)) |
738 | | } |
739 | 0 | (DataType::Union(fields1, _), DataType::Union(fields2, _)) => { |
740 | 0 | let iter1 = fields1.iter(); |
741 | 0 | let iter2 = fields2.iter(); |
742 | 0 | fields1.len() == fields2.len() && |
743 | | // all fields have to be the same |
744 | 0 | iter1 |
745 | 0 | .zip(iter2) |
746 | 0 | .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_semantically_equal(f1, f2)) |
747 | | } |
748 | | ( |
749 | 0 | DataType::Decimal128(_l_precision, _l_scale), |
750 | 0 | DataType::Decimal128(_r_precision, _r_scale), |
751 | 0 | ) => true, |
752 | | ( |
753 | 0 | DataType::Decimal256(_l_precision, _l_scale), |
754 | 0 | DataType::Decimal256(_r_precision, _r_scale), |
755 | 0 | ) => true, |
756 | 0 | _ => dt1 == dt2, |
757 | | } |
758 | 0 | } |
759 | | |
760 | 0 | fn field_is_logically_equal(f1: &Field, f2: &Field) -> bool { |
761 | 0 | f1.name() == f2.name() |
762 | 0 | && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) |
763 | 0 | } |
764 | | |
765 | 0 | fn field_is_semantically_equal(f1: &Field, f2: &Field) -> bool { |
766 | 0 | f1.name() == f2.name() |
767 | 0 | && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) |
768 | 0 | } |
769 | | |
770 | | /// Strip all field qualifier in schema |
771 | 0 | pub fn strip_qualifiers(self) -> Self { |
772 | 0 | DFSchema { |
773 | 0 | field_qualifiers: vec![None; self.inner.fields.len()], |
774 | 0 | inner: self.inner, |
775 | 0 | functional_dependencies: self.functional_dependencies, |
776 | 0 | } |
777 | 0 | } |
778 | | |
779 | | /// Replace all field qualifier with new value in schema |
780 | 0 | pub fn replace_qualifier(self, qualifier: impl Into<TableReference>) -> Self { |
781 | 0 | let qualifier = qualifier.into(); |
782 | 0 | DFSchema { |
783 | 0 | field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], |
784 | 0 | inner: self.inner, |
785 | 0 | functional_dependencies: self.functional_dependencies, |
786 | 0 | } |
787 | 0 | } |
788 | | |
789 | | /// Get list of fully-qualified field names in this schema |
790 | 0 | pub fn field_names(&self) -> Vec<String> { |
791 | 0 | self.iter() |
792 | 0 | .map(|(qualifier, field)| qualified_name(qualifier, field.name())) |
793 | 0 | .collect::<Vec<_>>() |
794 | 0 | } |
795 | | |
796 | | /// Get metadata of this schema |
797 | 0 | pub fn metadata(&self) -> &HashMap<String, String> { |
798 | 0 | &self.inner.metadata |
799 | 0 | } |
800 | | |
801 | | /// Get functional dependencies |
802 | 0 | pub fn functional_dependencies(&self) -> &FunctionalDependencies { |
803 | 0 | &self.functional_dependencies |
804 | 0 | } |
805 | | |
806 | | /// Iterate over the qualifiers and fields in the DFSchema |
807 | 0 | pub fn iter(&self) -> impl Iterator<Item = (Option<&TableReference>, &FieldRef)> { |
808 | 0 | self.field_qualifiers |
809 | 0 | .iter() |
810 | 0 | .zip(self.inner.fields().iter()) |
811 | 0 | .map(|(qualifier, field)| (qualifier.as_ref(), field)) |
812 | 0 | } |
813 | | } |
814 | | |
815 | | impl From<DFSchema> for Schema { |
816 | | /// Convert DFSchema into a Schema |
817 | 0 | fn from(df_schema: DFSchema) -> Self { |
818 | 0 | let fields: Fields = df_schema.inner.fields.clone(); |
819 | 0 | Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) |
820 | 0 | } |
821 | | } |
822 | | |
823 | | impl From<&DFSchema> for Schema { |
824 | | /// Convert DFSchema reference into a Schema |
825 | 0 | fn from(df_schema: &DFSchema) -> Self { |
826 | 0 | let fields: Fields = df_schema.inner.fields.clone(); |
827 | 0 | Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) |
828 | 0 | } |
829 | | } |
830 | | |
831 | | /// Allow DFSchema to be converted into an Arrow `&Schema` |
832 | | impl AsRef<Schema> for DFSchema { |
833 | 0 | fn as_ref(&self) -> &Schema { |
834 | 0 | self.as_arrow() |
835 | 0 | } |
836 | | } |
837 | | |
838 | | /// Allow DFSchema to be converted into an Arrow `&SchemaRef` (to clone, for |
839 | | /// example) |
840 | | impl AsRef<SchemaRef> for DFSchema { |
841 | 0 | fn as_ref(&self) -> &SchemaRef { |
842 | 0 | self.inner() |
843 | 0 | } |
844 | | } |
845 | | |
846 | | /// Create a `DFSchema` from an Arrow schema |
847 | | impl TryFrom<Schema> for DFSchema { |
848 | | type Error = DataFusionError; |
849 | 0 | fn try_from(schema: Schema) -> Result<Self, Self::Error> { |
850 | 0 | Self::try_from(Arc::new(schema)) |
851 | 0 | } |
852 | | } |
853 | | |
854 | | impl TryFrom<SchemaRef> for DFSchema { |
855 | | type Error = DataFusionError; |
856 | 0 | fn try_from(schema: SchemaRef) -> Result<Self, Self::Error> { |
857 | 0 | let field_count = schema.fields.len(); |
858 | 0 | let dfschema = Self { |
859 | 0 | inner: schema, |
860 | 0 | field_qualifiers: vec![None; field_count], |
861 | 0 | functional_dependencies: FunctionalDependencies::empty(), |
862 | 0 | }; |
863 | 0 | Ok(dfschema) |
864 | 0 | } |
865 | | } |
866 | | |
867 | | impl From<DFSchema> for SchemaRef { |
868 | 0 | fn from(df_schema: DFSchema) -> Self { |
869 | 0 | SchemaRef::new(df_schema.into()) |
870 | 0 | } |
871 | | } |
872 | | |
873 | | // Hashing refers to a subset of fields considered in PartialEq. |
874 | | impl Hash for DFSchema { |
875 | 0 | fn hash<H: std::hash::Hasher>(&self, state: &mut H) { |
876 | 0 | self.inner.fields.hash(state); |
877 | 0 | self.inner.metadata.len().hash(state); // HashMap is not hashable |
878 | 0 | } |
879 | | } |
880 | | |
881 | | /// Convenience trait to convert Schema like things to DFSchema and DFSchemaRef with fewer keystrokes |
882 | | pub trait ToDFSchema |
883 | | where |
884 | | Self: Sized, |
885 | | { |
886 | | /// Attempt to create a DSSchema |
887 | | fn to_dfschema(self) -> Result<DFSchema>; |
888 | | |
889 | | /// Attempt to create a DSSchemaRef |
890 | 0 | fn to_dfschema_ref(self) -> Result<DFSchemaRef> { |
891 | 0 | Ok(Arc::new(self.to_dfschema()?)) |
892 | 0 | } |
893 | | } |
894 | | |
895 | | impl ToDFSchema for Schema { |
896 | 0 | fn to_dfschema(self) -> Result<DFSchema> { |
897 | 0 | DFSchema::try_from(self) |
898 | 0 | } |
899 | | } |
900 | | |
901 | | impl ToDFSchema for SchemaRef { |
902 | 0 | fn to_dfschema(self) -> Result<DFSchema> { |
903 | 0 | DFSchema::try_from(self) |
904 | 0 | } |
905 | | } |
906 | | |
907 | | impl ToDFSchema for Vec<Field> { |
908 | | fn to_dfschema(self) -> Result<DFSchema> { |
909 | | let field_count = self.len(); |
910 | | let schema = Schema { |
911 | | fields: self.into(), |
912 | | metadata: HashMap::new(), |
913 | | }; |
914 | | let dfschema = DFSchema { |
915 | | inner: schema.into(), |
916 | | field_qualifiers: vec![None; field_count], |
917 | | functional_dependencies: FunctionalDependencies::empty(), |
918 | | }; |
919 | | Ok(dfschema) |
920 | | } |
921 | | } |
922 | | |
923 | | impl Display for DFSchema { |
924 | 0 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { |
925 | 0 | write!( |
926 | 0 | f, |
927 | 0 | "fields:[{}], metadata:{:?}", |
928 | 0 | self.iter() |
929 | 0 | .map(|(q, f)| qualified_name(q, f.name())) |
930 | 0 | .collect::<Vec<String>>() |
931 | 0 | .join(", "), |
932 | 0 | self.inner.metadata |
933 | 0 | ) |
934 | 0 | } |
935 | | } |
936 | | |
937 | | /// Provides schema information needed by certain methods of `Expr` |
938 | | /// (defined in the datafusion-common crate). |
939 | | /// |
940 | | /// Note that this trait is implemented for &[DFSchema] which is |
941 | | /// widely used in the DataFusion codebase. |
942 | | pub trait ExprSchema: std::fmt::Debug { |
943 | | /// Is this column reference nullable? |
944 | | fn nullable(&self, col: &Column) -> Result<bool>; |
945 | | |
946 | | /// What is the datatype of this column? |
947 | | fn data_type(&self, col: &Column) -> Result<&DataType>; |
948 | | |
949 | | /// Returns the column's optional metadata. |
950 | | fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>>; |
951 | | |
952 | | /// Return the coulmn's datatype and nullability |
953 | | fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)>; |
954 | | } |
955 | | |
956 | | // Implement `ExprSchema` for `Arc<DFSchema>` |
957 | | impl<P: AsRef<DFSchema> + std::fmt::Debug> ExprSchema for P { |
958 | 0 | fn nullable(&self, col: &Column) -> Result<bool> { |
959 | 0 | self.as_ref().nullable(col) |
960 | 0 | } |
961 | | |
962 | 0 | fn data_type(&self, col: &Column) -> Result<&DataType> { |
963 | 0 | self.as_ref().data_type(col) |
964 | 0 | } |
965 | | |
966 | 0 | fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>> { |
967 | 0 | ExprSchema::metadata(self.as_ref(), col) |
968 | 0 | } |
969 | | |
970 | 0 | fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)> { |
971 | 0 | self.as_ref().data_type_and_nullable(col) |
972 | 0 | } |
973 | | } |
974 | | |
975 | | impl ExprSchema for DFSchema { |
976 | 0 | fn nullable(&self, col: &Column) -> Result<bool> { |
977 | 0 | Ok(self.field_from_column(col)?.is_nullable()) |
978 | 0 | } |
979 | | |
980 | 0 | fn data_type(&self, col: &Column) -> Result<&DataType> { |
981 | 0 | Ok(self.field_from_column(col)?.data_type()) |
982 | 0 | } |
983 | | |
984 | 0 | fn metadata(&self, col: &Column) -> Result<&HashMap<String, String>> { |
985 | 0 | Ok(self.field_from_column(col)?.metadata()) |
986 | 0 | } |
987 | | |
988 | 0 | fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)> { |
989 | 0 | let field = self.field_from_column(col)?; |
990 | 0 | Ok((field.data_type(), field.is_nullable())) |
991 | 0 | } |
992 | | } |
993 | | |
994 | | /// DataFusion-specific extensions to [`Schema`]. |
995 | | pub trait SchemaExt { |
996 | | /// This is a specialized version of Eq that ignores differences |
997 | | /// in nullability and metadata. |
998 | | /// |
999 | | /// It works the same as [`DFSchema::equivalent_names_and_types`]. |
1000 | | fn equivalent_names_and_types(&self, other: &Self) -> bool; |
1001 | | |
1002 | | /// Returns true if the two schemas have the same qualified named |
1003 | | /// fields with logically equivalent data types. Returns false otherwise. |
1004 | | /// |
1005 | | /// Use [DFSchema]::equivalent_names_and_types for stricter semantic type |
1006 | | /// equivalence checking. |
1007 | | fn logically_equivalent_names_and_types(&self, other: &Self) -> bool; |
1008 | | } |
1009 | | |
1010 | | impl SchemaExt for Schema { |
1011 | 0 | fn equivalent_names_and_types(&self, other: &Self) -> bool { |
1012 | 0 | if self.fields().len() != other.fields().len() { |
1013 | 0 | return false; |
1014 | 0 | } |
1015 | 0 |
|
1016 | 0 | self.fields() |
1017 | 0 | .iter() |
1018 | 0 | .zip(other.fields().iter()) |
1019 | 0 | .all(|(f1, f2)| { |
1020 | 0 | f1.name() == f2.name() |
1021 | 0 | && DFSchema::datatype_is_semantically_equal( |
1022 | 0 | f1.data_type(), |
1023 | 0 | f2.data_type(), |
1024 | 0 | ) |
1025 | 0 | }) |
1026 | 0 | } |
1027 | | |
1028 | 0 | fn logically_equivalent_names_and_types(&self, other: &Self) -> bool { |
1029 | 0 | if self.fields().len() != other.fields().len() { |
1030 | 0 | return false; |
1031 | 0 | } |
1032 | 0 |
|
1033 | 0 | self.fields() |
1034 | 0 | .iter() |
1035 | 0 | .zip(other.fields().iter()) |
1036 | 0 | .all(|(f1, f2)| { |
1037 | 0 | f1.name() == f2.name() |
1038 | 0 | && DFSchema::datatype_is_logically_equal( |
1039 | 0 | f1.data_type(), |
1040 | 0 | f2.data_type(), |
1041 | 0 | ) |
1042 | 0 | }) |
1043 | 0 | } |
1044 | | } |
1045 | | |
1046 | 0 | pub fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String { |
1047 | 0 | match qualifier { |
1048 | 0 | Some(q) => format!("{}.{}", q, name), |
1049 | 0 | None => name.to_string(), |
1050 | | } |
1051 | 0 | } |
1052 | | |
1053 | | #[cfg(test)] |
1054 | | mod tests { |
1055 | | use crate::assert_contains; |
1056 | | |
1057 | | use super::*; |
1058 | | |
1059 | | #[test] |
1060 | | fn qualifier_in_name() -> Result<()> { |
1061 | | let col = Column::from_name("t1.c0"); |
1062 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1063 | | // lookup with unqualified name "t1.c0" |
1064 | | let err = schema.index_of_column(&col).unwrap_err(); |
1065 | | assert_eq!( |
1066 | | err.strip_backtrace(), |
1067 | | "Schema error: No field named \"t1.c0\". Valid fields are t1.c0, t1.c1." |
1068 | | ); |
1069 | | Ok(()) |
1070 | | } |
1071 | | |
1072 | | #[test] |
1073 | | fn quoted_qualifiers_in_name() -> Result<()> { |
1074 | | let col = Column::from_name("t1.c0"); |
1075 | | let schema = DFSchema::try_from_qualified_schema( |
1076 | | "t1", |
1077 | | &Schema::new(vec![ |
1078 | | Field::new("CapitalColumn", DataType::Boolean, true), |
1079 | | Field::new("field.with.period", DataType::Boolean, true), |
1080 | | ]), |
1081 | | )?; |
1082 | | |
1083 | | // lookup with unqualified name "t1.c0" |
1084 | | let err = schema.index_of_column(&col).unwrap_err(); |
1085 | | assert_eq!( |
1086 | | err.strip_backtrace(), |
1087 | | "Schema error: No field named \"t1.c0\". Valid fields are t1.\"CapitalColumn\", t1.\"field.with.period\"." |
1088 | | ); |
1089 | | Ok(()) |
1090 | | } |
1091 | | |
1092 | | #[test] |
1093 | | fn from_unqualified_schema() -> Result<()> { |
1094 | | let schema = DFSchema::try_from(test_schema_1())?; |
1095 | | assert_eq!("fields:[c0, c1], metadata:{}", schema.to_string()); |
1096 | | Ok(()) |
1097 | | } |
1098 | | |
1099 | | #[test] |
1100 | | fn from_qualified_schema() -> Result<()> { |
1101 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1102 | | assert_eq!("fields:[t1.c0, t1.c1], metadata:{}", schema.to_string()); |
1103 | | Ok(()) |
1104 | | } |
1105 | | |
1106 | | #[test] |
1107 | | fn test_from_field_specific_qualified_schema() -> Result<()> { |
1108 | | let schema = DFSchema::from_field_specific_qualified_schema( |
1109 | | vec![Some("t1".into()), None], |
1110 | | &Arc::new(Schema::new(vec![ |
1111 | | Field::new("c0", DataType::Boolean, true), |
1112 | | Field::new("c1", DataType::Boolean, true), |
1113 | | ])), |
1114 | | )?; |
1115 | | assert_eq!("fields:[t1.c0, c1], metadata:{}", schema.to_string()); |
1116 | | Ok(()) |
1117 | | } |
1118 | | |
1119 | | #[test] |
1120 | | fn test_from_qualified_fields() -> Result<()> { |
1121 | | let schema = DFSchema::new_with_metadata( |
1122 | | vec![ |
1123 | | ( |
1124 | | Some("t0".into()), |
1125 | | Arc::new(Field::new("c0", DataType::Boolean, true)), |
1126 | | ), |
1127 | | (None, Arc::new(Field::new("c1", DataType::Boolean, true))), |
1128 | | ], |
1129 | | HashMap::new(), |
1130 | | )?; |
1131 | | assert_eq!("fields:[t0.c0, c1], metadata:{}", schema.to_string()); |
1132 | | Ok(()) |
1133 | | } |
1134 | | |
1135 | | #[test] |
1136 | | fn from_qualified_schema_into_arrow_schema() -> Result<()> { |
1137 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1138 | | let arrow_schema: Schema = schema.into(); |
1139 | | let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ |
1140 | | Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"; |
1141 | | assert_eq!(expected, arrow_schema.to_string()); |
1142 | | Ok(()) |
1143 | | } |
1144 | | |
1145 | | #[test] |
1146 | | fn join_qualified() -> Result<()> { |
1147 | | let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1148 | | let right = DFSchema::try_from_qualified_schema("t2", &test_schema_1())?; |
1149 | | let join = left.join(&right)?; |
1150 | | assert_eq!( |
1151 | | "fields:[t1.c0, t1.c1, t2.c0, t2.c1], metadata:{}", |
1152 | | join.to_string() |
1153 | | ); |
1154 | | // test valid access |
1155 | | assert!(join |
1156 | | .field_with_qualified_name(&TableReference::bare("t1"), "c0") |
1157 | | .is_ok()); |
1158 | | assert!(join |
1159 | | .field_with_qualified_name(&TableReference::bare("t2"), "c0") |
1160 | | .is_ok()); |
1161 | | // test invalid access |
1162 | | assert!(join.field_with_unqualified_name("c0").is_err()); |
1163 | | assert!(join.field_with_unqualified_name("t1.c0").is_err()); |
1164 | | assert!(join.field_with_unqualified_name("t2.c0").is_err()); |
1165 | | Ok(()) |
1166 | | } |
1167 | | |
1168 | | #[test] |
1169 | | fn join_qualified_duplicate() -> Result<()> { |
1170 | | let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1171 | | let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1172 | | let join = left.join(&right); |
1173 | | assert_eq!( |
1174 | | join.unwrap_err().strip_backtrace(), |
1175 | | "Schema error: Schema contains duplicate qualified field name t1.c0", |
1176 | | ); |
1177 | | Ok(()) |
1178 | | } |
1179 | | |
1180 | | #[test] |
1181 | | fn join_unqualified_duplicate() -> Result<()> { |
1182 | | let left = DFSchema::try_from(test_schema_1())?; |
1183 | | let right = DFSchema::try_from(test_schema_1())?; |
1184 | | let join = left.join(&right); |
1185 | | assert_eq!( |
1186 | | join.unwrap_err().strip_backtrace(), |
1187 | | "Schema error: Schema contains duplicate unqualified field name c0" |
1188 | | ); |
1189 | | Ok(()) |
1190 | | } |
1191 | | |
1192 | | #[test] |
1193 | | fn join_mixed() -> Result<()> { |
1194 | | let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1195 | | let right = DFSchema::try_from(test_schema_2())?; |
1196 | | let join = left.join(&right)?; |
1197 | | assert_eq!( |
1198 | | "fields:[t1.c0, t1.c1, c100, c101], metadata:{}", |
1199 | | join.to_string() |
1200 | | ); |
1201 | | // test valid access |
1202 | | assert!(join |
1203 | | .field_with_qualified_name(&TableReference::bare("t1"), "c0") |
1204 | | .is_ok()); |
1205 | | assert!(join.field_with_unqualified_name("c0").is_ok()); |
1206 | | assert!(join.field_with_unqualified_name("c100").is_ok()); |
1207 | | assert!(join.field_with_name(None, "c100").is_ok()); |
1208 | | // test invalid access |
1209 | | assert!(join.field_with_unqualified_name("t1.c0").is_err()); |
1210 | | assert!(join.field_with_unqualified_name("t1.c100").is_err()); |
1211 | | assert!(join |
1212 | | .field_with_qualified_name(&TableReference::bare(""), "c100") |
1213 | | .is_err()); |
1214 | | Ok(()) |
1215 | | } |
1216 | | |
1217 | | #[test] |
1218 | | fn join_mixed_duplicate() -> Result<()> { |
1219 | | let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1220 | | let right = DFSchema::try_from(test_schema_1())?; |
1221 | | let join = left.join(&right); |
1222 | | assert_contains!(join.unwrap_err().to_string(), |
1223 | | "Schema error: Schema contains qualified \ |
1224 | | field name t1.c0 and unqualified field name c0 which would be ambiguous"); |
1225 | | Ok(()) |
1226 | | } |
1227 | | |
1228 | | #[test] |
1229 | | fn helpful_error_messages() -> Result<()> { |
1230 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1231 | | let expected_help = "Valid fields are t1.c0, t1.c1."; |
1232 | | assert_contains!( |
1233 | | schema |
1234 | | .field_with_qualified_name(&TableReference::bare("x"), "y") |
1235 | | .unwrap_err() |
1236 | | .to_string(), |
1237 | | expected_help |
1238 | | ); |
1239 | | assert_contains!( |
1240 | | schema |
1241 | | .field_with_unqualified_name("y") |
1242 | | .unwrap_err() |
1243 | | .to_string(), |
1244 | | expected_help |
1245 | | ); |
1246 | | assert!(schema.index_of_column_by_name(None, "y").is_none()); |
1247 | | assert!(schema.index_of_column_by_name(None, "t1.c0").is_none()); |
1248 | | |
1249 | | Ok(()) |
1250 | | } |
1251 | | |
1252 | | #[test] |
1253 | | fn select_without_valid_fields() { |
1254 | | let schema = DFSchema::empty(); |
1255 | | |
1256 | | let col = Column::from_qualified_name("t1.c0"); |
1257 | | let err = schema.index_of_column(&col).unwrap_err(); |
1258 | | assert_eq!(err.strip_backtrace(), "Schema error: No field named t1.c0."); |
1259 | | |
1260 | | // the same check without qualifier |
1261 | | let col = Column::from_name("c0"); |
1262 | | let err = schema.index_of_column(&col).err().unwrap(); |
1263 | | assert_eq!(err.strip_backtrace(), "Schema error: No field named c0."); |
1264 | | } |
1265 | | |
1266 | | #[test] |
1267 | | fn into() { |
1268 | | // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef |
1269 | | let arrow_schema = Schema::new_with_metadata( |
1270 | | vec![Field::new("c0", DataType::Int64, true)], |
1271 | | test_metadata(), |
1272 | | ); |
1273 | | let arrow_schema_ref = Arc::new(arrow_schema.clone()); |
1274 | | |
1275 | | let df_schema = DFSchema { |
1276 | | inner: Arc::clone(&arrow_schema_ref), |
1277 | | field_qualifiers: vec![None; arrow_schema_ref.fields.len()], |
1278 | | functional_dependencies: FunctionalDependencies::empty(), |
1279 | | }; |
1280 | | let df_schema_ref = Arc::new(df_schema.clone()); |
1281 | | |
1282 | | { |
1283 | | let arrow_schema = arrow_schema.clone(); |
1284 | | let arrow_schema_ref = Arc::clone(&arrow_schema_ref); |
1285 | | |
1286 | | assert_eq!(df_schema, arrow_schema.to_dfschema().unwrap()); |
1287 | | assert_eq!(df_schema, arrow_schema_ref.to_dfschema().unwrap()); |
1288 | | } |
1289 | | |
1290 | | { |
1291 | | let arrow_schema = arrow_schema.clone(); |
1292 | | let arrow_schema_ref = Arc::clone(&arrow_schema_ref); |
1293 | | |
1294 | | assert_eq!(df_schema_ref, arrow_schema.to_dfschema_ref().unwrap()); |
1295 | | assert_eq!(df_schema_ref, arrow_schema_ref.to_dfschema_ref().unwrap()); |
1296 | | } |
1297 | | |
1298 | | // Now, consume the refs |
1299 | | assert_eq!(df_schema_ref, arrow_schema.to_dfschema_ref().unwrap()); |
1300 | | assert_eq!(df_schema_ref, arrow_schema_ref.to_dfschema_ref().unwrap()); |
1301 | | } |
1302 | | |
1303 | | fn test_schema_1() -> Schema { |
1304 | | Schema::new(vec![ |
1305 | | Field::new("c0", DataType::Boolean, true), |
1306 | | Field::new("c1", DataType::Boolean, true), |
1307 | | ]) |
1308 | | } |
1309 | | #[test] |
1310 | | fn test_dfschema_to_schema_conversion() { |
1311 | | let mut a_metadata = HashMap::new(); |
1312 | | a_metadata.insert("key".to_string(), "value".to_string()); |
1313 | | let a_field = Field::new("a", DataType::Int64, false).with_metadata(a_metadata); |
1314 | | |
1315 | | let mut b_metadata = HashMap::new(); |
1316 | | b_metadata.insert("key".to_string(), "value".to_string()); |
1317 | | let b_field = Field::new("b", DataType::Int64, false).with_metadata(b_metadata); |
1318 | | |
1319 | | let schema = Arc::new(Schema::new(vec![a_field, b_field])); |
1320 | | |
1321 | | let df_schema = DFSchema { |
1322 | | inner: Arc::clone(&schema), |
1323 | | field_qualifiers: vec![None; schema.fields.len()], |
1324 | | functional_dependencies: FunctionalDependencies::empty(), |
1325 | | }; |
1326 | | |
1327 | | assert_eq!(df_schema.inner.metadata(), schema.metadata()) |
1328 | | } |
1329 | | |
1330 | | #[test] |
1331 | | fn test_contain_column() -> Result<()> { |
1332 | | // qualified exists |
1333 | | { |
1334 | | let col = Column::from_qualified_name("t1.c0"); |
1335 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1336 | | assert!(schema.is_column_from_schema(&col)); |
1337 | | } |
1338 | | |
1339 | | // qualified not exists |
1340 | | { |
1341 | | let col = Column::from_qualified_name("t1.c2"); |
1342 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1343 | | assert!(!schema.is_column_from_schema(&col)); |
1344 | | } |
1345 | | |
1346 | | // unqualified exists |
1347 | | { |
1348 | | let col = Column::from_name("c0"); |
1349 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1350 | | assert!(schema.is_column_from_schema(&col)); |
1351 | | } |
1352 | | |
1353 | | // unqualified not exists |
1354 | | { |
1355 | | let col = Column::from_name("c2"); |
1356 | | let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; |
1357 | | assert!(!schema.is_column_from_schema(&col)); |
1358 | | } |
1359 | | |
1360 | | Ok(()) |
1361 | | } |
1362 | | |
1363 | | fn test_schema_2() -> Schema { |
1364 | | Schema::new(vec![ |
1365 | | Field::new("c100", DataType::Boolean, true), |
1366 | | Field::new("c101", DataType::Boolean, true), |
1367 | | ]) |
1368 | | } |
1369 | | |
1370 | | fn test_metadata() -> HashMap<String, String> { |
1371 | | test_metadata_n(2) |
1372 | | } |
1373 | | |
1374 | | fn test_metadata_n(n: usize) -> HashMap<String, String> { |
1375 | | (0..n).map(|i| (format!("k{i}"), format!("v{i}"))).collect() |
1376 | | } |
1377 | | } |