From b40a298a3a8e7eb0546c06168ef19b44b28acf42 Mon Sep 17 00:00:00 2001 From: Filippo Rossi Date: Sun, 3 Nov 2024 04:23:53 +0100 Subject: [PATCH] feat(logical-types): add NativeType and LogicalType (#12853) * [logical-types] add NativeType and LogicalType * Add license header * Add NativeField and derivates * Support TypeSignatures * Fix doc * Add documentation * Fix doc tests * Remove dummy test * From NativeField to LogicalField * Add default_cast_for * Add type order with can_cast_types * Rename NativeType Utf8 to String * NativeType from &DataType * Add builtin types * From LazyLock to OnceLock --- datafusion/common/src/lib.rs | 1 + datafusion/common/src/types/builtin.rs | 49 +++ datafusion/common/src/types/field.rs | 114 +++++++ datafusion/common/src/types/logical.rs | 128 ++++++++ datafusion/common/src/types/mod.rs | 26 ++ datafusion/common/src/types/native.rs | 399 +++++++++++++++++++++++++ 6 files changed, 717 insertions(+) create mode 100644 datafusion/common/src/types/builtin.rs create mode 100644 datafusion/common/src/types/field.rs create mode 100644 datafusion/common/src/types/logical.rs create mode 100644 datafusion/common/src/types/mod.rs create mode 100644 datafusion/common/src/types/native.rs diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index e4575038ab98..08431a36e82f 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -44,6 +44,7 @@ pub mod scalar; pub mod stats; pub mod test_util; pub mod tree_node; +pub mod types; pub mod utils; /// Reexport arrow crate diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs new file mode 100644 index 000000000000..c6105d37c3bd --- /dev/null +++ b/datafusion/common/src/types/builtin.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::{LogicalTypeRef, NativeType}; +use std::sync::{Arc, OnceLock}; + +macro_rules! singleton { + ($name:ident, $getter:ident, $ty:ident) => { + // TODO: Use LazyLock instead of getter function when MSRV gets bumped + static $name: OnceLock = OnceLock::new(); + + #[doc = "Getter for singleton instance of a logical type representing"] + #[doc = concat!("[`NativeType::", stringify!($ty), "`].")] + pub fn $getter() -> LogicalTypeRef { + Arc::clone($name.get_or_init(|| Arc::new(NativeType::$ty))) + } + }; +} + +singleton!(LOGICAL_NULL, logical_null, Null); +singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean); +singleton!(LOGICAL_INT8, logical_int8, Int8); +singleton!(LOGICAL_INT16, logical_int16, Int16); +singleton!(LOGICAL_INT32, logical_int32, Int32); +singleton!(LOGICAL_INT64, logical_int64, Int64); +singleton!(LOGICAL_UINT8, logical_uint8, UInt8); +singleton!(LOGICAL_UINT16, logical_uint16, UInt16); +singleton!(LOGICAL_UINT32, logical_uint32, UInt32); +singleton!(LOGICAL_UINT64, logical_uint64, UInt64); +singleton!(LOGICAL_FLOAT16, logical_float16, Float16); +singleton!(LOGICAL_FLOAT32, logical_float32, Float32); +singleton!(LOGICAL_FLOAT64, logical_float64, Float64); +singleton!(LOGICAL_DATE, logical_date, Date); +singleton!(LOGICAL_BINARY, logical_binary, Binary); +singleton!(LOGICAL_STRING, logical_string, String); diff --git a/datafusion/common/src/types/field.rs b/datafusion/common/src/types/field.rs new file mode 100644 index 000000000000..85c7c157272a --- /dev/null +++ b/datafusion/common/src/types/field.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{Field, Fields, UnionFields}; +use std::hash::{Hash, Hasher}; +use std::{ops::Deref, sync::Arc}; + +use super::{LogicalTypeRef, NativeType}; + +/// A record of a logical type, its name and its nullability. +#[derive(Debug, Clone, Eq, PartialOrd, Ord)] +pub struct LogicalField { + pub name: String, + pub logical_type: LogicalTypeRef, + pub nullable: bool, +} + +impl PartialEq for LogicalField { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.logical_type.eq(&other.logical_type) + && self.nullable == other.nullable + } +} + +impl Hash for LogicalField { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.logical_type.hash(state); + self.nullable.hash(state); + } +} + +impl From<&Field> for LogicalField { + fn from(value: &Field) -> Self { + Self { + name: value.name().clone(), + logical_type: Arc::new(NativeType::from(value.data_type().clone())), + nullable: value.is_nullable(), + } + } +} + +/// A reference counted [`LogicalField`]. +pub type LogicalFieldRef = Arc; + +/// A cheaply cloneable, owned collection of [`LogicalFieldRef`]. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LogicalFields(Arc<[LogicalFieldRef]>); + +impl Deref for LogicalFields { + type Target = [LogicalFieldRef]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl From<&Fields> for LogicalFields { + fn from(value: &Fields) -> Self { + value + .iter() + .map(|field| Arc::new(LogicalField::from(field.as_ref()))) + .collect() + } +} + +impl FromIterator for LogicalFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +/// A cheaply cloneable, owned collection of [`LogicalFieldRef`] and their +/// corresponding type ids. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LogicalUnionFields(Arc<[(i8, LogicalFieldRef)]>); + +impl Deref for LogicalUnionFields { + type Target = [(i8, LogicalFieldRef)]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl From<&UnionFields> for LogicalUnionFields { + fn from(value: &UnionFields) -> Self { + value + .iter() + .map(|(i, field)| (i, Arc::new(LogicalField::from(field.as_ref())))) + .collect() + } +} + +impl FromIterator<(i8, LogicalFieldRef)> for LogicalUnionFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs new file mode 100644 index 000000000000..bde393992a0c --- /dev/null +++ b/datafusion/common/src/types/logical.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::NativeType; +use crate::error::Result; +use arrow_schema::DataType; +use core::fmt; +use std::{cmp::Ordering, hash::Hash, sync::Arc}; + +/// Signature that uniquely identifies a type among other types. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum TypeSignature<'a> { + /// Represents a built-in native type. + Native(&'a NativeType), + /// Represents an arrow-compatible extension type. + /// () + /// + /// The `name` should contain the same value as 'ARROW:extension:name'. + Extension { + name: &'a str, + parameters: &'a [TypeParameter<'a>], + }, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum TypeParameter<'a> { + Type(TypeSignature<'a>), + Number(i128), +} + +/// A reference counted [`LogicalType`]. +pub type LogicalTypeRef = Arc; + +/// Representation of a logical type with its signature and its native backing +/// type. +/// +/// The logical type is meant to be used during the DataFusion logical planning +/// phase in order to reason about logical types without worrying about their +/// underlying physical implementation. +/// +/// ### Extension types +/// +/// [`LogicalType`] is a trait in order to allow the possibility of declaring +/// extension types: +/// +/// ``` +/// use datafusion_common::types::{LogicalType, NativeType, TypeSignature}; +/// +/// struct JSON {} +/// +/// impl LogicalType for JSON { +/// fn native(&self) -> &NativeType { +/// &NativeType::String +/// } +/// +/// fn signature(&self) -> TypeSignature<'_> { +/// TypeSignature::Extension { +/// name: "JSON", +/// parameters: &[], +/// } +/// } +/// } +/// ``` +pub trait LogicalType: Sync + Send { + /// Get the native backing type of this logical type. + fn native(&self) -> &NativeType; + /// Get the unique type signature for this logical type. Logical types with identical + /// signatures are considered equal. + fn signature(&self) -> TypeSignature<'_>; + + /// Get the default physical type to cast `origin` to in order to obtain a physical type + /// that is logically compatible with this logical type. + fn default_cast_for(&self, origin: &DataType) -> Result { + self.native().default_cast_for(origin) + } +} + +impl fmt::Debug for dyn LogicalType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("LogicalType") + .field(&self.signature()) + .field(&self.native()) + .finish() + } +} + +impl PartialEq for dyn LogicalType { + fn eq(&self, other: &Self) -> bool { + self.signature().eq(&other.signature()) + } +} + +impl Eq for dyn LogicalType {} + +impl PartialOrd for dyn LogicalType { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for dyn LogicalType { + fn cmp(&self, other: &Self) -> Ordering { + self.signature() + .cmp(&other.signature()) + .then(self.native().cmp(other.native())) + } +} + +impl Hash for dyn LogicalType { + fn hash(&self, state: &mut H) { + self.signature().hash(state); + self.native().hash(state); + } +} diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs new file mode 100644 index 000000000000..2f9ce4ce0282 --- /dev/null +++ b/datafusion/common/src/types/mod.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod builtin; +mod field; +mod logical; +mod native; + +pub use builtin::*; +pub use field::*; +pub use logical::*; +pub use native::*; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs new file mode 100644 index 000000000000..bfb546783ea2 --- /dev/null +++ b/datafusion/common/src/types/native.rs @@ -0,0 +1,399 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::{ + LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields, + TypeSignature, +}; +use crate::error::{Result, _internal_err}; +use arrow::compute::can_cast_types; +use arrow_schema::{ + DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields, +}; +use std::sync::Arc; + +/// Representation of a type that DataFusion can handle natively. It is a subset +/// of the physical variants in Arrow's native [`DataType`]. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum NativeType { + /// Null type + Null, + /// A boolean type representing the values `true` and `false`. + Boolean, + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, + /// A 16-bit floating point number. + Float16, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// A timestamp with an optional timezone. + /// + /// Time is measured as a Unix epoch, counting the seconds from + /// 00:00:00.000 on 1 January 1970, excluding leap seconds, + /// as a signed 64-bit integer. + /// + /// The time zone is a string indicating the name of a time zone, one of: + /// + /// * As used in the Olson time zone database (the "tz database" or + /// "tzdata"), such as "America/New_York" + /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Timestamps with a non-empty timezone + /// ------------------------------------ + /// + /// If a Timestamp column has a non-empty timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone + /// (the Unix epoch), regardless of the Timestamp's own timezone. + /// + /// Therefore, timestamp values with a non-empty timezone correspond to + /// physical points in time together with some additional information about + /// how the data was obtained and/or how to display it (the timezone). + /// + /// For example, the timestamp value 0 with the timezone string "Europe/Paris" + /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the + /// application may prefer to display it as "January 1st 1970, 01h00" in + /// the Europe/Paris timezone (which is the same physical point in time). + /// + /// One consequence is that timestamp values with a non-empty timezone + /// can be compared and ordered directly, since they all share the same + /// well-known point of reference (the Unix epoch). + /// + /// Timestamps with an unset / empty timezone + /// ----------------------------------------- + /// + /// If a Timestamp column has no timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. + /// + /// Therefore, timestamp values without a timezone cannot be meaningfully + /// interpreted as physical points in time, but only as calendar / clock + /// indications ("wall clock time") in an unspecified timezone. + /// + /// For example, the timestamp value 0 with an empty timezone string + /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there + /// is not enough information to interpret it as a well-defined physical + /// point in time. + /// + /// One consequence is that timestamp values without a timezone cannot + /// be reliably compared or ordered, since they may have different points of + /// reference. In particular, it is *not* possible to interpret an unset + /// or empty timezone as the same as "UTC". + /// + /// Conversion between timezones + /// ---------------------------- + /// + /// If a Timestamp column has a non-empty timezone, changing the timezone + /// to a different non-empty value is a metadata-only operation: + /// the timestamp values need not change as their point of reference remains + /// the same (the Unix epoch). + /// + /// However, if a Timestamp column has no timezone value, changing it to a + /// non-empty value requires to think about the desired semantics. + /// One possibility is to assume that the original timestamp values are + /// relative to the epoch of the timezone being set; timestamp values should + /// then adjusted to the Unix epoch (for example, changing the timezone from + /// empty to "Europe/Paris" would require converting the timestamp values + /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is + /// nevertheless correct). + /// + /// ``` + /// # use arrow_schema::{DataType, TimeUnit}; + /// DataType::Timestamp(TimeUnit::Second, None); + /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); + /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); + /// ``` + Timestamp(TimeUnit, Option>), + /// A signed date representing the elapsed time since UNIX epoch (1970-01-01) + /// in days. + Date, + /// A signed time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time(TimeUnit), + /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. + Duration(TimeUnit), + /// A "calendar" interval which models types that don't necessarily + /// have a precise duration without the context of a base timestamp (e.g. + /// days can differ in length during day light savings time transitions). + Interval(IntervalUnit), + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + /// Enum parameter specifies the number of bytes per value. + FixedSizeBinary(i32), + /// A variable-length string in Unicode with UTF-8 encoding. + String, + /// A list of some logical data type with variable length. + List(LogicalFieldRef), + /// A list of some logical data type with fixed length. + FixedSizeList(LogicalFieldRef, i32), + /// A nested type that contains a number of sub-fields. + Struct(LogicalFields), + /// A nested type that can represent slots of differing types. + Union(LogicalUnionFields), + /// Decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. + Decimal(u8, i8), + /// A Map is a type that an association between a key and a value. + /// + /// The key and value types are not constrained, but keys should be + /// hashable and unique. + /// + /// In a field with Map type, key type and the second the value type. The names of the + /// child fields may be respectively "entries", "key", and "value", but this is + /// not enforced. + Map(LogicalFieldRef), +} + +impl LogicalType for NativeType { + fn native(&self) -> &NativeType { + self + } + + fn signature(&self) -> TypeSignature<'_> { + TypeSignature::Native(self) + } + + fn default_cast_for(&self, origin: &DataType) -> Result { + use DataType::*; + + fn default_field_cast(to: &LogicalField, from: &Field) -> Result { + Ok(Arc::new(Field::new( + to.name.clone(), + to.logical_type.default_cast_for(from.data_type())?, + to.nullable, + ))) + } + + Ok(match (self, origin) { + (Self::Null, _) => Null, + (Self::Boolean, _) => Boolean, + (Self::Int8, _) => Int8, + (Self::Int16, _) => Int16, + (Self::Int32, _) => Int32, + (Self::Int64, _) => Int64, + (Self::UInt8, _) => UInt8, + (Self::UInt16, _) => UInt16, + (Self::UInt32, _) => UInt32, + (Self::UInt64, _) => UInt64, + (Self::Float16, _) => Float16, + (Self::Float32, _) => Float32, + (Self::Float64, _) => Float64, + (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s), + (Self::Decimal(p, s), _) => Decimal256(*p, *s), + (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()), + (Self::Date, _) => Date32, + (Self::Time(tu), _) => match tu { + TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu), + TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu), + }, + (Self::Duration(tu), _) => Duration(*tu), + (Self::Interval(iu), _) => Interval(*iu), + (Self::Binary, LargeUtf8) => LargeBinary, + (Self::Binary, Utf8View) => BinaryView, + (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => { + BinaryView + } + (Self::Binary, data_type) if can_cast_types(data_type, &LargeBinary) => { + LargeBinary + } + (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => Binary, + (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size), + (Self::String, LargeBinary) => LargeUtf8, + (Self::String, BinaryView) => Utf8View, + (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View, + (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => { + LargeUtf8 + } + (Self::String, data_type) if can_cast_types(data_type, &Utf8) => Utf8, + (Self::List(to_field), List(from_field) | FixedSizeList(from_field, _)) => { + List(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), LargeList(from_field)) => { + LargeList(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), ListView(from_field)) => { + ListView(default_field_cast(to_field, from_field)?) + } + (Self::List(to_field), LargeListView(from_field)) => { + LargeListView(default_field_cast(to_field, from_field)?) + } + // List array where each element is a len 1 list of the origin type + (Self::List(field), _) => List(Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(origin)?, + field.nullable, + ))), + ( + Self::FixedSizeList(to_field, to_size), + FixedSizeList(from_field, from_size), + ) if from_size == to_size => { + FixedSizeList(default_field_cast(to_field, from_field)?, *to_size) + } + ( + Self::FixedSizeList(to_field, size), + List(from_field) + | LargeList(from_field) + | ListView(from_field) + | LargeListView(from_field), + ) => FixedSizeList(default_field_cast(to_field, from_field)?, *size), + // FixedSizeList array where each element is a len 1 list of the origin type + (Self::FixedSizeList(field, size), _) => FixedSizeList( + Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(origin)?, + field.nullable, + )), + *size, + ), + // From https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196 + (Self::Struct(to_fields), Struct(from_fields)) + if from_fields.len() == to_fields.len() => + { + Struct( + from_fields + .iter() + .zip(to_fields.iter()) + .map(|(from, to)| default_field_cast(to, from)) + .collect::>()?, + ) + } + (Self::Struct(to_fields), Null) => Struct( + to_fields + .iter() + .map(|field| { + Ok(Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(&Null)?, + field.nullable, + ))) + }) + .collect::>()?, + ), + (Self::Map(to_field), Map(from_field, sorted)) => { + Map(default_field_cast(to_field, from_field)?, *sorted) + } + (Self::Map(field), Null) => Map( + Arc::new(Field::new( + field.name.clone(), + field.logical_type.default_cast_for(&Null)?, + field.nullable, + )), + false, + ), + (Self::Union(to_fields), Union(from_fields, mode)) + if from_fields.len() == to_fields.len() => + { + Union( + from_fields + .iter() + .zip(to_fields.iter()) + .map(|((_, from), (i, to))| { + Ok((*i, default_field_cast(to, from)?)) + }) + .collect::>()?, + *mode, + ) + } + _ => { + return _internal_err!( + "Unavailable default cast for native type {:?} from physical type {:?}", + self, + origin + ) + } + }) + } +} + +// The following From, From, ... implementations are temporary +// mapping solutions to provide backwards compatibility while transitioning from +// the purely physical system to a logical / physical system. + +impl From for NativeType { + fn from(value: DataType) -> Self { + use NativeType::*; + match value { + DataType::Null => Null, + DataType::Boolean => Boolean, + DataType::Int8 => Int8, + DataType::Int16 => Int16, + DataType::Int32 => Int32, + DataType::Int64 => Int64, + DataType::UInt8 => UInt8, + DataType::UInt16 => UInt16, + DataType::UInt32 => UInt32, + DataType::UInt64 => UInt64, + DataType::Float16 => Float16, + DataType::Float32 => Float32, + DataType::Float64 => Float64, + DataType::Timestamp(tu, tz) => Timestamp(tu, tz), + DataType::Date32 | DataType::Date64 => Date, + DataType::Time32(tu) | DataType::Time64(tu) => Time(tu), + DataType::Duration(tu) => Duration(tu), + DataType::Interval(iu) => Interval(iu), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Binary, + DataType::FixedSizeBinary(size) => FixedSizeBinary(size), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => String, + DataType::List(field) + | DataType::ListView(field) + | DataType::LargeList(field) + | DataType::LargeListView(field) => List(Arc::new(field.as_ref().into())), + DataType::FixedSizeList(field, size) => { + FixedSizeList(Arc::new(field.as_ref().into()), size) + } + DataType::Struct(fields) => Struct(LogicalFields::from(&fields)), + DataType::Union(union_fields, _) => { + Union(LogicalUnionFields::from(&union_fields)) + } + DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), + DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), + DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), + } + } +} + +impl From<&DataType> for NativeType { + fn from(value: &DataType) -> Self { + value.clone().into() + } +}