From 48511c0560e01ed6d073026301bfca23b39291b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 18:03:00 +0300 Subject: [PATCH 01/22] feat: add support for arrow binary type --- polars/polars-arrow/Cargo.toml | 3 +- polars/polars-arrow/src/prelude.rs | 3 +- polars/polars-core/Cargo.toml | 3 +- .../src/chunked_array/iterator/mod.rs | 83 +++++++++++++++++++ .../src/chunked_array/ops/any_value.rs | 12 +++ .../src/chunked_array/ops/downcast.rs | 25 ++++++ polars/polars-core/src/datatypes/_serde.rs | 4 + polars/polars-core/src/datatypes/dtype.rs | 5 ++ polars/polars-core/src/datatypes/field.rs | 4 +- polars/polars-core/src/datatypes/mod.rs | 43 +++++++++- polars/polars-core/src/fmt.rs | 7 ++ polars/polars-core/src/series/any_value.rs | 4 +- polars/polars-io/Cargo.toml | 3 +- 13 files changed, 188 insertions(+), 11 deletions(-) diff --git a/polars/polars-arrow/Cargo.toml b/polars/polars-arrow/Cargo.toml index 80351c76be61..90dda6a50013 100644 --- a/polars/polars-arrow/Cargo.toml +++ b/polars/polars-arrow/Cargo.toml @@ -12,7 +12,8 @@ description = "Arrow interfaces for Polars DataFrame library" # arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd", features = ["compute_concatenate"], default-features = false } # arrow = { package = "arrow2", path = "../../../arrow2", features = ["compute_concatenate"], default-features = false } # arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "comparison_and_validity", features = ["compute_concatenate"], default-features = false } -arrow = { package = "arrow2", version = "0.14", default-features = false, features = ["compute_concatenate"] } +arrow = { package = "arrow2", git = "https://github.com/ozgrakkurt/arrow2", default-features = false, features = ["compute_concatenate"] } +# arrow = { package = "arrow2", version = "0.14", default-features = false, features = ["compute_concatenate"] } hashbrown = "0.12" num = "^0.4" serde = { version = "1", features = ["derive"], optional = true } diff --git a/polars/polars-arrow/src/prelude.rs b/polars/polars-arrow/src/prelude.rs index d9d7fb000ef3..7123f8e664dc 100644 --- a/polars/polars-arrow/src/prelude.rs +++ b/polars/polars-arrow/src/prelude.rs @@ -1,4 +1,4 @@ -use arrow::array::{ListArray, Utf8Array}; +use arrow::array::{ListArray, Utf8Array, BinaryArray}; pub use crate::array::default_arrays::*; pub use crate::array::*; @@ -8,4 +8,5 @@ pub use crate::index::*; pub use crate::kernels::rolling::no_nulls::QuantileInterpolOptions; pub type LargeStringArray = Utf8Array; +pub type LargeBinaryArray = BinaryArray; pub type LargeListArray = ListArray; diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 742cf2a4af27..49dd589e00b8 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -175,10 +175,11 @@ thiserror = "^1.0" package = "arrow2" # git = "https://github.com/jorgecarleitao/arrow2" # git = "https://github.com/ritchie46/arrow2" +git = "https://github.com/ozgrakkurt/arrow2" # rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd" # path = "../../../arrow2" # branch = "comparison_and_validity" -version = "0.14" +# version = "0.14" default-features = false features = [ "compute_aggregate", diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/polars/polars-core/src/chunked_array/iterator/mod.rs index ac5b36f0b439..71e59952e2ee 100644 --- a/polars/polars-core/src/chunked_array/iterator/mod.rs +++ b/polars/polars-core/src/chunked_array/iterator/mod.rs @@ -8,6 +8,7 @@ use crate::series::iterator::SeriesIter; use crate::utils::CustomIterTools; type LargeStringArray = Utf8Array; +type LargeBinaryArray = BinaryArray; type LargeListArray = ListArray; pub mod par; @@ -209,6 +210,88 @@ impl Utf8Chunked { } } +impl<'a> IntoIterator for &'a BinaryChunked { + type Item = Option<&'a [u8]>; + type IntoIter = Box + 'a>; + fn into_iter(self) -> Self::IntoIter { + // we know that we only iterate over length == self.len() + unsafe { Box::new(self.downcast_iter().flatten().trust_my_length(self.len())) } + } +} + +pub struct BinaryIterNoNull<'a> { + array: &'a LargeBinaryArray, + current: usize, + current_end: usize, +} + +impl<'a> BinaryIterNoNull<'a> { + /// create a new iterator + pub fn new(array: &'a LargeBinaryArray) -> Self { + BinaryIterNoNull { + array, + current: 0, + current_end: array.len(), + } + } +} + +impl<'a> Iterator for BinaryIterNoNull<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.current == self.current_end { + None + } else { + let old = self.current; + self.current += 1; + unsafe { Some(self.array.value_unchecked(old)) } + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + self.array.len() - self.current, + Some(self.array.len() - self.current), + ) + } +} + +impl<'a> DoubleEndedIterator for BinaryIterNoNull<'a> { + fn next_back(&mut self) -> Option { + if self.current_end == self.current { + None + } else { + self.current_end -= 1; + unsafe { Some(self.array.value_unchecked(self.current_end)) } + } + } +} + +/// all arrays have known size. +impl<'a> ExactSizeIterator for BinaryIterNoNull<'a> {} + +impl BinaryChunked { + #[allow(clippy::wrong_self_convention)] + #[doc(hidden)] + pub fn into_no_null_iter( + &self, + ) -> impl Iterator + + '_ + + Send + + Sync + + ExactSizeIterator + + DoubleEndedIterator + + TrustedLen { + // we know that we only iterate over length == self.len() + unsafe { + self.downcast_iter() + .flat_map(BinaryIterNoNull::new) + .trust_my_length(self.len()) + } + } +} + impl<'a> IntoIterator for &'a ListChunked { type Item = Option; type IntoIter = Box + 'a>; diff --git a/polars/polars-core/src/chunked_array/ops/any_value.rs b/polars/polars-core/src/chunked_array/ops/any_value.rs index 9006c19ef6e0..a47357783567 100644 --- a/polars/polars-core/src/chunked_array/ops/any_value.rs +++ b/polars/polars-core/src/chunked_array/ops/any_value.rs @@ -30,6 +30,7 @@ pub(crate) unsafe fn arr_to_any_value<'a>( // TODO: insert types match dtype { DataType::Utf8 => downcast_and_pack!(LargeStringArray, Utf8), + DataType::Binary => downcast_and_pack!(LargeBinaryArray, Binary), DataType::Boolean => downcast_and_pack!(BooleanArray, Boolean), DataType::UInt8 => downcast_and_pack!(UInt8Array, UInt8), DataType::UInt16 => downcast_and_pack!(UInt16Array, UInt16), @@ -166,6 +167,17 @@ impl ChunkAnyValue for Utf8Chunked { } } +impl ChunkAnyValue for BinaryChunked { + #[inline] + unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { + get_any_value_unchecked!(self, index) + } + + fn get_any_value(&self, index: usize) -> AnyValue { + get_any_value!(self, index) + } +} + impl ChunkAnyValue for ListChunked { #[inline] unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { diff --git a/polars/polars-core/src/chunked_array/ops/downcast.rs b/polars/polars-core/src/chunked_array/ops/downcast.rs index 41f715587ef1..a9744f225abb 100644 --- a/polars/polars-core/src/chunked_array/ops/downcast.rs +++ b/polars/polars-core/src/chunked_array/ops/downcast.rs @@ -129,6 +129,31 @@ impl Utf8Chunked { } } +#[doc(hidden)] +impl BinaryChunked { + pub fn downcast_iter(&self) -> impl Iterator> + DoubleEndedIterator { + // Safety: + // This is the array type that must be in a BinaryChunked + self.chunks.iter().map(|arr| { + // Safety: + // This should be the array type in BinaryChunked + let arr = &**arr; + unsafe { &*(arr as *const dyn Array as *const BinaryArray) } + }) + } + pub fn downcast_chunks(&self) -> Chunks<'_, BinaryArray> { + Chunks::new(&self.chunks) + } + + #[inline] + pub(crate) fn index_to_chunked_index(&self, index: usize) -> (usize, usize) { + if self.chunks.len() == 1 { + return (0, index); + } + index_to_chunked_index(self.downcast_iter().map(|arr| arr.len()), index) + } +} + #[doc(hidden)] impl ListChunked { pub fn downcast_iter(&self) -> impl Iterator> + DoubleEndedIterator { diff --git a/polars/polars-core/src/datatypes/_serde.rs b/polars/polars-core/src/datatypes/_serde.rs index 10bfd7ed4ac0..8d5b47bae4ca 100644 --- a/polars/polars-core/src/datatypes/_serde.rs +++ b/polars/polars-core/src/datatypes/_serde.rs @@ -42,6 +42,8 @@ pub enum SerializableDataType { Float64, /// String data Utf8, + /// Raw binary data + Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date, @@ -76,6 +78,7 @@ impl From<&DataType> for SerializableDataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(*tu, tz.clone()), Duration(tu) => Self::Duration(*tu), @@ -105,6 +108,7 @@ impl From for DataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(tu, tz), Duration(tu) => Self::Duration(tu), diff --git a/polars/polars-core/src/datatypes/dtype.rs b/polars/polars-core/src/datatypes/dtype.rs index 6fcf7d0eaef4..6ab38721fa6e 100644 --- a/polars/polars-core/src/datatypes/dtype.rs +++ b/polars/polars-core/src/datatypes/dtype.rs @@ -17,6 +17,8 @@ pub enum DataType { Float64, /// String data Utf8, + /// Raw binary data + Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date, @@ -132,6 +134,7 @@ impl DataType { #[allow(clippy::match_like_matches_macro)] match self { DataType::Utf8 + | DataType::Binary | DataType::List(_) | DataType::Date | DataType::Datetime(_, _) @@ -182,6 +185,7 @@ impl DataType { Float32 => ArrowDataType::Float32, Float64 => ArrowDataType::Float64, Utf8 => ArrowDataType::LargeUtf8, + Binary => ArrowDataType::LargeBinary, Date => ArrowDataType::Date32, Datetime(unit, tz) => ArrowDataType::Timestamp(unit.to_arrow(), tz.clone()), Duration(unit) => ArrowDataType::Duration(unit.to_arrow()), @@ -233,6 +237,7 @@ impl Display for DataType { DataType::Float32 => "f32", DataType::Float64 => "f64", DataType::Utf8 => "str", + DataType::Binary => "binary", DataType::Date => "date", DataType::Datetime(tu, tz) => { let s = match tz { diff --git a/polars/polars-core/src/datatypes/field.rs b/polars/polars-core/src/datatypes/field.rs index aeca75576cb6..fbac10eada4e 100644 --- a/polars/polars-core/src/datatypes/field.rs +++ b/polars/polars-core/src/datatypes/field.rs @@ -131,8 +131,8 @@ impl From<&ArrowDataType> for DataType { ArrowDataType::Timestamp(tu, tz) => DataType::Datetime(tu.into(), tz.clone()), ArrowDataType::Duration(tu) => DataType::Duration(tu.into()), ArrowDataType::Date64 => DataType::Datetime(TimeUnit::Milliseconds, None), - ArrowDataType::LargeUtf8 => DataType::Utf8, - ArrowDataType::Utf8 => DataType::Utf8, + ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::Utf8, + ArrowDataType::LargeBinary | ArrowDataType::Binary => DataType::Binary, ArrowDataType::Time64(_) | ArrowDataType::Time32(_) => DataType::Time, #[cfg(feature = "dtype-categorical")] ArrowDataType::Dictionary(_, _, _) => DataType::Categorical(None), diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index c4388b818512..8741bc0f3653 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -43,6 +43,8 @@ use crate::utils::Wrap; pub struct Utf8Type {} +pub struct BinaryType {} + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct ListType {} @@ -86,6 +88,12 @@ impl PolarsDataType for Utf8Type { } } +impl PolarsDataType for BinaryType { + fn get_dtype() -> DataType { + DataType::Binary + } +} + pub struct BooleanType {} impl PolarsDataType for BooleanType { @@ -121,6 +129,7 @@ pub trait PolarsSingleType: PolarsDataType {} impl PolarsSingleType for T where T: NativeType + PolarsDataType {} impl PolarsSingleType for Utf8Type {} +impl PolarsSingleType for BinaryType {} pub type ListChunked = ChunkedArray; pub type BooleanChunked = ChunkedArray; @@ -135,6 +144,7 @@ pub type Int64Chunked = ChunkedArray; pub type Float32Chunked = ChunkedArray; pub type Float64Chunked = ChunkedArray; pub type Utf8Chunked = ChunkedArray; +pub type BinaryChunked = ChunkedArray; pub trait NumericNative: PartialOrd @@ -245,6 +255,8 @@ pub enum AnyValue<'a> { Boolean(bool), /// A UTF8 encoded string type. Utf8(&'a str), + /// A raw binary type + Binary(&'a [u8]), /// An unsigned 8-bit integer number. UInt8(u8), /// An unsigned 16-bit integer number. @@ -292,6 +304,8 @@ pub enum AnyValue<'a> { StructOwned(Box<(Vec>, Vec)>), /// A UTF8 encoded string type. Utf8Owned(String), + // A raw binary type + BinaryOwned(Vec), } #[cfg(feature = "serde")] @@ -320,6 +334,8 @@ impl Serialize for AnyValue<'_> { AnyValue::Utf8Owned(v) => { serializer.serialize_newtype_variant(name, 13, "Utf8Owned", v) } + AnyValue::Binary(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), + AnyValue::BinaryOwned(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), _ => todo!(), } } @@ -347,6 +363,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { List, Bool, Utf8Owned, + BinaryOwned, } const VARIANTS: &[&str] = &[ "Null", @@ -363,8 +380,9 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { "List", "Boolean", "Utf8Owned", + "BinaryOwned", ]; - const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::Utf8Owned) }; + const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::BinaryOwned) }; struct FieldVisitor; @@ -427,6 +445,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { b"List" => AvField::List, b"Bool" => AvField::Bool, b"Utf8Owned" | b"Utf8" => AvField::Utf8Owned, + b"BinaryOwned" | b"Binary" => AvField::BinaryOwned, _ => { return Err(serde::de::Error::unknown_variant( &String::from_utf8_lossy(v), @@ -514,6 +533,10 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { let value = variant.newtype_variant()?; AnyValue::Utf8Owned(value) } + (AvField::BinaryOwned, variant) => { + let value = variant.newtype_variant()?; + AnyValue::BinaryOwned(value) + } }; Ok(out) } @@ -571,7 +594,10 @@ impl<'a> Hash for AnyValue<'a> { UInt16(v) => state.write_u16(*v), UInt32(v) => state.write_u32(*v), UInt64(v) => state.write_u64(*v), - Utf8(s) => state.write(s.as_bytes()), + Utf8(v) => state.write(v.as_bytes()), + Utf8Owned(v) => state.write(v.as_bytes()), + Binary(v) => state.write(v), + BinaryOwned(v) => state.write(v), Boolean(v) => state.write_u8(*v as u8), List(v) => Hash::hash(&Wrap(v.clone()), state), _ => unimplemented!(), @@ -720,7 +746,10 @@ impl<'a> AnyValue<'a> { #[cfg(feature = "dtype-time")] Time(v) => AnyValue::Time(v), List(v) => AnyValue::List(v), - Utf8(s) => AnyValue::Utf8Owned(s.to_string()), + Utf8(v) => AnyValue::Utf8Owned(v.to_string()), + Utf8Owned(v) => AnyValue::Utf8Owned(v), + Binary(v) => AnyValue::BinaryOwned(v.to_vec()), + BinaryOwned(v) => AnyValue::BinaryOwned(v), dt => { return Err(PolarsError::ComputeError( format!("cannot get static AnyValue from {}", dt).into(), @@ -749,6 +778,9 @@ impl PartialEq for AnyValue<'_> { fn eq(&self, other: &Self) -> bool { use AnyValue::*; match (self, other) { + (BinaryOwned(l), BinaryOwned(r)) => l == r, + (Binary(l), Binary(r)) => l == r, + (Utf8Owned(l), Utf8Owned(r)) => l == r, (Utf8(l), Utf8(r)) => l == r, (UInt8(l), UInt8(r)) => l == r, (UInt16(l), UInt16(r)) => l == r, @@ -805,6 +837,9 @@ impl PartialOrd for AnyValue<'_> { (Float32(l), Float32(r)) => l.partial_cmp(r), (Float64(l), Float64(r)) => l.partial_cmp(r), (Utf8(l), Utf8(r)) => l.partial_cmp(r), + (Utf8Owned(l), Utf8Owned(r)) => l.partial_cmp(r), + (Binary(l), Binary(r)) => l.partial_cmp(r), + (BinaryOwned(l), BinaryOwned(r)) => l.partial_cmp(r), _ => None, } } @@ -938,6 +973,8 @@ mod test { ), (ArrowDataType::LargeUtf8, DataType::Utf8), (ArrowDataType::Utf8, DataType::Utf8), + (ArrowDataType::LargeBinary, DataType::Utf8), + (ArrowDataType::Binary, DataType::Utf8), ( ArrowDataType::Time64(ArrowTimeUnit::Nanosecond), DataType::Time, diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs index fa14e1aab09b..099e51fb508e 100644 --- a/polars/polars-core/src/fmt.rs +++ b/polars/polars-core/src/fmt.rs @@ -651,6 +651,7 @@ impl Display for AnyValue<'_> { AnyValue::Boolean(v) => write!(f, "{}", *v), AnyValue::Utf8(v) => write!(f, "{}", format_args!("\"{}\"", v)), AnyValue::Utf8Owned(v) => write!(f, "{}", format_args!("\"{}\"", v)), + AnyValue::Binary(_) | AnyValue::BinaryOwned(_) => write!(f, "[binary data]"), #[cfg(feature = "dtype-date")] AnyValue::Date(v) => write!(f, "{}", date32_to_date(*v)), #[cfg(feature = "dtype-datetime")] @@ -764,6 +765,12 @@ impl FmtList for Utf8Chunked { } } +impl FmtList for BinaryChunked { + fn fmt_list(&self) -> String { + impl_fmt_list!(self) + } +} + impl FmtList for ListChunked { fn fmt_list(&self) -> String { impl_fmt_list!(self) diff --git a/polars/polars-core/src/series/any_value.rs b/polars/polars-core/src/series/any_value.rs index 9174b4ae6d36..ccb69109e81e 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/polars/polars-core/src/series/any_value.rs @@ -152,8 +152,8 @@ impl<'a> From<&AnyValue<'a>> for DataType { match val { Null => DataType::Null, Boolean(_) => DataType::Boolean, - Utf8(_) => DataType::Utf8, - Utf8Owned(_) => DataType::Utf8, + Utf8(_) | Utf8Owned(_) => DataType::Utf8, + Binary(_) | BinaryOwned(_) => DataType::Utf8, UInt32(_) => DataType::UInt32, UInt64(_) => DataType::UInt64, Int32(_) => DataType::Int32, diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index f25bdbd96cee..47dc933b7834 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -40,7 +40,8 @@ ahash = "0.7" anyhow = "1.0" # arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd", default-features = false } # arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "comparison_and_validity", default-features = false } -arrow = { package = "arrow2", version = "0.14", default-features = false } +arrow = { package = "arrow2", git = "https://github.com/ozgrakkurt/arrow2", default-features = false, features = ["compute_concatenate"] } +# arrow = { package = "arrow2", version = "0.14", default-features = false } # arrow = { package = "arrow2", path = "../../../arrow2", default-features = false } csv-core = { version = "0.1.10", optional = true } dirs = "4.0" From fa87cd6f35fcc85480b7e9543b40e2f50de6706a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 18:05:32 +0300 Subject: [PATCH 02/22] fix DataType from AnyValue --- polars/polars-core/src/series/any_value.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polars/polars-core/src/series/any_value.rs b/polars/polars-core/src/series/any_value.rs index ccb69109e81e..17e994da13af 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/polars/polars-core/src/series/any_value.rs @@ -153,7 +153,7 @@ impl<'a> From<&AnyValue<'a>> for DataType { Null => DataType::Null, Boolean(_) => DataType::Boolean, Utf8(_) | Utf8Owned(_) => DataType::Utf8, - Binary(_) | BinaryOwned(_) => DataType::Utf8, + Binary(_) | BinaryOwned(_) => DataType::Binary, UInt32(_) => DataType::UInt32, UInt64(_) => DataType::UInt64, Int32(_) => DataType::Int32, From 3ac6fca0cc7ae593905c573c45da6ef2c9e18528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 18:12:46 +0300 Subject: [PATCH 03/22] fmt --- polars/polars-arrow/src/prelude.rs | 2 +- polars/polars-core/src/datatypes/mod.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/polars/polars-arrow/src/prelude.rs b/polars/polars-arrow/src/prelude.rs index 7123f8e664dc..d72965a66e6e 100644 --- a/polars/polars-arrow/src/prelude.rs +++ b/polars/polars-arrow/src/prelude.rs @@ -1,4 +1,4 @@ -use arrow::array::{ListArray, Utf8Array, BinaryArray}; +use arrow::array::{BinaryArray, ListArray, Utf8Array}; pub use crate::array::default_arrays::*; pub use crate::array::*; diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index 8741bc0f3653..90fbb563c287 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -335,7 +335,9 @@ impl Serialize for AnyValue<'_> { serializer.serialize_newtype_variant(name, 13, "Utf8Owned", v) } AnyValue::Binary(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), - AnyValue::BinaryOwned(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), + AnyValue::BinaryOwned(v) => { + serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v) + } _ => todo!(), } } From f3930feb77eeceb02191b3771809d919d73cea93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 22 Sep 2022 18:28:34 +0300 Subject: [PATCH 04/22] run fmt, fix test failure --- polars/polars-core/src/datatypes/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index 90fbb563c287..cb597ed97b9d 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -975,8 +975,8 @@ mod test { ), (ArrowDataType::LargeUtf8, DataType::Utf8), (ArrowDataType::Utf8, DataType::Utf8), - (ArrowDataType::LargeBinary, DataType::Utf8), - (ArrowDataType::Binary, DataType::Utf8), + (ArrowDataType::LargeBinary, DataType::Binary), + (ArrowDataType::Binary, DataType::Binary), ( ArrowDataType::Time64(ArrowTimeUnit::Nanosecond), DataType::Time, From e14bae47d933e9a081c0a86a7e6957f44a9ff998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 23 Sep 2022 15:57:54 +0300 Subject: [PATCH 05/22] impl execution for binary type --- .../polars-arrow/src/array/default_arrays.rs | 22 +- polars/polars-arrow/src/array/get.rs | 24 +- polars/polars-arrow/src/array/mod.rs | 54 ++- polars/polars-arrow/src/compute/take/mod.rs | 185 +++++++++ polars/polars-arrow/src/data_types.rs | 2 + polars/polars-arrow/src/is_valid.rs | 3 +- polars/polars-arrow/src/trusted_len/mod.rs | 1 + .../src/chunked_array/arithmetic.rs | 69 ++++ .../src/chunked_array/builder/binary.rs | 90 +++++ .../src/chunked_array/builder/from.rs | 9 + .../src/chunked_array/builder/list.rs | 81 ++++ .../src/chunked_array/builder/mod.rs | 52 +++ polars/polars-core/src/chunked_array/cast.rs | 10 + .../src/chunked_array/comparison.rs | 9 + polars/polars-core/src/chunked_array/mod.rs | 9 + .../src/chunked_array/ops/aggregate.rs | 49 +++ .../src/chunked_array/ops/append.rs | 10 + .../src/chunked_array/ops/apply.rs | 107 +++++ .../src/chunked_array/ops/compare_inner.rs | 47 ++- .../src/chunked_array/ops/explode.rs | 41 ++ .../src/chunked_array/ops/extend.rs | 37 ++ .../src/chunked_array/ops/fill_null.rs | 61 +++ .../src/chunked_array/ops/filter.rs | 22 ++ .../polars-core/src/chunked_array/ops/full.rs | 20 + .../src/chunked_array/ops/is_in.rs | 63 +++ .../polars-core/src/chunked_array/ops/mod.rs | 6 + .../src/chunked_array/ops/repeat_by.rs | 17 + .../src/chunked_array/ops/reverse.rs | 1 + .../polars-core/src/chunked_array/ops/set.rs | 74 ++++ .../src/chunked_array/ops/shift.rs | 12 + .../src/chunked_array/ops/sort/mod.rs | 130 +++++++ .../src/chunked_array/ops/take/mod.rs | 81 ++++ .../chunked_array/ops/take/take_chunked.rs | 32 ++ .../src/chunked_array/ops/take/take_every.rs | 12 + .../src/chunked_array/ops/take/take_random.rs | 64 +++ .../src/chunked_array/ops/take/take_single.rs | 11 + .../src/chunked_array/ops/unique/mod.rs | 71 ++++ .../polars-core/src/chunked_array/ops/zip.rs | 26 ++ .../src/chunked_array/trusted_len.rs | 20 + .../src/chunked_array/upstream_traits.rs | 28 ++ polars/polars-core/src/fmt.rs | 6 + .../frame/groupby/aggregations/agg_list.rs | 25 ++ .../src/frame/groupby/into_groups.rs | 52 +++ polars/polars-core/src/frame/hash_join/mod.rs | 1 + polars/polars-core/src/named_from.rs | 64 +++ polars/polars-core/src/prelude.rs | 6 +- polars/polars-core/src/series/any_value.rs | 11 + .../src/series/arithmetic/borrowed.rs | 8 + .../src/series/implementations/binary.rs | 367 ++++++++++++++++++ .../src/series/implementations/mod.rs | 2 + polars/polars-core/src/series/ops/downcast.rs | 12 + polars/polars-core/src/vector_hasher.rs | 28 ++ polars/polars-lazy/src/logical_plan/format.rs | 1 + polars/polars-lazy/src/logical_plan/lit.rs | 16 + .../src/physical_plan/expressions/literal.rs | 1 + 55 files changed, 2253 insertions(+), 9 deletions(-) create mode 100644 polars/polars-core/src/chunked_array/builder/binary.rs create mode 100644 polars/polars-core/src/series/implementations/binary.rs diff --git a/polars/polars-arrow/src/array/default_arrays.rs b/polars/polars-arrow/src/array/default_arrays.rs index 8345e3e79f7f..7ac32c0a8ad7 100644 --- a/polars/polars-arrow/src/array/default_arrays.rs +++ b/polars/polars-arrow/src/array/default_arrays.rs @@ -1,4 +1,4 @@ -use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; +use arrow::array::{BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; use arrow::datatypes::DataType; @@ -40,3 +40,23 @@ impl FromDataUtf8 for Utf8Array { Utf8Array::from_data_unchecked(DataType::LargeUtf8, offsets, values, validity) } } + +pub trait FromDataBinary { + /// # Safety + /// `values` buffer must contain valid utf8 between every `offset` + unsafe fn from_data_unchecked_default( + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self; +} + +impl FromDataBinary for BinaryArray { + unsafe fn from_data_unchecked_default( + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + BinaryArray::from_data_unchecked(DataType::LargeBinary, offsets, values, validity) + } +} diff --git a/polars/polars-arrow/src/array/get.rs b/polars/polars-arrow/src/array/get.rs index 61b0ee324a19..0bf196194ae0 100644 --- a/polars/polars-arrow/src/array/get.rs +++ b/polars/polars-arrow/src/array/get.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::types::NativeType; use crate::is_valid::IsValid; @@ -79,6 +79,28 @@ impl<'a> ArrowGetItem for &'a Utf8Array { } } +impl<'a> ArrowGetItem for &'a BinaryArray { + type Item = &'a [u8]; + + #[inline] + fn get(&self, item: usize) -> Option { + if item >= self.len() { + None + } else { + unsafe { self.get_unchecked(item) } + } + } + + #[inline] + unsafe fn get_unchecked(&self, item: usize) -> Option { + if self.is_null_unchecked(item) { + None + } else { + Some(self.value_unchecked(item)) + } + } +} + impl ArrowGetItem for ListArray { type Item = Box; diff --git a/polars/polars-arrow/src/array/mod.rs b/polars/polars-arrow/src/array/mod.rs index 1c5c37015c00..e882b587db0c 100644 --- a/polars/polars-arrow/src/array/mod.rs +++ b/polars/polars-arrow/src/array/mod.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; @@ -30,6 +30,12 @@ impl ValueSize for Utf8Array { } } +impl ValueSize for BinaryArray { + fn get_values_size(&self) -> usize { + self.values().len() + } +} + impl ValueSize for ArrayRef { fn get_values_size(&self) -> usize { match self.data_type() { @@ -179,6 +185,52 @@ pub trait ListFromIter { Some(validity.into()), ) } + + /// Create a list-array from an iterator. + /// Used in groupby agg-list + /// + /// # Safety + /// Will produce incorrect arrays if size hint is incorrect. + unsafe fn from_iter_binary_trusted_len(iter: I, n_elements: usize) -> ListArray + where + I: IntoIterator>, + P: IntoIterator>, + Ref: AsRef<[u8]>, + { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + + let mut validity = MutableBitmap::with_capacity(lower); + let mut offsets = Vec::::with_capacity(lower + 1); + let mut length_so_far = 0i64; + offsets.push(length_so_far); + let values: BinaryArray = iterator + .filter_map(|opt_iter| match opt_iter { + Some(x) => { + let it = x.into_iter(); + length_so_far += it.size_hint().0 as i64; + validity.push(true); + offsets.push(length_so_far); + Some(it) + } + None => { + validity.push(false); + None + } + }) + .flatten() + .trust_my_length(n_elements) + .collect(); + + // Safety: + // offsets are monotonically increasing + ListArray::new_unchecked( + ListArray::::default_datatype(DataType::LargeBinary), + offsets.into(), + Box::new(values), + Some(validity.into()), + ) + } } impl ListFromIter for ListArray {} diff --git a/polars/polars-arrow/src/compute/take/mod.rs b/polars/polars-arrow/src/compute/take/mod.rs index 4b574414cc61..7e653b5f8f38 100644 --- a/polars/polars-arrow/src/compute/take/mod.rs +++ b/polars/polars-arrow/src/compute/take/mod.rs @@ -327,6 +327,21 @@ pub unsafe fn take_no_null_utf8_iter_unchecked>( Box::new(MutableUtf8Array::::from_trusted_len_values_iter_unchecked(iter).into()) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_no_null_binary_iter_unchecked>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let iter = indices.into_iter().map(|idx| { + debug_assert!(idx < arr.len()); + arr.value_unchecked(idx) + }); + Box::new(MutableBinaryArray::::from_trusted_len_values_iter_unchecked(iter).into()) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -348,6 +363,27 @@ pub unsafe fn take_utf8_iter_unchecked>( Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_binary_iter_unchecked>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let validity = arr.validity().expect("should have nulls"); + let iter = indices.into_iter().map(|idx| { + debug_assert!(idx < arr.len()); + if validity.get_bit_unchecked(idx) { + Some(arr.value_unchecked(idx)) + } else { + None + } + }); + + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -363,6 +399,21 @@ pub unsafe fn take_no_null_utf8_opt_iter_unchecked>>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let iter = indices + .into_iter() + .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); + + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -384,6 +435,27 @@ pub unsafe fn take_utf8_opt_iter_unchecked> Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_binary_opt_iter_unchecked>>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let validity = arr.validity().expect("should have nulls"); + let iter = indices.into_iter().map(|opt_idx| { + opt_idx.and_then(|idx| { + if validity.get_bit_unchecked(idx) { + Some(arr.value_unchecked(idx)) + } else { + None + } + }) + }); + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// caller must ensure indices are in bounds pub unsafe fn take_utf8_unchecked( @@ -497,6 +569,119 @@ pub unsafe fn take_utf8_unchecked( )) } +/// # Safety +/// caller must ensure indices are in bounds +pub unsafe fn take_binary_unchecked( + arr: &LargeBinaryArray, + indices: &IdxArr, +) -> Box { + let data_len = indices.len(); + + let mut offset_buf = vec![0; data_len + 1]; + let offset_typed = offset_buf.as_mut_slice(); + + let mut length_so_far = 0; + offset_typed[0] = length_so_far; + + let validity; + + // The required size is yet unknown + // Allocate 2.0 times the expected size. + // where expected size is the length of bytes multiplied by the factor (take_len / current_len) + let mut values_capacity = if arr.len() > 0 { + ((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() as usize + } else { + 0 + }; + + // 16 bytes per string as default alloc + let mut values_buf = Vec::::with_capacity(values_capacity); + + // both 0 nulls + if !arr.has_validity() && !indices.has_validity() { + offset_typed + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + let index = indices.value_unchecked(idx) as usize; + let s = arr.value_unchecked(index); + length_so_far += s.len() as i64; + *offset = length_so_far; + + if length_so_far as usize >= values_capacity { + values_buf.reserve(values_capacity); + values_capacity *= 2; + } + + values_buf.extend_from_slice(s) + }); + validity = None; + } else if !arr.has_validity() { + offset_typed + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + if indices.is_valid(idx) { + let index = indices.value_unchecked(idx) as usize; + let s = arr.value_unchecked(index); + length_so_far += s.len() as i64; + + if length_so_far as usize >= values_capacity { + values_buf.reserve(values_capacity); + values_capacity *= 2; + } + + values_buf.extend_from_slice(s) + } + *offset = length_so_far; + }); + validity = indices.validity().cloned(); + } else { + let mut builder = MutableBinaryArray::with_capacities(data_len, length_so_far as usize); + let validity_arr = arr.validity().expect("should have nulls"); + + if !indices.has_validity() { + (0..data_len).for_each(|idx| { + let index = indices.value_unchecked(idx) as usize; + builder.push(if validity_arr.get_bit_unchecked(index) { + let s = arr.value_unchecked(index); + Some(s) + } else { + None + }); + }); + } else { + let validity_indices = indices.validity().expect("should have nulls"); + (0..data_len).for_each(|idx| { + if validity_indices.get_bit_unchecked(idx) { + let index = indices.value_unchecked(idx) as usize; + + if validity_arr.get_bit_unchecked(index) { + let s = arr.value_unchecked(index); + builder.push(Some(s)); + } else { + builder.push_null(); + } + } else { + builder.push_null(); + } + }); + } + + let array: BinaryArray = builder.into(); + return Box::new(array); + } + + // Safety: all "values" are &str, and thus valid utf8 + Box::new(BinaryArray::::from_data_unchecked_default( + offset_buf.into(), + values_buf.into(), + validity, + )) +} + /// Forked and adapted from arrow-rs /// This is faster because it does no bounds checks and allocates directly into aligned memory /// diff --git a/polars/polars-arrow/src/data_types.rs b/polars/polars-arrow/src/data_types.rs index 3712a9d26dbe..1ea62a367865 100644 --- a/polars/polars-arrow/src/data_types.rs +++ b/polars/polars-arrow/src/data_types.rs @@ -25,6 +25,7 @@ unsafe impl IsFloat for u16 {} unsafe impl IsFloat for u32 {} unsafe impl IsFloat for u64 {} unsafe impl IsFloat for &str {} +unsafe impl IsFloat for &[u8] {} unsafe impl IsFloat for bool {} unsafe impl IsFloat for Option {} @@ -41,6 +42,7 @@ mod private { impl Sealed for f32 {} impl Sealed for f64 {} impl Sealed for &str {} + impl Sealed for &[u8] {} impl Sealed for bool {} impl Sealed for Option {} } diff --git a/polars/polars-arrow/src/is_valid.rs b/polars/polars-arrow/src/is_valid.rs index 75189d4f6d4d..1f10b6eaf0c1 100644 --- a/polars/polars-arrow/src/is_valid.rs +++ b/polars/polars-arrow/src/is_valid.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::types::NativeType; pub trait IsValid { @@ -13,6 +13,7 @@ pub trait IsValid { pub trait ArrowArray: Array {} +impl ArrowArray for BinaryArray {} impl ArrowArray for Utf8Array {} impl ArrowArray for PrimitiveArray {} impl ArrowArray for BooleanArray {} diff --git a/polars/polars-arrow/src/trusted_len/mod.rs b/polars/polars-arrow/src/trusted_len/mod.rs index 085b5f648502..c3c73cfca8cf 100644 --- a/polars/polars-arrow/src/trusted_len/mod.rs +++ b/polars/polars-arrow/src/trusted_len/mod.rs @@ -65,6 +65,7 @@ unsafe impl TrustedLen for std::iter::Rev, J> TrustedLen for TrustMyLength {} unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} unsafe impl TrustedLen for arrow::array::Utf8ValuesIter<'_, i64> {} +unsafe impl TrustedLen for arrow::array::BinaryValueIter<'_, i64> {} unsafe impl> TrustedLen for ZipValidity<'_, T, I> {} unsafe impl TrustedLen for BitmapIter<'_> {} unsafe impl TrustedLen for std::iter::StepBy {} diff --git a/polars/polars-core/src/chunked_array/arithmetic.rs b/polars/polars-core/src/chunked_array/arithmetic.rs index 289e68dd8e03..687aeb0048ff 100644 --- a/polars/polars-core/src/chunked_array/arithmetic.rs +++ b/polars/polars-core/src/chunked_array/arithmetic.rs @@ -430,6 +430,13 @@ fn concat_strings(l: &str, r: &str) -> String { s } +fn concat_binary_arrs(l: &[u8], r: &[u8]) -> Vec { + let mut v = Vec::with_capacity(l.len() + r.len()); + v.extend_from_slice(l); + v.extend_from_slice(r); + v +} + impl Add for &Utf8Chunked { type Output = Utf8Chunked; @@ -492,6 +499,68 @@ impl Add<&str> for &Utf8Chunked { } } +impl Add for &BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: Self) -> Self::Output { + // broadcasting path rhs + if rhs.len() == 1 { + let rhs = rhs.get(0); + return match rhs { + Some(rhs) => self.add(rhs), + None => BinaryChunked::full_null(self.name(), self.len()), + }; + } + // broadcasting path lhs + if self.len() == 1 { + let lhs = self.get(0); + return match lhs { + Some(lhs) => rhs.apply(|s| Cow::Owned(concat_binary_arrs(lhs, s))), + None => BinaryChunked::full_null(self.name(), rhs.len()), + }; + } + + // todo! add no_null variants. Need 4 paths. + let mut ca: Self::Output = self + .into_iter() + .zip(rhs.into_iter()) + .map(|(opt_l, opt_r)| match (opt_l, opt_r) { + (Some(l), Some(r)) => Some(concat_binary_arrs(l, r)), + _ => None, + }) + .collect_trusted(); + ca.rename(self.name()); + ca + } +} + +impl Add for BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: Self) -> Self::Output { + (&self).add(&rhs) + } +} + +impl Add<&[u8]> for &BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: &[u8]) -> Self::Output { + let mut ca: Self::Output = match self.has_validity() { + false => self + .into_no_null_iter() + .map(|l| concat_binary_arrs(l, rhs)) + .collect_trusted(), + _ => self + .into_iter() + .map(|opt_l| opt_l.map(|l| concat_binary_arrs(l, rhs))) + .collect_trusted(), + }; + ca.rename(self.name()); + ca + } +} + #[cfg(test)] pub(crate) mod test { use crate::prelude::*; diff --git a/polars/polars-core/src/chunked_array/builder/binary.rs b/polars/polars-core/src/chunked_array/builder/binary.rs new file mode 100644 index 000000000000..cb2170e72b9a --- /dev/null +++ b/polars/polars-core/src/chunked_array/builder/binary.rs @@ -0,0 +1,90 @@ +use super::*; + +pub struct BinaryChunkedBuilder { + pub(crate) builder: MutableBinaryArray, + pub capacity: usize, + field: Field, +} + +impl BinaryChunkedBuilder { + /// Create a new UtfChunkedBuilder + /// + /// # Arguments + /// + /// * `capacity` - Number of string elements in the final array. + /// * `bytes_capacity` - Number of bytes needed to store the string values. + pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self { + BinaryChunkedBuilder { + builder: MutableBinaryArray::::with_capacities(capacity, bytes_capacity), + capacity, + field: Field::new(name, DataType::Binary), + } + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value>(&mut self, v: S) { + self.builder.push(Some(v.as_ref())); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.builder.push::<&[u8]>(None); + } + + #[inline] + pub fn append_option>(&mut self, opt: Option) { + self.builder.push(opt); + } + + pub fn finish(mut self) -> BinaryChunked { + let arr = self.builder.as_box(); + let length = arr.len() as IdxSize; + + ChunkedArray { + field: Arc::new(self.field), + chunks: vec![arr], + phantom: PhantomData, + categorical_map: None, + bit_settings: Default::default(), + length, + } + } + + fn shrink_to_fit(&mut self) { + self.builder.shrink_to_fit() + } +} + +pub struct BinaryChunkedBuilderCow { + builder: BinaryChunkedBuilder, +} + +impl BinaryChunkedBuilderCow { + pub fn new(name: &str, capacity: usize) -> Self { + BinaryChunkedBuilderCow { + builder: BinaryChunkedBuilder::new(name, capacity, capacity), + } + } +} + +impl ChunkedBuilder, BinaryType> for BinaryChunkedBuilderCow { + #[inline] + fn append_value(&mut self, val: Cow<'_, [u8]>) { + self.builder.append_value(val.as_ref()) + } + + #[inline] + fn append_null(&mut self) { + self.builder.append_null() + } + + fn finish(self) -> ChunkedArray { + self.builder.finish() + } + + fn shrink_to_fit(&mut self) { + self.builder.shrink_to_fit() + } +} diff --git a/polars/polars-core/src/chunked_array/builder/from.rs b/polars/polars-core/src/chunked_array/builder/from.rs index 0b2a5e8b7567..2450d83da596 100644 --- a/polars/polars-core/src/chunked_array/builder/from.rs +++ b/polars/polars-core/src/chunked_array/builder/from.rs @@ -40,3 +40,12 @@ impl From<(&str, Utf8Array)> for Utf8Chunked { ChunkedArray::from_chunks(name, vec![Box::new(arr)]) } } + +impl From<(&str, BinaryArray)> for BinaryChunked { + fn from(tpl: (&str, BinaryArray)) -> Self { + let name = tpl.0; + let arr = tpl.1; + + ChunkedArray::from_chunks(name, vec![Box::new(arr)]) + } +} diff --git a/polars/polars-core/src/chunked_array/builder/list.rs b/polars/polars-core/src/chunked_array/builder/list.rs index 9c3363343ff3..8f53916330ae 100644 --- a/polars/polars-core/src/chunked_array/builder/list.rs +++ b/polars/polars-core/src/chunked_array/builder/list.rs @@ -179,6 +179,7 @@ where type LargePrimitiveBuilder = MutableListArray>; type LargeListUtf8Builder = MutableListArray>; +type LargeListBinaryBuilder = MutableListArray>; type LargeListBooleanBuilder = MutableListArray; pub struct ListUtf8ChunkedBuilder { @@ -261,6 +262,86 @@ impl ListBuilderTrait for ListUtf8ChunkedBuilder { } } +pub struct ListBinaryChunkedBuilder { + builder: LargeListBinaryBuilder, + field: Field, + fast_explode: bool, +} + +impl ListBinaryChunkedBuilder { + pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { + let values = MutableBinaryArray::::with_capacity(values_capacity); + let builder = LargeListBinaryBuilder::new_with_capacity(values, capacity); + let field = Field::new(name, DataType::List(Box::new(DataType::Binary))); + + ListBinaryChunkedBuilder { + builder, + field, + fast_explode: true, + } + } + + pub fn append_trusted_len_iter<'a, I: Iterator> + TrustedLen>( + &mut self, + iter: I, + ) { + let values = self.builder.mut_values(); + + if iter.size_hint().0 == 0 { + self.fast_explode = false; + } + // Safety + // trusted len, trust the type system + unsafe { values.extend_trusted_len_unchecked(iter) }; + self.builder.try_push_valid().unwrap(); + } + + pub fn append_values_iter<'a, I: Iterator>(&mut self, iter: I) { + let values = self.builder.mut_values(); + + if iter.size_hint().0 == 0 { + self.fast_explode = false; + } + values.extend_values(iter); + self.builder.try_push_valid().unwrap(); + } + + pub(crate) fn append(&mut self, ca: &BinaryChunked) { + let value_builder = self.builder.mut_values(); + value_builder.try_extend(ca).unwrap(); + self.builder.try_push_valid().unwrap(); + } +} + +impl ListBuilderTrait for ListBinaryChunkedBuilder { + fn append_opt_series(&mut self, opt_s: Option<&Series>) { + match opt_s { + Some(s) => self.append_series(s), + None => { + self.append_null(); + } + } + } + + #[inline] + fn append_null(&mut self) { + self.fast_explode = false; + self.builder.push_null(); + } + + fn append_series(&mut self, s: &Series) { + if s.is_empty() { + self.fast_explode = false; + } + let ca = s.binary().unwrap(); + self.append(ca) + } + + fn finish(&mut self) -> ListChunked { + finish_list_builder!(self) + } +} + pub struct ListBooleanChunkedBuilder { builder: LargeListBooleanBuilder, field: Field, diff --git a/polars/polars-core/src/chunked_array/builder/mod.rs b/polars/polars-core/src/chunked_array/builder/mod.rs index ec6161f2fafb..da530aa42ab6 100644 --- a/polars/polars-core/src/chunked_array/builder/mod.rs +++ b/polars/polars-core/src/chunked_array/builder/mod.rs @@ -1,3 +1,4 @@ +mod binary; mod boolean; mod from; pub mod list; @@ -11,6 +12,7 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; +pub use binary::*; pub use boolean::*; pub use list::*; pub use primitive::*; @@ -159,6 +161,48 @@ where } } +impl NewChunkedArray for BinaryChunked +where + B: AsRef<[u8]>, +{ + fn from_slice(name: &str, v: &[B]) -> Self { + let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); + + let mut builder = MutableBinaryArray::::with_capacities(v.len(), values_size); + builder.extend_trusted_len_values(v.iter().map(|s| s.as_ref())); + + let chunks = vec![builder.as_box()]; + ChunkedArray::from_chunks(name, chunks) + } + + fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + let values_size = opt_v.iter().fold(0, |acc, s| match s { + Some(s) => acc + s.as_ref().len(), + None => acc, + }); + let mut builder = MutableBinaryArray::::with_capacities(opt_v.len(), values_size); + builder.extend_trusted_len(opt_v.iter().map(|s| s.as_ref())); + + let chunks = vec![builder.as_box()]; + ChunkedArray::from_chunks(name, chunks) + } + + fn from_iter_options(name: &str, it: impl Iterator>) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); + it.for_each(|opt| builder.append_option(opt)); + builder.finish() + } + + /// Create a new ChunkedArray from an iterator. + fn from_iter_values(name: &str, it: impl Iterator) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); + it.for_each(|v| builder.append_value(v)); + builder.finish() + } +} + #[cfg(test)] mod test { use super::*; @@ -221,4 +265,12 @@ mod test { let ca = builder.finish(); dbg!(ca); } + + #[test] + fn test_list_binary_builder() { + let mut builder = ListBinaryChunkedBuilder::new("a", 10, 10); + builder.append_series(&Series::new("", &[b"foo", b"bar"])); + let ca = builder.finish(); + dbg!(ca); + } } diff --git a/polars/polars-core/src/chunked_array/cast.rs b/polars/polars-core/src/chunked_array/cast.rs index d2488ba84f65..ca7c0c7ac922 100644 --- a/polars/polars-core/src/chunked_array/cast.rs +++ b/polars/polars-core/src/chunked_array/cast.rs @@ -113,6 +113,16 @@ impl ChunkCast for Utf8Chunked { } } +impl ChunkCast for BinaryChunked { + fn cast(&self, data_type: &DataType) -> PolarsResult { + cast_impl(self.name(), &self.chunks, data_type) + } + + fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult { + self.cast(data_type) + } +} + fn boolean_to_utf8(ca: &BooleanChunked) -> Utf8Chunked { ca.into_iter() .map(|opt_b| match opt_b { diff --git a/polars/polars-core/src/chunked_array/comparison.rs b/polars/polars-core/src/chunked_array/comparison.rs index 6b99d078c610..a64431fb331a 100644 --- a/polars/polars-core/src/chunked_array/comparison.rs +++ b/polars/polars-core/src/chunked_array/comparison.rs @@ -897,6 +897,15 @@ impl ChunkEqualElement for Utf8Chunked { } } +impl ChunkEqualElement for BinaryChunked { + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + let ca_other = other.as_ref().as_ref(); + debug_assert!(self.dtype() == other.dtype()); + let ca_other = &*(ca_other as *const BinaryChunked); + self.get(idx_self) == ca_other.get(idx_other) + } +} + impl ChunkEqualElement for ListChunked {} #[cfg(feature = "dtype-struct")] diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs index d7e56b23a4e6..49f72d25edca 100644 --- a/polars/polars-core/src/chunked_array/mod.rs +++ b/polars/polars-core/src/chunked_array/mod.rs @@ -521,6 +521,7 @@ where impl AsSinglePtr for BooleanChunked {} impl AsSinglePtr for ListChunked {} impl AsSinglePtr for Utf8Chunked {} +impl AsSinglePtr for BinaryChunked {} #[cfg(feature = "object")] impl AsSinglePtr for ObjectChunked {} @@ -601,6 +602,14 @@ impl ValueSize for Utf8Chunked { } } +impl ValueSize for BinaryChunked { + fn get_values_size(&self) -> usize { + self.chunks + .iter() + .fold(0usize, |acc, arr| acc + arr.get_values_size()) + } +} + impl ListChunked { /// Get the inner data type of the list. pub fn inner_dtype(&self) -> DataType { diff --git a/polars/polars-core/src/chunked_array/ops/aggregate.rs b/polars/polars-core/src/chunked_array/ops/aggregate.rs index a26acd0da39a..861bec9d1df6 100644 --- a/polars/polars-core/src/chunked_array/ops/aggregate.rs +++ b/polars/polars-core/src/chunked_array/ops/aggregate.rs @@ -692,6 +692,16 @@ impl VarAggSeries for Utf8Chunked { } } +impl VarAggSeries for BinaryChunked { + fn var_as_series(&self, _ddof: u8) -> Series { + Self::full_null(self.name(), 1).into_series() + } + + fn std_as_series(&self, _ddof: u8) -> Series { + Self::full_null(self.name(), 1).into_series() + } +} + macro_rules! impl_quantile_as_series { ($self:expr, $agg:ident, $ty: ty, $qtl:expr, $opt:expr) => {{ let v = $self.$agg($qtl, $opt)?; @@ -804,6 +814,20 @@ impl QuantileAggSeries for Utf8Chunked { } } +impl QuantileAggSeries for BinaryChunked { + fn quantile_as_series( + &self, + _quantile: f64, + _interpol: QuantileInterpolOptions, + ) -> PolarsResult { + Ok(Self::full_null(self.name(), 1).into_series()) + } + + fn median_as_series(&self) -> Series { + Self::full_null(self.name(), 1).into_series() + } +} + impl ChunkAggSeries for BooleanChunked { fn sum_as_series(&self) -> Series { let v = ChunkAgg::sum(self); @@ -849,6 +873,30 @@ impl ChunkAggSeries for Utf8Chunked { } } +impl ChunkAggSeries for BinaryChunked { + fn sum_as_series(&self) -> Series { + BinaryChunked::full_null(self.name(), 1).into_series() + } + fn max_as_series(&self) -> Series { + Series::new( + self.name(), + &[self + .downcast_iter() + .filter_map(compute::aggregate::max_binary) + .fold_first_(|acc, v| if acc > v { acc } else { v })], + ) + } + fn min_as_series(&self) -> Series { + Series::new( + self.name(), + &[self + .downcast_iter() + .filter_map(compute::aggregate::min_binary) + .fold_first_(|acc, v| if acc < v { acc } else { v })], + ) + } +} + macro_rules! one_null_list { ($self:ident, $dtype: expr) => {{ let mut builder = get_list_builder(&$dtype, 0, 1, $self.name()).unwrap(); @@ -892,6 +940,7 @@ where impl ArgAgg for BooleanChunked {} impl ArgAgg for Utf8Chunked {} +impl ArgAgg for BinaryChunked {} impl ArgAgg for ListChunked {} #[cfg(feature = "object")] diff --git a/polars/polars-core/src/chunked_array/ops/append.rs b/polars/polars-core/src/chunked_array/ops/append.rs index fb35c1c4a035..26dc4ca14ac2 100644 --- a/polars/polars-core/src/chunked_array/ops/append.rs +++ b/polars/polars-core/src/chunked_array/ops/append.rs @@ -44,6 +44,16 @@ impl Utf8Chunked { } } +#[doc(hidden)] +impl BinaryChunked { + pub fn append(&mut self, other: &Self) { + let len = self.len(); + self.length += other.length; + new_chunks(&mut self.chunks, &other.chunks, len); + self.set_sorted2(IsSorted::Not); + } +} + #[doc(hidden)] impl ListChunked { pub fn append(&mut self, other: &Self) -> PolarsResult<()> { diff --git a/polars/polars-core/src/chunked_array/ops/apply.rs b/polars/polars-core/src/chunked_array/ops/apply.rs index d344af503144..c669d09de99f 100644 --- a/polars/polars-core/src/chunked_array/ops/apply.rs +++ b/polars/polars-core/src/chunked_array/ops/apply.rs @@ -443,6 +443,99 @@ impl<'a> ChunkApply<'a, &'a str, Cow<'a, str>> for Utf8Chunked { } } +impl<'a> ChunkApply<'a, &'a [u8], Cow<'a, [u8]>> for BinaryChunked { + fn apply_cast_numeric(&'a self, f: F) -> ChunkedArray + where + F: Fn(&'a [u8]) -> S::Native + Copy, + S: PolarsNumericType, + { + let chunks = self + .downcast_iter() + .into_iter() + .map(|array| { + let values = array.values_iter().map(f); + let values = Vec::<_>::from_trusted_len_iter(values); + to_array::(values, array.validity().cloned()) + }) + .collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } + + fn branch_apply_cast_numeric_no_null(&'a self, f: F) -> ChunkedArray + where + F: Fn(Option<&'a [u8]>) -> S::Native + Copy, + S: PolarsNumericType, + { + let chunks = self + .downcast_iter() + .into_iter() + .map(|array| { + let values = array.into_iter().map(f); + let values = Vec::<_>::from_trusted_len_iter(values); + to_array::(values, array.validity().cloned()) + }) + .collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } + + fn apply(&'a self, f: F) -> Self + where + F: Fn(&'a [u8]) -> Cow<'a, [u8]> + Copy, + { + apply!(self, f) + } + + fn try_apply(&'a self, f: F) -> PolarsResult + where + F: Fn(&'a [u8]) -> PolarsResult> + Copy, + { + try_apply!(self, f) + } + + fn apply_on_opt(&'a self, f: F) -> Self + where + F: Fn(Option<&'a [u8]>) -> Option> + Copy, + { + let mut ca: Self = self.into_iter().map(f).collect_trusted(); + ca.rename(self.name()); + ca + } + + fn apply_with_idx(&'a self, f: F) -> Self + where + F: Fn((usize, &'a [u8])) -> Cow<'a, [u8]> + Copy, + { + apply_enumerate!(self, f) + } + + fn apply_with_idx_on_opt(&'a self, f: F) -> Self + where + F: Fn((usize, Option<&'a [u8]>)) -> Option> + Copy, + { + let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted(); + ca.rename(self.name()); + ca + } + + fn apply_to_slice(&'a self, f: F, slice: &mut [T]) + where + F: Fn(Option<&'a [u8]>, &T) -> T, + { + assert!(slice.len() >= self.len()); + + let mut idx = 0; + self.downcast_iter().for_each(|arr| { + arr.into_iter().for_each(|opt_val| { + // Safety: + // length asserted above + let item = unsafe { slice.get_unchecked_mut(idx) }; + *item = f(opt_val, item); + idx += 1; + }) + }); + } +} + impl ChunkApplyKernel for BooleanChunked { fn apply_kernel(&self, f: &dyn Fn(&BooleanArray) -> ArrayRef) -> Self { let chunks = self.downcast_iter().into_iter().map(f).collect(); @@ -491,6 +584,20 @@ impl ChunkApplyKernel for Utf8Chunked { } } +impl ChunkApplyKernel for BinaryChunked { + fn apply_kernel(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> Self { + self.apply_kernel_cast(&f) + } + + fn apply_kernel_cast(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> ChunkedArray + where + S: PolarsDataType, + { + let chunks = self.downcast_iter().into_iter().map(f).collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } +} + impl<'a> ChunkApply<'a, Series, Series> for ListChunked { fn apply_cast_numeric(&self, f: F) -> ChunkedArray where diff --git a/polars/polars-core/src/chunked_array/ops/compare_inner.rs b/polars/polars-core/src/chunked_array/ops/compare_inner.rs index 62edadeafaaf..06ff863c1daf 100644 --- a/polars/polars-core/src/chunked_array/ops/compare_inner.rs +++ b/polars/polars-core/src/chunked_array/ops/compare_inner.rs @@ -5,8 +5,9 @@ use std::cmp::{Ordering, PartialEq}; use crate::chunked_array::ops::take::take_random::{ - BoolTakeRandom, BoolTakeRandomSingleChunk, NumTakeRandomChunked, NumTakeRandomCont, - NumTakeRandomSingleChunk, Utf8TakeRandom, Utf8TakeRandomSingleChunk, + BinaryTakeRandom, BinaryTakeRandomSingleChunk, BoolTakeRandom, BoolTakeRandomSingleChunk, + NumTakeRandomChunked, NumTakeRandomCont, NumTakeRandomSingleChunk, Utf8TakeRandom, + Utf8TakeRandomSingleChunk, }; #[cfg(feature = "object")] use crate::chunked_array::ops::take::take_random::{ObjectTakeRandom, ObjectTakeRandomSingleChunk}; @@ -69,6 +70,8 @@ macro_rules! impl_traits { impl_traits!(Utf8TakeRandom<'_>); impl_traits!(Utf8TakeRandomSingleChunk<'_>); +impl_traits!(BinaryTakeRandom<'_>); +impl_traits!(BinaryTakeRandomSingleChunk<'_>); impl_traits!(BoolTakeRandom<'_>); impl_traits!(BoolTakeRandomSingleChunk<'_>); impl_traits!(NumTakeRandomSingleChunk<'_, T>, T); @@ -140,6 +143,26 @@ impl<'a> IntoPartialEqInner<'a> for &'a Utf8Chunked { } } +impl<'a> IntoPartialEqInner<'a> for &'a BinaryChunked { + fn into_partial_eq_inner(self) -> Box { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + Box::new(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + Box::new(t) + } + } + } +} + impl<'a> IntoPartialEqInner<'a> for &'a BooleanChunked { fn into_partial_eq_inner(self) -> Box { match self.chunks.len() { @@ -240,6 +263,26 @@ impl<'a> IntoPartialOrdInner<'a> for &'a Utf8Chunked { } } +impl<'a> IntoPartialOrdInner<'a> for &'a BinaryChunked { + fn into_partial_ord_inner(self) -> Box { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + Box::new(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + Box::new(t) + } + } + } +} + impl<'a> IntoPartialOrdInner<'a> for &'a BooleanChunked { fn into_partial_ord_inner(self) -> Box { match self.chunks.len() { diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/polars/polars-core/src/chunked_array/ops/explode.rs index e28bc381a6a6..4c3fe81feaf5 100644 --- a/polars/polars-core/src/chunked_array/ops/explode.rs +++ b/polars/polars-core/src/chunked_array/ops/explode.rs @@ -280,6 +280,47 @@ impl ExplodeByOffsets for Utf8Chunked { } } +impl ExplodeByOffsets for BinaryChunked { + fn explode_by_offsets(&self, offsets: &[i64]) -> Series { + debug_assert_eq!(self.chunks.len(), 1); + let arr = self.downcast_iter().next().unwrap(); + + let cap = ((arr.len() as f32) * 1.5) as usize; + let bytes_size = self.get_values_size(); + let mut builder = BinaryChunkedBuilder::new(self.name(), cap, bytes_size); + + let mut start = offsets[0] as usize; + let mut last = start; + for &o in &offsets[1..] { + let o = o as usize; + if o == last { + if start != last { + let vals = arr.slice(start, last - start); + if vals.null_count() == 0 { + builder + .builder + .extend_trusted_len_values(vals.values_iter()) + } else { + builder.builder.extend_trusted_len(vals.into_iter()); + } + } + builder.append_null(); + start = o; + } + last = o; + } + let vals = arr.slice(start, last - start); + if vals.null_count() == 0 { + builder + .builder + .extend_trusted_len_values(vals.values_iter()) + } else { + builder.builder.extend_trusted_len(vals.into_iter()); + } + builder.finish().into() + } +} + /// Convert Arrow array offsets to indexes of the original list pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec { if offsets.is_empty() { diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/polars/polars-core/src/chunked_array/ops/extend.rs index 9ca952949f08..65ac7e20b910 100644 --- a/polars/polars-core/src/chunked_array/ops/extend.rs +++ b/polars/polars-core/src/chunked_array/ops/extend.rs @@ -117,6 +117,43 @@ impl Utf8Chunked { } } +#[doc(hidden)] +impl BinaryChunked { + pub fn extend(&mut self, other: &Self) { + if self.chunks.len() > 1 { + self.append(other); + *self = self.rechunk(); + return; + } + let arr = self.downcast_iter().next().unwrap(); + + // increments 1 + let arr = arr.clone(); + + // now we drop our owned ArrayRefs so that + // decrements 1 + { + self.chunks.clear(); + } + + use Either::*; + + match arr.into_mut() { + Left(immutable) => { + extend_immutable(&immutable, &mut self.chunks, &other.chunks); + } + Right(mut mutable) => { + for arr in other.downcast_iter() { + mutable.extend_trusted_len(arr.into_iter()) + } + let arr: BinaryArray = mutable.into(); + self.chunks.push(Box::new(arr) as ArrayRef) + } + } + self.compute_len(); + } +} + #[doc(hidden)] impl BooleanChunked { pub fn extend(&mut self, other: &Self) { diff --git a/polars/polars-core/src/chunked_array/ops/fill_null.rs b/polars/polars-core/src/chunked_array/ops/fill_null.rs index a6bd4ca95105..18bbb144b616 100644 --- a/polars/polars-core/src/chunked_array/ops/fill_null.rs +++ b/polars/polars-core/src/chunked_array/ops/fill_null.rs @@ -107,6 +107,31 @@ fn fill_backward_limit_utf8(ca: &Utf8Chunked, limit: IdxSize) -> Utf8Chunked { out.into_iter().rev().collect_trusted() } +fn fill_backward_limit_binary(ca: &BinaryChunked, limit: IdxSize) -> BinaryChunked { + let mut cnt = 0; + let mut previous = None; + let out: BinaryChunked = ca + .into_iter() + .rev() + .map(|opt_v| match opt_v { + Some(v) => { + cnt = 0; + previous = Some(v); + Some(v) + } + None => { + if cnt < limit { + cnt += 1; + previous + } else { + None + } + } + }) + .collect_trusted(); + out.into_iter().rev().collect_trusted() +} + fn fill_forward(ca: &ChunkedArray) -> ChunkedArray where T: PolarsNumericType, @@ -345,6 +370,42 @@ impl ChunkFillNullValue<&str> for Utf8Chunked { } } +impl ChunkFillNull for BinaryChunked { + fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + // nothing to fill + if !self.has_validity() { + return Ok(self.clone()); + } + match strategy { + FillNullStrategy::Forward(limit) => { + let mut out: Self = match limit { + Some(limit) => impl_fill_forward_limit!(self, limit), + None => impl_fill_forward!(self), + }; + out.rename(self.name()); + Ok(out) + } + FillNullStrategy::Backward(limit) => { + let mut out = match limit { + None => impl_fill_backward!(self, BinaryChunked), + Some(limit) => fill_backward_limit_binary(self, limit), + }; + out.rename(self.name()); + Ok(out) + } + strat => Err(PolarsError::InvalidOperation( + format!("Strategy {:?} not supported", strat).into(), + )), + } + } +} + +impl ChunkFillNullValue<&[u8]> for BinaryChunked { + fn fill_null_with_values(&self, value: &[u8]) -> PolarsResult { + self.set(&self.is_null(), Some(value)) + } +} + impl ChunkFillNull for ListChunked { fn fill_null(&self, _strategy: FillNullStrategy) -> PolarsResult { Err(PolarsError::InvalidOperation( diff --git a/polars/polars-core/src/chunked_array/ops/filter.rs b/polars/polars-core/src/chunked_array/ops/filter.rs index b729513ddffa..d80038c76257 100644 --- a/polars/polars-core/src/chunked_array/ops/filter.rs +++ b/polars/polars-core/src/chunked_array/ops/filter.rs @@ -93,6 +93,28 @@ impl ChunkFilter for Utf8Chunked { } } +impl ChunkFilter for BinaryChunked { + fn filter(&self, filter: &BooleanChunked) -> PolarsResult> { + // broadcast + if filter.len() == 1 { + return match filter.get(0) { + Some(true) => Ok(self.clone()), + _ => Ok(BinaryChunked::full_null(self.name(), 0)), + }; + } + check_filter_len!(self, filter); + let (left, filter) = align_chunks_binary(self, filter); + + let chunks = left + .downcast_iter() + .zip(filter.downcast_iter()) + .map(|(left, mask)| filter_fn(left, mask).unwrap()) + .collect::>(); + + Ok(self.copy_with_chunks(chunks, true)) + } +} + impl ChunkFilter for ListChunked { fn filter(&self, filter: &BooleanChunked) -> PolarsResult { // broadcast diff --git a/polars/polars-core/src/chunked_array/ops/full.rs b/polars/polars-core/src/chunked_array/ops/full.rs index cb2840488f05..594e50c86b91 100644 --- a/polars/polars-core/src/chunked_array/ops/full.rs +++ b/polars/polars-core/src/chunked_array/ops/full.rs @@ -64,6 +64,26 @@ impl ChunkFullNull for Utf8Chunked { } } +impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { + fn full(name: &str, value: &'a [u8], length: usize) -> Self { + let mut builder = BinaryChunkedBuilder::new(name, length, length * value.len()); + + for _ in 0..length { + builder.append_value(value); + } + let mut out = builder.finish(); + out.set_sorted2(IsSorted::Ascending); + out + } +} + +impl ChunkFullNull for BinaryChunked { + fn full_null(name: &str, length: usize) -> Self { + let arr = new_null_array(DataType::Binary.to_arrow(), length); + BinaryChunked::from_chunks(name, vec![arr]) + } +} + impl ChunkFull<&Series> for ListChunked { fn full(name: &str, value: &Series, length: usize) -> ListChunked { let mut builder = diff --git a/polars/polars-core/src/chunked_array/ops/is_in.rs b/polars/polars-core/src/chunked_array/ops/is_in.rs index c08bea016894..fa38a4e199b4 100644 --- a/polars/polars-core/src/chunked_array/ops/is_in.rs +++ b/polars/polars-core/src/chunked_array/ops/is_in.rs @@ -230,6 +230,69 @@ impl IsIn for Utf8Chunked { } } +impl IsIn for BinaryChunked { + fn is_in(&self, other: &Series) -> PolarsResult { + match other.dtype() { + DataType::List(dt) if DataType::Binary == **dt => { + let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { + let value = self.get(0); + other + .list()? + .amortized_iter() + .map(|opt_b| { + opt_b.map(|s| { + let ca = s.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .collect_trusted() + } else { + self.into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == val) + } + _ => false, + }) + .collect_trusted() + }; + ca.rename(self.name()); + Ok(ca) + } + DataType::Binary => { + let mut set = HashSet::with_capacity(other.len()); + + let other = other.binary()?; + other.downcast_iter().for_each(|iter| { + iter.into_iter().for_each(|opt_val| { + set.insert(opt_val); + }) + }); + let mut ca: BooleanChunked = self + .into_iter() + .map(|opt_val| set.contains(&opt_val)) + .collect_trusted(); + ca.rename(self.name()); + Ok(ca) + } + _ => Err(PolarsError::SchemaMisMatch( + format!( + "cannot do is_in operation with left a dtype: {:?} and right a dtype {:?}", + self.dtype(), + other.dtype() + ) + .into(), + )), + } + .map(|mut ca| { + ca.rename(self.name()); + ca + }) + } +} + impl IsIn for BooleanChunked { fn is_in(&self, other: &Series) -> PolarsResult { match other.dtype() { diff --git a/polars/polars-core/src/chunked_array/ops/mod.rs b/polars/polars-core/src/chunked_array/ops/mod.rs index 507f8987b51c..43c54c3b548a 100644 --- a/polars/polars-core/src/chunked_array/ops/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/mod.rs @@ -639,6 +639,12 @@ impl ChunkExpandAtIndex for Utf8Chunked { } } +impl ChunkExpandAtIndex for BinaryChunked { + fn expand_at_index(&self, index: usize, length: usize) -> BinaryChunked { + impl_chunk_expand!(self, length, index) + } +} + impl ChunkExpandAtIndex for ListChunked { fn expand_at_index(&self, index: usize, length: usize) -> ListChunked { let opt_val = self.get(index); diff --git a/polars/polars-core/src/chunked_array/ops/repeat_by.rs b/polars/polars-core/src/chunked_array/ops/repeat_by.rs index 6e220a5dfdba..41100d68caa6 100644 --- a/polars/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/polars/polars-core/src/chunked_array/ops/repeat_by.rs @@ -63,3 +63,20 @@ impl RepeatBy for Utf8Chunked { ) } } +impl RepeatBy for BinaryChunked { + fn repeat_by(&self, by: &IdxCa) -> ListChunked { + let iter = self + .into_iter() + .zip(by.into_iter()) + .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); + + // Safety: + // Length of iter is trusted + ListChunked::from_chunks( + self.name(), + vec![Box::new(unsafe { + LargeListArray::from_iter_binary_trusted_len(iter, self.len()) + })], + ) + } +} diff --git a/polars/polars-core/src/chunked_array/ops/reverse.rs b/polars/polars-core/src/chunked_array/ops/reverse.rs index e435f71589c7..85a80758f06e 100644 --- a/polars/polars-core/src/chunked_array/ops/reverse.rs +++ b/polars/polars-core/src/chunked_array/ops/reverse.rs @@ -39,6 +39,7 @@ macro_rules! impl_reverse { impl_reverse!(BooleanType, BooleanChunked); impl_reverse!(Utf8Type, Utf8Chunked); +impl_reverse!(BinaryType, BinaryChunked); impl_reverse!(ListType, ListChunked); #[cfg(feature = "object")] diff --git a/polars/polars-core/src/chunked_array/ops/set.rs b/polars/polars-core/src/chunked_array/ops/set.rs index d4f7c831930e..d4e9b405b7aa 100644 --- a/polars/polars-core/src/chunked_array/ops/set.rs +++ b/polars/polars-core/src/chunked_array/ops/set.rs @@ -273,6 +273,80 @@ impl<'a> ChunkSet<'a, &'a str, String> for Utf8Chunked { } } +impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { + fn set_at_idx>( + &'a self, + idx: I, + opt_value: Option<&'a [u8]>, + ) -> PolarsResult + where + Self: Sized, + { + let idx_iter = idx.into_iter(); + let mut ca_iter = self.into_iter().enumerate(); + let mut builder = + BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + + for current_idx in idx_iter { + if current_idx as usize > self.len() { + return Err(PolarsError::ComputeError( + format!( + "index: {} outside of ChunkedArray with length: {}", + current_idx, + self.len() + ) + .into(), + )); + } + for (cnt_idx, opt_val_self) in &mut ca_iter { + if cnt_idx == current_idx as usize { + builder.append_option(opt_value); + break; + } else { + builder.append_option(opt_val_self); + } + } + } + // the last idx is probably not the last value so we finish the iterator + for (_, opt_val_self) in ca_iter { + builder.append_option(opt_val_self); + } + + let ca = builder.finish(); + Ok(ca) + } + + fn set_at_idx_with, F>( + &'a self, + idx: I, + f: F, + ) -> PolarsResult + where + Self: Sized, + F: Fn(Option<&'a [u8]>) -> Option>, + { + let mut builder = + BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + impl_set_at_idx_with!(self, builder, idx, f) + } + + fn set(&'a self, mask: &BooleanChunked, value: Option<&'a [u8]>) -> PolarsResult + where + Self: Sized, + { + check_bounds!(self, mask); + let ca = mask + .into_iter() + .zip(self.into_iter()) + .map(|(mask_val, opt_val)| match mask_val { + Some(true) => value, + _ => opt_val, + }) + .collect_trusted(); + Ok(ca) + } +} + #[cfg(test)] mod test { use crate::prelude::*; diff --git a/polars/polars-core/src/chunked_array/ops/shift.rs b/polars/polars-core/src/chunked_array/ops/shift.rs index 1a2d1428c18a..a2a55cfa9b4d 100644 --- a/polars/polars-core/src/chunked_array/ops/shift.rs +++ b/polars/polars-core/src/chunked_array/ops/shift.rs @@ -60,12 +60,24 @@ impl ChunkShiftFill> for Utf8Chunked { } } +impl ChunkShiftFill> for BinaryChunked { + fn shift_and_fill(&self, periods: i64, fill_value: Option<&[u8]>) -> BinaryChunked { + impl_shift_fill!(self, periods, fill_value) + } +} + impl ChunkShift for Utf8Chunked { fn shift(&self, periods: i64) -> Self { self.shift_and_fill(periods, None) } } +impl ChunkShift for BinaryChunked { + fn shift(&self, periods: i64) -> Self { + self.shift_and_fill(periods, None) + } +} + impl ChunkShiftFill> for ListChunked { fn shift_and_fill(&self, periods: i64, fill_value: Option<&Series>) -> ListChunked { // This has its own implementation because a ListChunked cannot have a full-null without diff --git a/polars/polars-core/src/chunked_array/ops/sort/mod.rs b/polars/polars-core/src/chunked_array/ops/sort/mod.rs index 9563d1115eee..6a08a61ecfe3 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/mod.rs @@ -508,6 +508,136 @@ impl ChunkSort for Utf8Chunked { } } +impl ChunkSort for BinaryChunked { + fn sort_with(&self, options: SortOptions) -> ChunkedArray { + sort_with_fast_path!(self, options); + let mut v: Vec<&[u8]> = if self.null_count() > 0 { + Vec::from_iter(self.into_iter().flatten()) + } else { + Vec::from_iter(self.into_no_null_iter()) + }; + + sort_branch( + v.as_mut_slice(), + options.descending, + order_default, + order_reverse, + ); + + let mut values = Vec::::with_capacity(self.get_values_size()); + let mut offsets = Vec::::with_capacity(self.len() + 1); + let mut length_so_far = 0i64; + offsets.push(length_so_far); + + let len = self.len(); + let null_count = self.null_count(); + let mut ca: Self = match (null_count, options.nulls_last) { + (0, _) => { + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default(offsets.into(), values.into(), None) + }; + (self.name(), ar).into() + } + (_, true) => { + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + let mut validity = MutableBitmap::with_capacity(len); + validity.extend_constant(len - null_count, true); + validity.extend_constant(null_count, false); + offsets.extend(std::iter::repeat(length_so_far).take(null_count)); + + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default( + offsets.into(), + values.into(), + Some(validity.into()), + ) + }; + (self.name(), ar).into() + } + (_, false) => { + let mut validity = MutableBitmap::with_capacity(len); + validity.extend_constant(null_count, false); + validity.extend_constant(len - null_count, true); + offsets.extend(std::iter::repeat(length_so_far).take(null_count)); + + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default( + offsets.into(), + values.into(), + Some(validity.into()), + ) + }; + (self.name(), ar).into() + } + }; + + ca.set_sorted(options.descending); + ca + } + + fn sort(&self, reverse: bool) -> BinaryChunked { + self.sort_with(SortOptions { + descending: reverse, + nulls_last: false, + }) + } + + fn argsort(&self, options: SortOptions) -> IdxCa { + argsort::argsort( + self.name(), + self.downcast_iter().map(|arr| arr.iter()), + options, + self.null_count(), + self.len(), + ) + } + + #[cfg(feature = "sort_multiple")] + /// # Panics + /// + /// This function is very opinionated. On the implementation of `ChunkedArray` for numeric types, + /// we assume that all numeric `Series` are of the same type. + /// + /// In this case we assume that all numeric `Series` are `f64` types. The caller needs to + /// uphold this contract. If not, it will panic. + /// + fn argsort_multiple(&self, other: &[Series], reverse: &[bool]) -> PolarsResult { + args_validate(self, other, reverse)?; + + let mut count: IdxSize = 0; + let vals: Vec<_> = self + .into_iter() + .map(|v| { + let i = count; + count += 1; + (i, v) + }) + .collect_trusted(); + argsort_multiple_impl(vals, other, reverse) + } +} + impl ChunkSort for BooleanChunked { fn sort_with(&self, options: SortOptions) -> ChunkedArray { sort_with_fast_path!(self, options); diff --git a/polars/polars-core/src/chunked_array/ops/take/mod.rs b/polars/polars-core/src/chunked_array/ops/take/mod.rs index 2edf6f96e410..94095a1e2d1c 100644 --- a/polars/polars-core/src/chunked_array/ops/take/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/take/mod.rs @@ -323,6 +323,87 @@ impl ChunkTake for Utf8Chunked { } } +impl ChunkTake for BinaryChunked { + unsafe fn take_unchecked(&self, indices: TakeIdx) -> Self + where + Self: std::marker::Sized, + I: TakeIterator, + INulls: TakeIteratorNulls, + { + let mut chunks = self.downcast_iter(); + match indices { + TakeIdx::Array(array) => { + if array.null_count() == array.len() { + return Self::full_null(self.name(), array.len()); + } + let array = match self.chunks.len() { + 1 => take_binary_unchecked(chunks.next().unwrap(), array) as ArrayRef, + _ => { + return if !array.has_validity() { + let iter = array.values().iter().map(|i| *i as usize); + let mut ca: BinaryChunked = take_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + ca + } else { + let iter = array + .into_iter() + .map(|opt_idx| opt_idx.map(|idx| *idx as usize)); + let mut ca: BinaryChunked = + take_opt_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + ca + } + } + }; + self.copy_with_chunks(vec![array], false) + } + TakeIdx::Iter(iter) => { + let array = match (self.has_validity(), self.chunks.len()) { + (false, 1) => { + take_no_null_binary_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef + } + (_, 1) => take_binary_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef, + _ => { + let mut ca: BinaryChunked = take_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + return ca; + } + }; + self.copy_with_chunks(vec![array], false) + } + TakeIdx::IterNulls(iter) => { + let array = match (self.has_validity(), self.chunks.len()) { + (false, 1) => { + take_no_null_binary_opt_iter_unchecked(chunks.next().unwrap(), iter) + as ArrayRef + } + (_, 1) => { + take_binary_opt_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef + } + _ => { + let mut ca: BinaryChunked = take_opt_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + return ca; + } + }; + self.copy_with_chunks(vec![array], false) + } + } + } + + fn take(&self, indices: TakeIdx) -> PolarsResult + where + Self: std::marker::Sized, + I: TakeIterator, + INulls: TakeIteratorNulls, + { + indices.check_bounds(self.len())?; + // Safety: + // just checked bounds + Ok(unsafe { self.take_unchecked(indices) }) + } +} + impl ChunkTake for ListChunked { unsafe fn take_unchecked(&self, indices: TakeIdx) -> Self where diff --git a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs index c079974a591f..cb8b6e3206b0 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs @@ -90,6 +90,38 @@ impl TakeChunked for Utf8Chunked { } } +impl TakeChunked for BinaryChunked { + unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(*chunk_idx as usize); + arr.get_unchecked(*array_idx as usize) + }) + .collect_trusted(); + ca.rename(self.name()); + ca.set_sorted2(sorted); + ca + } + + unsafe fn take_opt_chunked_unchecked(&self, by: &[Option]) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|opt_idx| { + opt_idx.and_then(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(chunk_idx as usize); + arr.get_unchecked(array_idx as usize) + }) + }) + .collect_trusted(); + + ca.rename(self.name()); + ca + } +} + impl TakeChunked for BooleanChunked { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { let arrs = self.downcast_iter().collect::>(); diff --git a/polars/polars-core/src/chunked_array/ops/take/take_every.rs b/polars/polars-core/src/chunked_array/ops/take/take_every.rs index 0b092da0b190..09da5861de57 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_every.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_every.rs @@ -41,6 +41,18 @@ impl ChunkTakeEvery for Utf8Chunked { } } +impl ChunkTakeEvery for BinaryChunked { + fn take_every(&self, n: usize) -> BinaryChunked { + let mut ca: Self = if !self.has_validity() { + self.into_no_null_iter().step_by(n).collect() + } else { + self.into_iter().step_by(n).collect() + }; + ca.rename(self.name()); + ca + } +} + impl ChunkTakeEvery for ListChunked { fn take_every(&self, n: usize) -> ListChunked { let mut ca: Self = if !self.has_validity() { diff --git a/polars/polars-core/src/chunked_array/ops/take/take_random.rs b/polars/polars-core/src/chunked_array/ops/take/take_random.rs index 0b0c51f84de2..d284b9949c32 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_random.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_random.rs @@ -232,6 +232,70 @@ impl<'a> IntoTakeRandom<'a> for &'a Utf8Chunked { } } +pub struct BinaryTakeRandom<'a> { + pub(crate) chunks: Chunks<'a, BinaryArray>, + pub(crate) chunk_lens: Vec, +} + +impl<'a> TakeRandom for BinaryTakeRandom<'a> { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + take_random_get!(self, index) + } + + #[inline] + unsafe fn get_unchecked(&self, index: usize) -> Option { + take_random_get_unchecked!(self, index) + } +} + +pub struct BinaryTakeRandomSingleChunk<'a> { + pub(crate) arr: &'a BinaryArray, +} + +impl<'a> TakeRandom for BinaryTakeRandomSingleChunk<'a> { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + take_random_get_single!(self, index) + } + + #[inline] + unsafe fn get_unchecked(&self, index: usize) -> Option { + if self.arr.is_valid_unchecked(index) { + Some(self.arr.value_unchecked(index)) + } else { + None + } + } +} + +impl<'a> IntoTakeRandom<'a> for &'a BinaryChunked { + type Item = &'a [u8]; + type TakeRandom = TakeRandBranch2, BinaryTakeRandom<'a>>; + + fn take_rand(&self) -> Self::TakeRandom { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + TakeRandBranch2::Single(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + TakeRandBranch2::Multi(t) + } + } + } +} + impl<'a> IntoTakeRandom<'a> for &'a BooleanChunked { type Item = bool; type TakeRandom = TakeRandBranch2, BoolTakeRandom<'a>>; diff --git a/polars/polars-core/src/chunked_array/ops/take/take_single.rs b/polars/polars-core/src/chunked_array/ops/take/take_single.rs index b60be9e89360..9ee3cc25444d 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_single.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_single.rs @@ -113,6 +113,17 @@ impl<'a> TakeRandom for &'a Utf8Chunked { } } +impl<'a> TakeRandom for &'a BinaryChunked { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + // Safety: + // Out of bounds is checked and downcast is of correct type + unsafe { impl_take_random_get!(self, index, LargeBinaryArray) } + } +} + // extra trait such that it also works without extra reference. // Autoref will insert the reference and impl<'a> TakeRandomUtf8 for &'a Utf8Chunked { diff --git a/polars/polars-core/src/chunked_array/ops/unique/mod.rs b/polars/polars-core/src/chunked_array/ops/unique/mod.rs index bc5f94871838..7663e4f57aa0 100644 --- a/polars/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/unique/mod.rs @@ -274,6 +274,59 @@ impl ChunkUnique for Utf8Chunked { } } +impl ChunkUnique for BinaryChunked { + fn unique(&self) -> PolarsResult { + match self.null_count() { + 0 => { + let mut set = + PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len())); + for arr in self.downcast_iter() { + set.extend(arr.values_iter()) + } + Ok(BinaryChunked::from_iter_values( + self.name(), + set.iter().copied(), + )) + } + _ => { + let mut set = + PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len())); + for arr in self.downcast_iter() { + set.extend(arr.iter()) + } + Ok(BinaryChunked::from_iter_options( + self.name(), + set.iter().copied(), + )) + } + } + } + + fn arg_unique(&self) -> PolarsResult { + Ok(IdxCa::from_vec(self.name(), arg_unique_ca!(self))) + } + + fn is_unique(&self) -> PolarsResult { + is_unique_duplicated!(self, false) + } + fn is_duplicated(&self) -> PolarsResult { + is_unique_duplicated!(self, true) + } + + fn n_unique(&self) -> PolarsResult { + if self.null_count() > 0 { + Ok(fill_set(self.into_iter().flatten()).len() + 1) + } else { + Ok(fill_set(self.into_no_null_iter()).len()) + } + } + + #[cfg(feature = "mode")] + fn mode(&self) -> PolarsResult { + Ok(mode(self)) + } +} + impl ChunkUnique for BooleanChunked { fn unique(&self) -> PolarsResult { // can be None, Some(true), Some(false) @@ -408,6 +461,24 @@ mod is_first { Ok(BooleanChunked::from_chunks(self.name(), chunks)) } } + + impl IsFirst for BinaryChunked { + fn is_first(&self) -> PolarsResult { + let mut unique = PlHashSet::new(); + let chunks = self + .downcast_iter() + .map(|arr| { + let mask: BooleanArray = arr + .into_iter() + .map(|opt_v| unique.insert(opt_v)) + .collect_trusted(); + Box::new(mask) as ArrayRef + }) + .collect(); + + Ok(BooleanChunked::from_chunks(self.name(), chunks)) + } + } } #[cfg(test)] diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/polars/polars-core/src/chunked_array/ops/zip.rs index 59db6a73a5ac..ece5d4a74c49 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/polars/polars-core/src/chunked_array/ops/zip.rs @@ -142,6 +142,32 @@ impl ChunkZip for Utf8Chunked { } } } + +impl ChunkZip for BinaryChunked { + fn zip_with( + &self, + mask: &BooleanChunked, + other: &BinaryChunked, + ) -> PolarsResult { + if self.len() != mask.len() || other.len() != mask.len() { + impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, BinaryType) + } else { + let (left, right, mask) = align_chunks_ternary(self, other, mask); + let chunks = left + .downcast_iter() + .zip(right.downcast_iter()) + .zip(mask.downcast_iter()) + .map(|((left_c, right_c), mask_c)| { + let mask_c = prepare_mask(mask_c); + let arr = if_then_else(&mask_c, left_c, right_c)?; + Ok(arr) + }) + .collect::>>()?; + Ok(ChunkedArray::from_chunks(self.name(), chunks)) + } + } +} + impl ChunkZip for ListChunked { fn zip_with( &self, diff --git a/polars/polars-core/src/chunked_array/trusted_len.rs b/polars/polars-core/src/chunked_array/trusted_len.rs index 1be2de8ff7c0..3b81a1a14fe5 100644 --- a/polars/polars-core/src/chunked_array/trusted_len.rs +++ b/polars/polars-core/src/chunked_array/trusted_len.rs @@ -203,6 +203,26 @@ where } } +impl FromTrustedLenIterator for BinaryChunked +where + Ptr: PolarsAsRef<[u8]>, +{ + fn from_iter_trusted_length>(iter: I) -> Self { + let iter = iter.into_iter(); + iter.collect() + } +} + +impl FromTrustedLenIterator> for BinaryChunked +where + Ptr: AsRef<[u8]>, +{ + fn from_iter_trusted_length>>(iter: I) -> Self { + let iter = iter.into_iter(); + iter.collect() + } +} + #[cfg(feature = "object")] impl FromTrustedLenIterator> for ObjectChunked { fn from_iter_trusted_length>>(iter: I) -> Self { diff --git a/polars/polars-core/src/chunked_array/upstream_traits.rs b/polars/polars-core/src/chunked_array/upstream_traits.rs index 679ba56a5f79..19cb0484dc30 100644 --- a/polars/polars-core/src/chunked_array/upstream_traits.rs +++ b/polars/polars-core/src/chunked_array/upstream_traits.rs @@ -133,6 +133,34 @@ where } } +// FromIterator for BinaryChunked variants. + +impl FromIterator> for BinaryChunked +where + Ptr: AsRef<[u8]>, +{ + fn from_iter>>(iter: I) -> Self { + let arr = BinaryArray::::from_iter(iter); + Self::from_chunks("", vec![Box::new(arr)]) + } +} + +impl PolarsAsRef<[u8]> for Vec {} +impl PolarsAsRef<[u8]> for &[u8] {} +// &["foo", "bar"] +impl PolarsAsRef<[u8]> for &&[u8] {} +impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} + +impl FromIterator for BinaryChunked +where + Ptr: PolarsAsRef<[u8]>, +{ + fn from_iter>(iter: I) -> Self { + let arr = BinaryArray::::from_iter_values(iter.into_iter()); + Self::from_chunks("", vec![Box::new(arr)]) + } +} + impl FromIterator for ListChunked where Ptr: Borrow, diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs index 099e51fb508e..d59882e3ea5a 100644 --- a/polars/polars-core/src/fmt.rs +++ b/polars/polars-core/src/fmt.rs @@ -140,6 +140,12 @@ impl Debug for Utf8Chunked { } } +impl Debug for BinaryChunked { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + format_array!(f, self, "binary", self.name(), "ChunkedArray") + } +} + impl Debug for ListChunked { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { format_array!(f, self, "list", self.name(), "ChunkedArray") diff --git a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs index dbb38010ecc8..696b974092a6 100644 --- a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs +++ b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs @@ -202,6 +202,31 @@ impl AggList for Utf8Chunked { } } +impl AggList for BinaryChunked { + unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { + match groups { + GroupsProxy::Idx(groups) => { + let mut builder = + ListBinaryChunkedBuilder::new(self.name(), groups.len(), self.len()); + for idx in groups.all().iter() { + let ca = { self.take_unchecked(idx.into()) }; + builder.append(&ca) + } + builder.finish().into_series() + } + GroupsProxy::Slice { groups, .. } => { + let mut builder = + ListBinaryChunkedBuilder::new(self.name(), groups.len(), self.len()); + for [first, len] in groups { + let ca = self.slice(*first as i64, *len as usize); + builder.append(&ca) + } + builder.finish().into_series() + } + } + } +} + fn agg_list_list, &mut i64, &mut Vec) -> bool>( ca: &ListChunked, groups_len: usize, diff --git a/polars/polars-core/src/frame/groupby/into_groups.rs b/polars/polars-core/src/frame/groupby/into_groups.rs index 1cf2a84425bf..9411e5b27ebf 100644 --- a/polars/polars-core/src/frame/groupby/into_groups.rs +++ b/polars/polars-core/src/frame/groupby/into_groups.rs @@ -285,6 +285,58 @@ impl IntoGroupsProxy for Utf8Chunked { } } +impl IntoGroupsProxy for BinaryChunked { + #[allow(clippy::needless_lifetimes)] + fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { + let hb = RandomState::default(); + let null_h = get_null_hash_value(hb.clone()); + + let out = if multithreaded { + let n_partitions = set_partition_size(); + + let split = _split_offsets(self.len(), n_partitions); + + let byte_hashes = POOL.install(|| { + split + .into_par_iter() + .map(|(offset, len)| { + let ca = self.slice(offset as i64, len); + ca.into_iter() + .map(|opt_b| { + let hash = match opt_b { + Some(s) => <[u8]>::get_hash(s, &hb), + None => null_h, + }; + // Safety: + // the underlying data is tied to self + unsafe { + std::mem::transmute::, BytesHash<'a>>( + BytesHash::new(opt_b, hash), + ) + } + }) + .collect_trusted::>() + }) + .collect::>() + }); + groupby_threaded_num(byte_hashes, 0, n_partitions as u64, sorted) + } else { + let byte_hashes = self + .into_iter() + .map(|opt_b| { + let hash = match opt_b { + Some(s) => <[u8]>::get_hash(s, &hb), + None => null_h, + }; + BytesHash::new(opt_b, hash) + }) + .collect_trusted::>(); + groupby(byte_hashes.iter(), sorted) + }; + Ok(out) + } +} + impl IntoGroupsProxy for ListChunked { #[allow(clippy::needless_lifetimes)] #[allow(unused_variables)] diff --git a/polars/polars-core/src/frame/hash_join/mod.rs b/polars/polars-core/src/frame/hash_join/mod.rs index eacc89c69f15..2b5adcd1b1e1 100644 --- a/polars/polars-core/src/frame/hash_join/mod.rs +++ b/polars/polars-core/src/frame/hash_join/mod.rs @@ -233,6 +233,7 @@ macro_rules! impl_zip_outer_join { } impl_zip_outer_join!(BooleanChunked); impl_zip_outer_join!(Utf8Chunked); +impl_zip_outer_join!(BinaryChunked); impl ZipOuterJoinColumn for Float32Chunked { fn zip_outer_join_column( diff --git a/polars/polars-core/src/named_from.rs b/polars/polars-core/src/named_from.rs index 8f12bce54253..57ae747fb972 100644 --- a/polars/polars-core/src/named_from.rs +++ b/polars/polars-core/src/named_from.rs @@ -225,6 +225,70 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>]> } } +impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice(name, v.as_ref()).into_series() + } +} + +impl NamedFrom<&Series, [u8]> for Series { + fn new(name: &str, s: &Series) -> Self { + let mut s = s.clone(); + s.rename(name); + s + } +} + +impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice(name, v.as_ref()) + } +} + +impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice_options(name, v.as_ref()).into_series() + } +} + +impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice_options(name, v.as_ref()) + } +} + +impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) + .into_series() + } +} + +impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) + } +} + +impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::new(name, v).into_series() + } +} + +impl<'a, T: AsRef<[Option>]>> NamedFrom>]> + for BinaryChunked +{ + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_options( + name, + v.as_ref() + .iter() + .map(|opt| opt.as_ref().map(|value| value.as_ref())), + ) + } +} + #[cfg(feature = "dtype-date")] impl> NamedFrom for DateChunked { fn new(name: &str, v: T) -> Self { diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs index 43a54094fa45..2b787efe6cb6 100644 --- a/polars/polars-core/src/prelude.rs +++ b/polars/polars-core/src/prelude.rs @@ -10,9 +10,9 @@ pub use polars_arrow::prelude::*; pub(crate) use polars_arrow::trusted_len::TrustedLen; pub use crate::chunked_array::builder::{ - BooleanChunkedBuilder, ChunkedBuilder, ListBooleanChunkedBuilder, ListBuilderTrait, - ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, - Utf8ChunkedBuilder, + BinaryChunkedBuilder, BooleanChunkedBuilder, ChunkedBuilder, ListBinaryChunkedBuilder, + ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, + ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, Utf8ChunkedBuilder, }; pub use crate::chunked_array::iterator::PolarsIterator; #[cfg(feature = "dtype-categorical")] diff --git a/polars/polars-core/src/series/any_value.rs b/polars/polars-core/src/series/any_value.rs index 17e994da13af..1e26e0da698e 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/polars/polars-core/src/series/any_value.rs @@ -16,6 +16,16 @@ fn any_values_to_utf8(avs: &[AnyValue]) -> Utf8Chunked { .collect_trusted() } +fn any_values_to_binary(avs: &[AnyValue]) -> BinaryChunked { + avs.iter() + .map(|av| match av { + AnyValue::Binary(s) => Some(*s), + AnyValue::BinaryOwned(s) => Some(&**s), + _ => None, + }) + .collect_trusted() +} + fn any_values_to_bool(avs: &[AnyValue]) -> BooleanChunked { avs.iter() .map(|av| match av { @@ -81,6 +91,7 @@ impl Series { DataType::Float32 => any_values_to_primitive::(av).into_series(), DataType::Float64 => any_values_to_primitive::(av).into_series(), DataType::Utf8 => any_values_to_utf8(av).into_series(), + DataType::Binary => any_values_to_binary(av).into_series(), DataType::Boolean => any_values_to_bool(av).into_series(), #[cfg(feature = "dtype-date")] DataType::Date => any_values_to_primitive::(av) diff --git a/polars/polars-core/src/series/arithmetic/borrowed.rs b/polars/polars-core/src/series/arithmetic/borrowed.rs index 3a10622dd017..6df1227248ff 100644 --- a/polars/polars-core/src/series/arithmetic/borrowed.rs +++ b/polars/polars-core/src/series/arithmetic/borrowed.rs @@ -101,6 +101,14 @@ impl NumOpsDispatch for Utf8Chunked { } } +impl NumOpsDispatch for BinaryChunked { + fn add_to(&self, rhs: &Series) -> PolarsResult { + let rhs = self.unpack_series_matching_type(rhs)?; + let out = self + rhs; + Ok(out.into_series()) + } +} + #[cfg(feature = "checked_arithmetic")] pub mod checked { use num::{CheckedDiv, ToPrimitive, Zero}; diff --git a/polars/polars-core/src/series/implementations/binary.rs b/polars/polars-core/src/series/implementations/binary.rs new file mode 100644 index 000000000000..e96816b88867 --- /dev/null +++ b/polars/polars-core/src/series/implementations/binary.rs @@ -0,0 +1,367 @@ +use std::borrow::Cow; + +use ahash::RandomState; +use polars_arrow::prelude::QuantileInterpolOptions; + +use super::{private, IntoSeries, SeriesTrait, *}; +use crate::chunked_array::comparison::*; +use crate::chunked_array::ops::compare_inner::{ + IntoPartialEqInner, IntoPartialOrdInner, PartialEqInner, PartialOrdInner, +}; +use crate::chunked_array::ops::explode::ExplodeByOffsets; +use crate::chunked_array::AsSinglePtr; +use crate::fmt::FmtList; +use crate::frame::groupby::*; +use crate::frame::hash_join::ZipOuterJoinColumn; +use crate::prelude::*; +use crate::series::implementations::SeriesWrap; + +impl private::PrivateSeries for SeriesWrap { + fn compute_len(&mut self) { + self.0.compute_len() + } + fn _field(&self) -> Cow { + Cow::Borrowed(self.0.ref_field()) + } + fn _dtype(&self) -> &DataType { + self.0.ref_field().data_type() + } + fn explode_by_offsets(&self, offsets: &[i64]) -> Series { + self.0.explode_by_offsets(offsets) + } + + fn _set_sorted(&mut self, is_sorted: IsSorted) { + self.0.set_sorted2(is_sorted) + } + + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + self.0.equal_element(idx_self, idx_other, other) + } + + #[cfg(feature = "zip_with")] + fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { + ChunkZip::zip_with(&self.0, mask, other.as_ref().as_ref()).map(|ca| ca.into_series()) + } + fn into_partial_eq_inner<'a>(&'a self) -> Box { + (&self.0).into_partial_eq_inner() + } + fn into_partial_ord_inner<'a>(&'a self) -> Box { + (&self.0).into_partial_ord_inner() + } + + fn vec_hash(&self, random_state: RandomState) -> PolarsResult> { + Ok(self.0.vec_hash(random_state)) + } + + fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.0.vec_hash_combine(build_hasher, hashes); + Ok(()) + } + + unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { + self.0.agg_list(groups) + } + + fn zip_outer_join_column( + &self, + right_column: &Series, + opt_join_tuples: &[(Option, Option)], + ) -> Series { + ZipOuterJoinColumn::zip_outer_join_column(&self.0, right_column, opt_join_tuples) + } + fn subtract(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::subtract(&self.0, rhs) + } + fn add_to(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::add_to(&self.0, rhs) + } + fn multiply(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::multiply(&self.0, rhs) + } + fn divide(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::divide(&self.0, rhs) + } + fn remainder(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::remainder(&self.0, rhs) + } + fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { + IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) + } + + #[cfg(feature = "sort_multiple")] + fn argsort_multiple(&self, by: &[Series], reverse: &[bool]) -> PolarsResult { + self.0.argsort_multiple(by, reverse) + } +} + +impl SeriesTrait for SeriesWrap { + fn is_sorted(&self) -> IsSorted { + if self.0.is_sorted() { + IsSorted::Ascending + } else if self.0.is_sorted_reverse() { + IsSorted::Descending + } else { + IsSorted::Not + } + } + + #[cfg(feature = "interpolate")] + fn interpolate(&self) -> Series { + self.0.clone().into_series() + } + + fn rename(&mut self, name: &str) { + self.0.rename(name); + } + + fn chunk_lengths(&self) -> ChunkIdIter { + self.0.chunk_id() + } + fn name(&self) -> &str { + self.0.name() + } + + fn chunks(&self) -> &Vec { + self.0.chunks() + } + fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit() + } + + fn append_array(&mut self, other: ArrayRef) -> PolarsResult<()> { + self.0.append_array(other) + } + + fn slice(&self, offset: i64, length: usize) -> Series { + self.0.slice(offset, length).into_series() + } + + fn append(&mut self, other: &Series) -> PolarsResult<()> { + if self.0.dtype() == other.dtype() { + // todo! add object + self.0.append(other.as_ref().as_ref()); + Ok(()) + } else { + Err(PolarsError::SchemaMisMatch( + "cannot append Series; data types don't match".into(), + )) + } + } + + fn extend(&mut self, other: &Series) -> PolarsResult<()> { + if self.0.dtype() == other.dtype() { + self.0.extend(other.as_ref().as_ref()); + Ok(()) + } else { + Err(PolarsError::SchemaMisMatch( + "cannot extend Series; data types don't match".into(), + )) + } + } + + fn filter(&self, filter: &BooleanChunked) -> PolarsResult { + ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series()) + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Series { + self.0.take_chunked_unchecked(by, sorted).into_series() + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_opt_chunked_unchecked(&self, by: &[Option]) -> Series { + self.0.take_opt_chunked_unchecked(by).into_series() + } + + fn take(&self, indices: &IdxCa) -> PolarsResult { + let indices = if indices.chunks.len() > 1 { + Cow::Owned(indices.rechunk()) + } else { + Cow::Borrowed(indices) + }; + Ok(ChunkTake::take(&self.0, (&*indices).into())?.into_series()) + } + + fn take_iter(&self, iter: &mut dyn TakeIterator) -> PolarsResult { + Ok(ChunkTake::take(&self.0, iter.into())?.into_series()) + } + + fn take_every(&self, n: usize) -> Series { + self.0.take_every(n).into_series() + } + + unsafe fn take_iter_unchecked(&self, iter: &mut dyn TakeIterator) -> Series { + ChunkTake::take_unchecked(&self.0, iter.into()).into_series() + } + + unsafe fn take_unchecked(&self, idx: &IdxCa) -> PolarsResult { + let idx = if idx.chunks.len() > 1 { + Cow::Owned(idx.rechunk()) + } else { + Cow::Borrowed(idx) + }; + + let mut out = ChunkTake::take_unchecked(&self.0, (&*idx).into()); + + if self.0.is_sorted() && (idx.is_sorted() || idx.is_sorted_reverse()) { + out.set_sorted2(idx.is_sorted2()) + } + + Ok(out.into_series()) + } + + unsafe fn take_opt_iter_unchecked(&self, iter: &mut dyn TakeIteratorNulls) -> Series { + ChunkTake::take_unchecked(&self.0, iter.into()).into_series() + } + + #[cfg(feature = "take_opt_iter")] + fn take_opt_iter(&self, iter: &mut dyn TakeIteratorNulls) -> PolarsResult { + Ok(ChunkTake::take(&self.0, iter.into())?.into_series()) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn rechunk(&self) -> Series { + self.0.rechunk().into_series() + } + + fn expand_at_index(&self, index: usize, length: usize) -> Series { + ChunkExpandAtIndex::expand_at_index(&self.0, index, length).into_series() + } + + fn cast(&self, data_type: &DataType) -> PolarsResult { + self.0.cast(data_type) + } + + fn get(&self, index: usize) -> AnyValue { + self.0.get_any_value(index) + } + + #[inline] + #[cfg(feature = "private")] + unsafe fn get_unchecked(&self, index: usize) -> AnyValue { + self.0.get_any_value_unchecked(index) + } + + fn sort_with(&self, options: SortOptions) -> Series { + ChunkSort::sort_with(&self.0, options).into_series() + } + + fn argsort(&self, options: SortOptions) -> IdxCa { + ChunkSort::argsort(&self.0, options) + } + + fn null_count(&self) -> usize { + self.0.null_count() + } + + fn has_validity(&self) -> bool { + self.0.has_validity() + } + + fn unique(&self) -> PolarsResult { + ChunkUnique::unique(&self.0).map(|ca| ca.into_series()) + } + + fn n_unique(&self) -> PolarsResult { + ChunkUnique::n_unique(&self.0) + } + + fn arg_unique(&self) -> PolarsResult { + ChunkUnique::arg_unique(&self.0) + } + + fn arg_min(&self) -> Option { + ArgAgg::arg_min(&self.0) + } + + fn arg_max(&self) -> Option { + ArgAgg::arg_max(&self.0) + } + + fn is_null(&self) -> BooleanChunked { + self.0.is_null() + } + + fn is_not_null(&self) -> BooleanChunked { + self.0.is_not_null() + } + + fn is_unique(&self) -> PolarsResult { + ChunkUnique::is_unique(&self.0) + } + + fn is_duplicated(&self) -> PolarsResult { + ChunkUnique::is_duplicated(&self.0) + } + + fn reverse(&self) -> Series { + ChunkReverse::reverse(&self.0).into_series() + } + + fn as_single_ptr(&mut self) -> PolarsResult { + self.0.as_single_ptr() + } + + fn shift(&self, periods: i64) -> Series { + ChunkShift::shift(&self.0, periods).into_series() + } + + fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + ChunkFillNull::fill_null(&self.0, strategy).map(|ca| ca.into_series()) + } + + fn _sum_as_series(&self) -> Series { + ChunkAggSeries::sum_as_series(&self.0) + } + fn max_as_series(&self) -> Series { + ChunkAggSeries::max_as_series(&self.0) + } + fn min_as_series(&self) -> Series { + ChunkAggSeries::min_as_series(&self.0) + } + fn median_as_series(&self) -> Series { + QuantileAggSeries::median_as_series(&self.0) + } + fn var_as_series(&self, ddof: u8) -> Series { + VarAggSeries::var_as_series(&self.0, ddof) + } + fn std_as_series(&self, ddof: u8) -> Series { + VarAggSeries::std_as_series(&self.0, ddof) + } + fn quantile_as_series( + &self, + quantile: f64, + interpol: QuantileInterpolOptions, + ) -> PolarsResult { + QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) + } + + fn fmt_list(&self) -> String { + FmtList::fmt_list(&self.0) + } + fn clone_inner(&self) -> Arc { + Arc::new(SeriesWrap(Clone::clone(&self.0))) + } + + #[cfg(feature = "is_in")] + fn is_in(&self, other: &Series) -> PolarsResult { + IsIn::is_in(&self.0, other) + } + #[cfg(feature = "repeat_by")] + fn repeat_by(&self, by: &IdxCa) -> ListChunked { + RepeatBy::repeat_by(&self.0, by) + } + + #[cfg(feature = "is_first")] + fn is_first(&self) -> PolarsResult { + self.0.is_first() + } + + #[cfg(feature = "mode")] + fn mode(&self) -> PolarsResult { + Ok(self.0.mode()?.into_series()) + } +} diff --git a/polars/polars-core/src/series/implementations/mod.rs b/polars/polars-core/src/series/implementations/mod.rs index 027efb371787..deb81ed07cc5 100644 --- a/polars/polars-core/src/series/implementations/mod.rs +++ b/polars/polars-core/src/series/implementations/mod.rs @@ -1,3 +1,4 @@ +mod binary; mod boolean; #[cfg(feature = "dtype-categorical")] mod categorical; @@ -579,6 +580,7 @@ impl private::PrivateSeriesNumeric for SeriesWrap {} +impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap { fn bit_repr_is_large(&self) -> bool { diff --git a/polars/polars-core/src/series/ops/downcast.rs b/polars/polars-core/src/series/ops/downcast.rs index 2fed1d1d8829..77891414e7f8 100644 --- a/polars/polars-core/src/series/ops/downcast.rs +++ b/polars/polars-core/src/series/ops/downcast.rs @@ -158,6 +158,18 @@ impl Series { } } + /// Unpack to ChunkedArray of dtype binary + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + match self.dtype() { + DataType::Binary => unsafe { + Ok(&*(self.as_ref() as *const dyn SeriesTrait as *const BinaryChunked)) + }, + dt => Err(PolarsError::SchemaMisMatch( + format!("Series of dtype: {:?} != binary", dt).into(), + )), + } + } + /// Unpack to ChunkedArray of dtype Time #[cfg(feature = "dtype-time")] pub fn time(&self) -> PolarsResult<&TimeChunked> { diff --git a/polars/polars-core/src/vector_hasher.rs b/polars/polars-core/src/vector_hasher.rs index 0f16d30a7196..9db7b14958d7 100644 --- a/polars/polars-core/src/vector_hasher.rs +++ b/polars/polars-core/src/vector_hasher.rs @@ -147,6 +147,34 @@ impl VecHash for Utf8Chunked { } } +impl VecHash for BinaryChunked { + fn vec_hash(&self, random_state: RandomState) -> Vec { + let null_h = get_null_hash_value(random_state.clone()); + let mut av = Vec::with_capacity(self.len()); + self.downcast_iter().for_each(|arr| { + av.extend(arr.into_iter().map(|opt_v| match opt_v { + Some(v) => <[u8]>::get_hash(v, &random_state), + None => null_h, + })) + }); + av + } + + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { + let null_h = get_null_hash_value(random_state.clone()); + self.apply_to_slice( + |opt_v, h| { + let l = match opt_v { + Some(v) => <[u8]>::get_hash(v, &random_state), + None => null_h, + }; + _boost_hash_combine(l, *h) + }, + hashes, + ) + } +} + impl VecHash for BooleanChunked { fn vec_hash(&self, random_state: RandomState) -> Vec { let mut av = Vec::with_capacity(self.len()); diff --git a/polars/polars-lazy/src/logical_plan/format.rs b/polars/polars-lazy/src/logical_plan/format.rs index c1b0d543f96b..01e822ea94d0 100644 --- a/polars/polars-lazy/src/logical_plan/format.rs +++ b/polars/polars-lazy/src/logical_plan/format.rs @@ -386,6 +386,7 @@ impl Debug for LiteralValue { Null => write!(f, "null"), Boolean(b) => write!(f, "{}", b), Utf8(s) => write!(f, "{}", s), + Binary(_) => write!(f, "[binary value]"), #[cfg(feature = "dtype-u8")] UInt8(v) => write!(f, "{}u8", v), #[cfg(feature = "dtype-u16")] diff --git a/polars/polars-lazy/src/logical_plan/lit.rs b/polars/polars-lazy/src/logical_plan/lit.rs index 0bcb35333874..05a851df4e3a 100644 --- a/polars/polars-lazy/src/logical_plan/lit.rs +++ b/polars/polars-lazy/src/logical_plan/lit.rs @@ -14,6 +14,8 @@ pub enum LiteralValue { Boolean(bool), /// A UTF8 encoded string type. Utf8(String), + /// A raw binary array + Binary(Vec), /// An unsigned 8-bit integer number. #[cfg(feature = "dtype-u8")] UInt8(u8), @@ -97,6 +99,7 @@ impl LiteralValue { LiteralValue::Float32(_) => DataType::Float32, LiteralValue::Float64(_) => DataType::Float64, LiteralValue::Utf8(_) => DataType::Utf8, + LiteralValue::Binary(_) => DataType::Binary, LiteralValue::Range { data_type, .. } => data_type.clone(), #[cfg(all(feature = "temporal", feature = "dtype-datetime"))] LiteralValue::DateTime(_, tu) => DataType::Datetime(*tu, None), @@ -125,6 +128,18 @@ impl<'a> Literal for &'a str { } } +impl Literal for Vec { + fn lit(self) -> Expr { + Expr::Literal(LiteralValue::Binary(self)) + } +} + +impl<'a> Literal for &'a [u8] { + fn lit(self) -> Expr { + Expr::Literal(LiteralValue::Binary(self.to_vec())) + } +} + impl TryFrom> for LiteralValue { type Error = PolarsError; fn try_from(value: AnyValue) -> PolarsResult { @@ -132,6 +147,7 @@ impl TryFrom> for LiteralValue { AnyValue::Null => Ok(Self::Null), AnyValue::Boolean(b) => Ok(Self::Boolean(b)), AnyValue::Utf8(s) => Ok(Self::Utf8(s.to_string())), + AnyValue::Binary(b) => Ok(Self::Binary(b.to_vec())), #[cfg(feature = "dtype-u8")] AnyValue::UInt8(u) => Ok(Self::UInt8(u)), #[cfg(feature = "dtype-u16")] diff --git a/polars/polars-lazy/src/physical_plan/expressions/literal.rs b/polars/polars-lazy/src/physical_plan/expressions/literal.rs index 157cb3d7cdd5..c4c36b018f3b 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/literal.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/literal.rs @@ -74,6 +74,7 @@ impl PhysicalExpr for LiteralExpr { } }, Utf8(v) => Utf8Chunked::full("literal", v, 1).into_series(), + Binary(v) => BinaryChunked::full("literal", v, 1).into_series(), #[cfg(all(feature = "temporal", feature = "dtype-datetime"))] DateTime(ndt, tu) => { use polars_core::chunked_array::temporal::conversion::*; From 7dffa6a7e6554da62f6d3c21d8116556555fc659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 23 Sep 2022 17:19:48 +0300 Subject: [PATCH 06/22] impl_named_from! binary --- polars/polars-core/src/chunked_array/builder/mod.rs | 2 +- polars/polars-core/src/named_from.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/polars/polars-core/src/chunked_array/builder/mod.rs b/polars/polars-core/src/chunked_array/builder/mod.rs index da530aa42ab6..0a49bb478947 100644 --- a/polars/polars-core/src/chunked_array/builder/mod.rs +++ b/polars/polars-core/src/chunked_array/builder/mod.rs @@ -269,7 +269,7 @@ mod test { #[test] fn test_list_binary_builder() { let mut builder = ListBinaryChunkedBuilder::new("a", 10, 10); - builder.append_series(&Series::new("", &[b"foo", b"bar"])); + builder.append_series(&Series::new("", &["foo".as_bytes(), "bar".as_bytes()])); let ca = builder.finish(); dbg!(ca); } diff --git a/polars/polars-core/src/named_from.rs b/polars/polars-core/src/named_from.rs index 57ae747fb972..92d4c45668ad 100644 --- a/polars/polars-core/src/named_from.rs +++ b/polars/polars-core/src/named_from.rs @@ -63,6 +63,7 @@ macro_rules! impl_named_from { } impl_named_from!([String], Utf8Type, from_slice); +impl_named_from!([Vec], BinaryType, from_slice); impl_named_from!([bool], BooleanType, from_slice); #[cfg(feature = "dtype-u8")] impl_named_from!([u8], UInt8Type, from_slice); @@ -79,6 +80,7 @@ impl_named_from!([i64], Int64Type, from_slice); impl_named_from!([f32], Float32Type, from_slice); impl_named_from!([f64], Float64Type, from_slice); impl_named_from!([Option], Utf8Type, from_slice_options); +impl_named_from!([Option>], BinaryType, from_slice_options); impl_named_from!([Option], BooleanType, from_slice_options); #[cfg(feature = "dtype-u8")] impl_named_from!([Option], UInt8Type, from_slice_options); From 98b044298917a648b6ace16f148990c7382ecccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 23 Sep 2022 17:50:07 +0300 Subject: [PATCH 07/22] impl binary comparison --- .../src/chunked_array/comparison.rs | 201 +++++++++++++++++- polars/polars-core/src/series/comparison.rs | 1 + 2 files changed, 201 insertions(+), 1 deletion(-) diff --git a/polars/polars-core/src/chunked_array/comparison.rs b/polars/polars-core/src/chunked_array/comparison.rs index a64431fb331a..b74ad11c20ee 100644 --- a/polars/polars-core/src/chunked_array/comparison.rs +++ b/polars/polars-core/src/chunked_array/comparison.rs @@ -3,7 +3,7 @@ use std::ops::Not; use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; use arrow::compute; use arrow::compute::comparison; -use arrow::scalar::{PrimitiveScalar, Scalar, Utf8Scalar}; +use arrow::scalar::{BinaryScalar, PrimitiveScalar, Scalar, Utf8Scalar}; use num::{NumCast, ToPrimitive}; use polars_arrow::prelude::FromData; @@ -636,6 +636,164 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked { } } +impl BinaryChunked { + fn comparison( + &self, + rhs: &BinaryChunked, + f: impl Fn(&BinaryArray, &BinaryArray) -> BooleanArray, + ) -> BooleanChunked { + let chunks = self + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(left, right)| { + let arr = f(left, right); + Box::new(arr) as ArrayRef + }) + .collect(); + BooleanChunked::from_chunks("", chunks) + } +} + +impl ChunkCompare<&BinaryChunked> for BinaryChunked { + type Item = BooleanChunked; + + fn eq_missing(&self, rhs: &BinaryChunked) -> BooleanChunked { + impl_eq_missing!(self, rhs) + } + + fn equal(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else { + let (lhs, rhs) = align_chunks_binary(self, rhs); + lhs.comparison(&rhs, comparison::binary::eq_and_validity) + } + } + + fn not_equal(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.not_equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.not_equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else { + let (lhs, rhs) = align_chunks_binary(self, rhs); + lhs.comparison(&rhs, comparison::binary::neq_and_validity) + } + } + + fn gt(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.gt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.lt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::gt(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, >) + } + } + + fn gt_eq(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.gt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.lt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::gt_eq(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, >=) + } + } + + fn lt(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.lt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.gt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::lt(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, <) + } + } + + fn lt_eq(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.lt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.gt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::lt_eq(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, <=) + } + } +} + impl ChunkedArray where T: PolarsNumericType, @@ -728,6 +886,47 @@ impl ChunkCompare<&str> for Utf8Chunked { } } +impl BinaryChunked { + fn binary_compare_scalar( + &self, + rhs: &[u8], + f: impl Fn(&BinaryArray, &dyn Scalar) -> BooleanArray, + ) -> BooleanChunked { + let scalar = BinaryScalar::::new(Some(rhs)); + self.apply_kernel_cast(&|arr| Box::new(f(arr, &scalar))) + } +} + +impl ChunkCompare<&[u8]> for BinaryChunked { + type Item = BooleanChunked; + fn eq_missing(&self, rhs: &[u8]) -> BooleanChunked { + self.equal(rhs) + } + + fn equal(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::eq_scalar_and_validity(l, rhs)) + } + fn not_equal(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::neq_scalar_and_validity(l, rhs)) + } + + fn gt(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::gt_scalar(l, rhs)) + } + + fn gt_eq(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::gt_eq_scalar(l, rhs)) + } + + fn lt(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::lt_scalar(l, rhs)) + } + + fn lt_eq(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::lt_eq_scalar(l, rhs)) + } +} + macro_rules! impl_cmp_list { ($self:ident, $rhs:ident, $cmp_method:ident) => {{ match ($self.has_validity(), $rhs.has_validity()) { diff --git a/polars/polars-core/src/series/comparison.rs b/polars/polars-core/src/series/comparison.rs index e10a9b0124c9..ab3e8db48d71 100644 --- a/polars/polars-core/src/series/comparison.rs +++ b/polars/polars-core/src/series/comparison.rs @@ -22,6 +22,7 @@ macro_rules! impl_compare { match lhs.dtype() { DataType::Boolean => lhs.bool().unwrap().$method(rhs.bool().unwrap()), DataType::Utf8 => lhs.utf8().unwrap().$method(rhs.utf8().unwrap()), + DataType::Binary => lhs.binary().unwrap().$method(rhs.binary().unwrap()), DataType::UInt8 => lhs.u8().unwrap().$method(rhs.u8().unwrap()), DataType::UInt16 => lhs.u16().unwrap().$method(rhs.u16().unwrap()), DataType::UInt32 => lhs.u32().unwrap().$method(rhs.u32().unwrap()), From 06da9b640f58c8a9df7abc2b701e309c6a98dee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 23 Sep 2022 18:46:41 +0300 Subject: [PATCH 08/22] read arrow binary as binary instead of List --- polars/polars-core/src/series/from.rs | 39 ++++------------------- polars/polars-core/src/utils/supertype.rs | 4 +-- 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index e7c2dc301bdd..8186bae2b8d9 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -95,6 +95,13 @@ impl Series { let chunks = cast_chunks(&chunks, &DataType::Utf8, false).unwrap(); Ok(Utf8Chunked::from_chunks(name, chunks).into_series()) } + ArrowDataType::LargeBinary => { + Ok(BinaryChunked::from_chunks(name, chunks).into_series()) + } + ArrowDataType::Binary => { + let chunks = cast_chunks(&chunks, &DataType::Binary, false).unwrap(); + Ok(BinaryChunked::from_chunks(name, chunks).into_series()) + } ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let chunks = chunks.iter().map(convert_inner_types).collect(); Ok(ListChunked::from_chunks(name, chunks).into_series()) @@ -254,38 +261,6 @@ impl Series { // the invariants of an Arrow Dictionary guarantee the keys are in bounds Ok(CategoricalChunked::from_keys_and_values(name, keys, values).into_series()) } - #[cfg(not(feature = "dtype-u8"))] - ArrowDataType::LargeBinary | ArrowDataType::Binary => { - panic!("activate dtype-u8 to read binary data into polars List") - } - #[cfg(feature = "dtype-u8")] - ArrowDataType::LargeBinary | ArrowDataType::Binary => { - let chunks = chunks - .iter() - .map(|arr| { - let arr = cast(&**arr, &ArrowDataType::LargeBinary).unwrap(); - - let arr = arr.as_any().downcast_ref::>().unwrap(); - let values = arr.values().clone(); - let offsets = arr.offsets().clone(); - let validity = arr.validity().cloned(); - - let values = Box::new(PrimitiveArray::from_data( - ArrowDataType::UInt8, - values, - None, - )); - - let dtype = ListArray::::default_datatype(ArrowDataType::UInt8); - // Safety: - // offsets are monotonically increasing - Box::new(ListArray::::new_unchecked( - dtype, offsets, values, validity, - )) as ArrayRef - }) - .collect(); - Ok(ListChunked::from_chunks(name, chunks).into()) - } #[cfg(feature = "object")] ArrowDataType::Extension(s, _, Some(_)) if s == "POLARS_EXTENSION_TYPE" => { assert_eq!(chunks.len(), 1); diff --git a/polars/polars-core/src/utils/supertype.rs b/polars/polars-core/src/utils/supertype.rs index a535e0f4afe0..9e151dc4bb17 100644 --- a/polars/polars-core/src/utils/supertype.rs +++ b/polars/polars-core/src/utils/supertype.rs @@ -205,8 +205,8 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Option { #[cfg(all(feature = "dtype-date", feature = "dtype-time"))] (Date, Time) => Some(Int64), - // every known type can be casted to a string - (dt, Utf8) if dt != &DataType::Unknown => Some(Utf8), + // every known type can be casted to a string except binary + (dt, Utf8) if dt != &DataType::Unknown && dt != &DataType::Binary => Some(Utf8), (dt, Null) => Some(dt.clone()), From 14353efb8747c296681fba48fabd2f6c24123d03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Mon, 26 Sep 2022 19:56:06 +0300 Subject: [PATCH 09/22] add Binary to Series::from_chunks_and_dtype_unchecked --- polars/polars-core/src/series/from.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index 8186bae2b8d9..0c0b50781129 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -61,6 +61,7 @@ impl Series { .into_series(), List(_) => ListChunked::from_chunks(name, chunks).cast(dtype).unwrap(), Utf8 => Utf8Chunked::from_chunks(name, chunks).into_series(), + Binary => BinaryChunked::from_chunks(name, chunks).into_series(), #[cfg(feature = "dtype-categorical")] Categorical(rev_map) => { let cats = UInt32Chunked::from_chunks(name, chunks); From 90a58cb187ad91e1095ea978b3903d2c0e7e6a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Mon, 26 Sep 2022 21:42:54 +0300 Subject: [PATCH 10/22] add Binary to match_dtype_to_logical_apply_macro --- polars/polars-core/src/chunked_array/builder/list.rs | 8 ++++++++ polars/polars-core/src/series/ops/null.rs | 7 ++++++- polars/polars-core/src/utils/mod.rs | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/polars/polars-core/src/chunked_array/builder/list.rs b/polars/polars-core/src/chunked_array/builder/list.rs index 8f53916330ae..6827273b9904 100644 --- a/polars/polars-core/src/chunked_array/builder/list.rs +++ b/polars/polars-core/src/chunked_array/builder/list.rs @@ -470,10 +470,18 @@ pub fn get_list_builder( Box::new(builder) }}; } + macro_rules! get_binary_builder { + () => {{ + let builder = + ListBinaryChunkedBuilder::new(&name, list_capacity, 5 * value_capacity); + Box::new(builder) + }}; + } Ok(match_dtype_to_logical_apply_macro!( physical_type, get_primitive_builder, get_utf8_builder, + get_binary_builder, get_bool_builder )) } diff --git a/polars/polars-core/src/series/ops/null.rs b/polars/polars-core/src/series/ops/null.rs index 39f33279ddf1..1741606648d2 100644 --- a/polars/polars-core/src/series/ops/null.rs +++ b/polars/polars-core/src/series/ops/null.rs @@ -50,7 +50,12 @@ impl Series { ChunkedArray::::full_null(name, size).into_series() }}; } - match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, bool) + macro_rules! binary { + () => {{ + ChunkedArray::::full_null(name, size).into_series() + }}; + } + match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, binary, bool) } } } diff --git a/polars/polars-core/src/utils/mod.rs b/polars/polars-core/src/utils/mod.rs index b6d84cbde9bb..3ba1c434b3d4 100644 --- a/polars/polars-core/src/utils/mod.rs +++ b/polars/polars-core/src/utils/mod.rs @@ -257,9 +257,10 @@ macro_rules! match_dtype_to_physical_apply_macro { /// Apply a macro on the Series #[macro_export] macro_rules! match_dtype_to_logical_apply_macro { - ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ + ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_binary:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ match $obj { DataType::Utf8 => $macro_utf8!($($opt_args)*), + DataType::Binary => $macro_binary!($($opt_args)*), DataType::Boolean => $macro_bool!($($opt_args)*), #[cfg(feature = "dtype-u8")] DataType::UInt8 => $macro!(UInt8Type $(, $opt_args)*), From 69846bf6793abc6de71be2c822f7bc3dcf11c86c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Tue, 27 Sep 2022 13:46:22 +0300 Subject: [PATCH 11/22] update toml files --- polars/polars-arrow/Cargo.toml | 6 ------ polars/polars-core/Cargo.toml | 22 ---------------------- polars/polars-io/Cargo.toml | 6 ------ 3 files changed, 34 deletions(-) diff --git a/polars/polars-arrow/Cargo.toml b/polars/polars-arrow/Cargo.toml index fb3196051564..91c8e66463c2 100644 --- a/polars/polars-arrow/Cargo.toml +++ b/polars/polars-arrow/Cargo.toml @@ -9,12 +9,6 @@ description = "Arrow interfaces for Polars DataFrame library" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd", features = ["compute_concatenate"], default-features = false } -# arrow = { package = "arrow2", path = "../../../arrow2", features = ["compute_concatenate"], default-features = false } -# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "comparison_and_validity", features = ["compute_concatenate"], default-features = false } -arrow = { package = "arrow2", version = "0.14", default-features = false, features = ["compute_concatenate"] } -hashbrown = "0.12" -num = "^0.4" arrow.workspace = true hashbrown.workspace = true num.workspace = true diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 46aa56d976a8..0cfae570fe59 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -170,28 +170,6 @@ regex = { version = "1.5", optional = true } serde = { version = "1", features = ["derive"], optional = true } serde_json = { version = "1", optional = true } smartstring = { version = "1", optional = true } -thiserror = "^1.0" - -[dependencies.arrow] -package = "arrow2" -# git = "https://github.com/jorgecarleitao/arrow2" -# git = "https://github.com/ritchie46/arrow2" -# rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd" -# path = "../../../arrow2" -# branch = "comparison_and_validity" -version = "0.14" -default-features = false -features = [ - "compute_aggregate", - "compute_arithmetics", - "compute_boolean", - "compute_boolean_kleene", - "compute_cast", - "compute_comparison", - "compute_concatenate", - "compute_filter", - "compute_if_then_else", -] thiserror.workspace = true [dev-dependencies] diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index 510b9a3fe555..23be147e26d0 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -36,12 +36,6 @@ temporal = ["dtype-datetime", "dtype-date", "dtype-time"] private = ["polars-time/private"] [dependencies] -ahash = "0.7" -anyhow = "1.0" -# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "6c102a0c3e2dbeb185360dd3d5c3637b5e2028fd", default-features = false } -# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "comparison_and_validity", default-features = false } -arrow = { package = "arrow2", version = "0.14", default-features = false } -# arrow = { package = "arrow2", path = "../../../arrow2", default-features = false } ahash.workspace = true anyhow.workspace = true arrow.workspace = true From c5ac9576f1d6854f52f0500d33fdd24993611331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Wed, 28 Sep 2022 17:49:04 +0300 Subject: [PATCH 12/22] fix joins for binary --- .../frame/hash_join/single_keys_dispatch.rs | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs index 4b52106daad6..1016807267ce 100644 --- a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs +++ b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs @@ -15,6 +15,11 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_left(rhs) } + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_left(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = lhs.bit_repr_large(); @@ -40,6 +45,11 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_semi_anti(rhs, anti) } + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_semi_anti(rhs, anti) + } _ => { if self.bit_repr_is_large() { let lhs = lhs.bit_repr_large(); @@ -65,6 +75,11 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_inner(rhs) } + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_inner(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = self.bit_repr_large(); @@ -92,6 +107,11 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_outer(rhs) } + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_outer(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = self.bit_repr_large(); @@ -409,6 +429,120 @@ impl Utf8Chunked { } } +pub(crate) fn prepare_bytes<'a>( + been_split: &'a [BinaryChunked], + hb: &RandomState, +) -> Vec>> { + POOL.install(|| { + been_split + .par_iter() + .map(|ca| { + ca.into_iter() + .map(|opt_b| { + let mut state = hb.build_hasher(); + opt_b.hash(&mut state); + let hash = state.finish(); + BytesHash::new(opt_b, hash) + }) + .collect::>() + }) + .collect() + }) +} + +impl BinaryChunked { + fn prepare( + &self, + other: &BinaryChunked, + swapped: bool, + ) -> (Vec, Vec, bool, RandomState) { + let n_threads = POOL.current_num_threads(); + + let (a, b, swap) = if swapped { + det_hash_prone_order!(self, other) + } else { + (self, other, false) + }; + + let hb = RandomState::default(); + let splitted_a = split_ca(a, n_threads).unwrap(); + let splitted_b = split_ca(b, n_threads).unwrap(); + + (splitted_a, splitted_b, swap, hb) + } + + // returns the join tuples and whether or not the lhs tuples are sorted + fn hash_join_inner(&self, other: &BinaryChunked) -> ((Vec, Vec), bool) { + let (splitted_a, splitted_b, swap, hb) = self.prepare(other, true); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + ( + hash_join_tuples_inner(str_hashes_a, str_hashes_b, swap), + !swap, + ) + } + + fn hash_join_left(&self, other: &BinaryChunked) -> LeftJoinIds { + let (splitted_a, splitted_b, _, hb) = self.prepare(other, false); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + + let (mapping_left, mapping_right) = + create_mappings(self.chunks(), other.chunks(), self.len(), other.len()); + hash_join_tuples_left( + str_hashes_a, + str_hashes_b, + mapping_left.as_deref(), + mapping_right.as_deref(), + ) + } + + #[cfg(feature = "semi_anti_join")] + fn hash_join_semi_anti(&self, other: &BinaryChunked, anti: bool) -> Vec { + let (splitted_a, splitted_b, _, hb) = self.prepare(other, false); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + if anti { + hash_join_tuples_left_anti(str_hashes_a, str_hashes_b) + } else { + hash_join_tuples_left_semi(str_hashes_a, str_hashes_b) + } + } + + fn hash_join_outer(&self, other: &BinaryChunked) -> Vec<(Option, Option)> { + let (a, b, swap) = det_hash_prone_order!(self, other); + + let n_partitions = set_partition_size(); + let splitted_a = split_ca(a, n_partitions).unwrap(); + let splitted_b = split_ca(b, n_partitions).unwrap(); + + match (a.has_validity(), b.has_validity()) { + (false, false) => { + let iters_a = splitted_a + .iter() + .map(|ca| ca.into_no_null_iter()) + .collect::>(); + let iters_b = splitted_b + .iter() + .map(|ca| ca.into_no_null_iter()) + .collect::>(); + hash_join_tuples_outer(iters_a, iters_b, swap) + } + _ => { + let iters_a = splitted_a + .iter() + .map(|ca| ca.into_iter()) + .collect::>(); + let iters_b = splitted_b + .iter() + .map(|ca| ca.into_iter()) + .collect::>(); + hash_join_tuples_outer(iters_a, iters_b, swap) + } + } + } +} + #[cfg(feature = "semi_anti_join")] fn num_group_join_anti_semi( left: &ChunkedArray, From a668bd80edcb5f72275e17420ddb6693cc456001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Thu, 29 Sep 2022 00:30:48 +0300 Subject: [PATCH 13/22] fix predicate pushdown bug --- .../optimizer/predicate_pushdown/mod.rs | 4 +--- .../optimizer/predicate_pushdown/utils.rs | 15 --------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs index cd5c6bf5a3d4..46c588b5f187 100644 --- a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs +++ b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs @@ -425,7 +425,7 @@ impl PredicatePushDown { let mut pushdown_right = optimizer::init_hashmap(Some(acc_predicates.len())); let mut local_predicates = Vec::with_capacity(acc_predicates.len()); - for (_, predicate) in acc_predicates { + for (name, predicate) in acc_predicates { // unique and duplicated can be caused by joins let matches = |e: &AExpr| matches!(e, AExpr::Function{function: FunctionExpr::IsDuplicated | FunctionExpr::IsUnique, ..}); @@ -453,7 +453,6 @@ impl PredicatePushDown { if !predicate_is_pushdown_boundary(predicate, expr_arena) { // no else if. predicate can be in both tables. if check_input_node(predicate, &schema_left, expr_arena) { - let name = get_insertion_name(expr_arena, predicate, &schema_left); insert_and_combine_predicate( &mut pushdown_left, name, @@ -467,7 +466,6 @@ impl PredicatePushDown { // in that case we should not push down as the user wants to filter on `x` // not on `x_rhs`. else if check_input_node(predicate, &schema_right, expr_arena) { - let name = get_insertion_name(expr_arena, predicate, &schema_right); insert_and_combine_predicate( &mut pushdown_right, name, diff --git a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs index 6b23e5bddb0e..2415ebb0d71c 100644 --- a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs +++ b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs @@ -93,21 +93,6 @@ pub(super) fn roots_to_key(roots: &[Arc]) -> Arc { } } -pub(super) fn get_insertion_name( - expr_arena: &Arena, - predicate: Node, - schema: &Schema, -) -> Arc { - Arc::from( - expr_arena - .get(predicate) - .to_field(schema, Context::Default, expr_arena) - .unwrap() - .name() - .as_ref(), - ) -} - // this checks if a predicate from a node upstream can pass // the predicate in this filter // Cases where this cannot be the case: From e53b80c8fc948000672b3cc448100c651706d862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 00:59:56 +0300 Subject: [PATCH 14/22] Revert "fix predicate pushdown bug" This reverts commit a668bd80edcb5f72275e17420ddb6693cc456001. --- .../optimizer/predicate_pushdown/mod.rs | 4 +++- .../optimizer/predicate_pushdown/utils.rs | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs index 46c588b5f187..cd5c6bf5a3d4 100644 --- a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs +++ b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/mod.rs @@ -425,7 +425,7 @@ impl PredicatePushDown { let mut pushdown_right = optimizer::init_hashmap(Some(acc_predicates.len())); let mut local_predicates = Vec::with_capacity(acc_predicates.len()); - for (name, predicate) in acc_predicates { + for (_, predicate) in acc_predicates { // unique and duplicated can be caused by joins let matches = |e: &AExpr| matches!(e, AExpr::Function{function: FunctionExpr::IsDuplicated | FunctionExpr::IsUnique, ..}); @@ -453,6 +453,7 @@ impl PredicatePushDown { if !predicate_is_pushdown_boundary(predicate, expr_arena) { // no else if. predicate can be in both tables. if check_input_node(predicate, &schema_left, expr_arena) { + let name = get_insertion_name(expr_arena, predicate, &schema_left); insert_and_combine_predicate( &mut pushdown_left, name, @@ -466,6 +467,7 @@ impl PredicatePushDown { // in that case we should not push down as the user wants to filter on `x` // not on `x_rhs`. else if check_input_node(predicate, &schema_right, expr_arena) { + let name = get_insertion_name(expr_arena, predicate, &schema_right); insert_and_combine_predicate( &mut pushdown_right, name, diff --git a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs index 2415ebb0d71c..6b23e5bddb0e 100644 --- a/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs +++ b/polars/polars-lazy/src/logical_plan/optimizer/predicate_pushdown/utils.rs @@ -93,6 +93,21 @@ pub(super) fn roots_to_key(roots: &[Arc]) -> Arc { } } +pub(super) fn get_insertion_name( + expr_arena: &Arena, + predicate: Node, + schema: &Schema, +) -> Arc { + Arc::from( + expr_arena + .get(predicate) + .to_field(schema, Context::Default, expr_arena) + .unwrap() + .name() + .as_ref(), + ) +} + // this checks if a predicate from a node upstream can pass // the predicate in this filter // Cases where this cannot be the case: From 109b057fa0f047d03fdd6245493ec057cfe83f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 01:49:05 +0300 Subject: [PATCH 15/22] add dtype-binary feature gate --- polars/Cargo.toml | 7 +++++ polars/polars-core/Cargo.toml | 1 + .../src/chunked_array/arithmetic.rs | 4 +++ .../src/chunked_array/builder/from.rs | 1 + .../src/chunked_array/builder/list.rs | 11 +++++++ .../src/chunked_array/builder/mod.rs | 4 +++ polars/polars-core/src/chunked_array/cast.rs | 1 + .../src/chunked_array/comparison.rs | 9 +++++- .../src/chunked_array/iterator/mod.rs | 8 +++++ polars/polars-core/src/chunked_array/mod.rs | 2 ++ .../src/chunked_array/ops/aggregate.rs | 4 +++ .../src/chunked_array/ops/any_value.rs | 2 ++ .../src/chunked_array/ops/append.rs | 1 + .../src/chunked_array/ops/apply.rs | 2 ++ .../src/chunked_array/ops/compare_inner.rs | 11 +++++-- .../src/chunked_array/ops/downcast.rs | 1 + .../src/chunked_array/ops/explode.rs | 1 + .../src/chunked_array/ops/extend.rs | 1 + .../src/chunked_array/ops/fill_null.rs | 2 ++ .../src/chunked_array/ops/filter.rs | 1 + .../polars-core/src/chunked_array/ops/full.rs | 2 ++ .../src/chunked_array/ops/is_in.rs | 1 + .../polars-core/src/chunked_array/ops/mod.rs | 1 + .../src/chunked_array/ops/repeat_by.rs | 1 + .../src/chunked_array/ops/reverse.rs | 1 + .../polars-core/src/chunked_array/ops/set.rs | 1 + .../src/chunked_array/ops/shift.rs | 2 ++ .../src/chunked_array/ops/sort/mod.rs | 1 + .../src/chunked_array/ops/take/mod.rs | 1 + .../chunked_array/ops/take/take_chunked.rs | 1 + .../src/chunked_array/ops/take/take_every.rs | 1 + .../src/chunked_array/ops/take/take_random.rs | 5 +++ .../src/chunked_array/ops/take/take_single.rs | 1 + .../src/chunked_array/ops/unique/mod.rs | 2 ++ .../polars-core/src/chunked_array/ops/zip.rs | 1 + .../src/chunked_array/trusted_len.rs | 1 + .../src/chunked_array/upstream_traits.rs | 11 +++++-- polars/polars-core/src/datatypes/_serde.rs | 4 ++- polars/polars-core/src/datatypes/dtype.rs | 7 +++-- polars/polars-core/src/datatypes/field.rs | 1 + polars/polars-core/src/datatypes/mod.rs | 27 ++++++++++++++-- polars/polars-core/src/fmt.rs | 3 ++ .../frame/groupby/aggregations/agg_list.rs | 1 + .../src/frame/groupby/into_groups.rs | 1 + polars/polars-core/src/frame/hash_join/mod.rs | 1 + .../frame/hash_join/single_keys_dispatch.rs | 6 ++++ polars/polars-core/src/named_from.rs | 11 +++++++ polars/polars-core/src/prelude.rs | 7 +++-- polars/polars-core/src/series/any_value.rs | 3 ++ .../src/series/arithmetic/borrowed.rs | 1 + polars/polars-core/src/series/comparison.rs | 1 + polars/polars-core/src/series/from.rs | 31 +++++++++++++++++++ .../src/series/implementations/mod.rs | 2 ++ polars/polars-core/src/series/ops/downcast.rs | 1 + polars/polars-core/src/series/ops/null.rs | 7 +++++ polars/polars-core/src/utils/mod.rs | 1 + polars/polars-core/src/utils/supertype.rs | 4 +++ polars/polars-core/src/vector_hasher.rs | 1 + polars/polars-io/Cargo.toml | 1 + polars/polars-lazy/Cargo.toml | 1 + polars/polars-lazy/polars-plan/Cargo.toml | 1 + .../polars-plan/src/logical_plan/format.rs | 1 + .../polars-plan/src/logical_plan/lit.rs | 5 +++ .../src/physical_plan/expressions/literal.rs | 1 + polars/polars-ops/Cargo.toml | 1 + 65 files changed, 227 insertions(+), 13 deletions(-) diff --git a/polars/Cargo.toml b/polars/Cargo.toml index d05464928a54..b63b1fadc766 100644 --- a/polars/Cargo.toml +++ b/polars/Cargo.toml @@ -163,6 +163,7 @@ dtype-full = [ "dtype-u16", "dtype-categorical", "dtype-struct", + "dtype-binary", ] # sensible minimal set of opt-in datatypes @@ -210,6 +211,12 @@ dtype-struct = [ "polars-ops/dtype-struct", "polars-io/dtype-struct", ] +dtype-binary = [ + "polars-core/dtype-binary", + "polars-lazy/dtype-binary", + "polars-ops/dtype-binary", + "polars-io/dtype-binary", +] docs-selection = [ "csv-file", diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 78dcd6a38acf..5208bc6af965 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -93,6 +93,7 @@ dtype-u8 = [] dtype-u16 = [] dtype-categorical = ["smartstring"] dtype-struct = [] +dtype-binary = [] parquet = ["arrow/io_parquet"] diff --git a/polars/polars-core/src/chunked_array/arithmetic.rs b/polars/polars-core/src/chunked_array/arithmetic.rs index 05cbdb2205d0..7a0210e11693 100644 --- a/polars/polars-core/src/chunked_array/arithmetic.rs +++ b/polars/polars-core/src/chunked_array/arithmetic.rs @@ -432,6 +432,7 @@ fn concat_strings(l: &str, r: &str) -> String { s } +#[cfg(feature = "dtype-binary")] fn concat_binary_arrs(l: &[u8], r: &[u8]) -> Vec { let mut v = Vec::with_capacity(l.len() + r.len()); v.extend_from_slice(l); @@ -501,6 +502,7 @@ impl Add<&str> for &Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl Add for &BinaryChunked { type Output = BinaryChunked; @@ -536,6 +538,7 @@ impl Add for &BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl Add for BinaryChunked { type Output = BinaryChunked; @@ -544,6 +547,7 @@ impl Add for BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl Add<&[u8]> for &BinaryChunked { type Output = BinaryChunked; diff --git a/polars/polars-core/src/chunked_array/builder/from.rs b/polars/polars-core/src/chunked_array/builder/from.rs index 2450d83da596..f008a934d0e0 100644 --- a/polars/polars-core/src/chunked_array/builder/from.rs +++ b/polars/polars-core/src/chunked_array/builder/from.rs @@ -41,6 +41,7 @@ impl From<(&str, Utf8Array)> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl From<(&str, BinaryArray)> for BinaryChunked { fn from(tpl: (&str, BinaryArray)) -> Self { let name = tpl.0; diff --git a/polars/polars-core/src/chunked_array/builder/list.rs b/polars/polars-core/src/chunked_array/builder/list.rs index 6827273b9904..07b964b4cf7f 100644 --- a/polars/polars-core/src/chunked_array/builder/list.rs +++ b/polars/polars-core/src/chunked_array/builder/list.rs @@ -179,6 +179,7 @@ where type LargePrimitiveBuilder = MutableListArray>; type LargeListUtf8Builder = MutableListArray>; +#[cfg(feature = "dtype-binary")] type LargeListBinaryBuilder = MutableListArray>; type LargeListBooleanBuilder = MutableListArray; @@ -262,12 +263,14 @@ impl ListBuilderTrait for ListUtf8ChunkedBuilder { } } +#[cfg(feature = "dtype-binary")] pub struct ListBinaryChunkedBuilder { builder: LargeListBinaryBuilder, field: Field, fast_explode: bool, } +#[cfg(feature = "dtype-binary")] impl ListBinaryChunkedBuilder { pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { let values = MutableBinaryArray::::with_capacity(values_capacity); @@ -313,6 +316,7 @@ impl ListBinaryChunkedBuilder { } } +#[cfg(feature = "dtype-binary")] impl ListBuilderTrait for ListBinaryChunkedBuilder { fn append_opt_series(&mut self, opt_s: Option<&Series>) { match opt_s { @@ -470,6 +474,7 @@ pub fn get_list_builder( Box::new(builder) }}; } + #[cfg(feature = "dtype-binary")] macro_rules! get_binary_builder { () => {{ let builder = @@ -477,6 +482,12 @@ pub fn get_list_builder( Box::new(builder) }}; } + #[cfg(not(feature = "dtype-binary"))] + macro_rules! get_binary_builder { + () => {{ + unreachable!(); + }}; + } Ok(match_dtype_to_logical_apply_macro!( physical_type, get_primitive_builder, diff --git a/polars/polars-core/src/chunked_array/builder/mod.rs b/polars/polars-core/src/chunked_array/builder/mod.rs index 0a49bb478947..d2f53d815d52 100644 --- a/polars/polars-core/src/chunked_array/builder/mod.rs +++ b/polars/polars-core/src/chunked_array/builder/mod.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "dtype-binary")] mod binary; mod boolean; mod from; @@ -12,6 +13,7 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; +#[cfg(feature = "dtype-binary")] pub use binary::*; pub use boolean::*; pub use list::*; @@ -161,6 +163,7 @@ where } } +#[cfg(feature = "dtype-binary")] impl NewChunkedArray for BinaryChunked where B: AsRef<[u8]>, @@ -266,6 +269,7 @@ mod test { dbg!(ca); } + #[cfg(feature = "dtype-binary")] #[test] fn test_list_binary_builder() { let mut builder = ListBinaryChunkedBuilder::new("a", 10, 10); diff --git a/polars/polars-core/src/chunked_array/cast.rs b/polars/polars-core/src/chunked_array/cast.rs index ca7c0c7ac922..49673e3e474b 100644 --- a/polars/polars-core/src/chunked_array/cast.rs +++ b/polars/polars-core/src/chunked_array/cast.rs @@ -113,6 +113,7 @@ impl ChunkCast for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkCast for BinaryChunked { fn cast(&self, data_type: &DataType) -> PolarsResult { cast_impl(self.name(), &self.chunks, data_type) diff --git a/polars/polars-core/src/chunked_array/comparison.rs b/polars/polars-core/src/chunked_array/comparison.rs index b74ad11c20ee..b97ad98e4ffa 100644 --- a/polars/polars-core/src/chunked_array/comparison.rs +++ b/polars/polars-core/src/chunked_array/comparison.rs @@ -3,7 +3,9 @@ use std::ops::Not; use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; use arrow::compute; use arrow::compute::comparison; -use arrow::scalar::{BinaryScalar, PrimitiveScalar, Scalar, Utf8Scalar}; +#[cfg(feature = "dtype-binary")] +use arrow::scalar::BinaryScalar; +use arrow::scalar::{PrimitiveScalar, Scalar, Utf8Scalar}; use num::{NumCast, ToPrimitive}; use polars_arrow::prelude::FromData; @@ -636,6 +638,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl BinaryChunked { fn comparison( &self, @@ -654,6 +657,7 @@ impl BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkCompare<&BinaryChunked> for BinaryChunked { type Item = BooleanChunked; @@ -886,6 +890,7 @@ impl ChunkCompare<&str> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl BinaryChunked { fn binary_compare_scalar( &self, @@ -897,6 +902,7 @@ impl BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkCompare<&[u8]> for BinaryChunked { type Item = BooleanChunked; fn eq_missing(&self, rhs: &[u8]) -> BooleanChunked { @@ -1096,6 +1102,7 @@ impl ChunkEqualElement for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkEqualElement for BinaryChunked { unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { let ca_other = other.as_ref().as_ref(); diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/polars/polars-core/src/chunked_array/iterator/mod.rs index 71e59952e2ee..8f7fa9d34ba3 100644 --- a/polars/polars-core/src/chunked_array/iterator/mod.rs +++ b/polars/polars-core/src/chunked_array/iterator/mod.rs @@ -8,6 +8,7 @@ use crate::series::iterator::SeriesIter; use crate::utils::CustomIterTools; type LargeStringArray = Utf8Array; +#[cfg(feature = "dtype-binary")] type LargeBinaryArray = BinaryArray; type LargeListArray = ListArray; pub mod par; @@ -210,6 +211,7 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> IntoIterator for &'a BinaryChunked { type Item = Option<&'a [u8]>; type IntoIter = Box + 'a>; @@ -219,12 +221,14 @@ impl<'a> IntoIterator for &'a BinaryChunked { } } +#[cfg(feature = "dtype-binary")] pub struct BinaryIterNoNull<'a> { array: &'a LargeBinaryArray, current: usize, current_end: usize, } +#[cfg(feature = "dtype-binary")] impl<'a> BinaryIterNoNull<'a> { /// create a new iterator pub fn new(array: &'a LargeBinaryArray) -> Self { @@ -236,6 +240,7 @@ impl<'a> BinaryIterNoNull<'a> { } } +#[cfg(feature = "dtype-binary")] impl<'a> Iterator for BinaryIterNoNull<'a> { type Item = &'a [u8]; @@ -257,6 +262,7 @@ impl<'a> Iterator for BinaryIterNoNull<'a> { } } +#[cfg(feature = "dtype-binary")] impl<'a> DoubleEndedIterator for BinaryIterNoNull<'a> { fn next_back(&mut self) -> Option { if self.current_end == self.current { @@ -268,9 +274,11 @@ impl<'a> DoubleEndedIterator for BinaryIterNoNull<'a> { } } +#[cfg(feature = "dtype-binary")] /// all arrays have known size. impl<'a> ExactSizeIterator for BinaryIterNoNull<'a> {} +#[cfg(feature = "dtype-binary")] impl BinaryChunked { #[allow(clippy::wrong_self_convention)] #[doc(hidden)] diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs index 49f72d25edca..ce94cd3c5506 100644 --- a/polars/polars-core/src/chunked_array/mod.rs +++ b/polars/polars-core/src/chunked_array/mod.rs @@ -521,6 +521,7 @@ where impl AsSinglePtr for BooleanChunked {} impl AsSinglePtr for ListChunked {} impl AsSinglePtr for Utf8Chunked {} +#[cfg(feature = "dtype-binary")] impl AsSinglePtr for BinaryChunked {} #[cfg(feature = "object")] impl AsSinglePtr for ObjectChunked {} @@ -602,6 +603,7 @@ impl ValueSize for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ValueSize for BinaryChunked { fn get_values_size(&self) -> usize { self.chunks diff --git a/polars/polars-core/src/chunked_array/ops/aggregate.rs b/polars/polars-core/src/chunked_array/ops/aggregate.rs index 861bec9d1df6..4c7f39bc193b 100644 --- a/polars/polars-core/src/chunked_array/ops/aggregate.rs +++ b/polars/polars-core/src/chunked_array/ops/aggregate.rs @@ -692,6 +692,7 @@ impl VarAggSeries for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl VarAggSeries for BinaryChunked { fn var_as_series(&self, _ddof: u8) -> Series { Self::full_null(self.name(), 1).into_series() @@ -814,6 +815,7 @@ impl QuantileAggSeries for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl QuantileAggSeries for BinaryChunked { fn quantile_as_series( &self, @@ -873,6 +875,7 @@ impl ChunkAggSeries for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkAggSeries for BinaryChunked { fn sum_as_series(&self) -> Series { BinaryChunked::full_null(self.name(), 1).into_series() @@ -940,6 +943,7 @@ where impl ArgAgg for BooleanChunked {} impl ArgAgg for Utf8Chunked {} +#[cfg(feature = "dtype-binary")] impl ArgAgg for BinaryChunked {} impl ArgAgg for ListChunked {} diff --git a/polars/polars-core/src/chunked_array/ops/any_value.rs b/polars/polars-core/src/chunked_array/ops/any_value.rs index a47357783567..3f891d34ed7d 100644 --- a/polars/polars-core/src/chunked_array/ops/any_value.rs +++ b/polars/polars-core/src/chunked_array/ops/any_value.rs @@ -30,6 +30,7 @@ pub(crate) unsafe fn arr_to_any_value<'a>( // TODO: insert types match dtype { DataType::Utf8 => downcast_and_pack!(LargeStringArray, Utf8), + #[cfg(feature = "dtype-binary")] DataType::Binary => downcast_and_pack!(LargeBinaryArray, Binary), DataType::Boolean => downcast_and_pack!(BooleanArray, Boolean), DataType::UInt8 => downcast_and_pack!(UInt8Array, UInt8), @@ -167,6 +168,7 @@ impl ChunkAnyValue for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkAnyValue for BinaryChunked { #[inline] unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { diff --git a/polars/polars-core/src/chunked_array/ops/append.rs b/polars/polars-core/src/chunked_array/ops/append.rs index 5be9d4ca0b60..e6018de0614d 100644 --- a/polars/polars-core/src/chunked_array/ops/append.rs +++ b/polars/polars-core/src/chunked_array/ops/append.rs @@ -44,6 +44,7 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] #[doc(hidden)] impl BinaryChunked { pub fn append(&mut self, other: &Self) { diff --git a/polars/polars-core/src/chunked_array/ops/apply.rs b/polars/polars-core/src/chunked_array/ops/apply.rs index 71b4f89170d2..271445fc497c 100644 --- a/polars/polars-core/src/chunked_array/ops/apply.rs +++ b/polars/polars-core/src/chunked_array/ops/apply.rs @@ -446,6 +446,7 @@ impl<'a> ChunkApply<'a, &'a str, Cow<'a, str>> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> ChunkApply<'a, &'a [u8], Cow<'a, [u8]>> for BinaryChunked { fn apply_cast_numeric(&'a self, f: F) -> ChunkedArray where @@ -587,6 +588,7 @@ impl ChunkApplyKernel for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkApplyKernel for BinaryChunked { fn apply_kernel(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> Self { self.apply_kernel_cast(&f) diff --git a/polars/polars-core/src/chunked_array/ops/compare_inner.rs b/polars/polars-core/src/chunked_array/ops/compare_inner.rs index 06ff863c1daf..21a2609e3eba 100644 --- a/polars/polars-core/src/chunked_array/ops/compare_inner.rs +++ b/polars/polars-core/src/chunked_array/ops/compare_inner.rs @@ -4,10 +4,11 @@ use std::cmp::{Ordering, PartialEq}; +#[cfg(feature = "dtype-binary")] +use crate::chunked_array::ops::take::take_random::{BinaryTakeRandom, BinaryTakeRandomSingleChunk}; use crate::chunked_array::ops::take::take_random::{ - BinaryTakeRandom, BinaryTakeRandomSingleChunk, BoolTakeRandom, BoolTakeRandomSingleChunk, - NumTakeRandomChunked, NumTakeRandomCont, NumTakeRandomSingleChunk, Utf8TakeRandom, - Utf8TakeRandomSingleChunk, + BoolTakeRandom, BoolTakeRandomSingleChunk, NumTakeRandomChunked, NumTakeRandomCont, + NumTakeRandomSingleChunk, Utf8TakeRandom, Utf8TakeRandomSingleChunk, }; #[cfg(feature = "object")] use crate::chunked_array::ops::take::take_random::{ObjectTakeRandom, ObjectTakeRandomSingleChunk}; @@ -70,7 +71,9 @@ macro_rules! impl_traits { impl_traits!(Utf8TakeRandom<'_>); impl_traits!(Utf8TakeRandomSingleChunk<'_>); +#[cfg(feature = "dtype-binary")] impl_traits!(BinaryTakeRandom<'_>); +#[cfg(feature = "dtype-binary")] impl_traits!(BinaryTakeRandomSingleChunk<'_>); impl_traits!(BoolTakeRandom<'_>); impl_traits!(BoolTakeRandomSingleChunk<'_>); @@ -143,6 +146,7 @@ impl<'a> IntoPartialEqInner<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> IntoPartialEqInner<'a> for &'a BinaryChunked { fn into_partial_eq_inner(self) -> Box { match self.chunks.len() { @@ -263,6 +267,7 @@ impl<'a> IntoPartialOrdInner<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> IntoPartialOrdInner<'a> for &'a BinaryChunked { fn into_partial_ord_inner(self) -> Box { match self.chunks.len() { diff --git a/polars/polars-core/src/chunked_array/ops/downcast.rs b/polars/polars-core/src/chunked_array/ops/downcast.rs index 56a4afd0bb0a..a75ccced1144 100644 --- a/polars/polars-core/src/chunked_array/ops/downcast.rs +++ b/polars/polars-core/src/chunked_array/ops/downcast.rs @@ -131,6 +131,7 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] #[doc(hidden)] impl BinaryChunked { pub fn downcast_iter(&self) -> impl Iterator> + DoubleEndedIterator { diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/polars/polars-core/src/chunked_array/ops/explode.rs index 4c3fe81feaf5..e380ed3d153e 100644 --- a/polars/polars-core/src/chunked_array/ops/explode.rs +++ b/polars/polars-core/src/chunked_array/ops/explode.rs @@ -280,6 +280,7 @@ impl ExplodeByOffsets for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ExplodeByOffsets for BinaryChunked { fn explode_by_offsets(&self, offsets: &[i64]) -> Series { debug_assert_eq!(self.chunks.len(), 1); diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/polars/polars-core/src/chunked_array/ops/extend.rs index 334be24d1aca..2fa8330c9075 100644 --- a/polars/polars-core/src/chunked_array/ops/extend.rs +++ b/polars/polars-core/src/chunked_array/ops/extend.rs @@ -120,6 +120,7 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] #[doc(hidden)] impl BinaryChunked { pub fn extend(&mut self, other: &Self) { diff --git a/polars/polars-core/src/chunked_array/ops/fill_null.rs b/polars/polars-core/src/chunked_array/ops/fill_null.rs index 18bbb144b616..c4470f4354f4 100644 --- a/polars/polars-core/src/chunked_array/ops/fill_null.rs +++ b/polars/polars-core/src/chunked_array/ops/fill_null.rs @@ -107,6 +107,7 @@ fn fill_backward_limit_utf8(ca: &Utf8Chunked, limit: IdxSize) -> Utf8Chunked { out.into_iter().rev().collect_trusted() } +#[cfg(feature = "dtype-binary")] fn fill_backward_limit_binary(ca: &BinaryChunked, limit: IdxSize) -> BinaryChunked { let mut cnt = 0; let mut previous = None; @@ -370,6 +371,7 @@ impl ChunkFillNullValue<&str> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkFillNull for BinaryChunked { fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { // nothing to fill diff --git a/polars/polars-core/src/chunked_array/ops/filter.rs b/polars/polars-core/src/chunked_array/ops/filter.rs index d80038c76257..a4d76571b534 100644 --- a/polars/polars-core/src/chunked_array/ops/filter.rs +++ b/polars/polars-core/src/chunked_array/ops/filter.rs @@ -93,6 +93,7 @@ impl ChunkFilter for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkFilter for BinaryChunked { fn filter(&self, filter: &BooleanChunked) -> PolarsResult> { // broadcast diff --git a/polars/polars-core/src/chunked_array/ops/full.rs b/polars/polars-core/src/chunked_array/ops/full.rs index 594e50c86b91..4e1e88c2cd08 100644 --- a/polars/polars-core/src/chunked_array/ops/full.rs +++ b/polars/polars-core/src/chunked_array/ops/full.rs @@ -64,6 +64,7 @@ impl ChunkFullNull for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { fn full(name: &str, value: &'a [u8], length: usize) -> Self { let mut builder = BinaryChunkedBuilder::new(name, length, length * value.len()); @@ -77,6 +78,7 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkFullNull for BinaryChunked { fn full_null(name: &str, length: usize) -> Self { let arr = new_null_array(DataType::Binary.to_arrow(), length); diff --git a/polars/polars-core/src/chunked_array/ops/is_in.rs b/polars/polars-core/src/chunked_array/ops/is_in.rs index fa38a4e199b4..932cd298685e 100644 --- a/polars/polars-core/src/chunked_array/ops/is_in.rs +++ b/polars/polars-core/src/chunked_array/ops/is_in.rs @@ -230,6 +230,7 @@ impl IsIn for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl IsIn for BinaryChunked { fn is_in(&self, other: &Series) -> PolarsResult { match other.dtype() { diff --git a/polars/polars-core/src/chunked_array/ops/mod.rs b/polars/polars-core/src/chunked_array/ops/mod.rs index 43c54c3b548a..3f441c509011 100644 --- a/polars/polars-core/src/chunked_array/ops/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/mod.rs @@ -639,6 +639,7 @@ impl ChunkExpandAtIndex for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkExpandAtIndex for BinaryChunked { fn expand_at_index(&self, index: usize, length: usize) -> BinaryChunked { impl_chunk_expand!(self, length, index) diff --git a/polars/polars-core/src/chunked_array/ops/repeat_by.rs b/polars/polars-core/src/chunked_array/ops/repeat_by.rs index 41100d68caa6..2d31d30e24b6 100644 --- a/polars/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/polars/polars-core/src/chunked_array/ops/repeat_by.rs @@ -63,6 +63,7 @@ impl RepeatBy for Utf8Chunked { ) } } +#[cfg(feature = "dtype-binary")] impl RepeatBy for BinaryChunked { fn repeat_by(&self, by: &IdxCa) -> ListChunked { let iter = self diff --git a/polars/polars-core/src/chunked_array/ops/reverse.rs b/polars/polars-core/src/chunked_array/ops/reverse.rs index 85a80758f06e..f3ec1f0e2c66 100644 --- a/polars/polars-core/src/chunked_array/ops/reverse.rs +++ b/polars/polars-core/src/chunked_array/ops/reverse.rs @@ -39,6 +39,7 @@ macro_rules! impl_reverse { impl_reverse!(BooleanType, BooleanChunked); impl_reverse!(Utf8Type, Utf8Chunked); +#[cfg(feature = "dtype-binary")] impl_reverse!(BinaryType, BinaryChunked); impl_reverse!(ListType, ListChunked); diff --git a/polars/polars-core/src/chunked_array/ops/set.rs b/polars/polars-core/src/chunked_array/ops/set.rs index d4e9b405b7aa..edd4001ae706 100644 --- a/polars/polars-core/src/chunked_array/ops/set.rs +++ b/polars/polars-core/src/chunked_array/ops/set.rs @@ -273,6 +273,7 @@ impl<'a> ChunkSet<'a, &'a str, String> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { fn set_at_idx>( &'a self, diff --git a/polars/polars-core/src/chunked_array/ops/shift.rs b/polars/polars-core/src/chunked_array/ops/shift.rs index a2a55cfa9b4d..450f62b3d322 100644 --- a/polars/polars-core/src/chunked_array/ops/shift.rs +++ b/polars/polars-core/src/chunked_array/ops/shift.rs @@ -60,6 +60,7 @@ impl ChunkShiftFill> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkShiftFill> for BinaryChunked { fn shift_and_fill(&self, periods: i64, fill_value: Option<&[u8]>) -> BinaryChunked { impl_shift_fill!(self, periods, fill_value) @@ -72,6 +73,7 @@ impl ChunkShift for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkShift for BinaryChunked { fn shift(&self, periods: i64) -> Self { self.shift_and_fill(periods, None) diff --git a/polars/polars-core/src/chunked_array/ops/sort/mod.rs b/polars/polars-core/src/chunked_array/ops/sort/mod.rs index 6a08a61ecfe3..f6be933758e0 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/mod.rs @@ -508,6 +508,7 @@ impl ChunkSort for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkSort for BinaryChunked { fn sort_with(&self, options: SortOptions) -> ChunkedArray { sort_with_fast_path!(self, options); diff --git a/polars/polars-core/src/chunked_array/ops/take/mod.rs b/polars/polars-core/src/chunked_array/ops/take/mod.rs index 94095a1e2d1c..78584b835462 100644 --- a/polars/polars-core/src/chunked_array/ops/take/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/take/mod.rs @@ -323,6 +323,7 @@ impl ChunkTake for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkTake for BinaryChunked { unsafe fn take_unchecked(&self, indices: TakeIdx) -> Self where diff --git a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs index cb8b6e3206b0..778bb596ac65 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs @@ -90,6 +90,7 @@ impl TakeChunked for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl TakeChunked for BinaryChunked { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { let arrs = self.downcast_iter().collect::>(); diff --git a/polars/polars-core/src/chunked_array/ops/take/take_every.rs b/polars/polars-core/src/chunked_array/ops/take/take_every.rs index 09da5861de57..be5c361943b0 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_every.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_every.rs @@ -41,6 +41,7 @@ impl ChunkTakeEvery for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkTakeEvery for BinaryChunked { fn take_every(&self, n: usize) -> BinaryChunked { let mut ca: Self = if !self.has_validity() { diff --git a/polars/polars-core/src/chunked_array/ops/take/take_random.rs b/polars/polars-core/src/chunked_array/ops/take/take_random.rs index d284b9949c32..dfcabe358691 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_random.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_random.rs @@ -232,11 +232,13 @@ impl<'a> IntoTakeRandom<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] pub struct BinaryTakeRandom<'a> { pub(crate) chunks: Chunks<'a, BinaryArray>, pub(crate) chunk_lens: Vec, } +#[cfg(feature = "dtype-binary")] impl<'a> TakeRandom for BinaryTakeRandom<'a> { type Item = &'a [u8]; @@ -251,10 +253,12 @@ impl<'a> TakeRandom for BinaryTakeRandom<'a> { } } +#[cfg(feature = "dtype-binary")] pub struct BinaryTakeRandomSingleChunk<'a> { pub(crate) arr: &'a BinaryArray, } +#[cfg(feature = "dtype-binary")] impl<'a> TakeRandom for BinaryTakeRandomSingleChunk<'a> { type Item = &'a [u8]; @@ -273,6 +277,7 @@ impl<'a> TakeRandom for BinaryTakeRandomSingleChunk<'a> { } } +#[cfg(feature = "dtype-binary")] impl<'a> IntoTakeRandom<'a> for &'a BinaryChunked { type Item = &'a [u8]; type TakeRandom = TakeRandBranch2, BinaryTakeRandom<'a>>; diff --git a/polars/polars-core/src/chunked_array/ops/take/take_single.rs b/polars/polars-core/src/chunked_array/ops/take/take_single.rs index 9ee3cc25444d..7764b7fcff06 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_single.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_single.rs @@ -113,6 +113,7 @@ impl<'a> TakeRandom for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl<'a> TakeRandom for &'a BinaryChunked { type Item = &'a [u8]; diff --git a/polars/polars-core/src/chunked_array/ops/unique/mod.rs b/polars/polars-core/src/chunked_array/ops/unique/mod.rs index 7663e4f57aa0..156f5fb15c2d 100644 --- a/polars/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/unique/mod.rs @@ -274,6 +274,7 @@ impl ChunkUnique for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkUnique for BinaryChunked { fn unique(&self) -> PolarsResult { match self.null_count() { @@ -462,6 +463,7 @@ mod is_first { } } + #[cfg(feature = "dtype-binary")] impl IsFirst for BinaryChunked { fn is_first(&self) -> PolarsResult { let mut unique = PlHashSet::new(); diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/polars/polars-core/src/chunked_array/ops/zip.rs index ece5d4a74c49..7a498741fcc7 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/polars/polars-core/src/chunked_array/ops/zip.rs @@ -143,6 +143,7 @@ impl ChunkZip for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkZip for BinaryChunked { fn zip_with( &self, diff --git a/polars/polars-core/src/chunked_array/trusted_len.rs b/polars/polars-core/src/chunked_array/trusted_len.rs index 3b81a1a14fe5..134e5eba2170 100644 --- a/polars/polars-core/src/chunked_array/trusted_len.rs +++ b/polars/polars-core/src/chunked_array/trusted_len.rs @@ -203,6 +203,7 @@ where } } +#[cfg(feature = "dtype-binary")] impl FromTrustedLenIterator for BinaryChunked where Ptr: PolarsAsRef<[u8]>, diff --git a/polars/polars-core/src/chunked_array/upstream_traits.rs b/polars/polars-core/src/chunked_array/upstream_traits.rs index 19cb0484dc30..63ac8aead8bb 100644 --- a/polars/polars-core/src/chunked_array/upstream_traits.rs +++ b/polars/polars-core/src/chunked_array/upstream_traits.rs @@ -134,7 +134,7 @@ where } // FromIterator for BinaryChunked variants. - +#[cfg(feature = "dtype-binary")] impl FromIterator> for BinaryChunked where Ptr: AsRef<[u8]>, @@ -145,12 +145,19 @@ where } } +#[cfg(feature = "dtype-binary")] impl PolarsAsRef<[u8]> for Vec {} + +#[cfg(feature = "dtype-binary")] impl PolarsAsRef<[u8]> for &[u8] {} -// &["foo", "bar"] + +#[cfg(feature = "dtype-binary")] impl PolarsAsRef<[u8]> for &&[u8] {} + +#[cfg(feature = "dtype-binary")] impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} +#[cfg(feature = "dtype-binary")] impl FromIterator for BinaryChunked where Ptr: PolarsAsRef<[u8]>, diff --git a/polars/polars-core/src/datatypes/_serde.rs b/polars/polars-core/src/datatypes/_serde.rs index 8d5b47bae4ca..af984f1f8380 100644 --- a/polars/polars-core/src/datatypes/_serde.rs +++ b/polars/polars-core/src/datatypes/_serde.rs @@ -42,7 +42,7 @@ pub enum SerializableDataType { Float64, /// String data Utf8, - /// Raw binary data + #[cfg(feature = "dtype-binary")] Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). @@ -78,6 +78,7 @@ impl From<&DataType> for SerializableDataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + #[cfg(feature = "dtype-binary")] Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(*tu, tz.clone()), @@ -108,6 +109,7 @@ impl From for DataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + #[cfg(feature = "dtype-binary")] Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(tu, tz), diff --git a/polars/polars-core/src/datatypes/dtype.rs b/polars/polars-core/src/datatypes/dtype.rs index 52a4b9913d64..5c4b144c7da4 100644 --- a/polars/polars-core/src/datatypes/dtype.rs +++ b/polars/polars-core/src/datatypes/dtype.rs @@ -17,7 +17,7 @@ pub enum DataType { Float64, /// String data Utf8, - /// Raw binary data + #[cfg(feature = "dtype-binary")] Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). @@ -140,13 +140,14 @@ impl DataType { #[allow(clippy::match_like_matches_macro)] match self { DataType::Utf8 - | DataType::Binary | DataType::List(_) | DataType::Date | DataType::Datetime(_, _) | DataType::Duration(_) | DataType::Boolean | DataType::Null => false, + #[cfg(feature = "dtype-binary")] + DataType::Binary => false, #[cfg(feature = "object")] DataType::Object(_) => false, #[cfg(feature = "dtype-categorical")] @@ -191,6 +192,7 @@ impl DataType { Float32 => ArrowDataType::Float32, Float64 => ArrowDataType::Float64, Utf8 => ArrowDataType::LargeUtf8, + #[cfg(feature = "dtype-binary")] Binary => ArrowDataType::LargeBinary, Date => ArrowDataType::Date32, Datetime(unit, tz) => ArrowDataType::Timestamp(unit.to_arrow(), tz.clone()), @@ -243,6 +245,7 @@ impl Display for DataType { DataType::Float32 => "f32", DataType::Float64 => "f64", DataType::Utf8 => "str", + #[cfg(feature = "dtype-binary")] DataType::Binary => "binary", DataType::Date => "date", DataType::Datetime(tu, tz) => { diff --git a/polars/polars-core/src/datatypes/field.rs b/polars/polars-core/src/datatypes/field.rs index fbac10eada4e..014472124c69 100644 --- a/polars/polars-core/src/datatypes/field.rs +++ b/polars/polars-core/src/datatypes/field.rs @@ -132,6 +132,7 @@ impl From<&ArrowDataType> for DataType { ArrowDataType::Duration(tu) => DataType::Duration(tu.into()), ArrowDataType::Date64 => DataType::Datetime(TimeUnit::Milliseconds, None), ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::Utf8, + #[cfg(feature = "dtype-binary")] ArrowDataType::LargeBinary | ArrowDataType::Binary => DataType::Binary, ArrowDataType::Time64(_) | ArrowDataType::Time32(_) => DataType::Time, #[cfg(feature = "dtype-categorical")] diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index cb597ed97b9d..38ce033aea10 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -43,6 +43,7 @@ use crate::utils::Wrap; pub struct Utf8Type {} +#[cfg(feature = "dtype-binary")] pub struct BinaryType {} #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -88,6 +89,7 @@ impl PolarsDataType for Utf8Type { } } +#[cfg(feature = "dtype-binary")] impl PolarsDataType for BinaryType { fn get_dtype() -> DataType { DataType::Binary @@ -129,6 +131,8 @@ pub trait PolarsSingleType: PolarsDataType {} impl PolarsSingleType for T where T: NativeType + PolarsDataType {} impl PolarsSingleType for Utf8Type {} + +#[cfg(feature = "dtype-binary")] impl PolarsSingleType for BinaryType {} pub type ListChunked = ChunkedArray; @@ -144,6 +148,7 @@ pub type Int64Chunked = ChunkedArray; pub type Float32Chunked = ChunkedArray; pub type Float64Chunked = ChunkedArray; pub type Utf8Chunked = ChunkedArray; +#[cfg(feature = "dtype-binary")] pub type BinaryChunked = ChunkedArray; pub trait NumericNative: @@ -255,7 +260,7 @@ pub enum AnyValue<'a> { Boolean(bool), /// A UTF8 encoded string type. Utf8(&'a str), - /// A raw binary type + #[cfg(feature = "dtype-binary")] Binary(&'a [u8]), /// An unsigned 8-bit integer number. UInt8(u8), @@ -304,7 +309,7 @@ pub enum AnyValue<'a> { StructOwned(Box<(Vec>, Vec)>), /// A UTF8 encoded string type. Utf8Owned(String), - // A raw binary type + #[cfg(feature = "dtype-binary")] BinaryOwned(Vec), } @@ -334,7 +339,9 @@ impl Serialize for AnyValue<'_> { AnyValue::Utf8Owned(v) => { serializer.serialize_newtype_variant(name, 13, "Utf8Owned", v) } + #[cfg(feature = "dtype-binary")] AnyValue::Binary(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), + #[cfg(feature = "dtype-binary")] AnyValue::BinaryOwned(v) => { serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v) } @@ -365,6 +372,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { List, Bool, Utf8Owned, + #[cfg(feature = "dtype-binary")] BinaryOwned, } const VARIANTS: &[&str] = &[ @@ -384,7 +392,10 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { "Utf8Owned", "BinaryOwned", ]; + #[cfg(feature = "dtype-binary")] const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::BinaryOwned) }; + #[cfg(not(feature = "dtype-binary"))] + const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::Utf8Owned) }; struct FieldVisitor; @@ -447,6 +458,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { b"List" => AvField::List, b"Bool" => AvField::Bool, b"Utf8Owned" | b"Utf8" => AvField::Utf8Owned, + #[cfg(feature = "dtype-binary")] b"BinaryOwned" | b"Binary" => AvField::BinaryOwned, _ => { return Err(serde::de::Error::unknown_variant( @@ -535,6 +547,7 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { let value = variant.newtype_variant()?; AnyValue::Utf8Owned(value) } + #[cfg(feature = "dtype-binary")] (AvField::BinaryOwned, variant) => { let value = variant.newtype_variant()?; AnyValue::BinaryOwned(value) @@ -598,7 +611,9 @@ impl<'a> Hash for AnyValue<'a> { UInt64(v) => state.write_u64(*v), Utf8(v) => state.write(v.as_bytes()), Utf8Owned(v) => state.write(v.as_bytes()), + #[cfg(feature = "dtype-binary")] Binary(v) => state.write(v), + #[cfg(feature = "dtype-binary")] BinaryOwned(v) => state.write(v), Boolean(v) => state.write_u8(*v as u8), List(v) => Hash::hash(&Wrap(v.clone()), state), @@ -750,7 +765,9 @@ impl<'a> AnyValue<'a> { List(v) => AnyValue::List(v), Utf8(v) => AnyValue::Utf8Owned(v.to_string()), Utf8Owned(v) => AnyValue::Utf8Owned(v), + #[cfg(feature = "dtype-binary")] Binary(v) => AnyValue::BinaryOwned(v.to_vec()), + #[cfg(feature = "dtype-binary")] BinaryOwned(v) => AnyValue::BinaryOwned(v), dt => { return Err(PolarsError::ComputeError( @@ -780,7 +797,9 @@ impl PartialEq for AnyValue<'_> { fn eq(&self, other: &Self) -> bool { use AnyValue::*; match (self, other) { + #[cfg(feature = "dtype-binary")] (BinaryOwned(l), BinaryOwned(r)) => l == r, + #[cfg(feature = "dtype-binary")] (Binary(l), Binary(r)) => l == r, (Utf8Owned(l), Utf8Owned(r)) => l == r, (Utf8(l), Utf8(r)) => l == r, @@ -840,7 +859,9 @@ impl PartialOrd for AnyValue<'_> { (Float64(l), Float64(r)) => l.partial_cmp(r), (Utf8(l), Utf8(r)) => l.partial_cmp(r), (Utf8Owned(l), Utf8Owned(r)) => l.partial_cmp(r), + #[cfg(feature = "dtype-binary")] (Binary(l), Binary(r)) => l.partial_cmp(r), + #[cfg(feature = "dtype-binary")] (BinaryOwned(l), BinaryOwned(r)) => l.partial_cmp(r), _ => None, } @@ -975,7 +996,9 @@ mod test { ), (ArrowDataType::LargeUtf8, DataType::Utf8), (ArrowDataType::Utf8, DataType::Utf8), + #[cfg(feature = "dtype-binary")] (ArrowDataType::LargeBinary, DataType::Binary), + #[cfg(feature = "dtype-binary")] (ArrowDataType::Binary, DataType::Binary), ( ArrowDataType::Time64(ArrowTimeUnit::Nanosecond), diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs index d59882e3ea5a..5f3c8db37753 100644 --- a/polars/polars-core/src/fmt.rs +++ b/polars/polars-core/src/fmt.rs @@ -140,6 +140,7 @@ impl Debug for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl Debug for BinaryChunked { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { format_array!(f, self, "binary", self.name(), "ChunkedArray") @@ -657,6 +658,7 @@ impl Display for AnyValue<'_> { AnyValue::Boolean(v) => write!(f, "{}", *v), AnyValue::Utf8(v) => write!(f, "{}", format_args!("\"{}\"", v)), AnyValue::Utf8Owned(v) => write!(f, "{}", format_args!("\"{}\"", v)), + #[cfg(feature = "dtype-binary")] AnyValue::Binary(_) | AnyValue::BinaryOwned(_) => write!(f, "[binary data]"), #[cfg(feature = "dtype-date")] AnyValue::Date(v) => write!(f, "{}", date32_to_date(*v)), @@ -771,6 +773,7 @@ impl FmtList for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl FmtList for BinaryChunked { fn fmt_list(&self) -> String { impl_fmt_list!(self) diff --git a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs index 696b974092a6..13d7959b356d 100644 --- a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs +++ b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs @@ -202,6 +202,7 @@ impl AggList for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl AggList for BinaryChunked { unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { match groups { diff --git a/polars/polars-core/src/frame/groupby/into_groups.rs b/polars/polars-core/src/frame/groupby/into_groups.rs index 9411e5b27ebf..2a78526a6994 100644 --- a/polars/polars-core/src/frame/groupby/into_groups.rs +++ b/polars/polars-core/src/frame/groupby/into_groups.rs @@ -285,6 +285,7 @@ impl IntoGroupsProxy for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl IntoGroupsProxy for BinaryChunked { #[allow(clippy::needless_lifetimes)] fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { diff --git a/polars/polars-core/src/frame/hash_join/mod.rs b/polars/polars-core/src/frame/hash_join/mod.rs index ea61997461ea..71630d4d129c 100644 --- a/polars/polars-core/src/frame/hash_join/mod.rs +++ b/polars/polars-core/src/frame/hash_join/mod.rs @@ -233,6 +233,7 @@ macro_rules! impl_zip_outer_join { } impl_zip_outer_join!(BooleanChunked); impl_zip_outer_join!(Utf8Chunked); +#[cfg(feature = "dtype-binary")] impl_zip_outer_join!(BinaryChunked); impl ZipOuterJoinColumn for Float32Chunked { diff --git a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs index 1016807267ce..4f4e3a368ade 100644 --- a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs +++ b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs @@ -15,6 +15,7 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_left(rhs) } + #[cfg(feature = "dtype-binary")] Binary => { let lhs = lhs.binary().unwrap(); let rhs = rhs.binary().unwrap(); @@ -45,6 +46,7 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_semi_anti(rhs, anti) } + #[cfg(feature = "dtype-binary")] Binary => { let lhs = lhs.binary().unwrap(); let rhs = rhs.binary().unwrap(); @@ -75,6 +77,7 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_inner(rhs) } + #[cfg(feature = "dtype-binary")] Binary => { let lhs = lhs.binary().unwrap(); let rhs = rhs.binary().unwrap(); @@ -107,6 +110,7 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_outer(rhs) } + #[cfg(feature = "dtype-binary")] Binary => { let lhs = lhs.binary().unwrap(); let rhs = rhs.binary().unwrap(); @@ -429,6 +433,7 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] pub(crate) fn prepare_bytes<'a>( been_split: &'a [BinaryChunked], hb: &RandomState, @@ -450,6 +455,7 @@ pub(crate) fn prepare_bytes<'a>( }) } +#[cfg(feature = "dtype-binary")] impl BinaryChunked { fn prepare( &self, diff --git a/polars/polars-core/src/named_from.rs b/polars/polars-core/src/named_from.rs index 92d4c45668ad..d843275f1d05 100644 --- a/polars/polars-core/src/named_from.rs +++ b/polars/polars-core/src/named_from.rs @@ -63,6 +63,7 @@ macro_rules! impl_named_from { } impl_named_from!([String], Utf8Type, from_slice); +#[cfg(feature = "dtype-binary")] impl_named_from!([Vec], BinaryType, from_slice); impl_named_from!([bool], BooleanType, from_slice); #[cfg(feature = "dtype-u8")] @@ -80,6 +81,7 @@ impl_named_from!([i64], Int64Type, from_slice); impl_named_from!([f32], Float32Type, from_slice); impl_named_from!([f64], Float64Type, from_slice); impl_named_from!([Option], Utf8Type, from_slice_options); +#[cfg(feature = "dtype-binary")] impl_named_from!([Option>], BinaryType, from_slice_options); impl_named_from!([Option], BooleanType, from_slice_options); #[cfg(feature = "dtype-u8")] @@ -227,12 +229,14 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>]> } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for Series { fn new(name: &str, v: T) -> Self { BinaryChunked::from_slice(name, v.as_ref()).into_series() } } +#[cfg(feature = "dtype-binary")] impl NamedFrom<&Series, [u8]> for Series { fn new(name: &str, s: &Series) -> Self { let mut s = s.clone(); @@ -241,24 +245,28 @@ impl NamedFrom<&Series, [u8]> for Series { } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for BinaryChunked { fn new(name: &str, v: T) -> Self { BinaryChunked::from_slice(name, v.as_ref()) } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for Series { fn new(name: &str, v: T) -> Self { BinaryChunked::from_slice_options(name, v.as_ref()).into_series() } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for BinaryChunked { fn new(name: &str, v: T) -> Self { BinaryChunked::from_slice_options(name, v.as_ref()) } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for Series { fn new(name: &str, v: T) -> Self { BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) @@ -266,18 +274,21 @@ impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for Series { } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for BinaryChunked { fn new(name: &str, v: T) -> Self { BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for Series { fn new(name: &str, v: T) -> Self { BinaryChunked::new(name, v).into_series() } } +#[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for BinaryChunked { diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs index 2b787efe6cb6..97c7dd48b2ae 100644 --- a/polars/polars-core/src/prelude.rs +++ b/polars/polars-core/src/prelude.rs @@ -9,9 +9,12 @@ pub use polars_arrow::kernels::ewm::EWMOptions; pub use polars_arrow::prelude::*; pub(crate) use polars_arrow::trusted_len::TrustedLen; +#[cfg(feature = "dtype-binary")] pub use crate::chunked_array::builder::{ - BinaryChunkedBuilder, BooleanChunkedBuilder, ChunkedBuilder, ListBinaryChunkedBuilder, - ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, + BinaryChunkedBuilder, BooleanChunkedBuilder, ListBinaryChunkedBuilder, +}; +pub use crate::chunked_array::builder::{ + ChunkedBuilder, ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, Utf8ChunkedBuilder, }; pub use crate::chunked_array::iterator::PolarsIterator; diff --git a/polars/polars-core/src/series/any_value.rs b/polars/polars-core/src/series/any_value.rs index 1e26e0da698e..e2a470acb4ab 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/polars/polars-core/src/series/any_value.rs @@ -16,6 +16,7 @@ fn any_values_to_utf8(avs: &[AnyValue]) -> Utf8Chunked { .collect_trusted() } +#[cfg(feature = "dtype-binary")] fn any_values_to_binary(avs: &[AnyValue]) -> BinaryChunked { avs.iter() .map(|av| match av { @@ -91,6 +92,7 @@ impl Series { DataType::Float32 => any_values_to_primitive::(av).into_series(), DataType::Float64 => any_values_to_primitive::(av).into_series(), DataType::Utf8 => any_values_to_utf8(av).into_series(), + #[cfg(feature = "dtype-binary")] DataType::Binary => any_values_to_binary(av).into_series(), DataType::Boolean => any_values_to_bool(av).into_series(), #[cfg(feature = "dtype-date")] @@ -164,6 +166,7 @@ impl<'a> From<&AnyValue<'a>> for DataType { Null => DataType::Null, Boolean(_) => DataType::Boolean, Utf8(_) | Utf8Owned(_) => DataType::Utf8, + #[cfg(feature = "dtype-binary")] Binary(_) | BinaryOwned(_) => DataType::Binary, UInt32(_) => DataType::UInt32, UInt64(_) => DataType::UInt64, diff --git a/polars/polars-core/src/series/arithmetic/borrowed.rs b/polars/polars-core/src/series/arithmetic/borrowed.rs index 6df1227248ff..41b442edbb12 100644 --- a/polars/polars-core/src/series/arithmetic/borrowed.rs +++ b/polars/polars-core/src/series/arithmetic/borrowed.rs @@ -101,6 +101,7 @@ impl NumOpsDispatch for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl NumOpsDispatch for BinaryChunked { fn add_to(&self, rhs: &Series) -> PolarsResult { let rhs = self.unpack_series_matching_type(rhs)?; diff --git a/polars/polars-core/src/series/comparison.rs b/polars/polars-core/src/series/comparison.rs index ab3e8db48d71..84abe446231a 100644 --- a/polars/polars-core/src/series/comparison.rs +++ b/polars/polars-core/src/series/comparison.rs @@ -22,6 +22,7 @@ macro_rules! impl_compare { match lhs.dtype() { DataType::Boolean => lhs.bool().unwrap().$method(rhs.bool().unwrap()), DataType::Utf8 => lhs.utf8().unwrap().$method(rhs.utf8().unwrap()), + #[cfg(feature = "dtype-binary")] DataType::Binary => lhs.binary().unwrap().$method(rhs.binary().unwrap()), DataType::UInt8 => lhs.u8().unwrap().$method(rhs.u8().unwrap()), DataType::UInt16 => lhs.u16().unwrap().$method(rhs.u16().unwrap()), diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index 0c0b50781129..82813165aeb2 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -61,6 +61,7 @@ impl Series { .into_series(), List(_) => ListChunked::from_chunks(name, chunks).cast(dtype).unwrap(), Utf8 => Utf8Chunked::from_chunks(name, chunks).into_series(), + #[cfg(feature = "dtype-binary")] Binary => BinaryChunked::from_chunks(name, chunks).into_series(), #[cfg(feature = "dtype-categorical")] Categorical(rev_map) => { @@ -96,13 +97,43 @@ impl Series { let chunks = cast_chunks(&chunks, &DataType::Utf8, false).unwrap(); Ok(Utf8Chunked::from_chunks(name, chunks).into_series()) } + #[cfg(feature = "dtype-binary")] ArrowDataType::LargeBinary => { Ok(BinaryChunked::from_chunks(name, chunks).into_series()) } + #[cfg(feature = "dtype-binary")] ArrowDataType::Binary => { let chunks = cast_chunks(&chunks, &DataType::Binary, false).unwrap(); Ok(BinaryChunked::from_chunks(name, chunks).into_series()) } + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] + ArrowDataType::LargeBinary | ArrowDataType::Binary => { + let chunks = chunks + .iter() + .map(|arr| { + let arr = cast(&**arr, &ArrowDataType::LargeBinary).unwrap(); + + let arr = arr.as_any().downcast_ref::>().unwrap(); + let values = arr.values().clone(); + let offsets = arr.offsets().clone(); + let validity = arr.validity().cloned(); + + let values = Box::new(PrimitiveArray::from_data( + ArrowDataType::UInt8, + values, + None, + )); + + let dtype = ListArray::::default_datatype(ArrowDataType::UInt8); + // Safety: + // offsets are monotonically increasing + Box::new(ListArray::::new_unchecked( + dtype, offsets, values, validity, + )) as ArrayRef + }) + .collect(); + Ok(ListChunked::from_chunks(name, chunks).into()) + } ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let chunks = chunks.iter().map(convert_inner_types).collect(); Ok(ListChunked::from_chunks(name, chunks).into_series()) diff --git a/polars/polars-core/src/series/implementations/mod.rs b/polars/polars-core/src/series/implementations/mod.rs index a653ce365368..3a272773b6b4 100644 --- a/polars/polars-core/src/series/implementations/mod.rs +++ b/polars/polars-core/src/series/implementations/mod.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "dtype-binary")] mod binary; mod boolean; #[cfg(feature = "dtype-categorical")] @@ -568,6 +569,7 @@ impl private::PrivateSeriesNumeric for SeriesWrap {} +#[cfg(feature = "dtype-binary")] impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap { diff --git a/polars/polars-core/src/series/ops/downcast.rs b/polars/polars-core/src/series/ops/downcast.rs index 77891414e7f8..b295523c097e 100644 --- a/polars/polars-core/src/series/ops/downcast.rs +++ b/polars/polars-core/src/series/ops/downcast.rs @@ -159,6 +159,7 @@ impl Series { } /// Unpack to ChunkedArray of dtype binary + #[cfg(feature = "dtype-binary")] pub fn binary(&self) -> PolarsResult<&BinaryChunked> { match self.dtype() { DataType::Binary => unsafe { diff --git a/polars/polars-core/src/series/ops/null.rs b/polars/polars-core/src/series/ops/null.rs index 1741606648d2..8c5b13aeddfc 100644 --- a/polars/polars-core/src/series/ops/null.rs +++ b/polars/polars-core/src/series/ops/null.rs @@ -50,11 +50,18 @@ impl Series { ChunkedArray::::full_null(name, size).into_series() }}; } + #[cfg(feature = "dtype-binary")] macro_rules! binary { () => {{ ChunkedArray::::full_null(name, size).into_series() }}; } + #[cfg(not(feature = "dtype-binary"))] + macro_rules! binary { + () => {{ + unreachable!(); + }}; + } match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, binary, bool) } } diff --git a/polars/polars-core/src/utils/mod.rs b/polars/polars-core/src/utils/mod.rs index 3ba1c434b3d4..a011201e7bb3 100644 --- a/polars/polars-core/src/utils/mod.rs +++ b/polars/polars-core/src/utils/mod.rs @@ -260,6 +260,7 @@ macro_rules! match_dtype_to_logical_apply_macro { ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_binary:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ match $obj { DataType::Utf8 => $macro_utf8!($($opt_args)*), + #[cfg(feature = "dtype-binary")] DataType::Binary => $macro_binary!($($opt_args)*), DataType::Boolean => $macro_bool!($($opt_args)*), #[cfg(feature = "dtype-u8")] diff --git a/polars/polars-core/src/utils/supertype.rs b/polars/polars-core/src/utils/supertype.rs index 9e151dc4bb17..9867646786bd 100644 --- a/polars/polars-core/src/utils/supertype.rs +++ b/polars/polars-core/src/utils/supertype.rs @@ -206,8 +206,12 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Option { (Date, Time) => Some(Int64), // every known type can be casted to a string except binary + #[cfg(feature = "dtype-binary")] (dt, Utf8) if dt != &DataType::Unknown && dt != &DataType::Binary => Some(Utf8), + #[cfg(not(feature = "dtype-binary"))] + (dt, Utf8) if dt != &DataType::Unknown => Some(Utf8), + (dt, Null) => Some(dt.clone()), #[cfg(all(feature = "dtype-duration", feature = "dtype-datetime"))] diff --git a/polars/polars-core/src/vector_hasher.rs b/polars/polars-core/src/vector_hasher.rs index 9db7b14958d7..75e28d994d59 100644 --- a/polars/polars-core/src/vector_hasher.rs +++ b/polars/polars-core/src/vector_hasher.rs @@ -147,6 +147,7 @@ impl VecHash for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] impl VecHash for BinaryChunked { fn vec_hash(&self, random_state: RandomState) -> Vec { let null_h = get_null_hash_value(random_state.clone()); diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index 4829cb7286fc..0e130625f319 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -27,6 +27,7 @@ dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"] dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal", "polars-time/dtype-datetime"] dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"] dtype-struct = ["polars-core/dtype-struct"] +dtype-binary = ["polars-core/dtype-binary"] fmt = ["polars-core/fmt"] lazy = [] parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression", "memmap"] diff --git a/polars/polars-lazy/Cargo.toml b/polars/polars-lazy/Cargo.toml index 60025e80b447..5217d65701e2 100644 --- a/polars/polars-lazy/Cargo.toml +++ b/polars/polars-lazy/Cargo.toml @@ -45,6 +45,7 @@ dtype-duration = ["polars-plan/dtype-duration", "polars-time/dtype-duration", "t dtype-time = ["polars-core/dtype-time", "temporal"] dtype-categorical = ["polars-plan/dtype-categorical"] dtype-struct = ["polars-plan/dtype-struct"] +dtype-binary = ["polars-plan/dtype-binary"] object = ["polars-plan/object"] date_offset = ["polars-plan/date_offset"] trigonometry = ["polars-plan/trigonometry"] diff --git a/polars/polars-lazy/polars-plan/Cargo.toml b/polars/polars-lazy/polars-plan/Cargo.toml index 362956f24e9a..364c253e2ad4 100644 --- a/polars/polars-lazy/polars-plan/Cargo.toml +++ b/polars/polars-lazy/polars-plan/Cargo.toml @@ -42,6 +42,7 @@ dtype-datetime = ["polars-core/dtype-datetime", "polars-time/dtype-datetime", "t dtype-duration = ["polars-core/dtype-duration", "polars-time/dtype-duration", "temporal"] dtype-categorical = ["polars-core/dtype-categorical"] dtype-struct = ["polars-core/dtype-struct"] +dtype-binary = ["polars-core/dtype-binary"] object = ["polars-core/object"] date_offset = ["polars-time"] trigonometry = [] diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs b/polars/polars-lazy/polars-plan/src/logical_plan/format.rs index 01e822ea94d0..ccfb26ad2d28 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/format.rs @@ -386,6 +386,7 @@ impl Debug for LiteralValue { Null => write!(f, "null"), Boolean(b) => write!(f, "{}", b), Utf8(s) => write!(f, "{}", s), + #[cfg(feature = "dtype-binary")] Binary(_) => write!(f, "[binary value]"), #[cfg(feature = "dtype-u8")] UInt8(v) => write!(f, "{}u8", v), diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs b/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs index 50b73fd0f078..29df32fddbdc 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs @@ -15,6 +15,7 @@ pub enum LiteralValue { /// A UTF8 encoded string type. Utf8(String), /// A raw binary array + #[cfg(feature = "dtype-binary")] Binary(Vec), /// An unsigned 8-bit integer number. #[cfg(feature = "dtype-u8")] @@ -99,6 +100,7 @@ impl LiteralValue { LiteralValue::Float32(_) => DataType::Float32, LiteralValue::Float64(_) => DataType::Float64, LiteralValue::Utf8(_) => DataType::Utf8, + #[cfg(feature = "dtype-binary")] LiteralValue::Binary(_) => DataType::Binary, LiteralValue::Range { data_type, .. } => data_type.clone(), #[cfg(all(feature = "temporal", feature = "dtype-datetime"))] @@ -128,12 +130,14 @@ impl<'a> Literal for &'a str { } } +#[cfg(feature = "dtype-binary")] impl Literal for Vec { fn lit(self) -> Expr { Expr::Literal(LiteralValue::Binary(self)) } } +#[cfg(feature = "dtype-binary")] impl<'a> Literal for &'a [u8] { fn lit(self) -> Expr { Expr::Literal(LiteralValue::Binary(self.to_vec())) @@ -147,6 +151,7 @@ impl TryFrom> for LiteralValue { AnyValue::Null => Ok(Self::Null), AnyValue::Boolean(b) => Ok(Self::Boolean(b)), AnyValue::Utf8(s) => Ok(Self::Utf8(s.to_string())), + #[cfg(feature = "dtype-binary")] AnyValue::Binary(b) => Ok(Self::Binary(b.to_vec())), #[cfg(feature = "dtype-u8")] AnyValue::UInt8(u) => Ok(Self::UInt8(u)), diff --git a/polars/polars-lazy/src/physical_plan/expressions/literal.rs b/polars/polars-lazy/src/physical_plan/expressions/literal.rs index 55d61ae54fbe..9d16fd69deda 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/literal.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/literal.rs @@ -74,6 +74,7 @@ impl PhysicalExpr for LiteralExpr { } }, Utf8(v) => Utf8Chunked::full("literal", v, 1).into_series(), + #[cfg(feature = "dtype-binary")] Binary(v) => BinaryChunked::full("literal", v, 1).into_series(), #[cfg(feature = "temporal")] DateTime(ndt, tu) => { diff --git a/polars/polars-ops/Cargo.toml b/polars/polars-ops/Cargo.toml index 8fdc0ff1f133..92de47c33829 100644 --- a/polars/polars-ops/Cargo.toml +++ b/polars/polars-ops/Cargo.toml @@ -20,6 +20,7 @@ dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal"] dtype-time = ["polars-core/dtype-time", "polars-core/temporal"] dtype-duration = ["polars-core/dtype-duration", "polars-core/temporal"] dtype-struct = ["polars-core/dtype-struct", "polars-core/temporal"] +dtype-binary = ["polars-core/dtype-binary", "polars-core/dtype-binary"] dtype-u8 = ["polars-core/dtype-u8"] object = ["polars-core/object"] propagate_nans = [] From 62fa394a5555b29058e86423db30d0c6f4fbb6bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 01:55:34 +0300 Subject: [PATCH 16/22] fix compilation --- polars/polars-core/src/chunked_array/ops/fill_null.rs | 1 + polars/polars-core/src/chunked_array/trusted_len.rs | 1 + polars/polars-core/src/prelude.rs | 9 ++++----- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/polars/polars-core/src/chunked_array/ops/fill_null.rs b/polars/polars-core/src/chunked_array/ops/fill_null.rs index c4470f4354f4..2ed7723c3895 100644 --- a/polars/polars-core/src/chunked_array/ops/fill_null.rs +++ b/polars/polars-core/src/chunked_array/ops/fill_null.rs @@ -402,6 +402,7 @@ impl ChunkFillNull for BinaryChunked { } } +#[cfg(feature = "dtype-binary")] impl ChunkFillNullValue<&[u8]> for BinaryChunked { fn fill_null_with_values(&self, value: &[u8]) -> PolarsResult { self.set(&self.is_null(), Some(value)) diff --git a/polars/polars-core/src/chunked_array/trusted_len.rs b/polars/polars-core/src/chunked_array/trusted_len.rs index 134e5eba2170..5ae068743f13 100644 --- a/polars/polars-core/src/chunked_array/trusted_len.rs +++ b/polars/polars-core/src/chunked_array/trusted_len.rs @@ -214,6 +214,7 @@ where } } +#[cfg(feature = "dtype-binary")] impl FromTrustedLenIterator> for BinaryChunked where Ptr: AsRef<[u8]>, diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs index 97c7dd48b2ae..2d302e2b4224 100644 --- a/polars/polars-core/src/prelude.rs +++ b/polars/polars-core/src/prelude.rs @@ -10,12 +10,11 @@ pub use polars_arrow::prelude::*; pub(crate) use polars_arrow::trusted_len::TrustedLen; #[cfg(feature = "dtype-binary")] +pub use crate::chunked_array::builder::{BinaryChunkedBuilder, ListBinaryChunkedBuilder}; pub use crate::chunked_array::builder::{ - BinaryChunkedBuilder, BooleanChunkedBuilder, ListBinaryChunkedBuilder, -}; -pub use crate::chunked_array::builder::{ - ChunkedBuilder, ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, - ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, Utf8ChunkedBuilder, + BooleanChunkedBuilder, ChunkedBuilder, ListBooleanChunkedBuilder, ListBuilderTrait, + ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, + Utf8ChunkedBuilder, }; pub use crate::chunked_array::iterator::PolarsIterator; #[cfg(feature = "dtype-categorical")] From 59522e842a43284c3d4326aa75097e03faecd937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 02:13:56 +0300 Subject: [PATCH 17/22] fix test compilation --- .../src/chunked_array/ops/sort/categorical.rs | 6 ++++-- polars/polars-core/src/named_from.rs | 9 --------- polars/polars-core/src/series/from.rs | 4 ++-- polars/src/lib.rs | 1 + 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/polars/polars-core/src/chunked_array/ops/sort/categorical.rs b/polars/polars-core/src/chunked_array/ops/sort/categorical.rs index 137008f2cf89..bbf56ef8f816 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -188,11 +188,13 @@ mod test { toggle_string_cache(toggle); let s = Series::new("", init).cast(&DataType::Categorical(None))?; let ca = s.categorical()?; - let mut ca_lexical = ca.clone(); + let mut ca_lexical: CategoricalChunked = ca.clone(); ca_lexical.set_lexical_sorted(true); + let series = ca_lexical.into_series(); + let df = df![ - "cat" => &ca_lexical.into_series(), + "cat" => &series, "vals" => [1, 1, 2, 2] ]?; diff --git a/polars/polars-core/src/named_from.rs b/polars/polars-core/src/named_from.rs index d843275f1d05..d19ee237d649 100644 --- a/polars/polars-core/src/named_from.rs +++ b/polars/polars-core/src/named_from.rs @@ -236,15 +236,6 @@ impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for Series { } } -#[cfg(feature = "dtype-binary")] -impl NamedFrom<&Series, [u8]> for Series { - fn new(name: &str, s: &Series) -> Self { - let mut s = s.clone(); - s.rename(name); - s - } -} - #[cfg(feature = "dtype-binary")] impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for BinaryChunked { fn new(name: &str, v: T) -> Self { diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index 82813165aeb2..76e6b9dc33c5 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -536,11 +536,11 @@ impl IntoSeries for Series { #[cfg(test)] mod test { - #[cfg(feature = "dtype-u8")] + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] use super::*; #[test] - #[cfg(feature = "dtype-u8")] + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] fn test_binary_to_list() { let iter = std::iter::repeat(b"hello").take(2).map(Some); let a = Box::new(iter.collect::>()) as ArrayRef; diff --git a/polars/src/lib.rs b/polars/src/lib.rs index 9371e4689120..9f055f08d483 100644 --- a/polars/src/lib.rs +++ b/polars/src/lib.rs @@ -275,6 +275,7 @@ //! | UInt16 | dtype-u16 | //! | Categorical | dtype-categorical | //! | Struct | dtype-struct | +//! | Binary | dtype-binary | //! //! //! Or you can choose on of the preconfigured pre-sets. From 6791e84f3b4f83fc8f07761d49f7b772b2811569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 02:26:19 +0300 Subject: [PATCH 18/22] fix clippy warnings --- polars/polars-core/src/chunked_array/builder/list.rs | 6 ------ polars/polars-core/src/series/ops/null.rs | 6 ------ 2 files changed, 12 deletions(-) diff --git a/polars/polars-core/src/chunked_array/builder/list.rs b/polars/polars-core/src/chunked_array/builder/list.rs index 07b964b4cf7f..8d68dd7cb909 100644 --- a/polars/polars-core/src/chunked_array/builder/list.rs +++ b/polars/polars-core/src/chunked_array/builder/list.rs @@ -482,12 +482,6 @@ pub fn get_list_builder( Box::new(builder) }}; } - #[cfg(not(feature = "dtype-binary"))] - macro_rules! get_binary_builder { - () => {{ - unreachable!(); - }}; - } Ok(match_dtype_to_logical_apply_macro!( physical_type, get_primitive_builder, diff --git a/polars/polars-core/src/series/ops/null.rs b/polars/polars-core/src/series/ops/null.rs index 8c5b13aeddfc..c9e41a409ee1 100644 --- a/polars/polars-core/src/series/ops/null.rs +++ b/polars/polars-core/src/series/ops/null.rs @@ -56,12 +56,6 @@ impl Series { ChunkedArray::::full_null(name, size).into_series() }}; } - #[cfg(not(feature = "dtype-binary"))] - macro_rules! binary { - () => {{ - unreachable!(); - }}; - } match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, binary, bool) } } From 5c9afe9e6232775c9b7cf66073979fbad4979d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Akkurt?= Date: Fri, 30 Sep 2022 02:36:10 +0300 Subject: [PATCH 19/22] fix clippy warnings --- py-polars/src/conversion.rs | 29 +++++++++++++++++++++++++++++ py-polars/src/datatypes.rs | 3 +++ 2 files changed, 32 insertions(+) diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index dfb63716f481..940b6d013689 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -136,6 +136,22 @@ impl<'a> FromPyObject<'a> for Wrap { } } +impl<'a> FromPyObject<'a> for Wrap { + fn extract(obj: &'a PyAny) -> PyResult { + let (seq, len) = get_pyseq(obj)?; + let mut builder = BinaryChunkedBuilder::new("", len, len * 25); + + for res in seq.iter()? { + let item = res?; + match item.extract::<&str>() { + Ok(val) => builder.append_value(val), + Err(_) => builder.append_null(), + } + } + Ok(Wrap(builder.finish())) + } +} + impl<'a> FromPyObject<'a> for Wrap { fn extract(ob: &'a PyAny) -> PyResult { if let Ok(s) = ob.extract::() { @@ -178,6 +194,8 @@ impl IntoPy for Wrap> { AnyValue::Boolean(v) => v.into_py(py), AnyValue::Utf8(v) => v.into_py(py), AnyValue::Utf8Owned(v) => v.into_py(py), + AnyValue::Binary(v) => v.into_py(py), + AnyValue::BinaryOwned(v) => v.into_py(py), AnyValue::Categorical(idx, rev) => { let s = rev.get(idx); s.into_py(py) @@ -244,6 +262,7 @@ impl ToPyObject for Wrap { DataType::Float64 => pl.getattr("Float64").unwrap().into(), DataType::Boolean => pl.getattr("Boolean").unwrap().into(), DataType::Utf8 => pl.getattr("Utf8").unwrap().into(), + DataType::Binary => pl.getattr("Binary").unwrap().into(), DataType::List(inner) => { let inner = Wrap(*inner.clone()).to_object(py); let list_class = pl.getattr("List").unwrap(); @@ -308,6 +327,7 @@ impl FromPyObject<'_> for Wrap { "Int32" => DataType::Int32, "Int64" => DataType::Int64, "Utf8" => DataType::Utf8, + "Binary" => DataType::Binary, "Boolean" => DataType::Boolean, "Categorical" => DataType::Categorical(None), "Date" => DataType::Date, @@ -391,6 +411,13 @@ impl ToPyObject for Wrap<&Utf8Chunked> { } } +impl ToPyObject for Wrap<&BinaryChunked> { + fn to_object(&self, py: Python) -> PyObject { + let iter = self.0.into_iter(); + PyList::new(py, iter).into_py(py) + } +} + impl ToPyObject for Wrap<&StructChunked> { fn to_object(&self, py: Python) -> PyObject { let s = self.0.clone().into_series(); @@ -480,6 +507,8 @@ impl<'s> FromPyObject<'s> for Wrap> { Ok(AnyValue::Float64(v).into()) } else if let Ok(v) = ob.extract::<&'s str>() { Ok(AnyValue::Utf8(v).into()) + } else if let Ok(v) = ob.extract::<&'s [u8]>() { + Ok(AnyValue::Binary(v).into()) } else if ob.get_type().name()?.contains("datetime") { Python::with_gil(|py| { // windows diff --git a/py-polars/src/datatypes.rs b/py-polars/src/datatypes.rs index 94c0643f5bad..c00c4708860c 100644 --- a/py-polars/src/datatypes.rs +++ b/py-polars/src/datatypes.rs @@ -27,6 +27,7 @@ pub(crate) enum PyDataType { Object, Categorical, Struct, + Binary, } impl From<&DataType> for PyDataType { @@ -45,6 +46,7 @@ impl From<&DataType> for PyDataType { DataType::Float64 => Float64, DataType::Boolean => Bool, DataType::Utf8 => Utf8, + DataType::Binary => Binary, DataType::List(_) => List, DataType::Date => Date, DataType::Datetime(tu, tz) => Datetime(*tu, tz.clone()), @@ -83,6 +85,7 @@ impl From for DataType { PyDataType::Float64 => Float64, PyDataType::Bool => Boolean, PyDataType::Utf8 => Utf8, + PyDataType::Binary => Binary, PyDataType::List => List(DataType::Null.into()), PyDataType::Date => Date, PyDataType::Datetime(tu, tz) => Datetime(tu, tz), From 1943b7bb5440737122b1dffe5571e7b33e60b828 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 6 Oct 2022 08:44:36 +0200 Subject: [PATCH 20/22] remove unneeded trait impls --- .../src/chunked_array/ops/aggregate.rs | 118 ------------------ polars/polars-core/src/series/from.rs | 11 +- .../src/series/implementations/binary.rs | 18 --- .../src/series/implementations/boolean.rs | 18 --- .../src/series/implementations/list.rs | 18 --- .../src/series/implementations/utf8.rs | 18 --- polars/polars-core/src/series/series_trait.rs | 8 +- py-polars/Cargo.toml | 1 + 8 files changed, 15 insertions(+), 195 deletions(-) diff --git a/polars/polars-core/src/chunked_array/ops/aggregate.rs b/polars/polars-core/src/chunked_array/ops/aggregate.rs index 4c7f39bc193b..5344760a578b 100644 --- a/polars/polars-core/src/chunked_array/ops/aggregate.rs +++ b/polars/polars-core/src/chunked_array/ops/aggregate.rs @@ -654,55 +654,6 @@ impl VarAggSeries for Float64Chunked { } } -impl VarAggSeries for BooleanChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -impl VarAggSeries for ListChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -#[cfg(feature = "object")] -impl VarAggSeries for ObjectChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - unimplemented!() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - unimplemented!() - } -} -impl VarAggSeries for Utf8Chunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - -#[cfg(feature = "dtype-binary")] -impl VarAggSeries for BinaryChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - macro_rules! impl_quantile_as_series { ($self:expr, $agg:ident, $ty: ty, $qtl:expr, $opt:expr) => {{ let v = $self.$agg($qtl, $opt)?; @@ -761,75 +712,6 @@ impl QuantileAggSeries for Float64Chunked { } } -impl QuantileAggSeries for BooleanChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -impl QuantileAggSeries for ListChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -#[cfg(feature = "object")] -impl QuantileAggSeries for ObjectChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - unimplemented!() - } - - fn median_as_series(&self) -> Series { - unimplemented!() - } -} -impl QuantileAggSeries for Utf8Chunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - -#[cfg(feature = "dtype-binary")] -impl QuantileAggSeries for BinaryChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - impl ChunkAggSeries for BooleanChunked { fn sum_as_series(&self) -> Series { let v = ChunkAgg::sum(self); diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index 76e6b9dc33c5..f2703aa34024 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -62,7 +62,16 @@ impl Series { List(_) => ListChunked::from_chunks(name, chunks).cast(dtype).unwrap(), Utf8 => Utf8Chunked::from_chunks(name, chunks).into_series(), #[cfg(feature = "dtype-binary")] - Binary => BinaryChunked::from_chunks(name, chunks).into_series(), + Binary => { + #[cfg(feature = "dtype-binary")] + { + BinaryChunked::from_chunks(name, chunks).into_series() + } + #[cfg(not(feature = "dtype-binary"))] + { + panic!("activate feature 'dtype-binary'") + } + }, #[cfg(feature = "dtype-categorical")] Categorical(rev_map) => { let cats = UInt32Chunked::from_chunks(name, chunks); diff --git a/polars/polars-core/src/series/implementations/binary.rs b/polars/polars-core/src/series/implementations/binary.rs index e96816b88867..a3be49cfb144 100644 --- a/polars/polars-core/src/series/implementations/binary.rs +++ b/polars/polars-core/src/series/implementations/binary.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use ahash::RandomState; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait, *}; use crate::chunked_array::comparison::*; @@ -322,23 +321,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/implementations/boolean.rs b/polars/polars-core/src/series/implementations/boolean.rs index bab64ab264eb..b76178a9331e 100644 --- a/polars/polars-core/src/series/implementations/boolean.rs +++ b/polars/polars-core/src/series/implementations/boolean.rs @@ -2,7 +2,6 @@ use std::borrow::Cow; use std::ops::{BitAnd, BitOr, BitXor}; use ahash::RandomState; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait, *}; use crate::chunked_array::comparison::*; @@ -332,23 +331,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/implementations/list.rs b/polars/polars-core/src/series/implementations/list.rs index 4d5a9bf7cdcf..8050268dd5af 100644 --- a/polars/polars-core/src/series/implementations/list.rs +++ b/polars/polars-core/src/series/implementations/list.rs @@ -1,7 +1,6 @@ use std::any::Any; use std::borrow::Cow; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait}; use crate::chunked_array::comparison::*; @@ -220,23 +219,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/implementations/utf8.rs b/polars/polars-core/src/series/implementations/utf8.rs index 481eff691ad9..c3a28f60f46a 100644 --- a/polars/polars-core/src/series/implementations/utf8.rs +++ b/polars/polars-core/src/series/implementations/utf8.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use ahash::RandomState; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait, *}; use crate::chunked_array::comparison::*; @@ -322,23 +321,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/series_trait.rs b/polars/polars-core/src/series/series_trait.rs index e803a7aed4a9..3a865a68bae6 100644 --- a/polars/polars-core/src/series/series_trait.rs +++ b/polars/polars-core/src/series/series_trait.rs @@ -581,15 +581,15 @@ pub trait SeriesTrait: } /// Get the median of the Series as a new Series of length 1. fn median_as_series(&self) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the variance of the Series as a new Series of length 1. fn var_as_series(&self, _ddof: u8) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the standard deviation of the Series as a new Series of length 1. fn std_as_series(&self, _ddof: u8) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the quantile of the ChunkedArray as a new Series of length 1. fn quantile_as_series( @@ -597,7 +597,7 @@ pub trait SeriesTrait: _quantile: f64, _interpol: QuantileInterpolOptions, ) -> PolarsResult { - invalid_operation_panic!(self) + Ok(Series::full_null(self.name(), 1, self.dtype())) } fn fmt_list(&self) -> String { diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 886e10cdfbab..6702d0185534 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -90,6 +90,7 @@ all = [ "propagate_nans", "polars/groupby_list", "polars-sql", + "polars/dtype-binary" ] # we cannot conditionaly activate simd From 896586a906794b78da8e6262cb58ddfe10bd2812 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 6 Oct 2022 09:28:14 +0200 Subject: [PATCH 21/22] expose to python --- polars/polars-core/src/datatypes/mod.rs | 4 ++-- polars/polars-core/src/fmt.rs | 4 ++++ polars/polars-core/src/series/from.rs | 2 +- .../src/series/implementations/list.rs | 1 - py-polars/Cargo.toml | 2 +- py-polars/polars/__init__.py | 2 ++ py-polars/polars/datatypes.py | 7 +++++++ py-polars/src/conversion.rs | 17 +++++++++-------- py-polars/src/series.rs | 6 +++++- py-polars/tests/unit/test_binary.py | 15 +++++++++++++++ 10 files changed, 46 insertions(+), 14 deletions(-) create mode 100644 py-polars/tests/unit/test_binary.py diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index 38ce033aea10..f101a30b5fa2 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -260,8 +260,6 @@ pub enum AnyValue<'a> { Boolean(bool), /// A UTF8 encoded string type. Utf8(&'a str), - #[cfg(feature = "dtype-binary")] - Binary(&'a [u8]), /// An unsigned 8-bit integer number. UInt8(u8), /// An unsigned 16-bit integer number. @@ -310,6 +308,8 @@ pub enum AnyValue<'a> { /// A UTF8 encoded string type. Utf8Owned(String), #[cfg(feature = "dtype-binary")] + Binary(&'a [u8]), + #[cfg(feature = "dtype-binary")] BinaryOwned(Vec), } diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs index a40c2a3fabfd..4c7c426a3e33 100644 --- a/polars/polars-core/src/fmt.rs +++ b/polars/polars-core/src/fmt.rs @@ -283,6 +283,10 @@ impl Debug for Series { DataType::Null => { writeln!(f, "nullarray") } + #[cfg(feature = "dtype-binary")] + DataType::Binary => { + format_array!(f, self.binary().unwrap(), "binary", self.name(), "Series") + } dt => panic!("{:?} not impl", dt), } } diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index f2703aa34024..76e966e5bcf6 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -71,7 +71,7 @@ impl Series { { panic!("activate feature 'dtype-binary'") } - }, + } #[cfg(feature = "dtype-categorical")] Categorical(rev_map) => { let cats = UInt32Chunked::from_chunks(name, chunks); diff --git a/polars/polars-core/src/series/implementations/list.rs b/polars/polars-core/src/series/implementations/list.rs index 8050268dd5af..a6ad67f606b3 100644 --- a/polars/polars-core/src/series/implementations/list.rs +++ b/polars/polars-core/src/series/implementations/list.rs @@ -1,7 +1,6 @@ use std::any::Any; use std::borrow::Cow; - use super::{private, IntoSeries, SeriesTrait}; use crate::chunked_array::comparison::*; use crate::chunked_array::ops::explode::ExplodeByOffsets; diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 6702d0185534..1b02cb70fc68 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -90,7 +90,7 @@ all = [ "propagate_nans", "polars/groupby_list", "polars-sql", - "polars/dtype-binary" + "polars/dtype-binary", ] # we cannot conditionaly activate simd diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 0a3c2616f9e0..df9280070ea3 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -21,6 +21,7 @@ def version() -> str: from_records, ) from polars.datatypes import ( + Binary, Boolean, Categorical, DataType, @@ -172,6 +173,7 @@ def version() -> str: "Float32", "Float64", "Boolean", + "Binary", "Utf8", "List", "Date", diff --git a/py-polars/polars/datatypes.py b/py-polars/polars/datatypes.py index f90c03cfa4af..372bf7064894 100644 --- a/py-polars/polars/datatypes.py +++ b/py-polars/polars/datatypes.py @@ -160,6 +160,10 @@ class Utf8(DataType): """UTF-8 encoded string type.""" +class Binary(DataType): + """Binary type.""" + + class Null(DataType): """Type representing Null / None values.""" @@ -373,6 +377,7 @@ def __hash__(self) -> int: Object: "object", Categorical: "categorical", Struct: "struct", + Binary: "binary", } for tu in DTYPE_TEMPORAL_UNITS: _DTYPE_TO_FFINAME[Datetime(tu)] = "datetime" @@ -411,6 +416,7 @@ def __hash__(self) -> int: list: List, tuple: List, Decimal: Float64, + bytes: Binary, } _PY_STR_TO_DTYPE: dict[str, PolarsDataType] = { @@ -434,6 +440,7 @@ def __hash__(self) -> int: Datetime: datetime, Date: date, Time: time, + Binary: bytes, } for tu in DTYPE_TEMPORAL_UNITS: _DTYPE_TO_PY_TYPE[Datetime(tu)] = datetime diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index 940b6d013689..6ba1238865e3 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -17,7 +17,7 @@ use pyo3::basic::CompareOp; use pyo3::conversion::{FromPyObject, IntoPy}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::types::{PyBool, PyDict, PyList, PySequence}; +use pyo3::types::{PyBool, PyBytes, PyDict, PyList, PySequence}; use pyo3::{PyAny, PyResult}; use crate::dataframe::PyDataFrame; @@ -194,8 +194,6 @@ impl IntoPy for Wrap> { AnyValue::Boolean(v) => v.into_py(py), AnyValue::Utf8(v) => v.into_py(py), AnyValue::Utf8Owned(v) => v.into_py(py), - AnyValue::Binary(v) => v.into_py(py), - AnyValue::BinaryOwned(v) => v.into_py(py), AnyValue::Categorical(idx, rev) => { let s = rev.get(idx); s.into_py(py) @@ -241,6 +239,8 @@ impl IntoPy for Wrap> { let s = format!("{}", v); s.into_py(py) } + AnyValue::Binary(v) => v.into_py(py), + AnyValue::BinaryOwned(v) => v.into_py(py), } } } @@ -413,7 +413,7 @@ impl ToPyObject for Wrap<&Utf8Chunked> { impl ToPyObject for Wrap<&BinaryChunked> { fn to_object(&self, py: Python) -> PyObject { - let iter = self.0.into_iter(); + let iter = self.0.into_iter().map(|opt_bytes| opt_bytes.map(|bytes| PyBytes::new(py, bytes))); PyList::new(py, iter).into_py(py) } } @@ -507,9 +507,7 @@ impl<'s> FromPyObject<'s> for Wrap> { Ok(AnyValue::Float64(v).into()) } else if let Ok(v) = ob.extract::<&'s str>() { Ok(AnyValue::Utf8(v).into()) - } else if let Ok(v) = ob.extract::<&'s [u8]>() { - Ok(AnyValue::Binary(v).into()) - } else if ob.get_type().name()?.contains("datetime") { + } else if ob.get_type().name()?.contains("datetime") { Python::with_gil(|py| { // windows #[cfg(target_arch = "windows")] @@ -593,7 +591,10 @@ impl<'s> FromPyObject<'s> for Wrap> { let v = td.extract::(py).unwrap(); Ok(Wrap(AnyValue::Duration(v, TimeUnit::Microseconds))) }) - } else { + } else if let Ok(v) = ob.extract::<&'s [u8]>() { + Ok(AnyValue::Binary(v).into()) + } + else { Err(PyErr::from(PyPolarsErr::Other(format!( "row type not supported {:?}", ob diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index af98857e08b7..b57b724e7fd5 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -713,7 +713,11 @@ impl PySeries { let ca = series.duration().unwrap(); return Wrap(ca).to_object(py); } - dt => panic!("to_list() not implemented for {:?}", dt), + DataType::Binary => { + let ca = series.binary().unwrap(); + return Wrap(ca).to_object(py); + } + DataType::Null | DataType::Unknown => {panic!("to_list not implemented for null/unknown")} }; pylist.to_object(py) } diff --git a/py-polars/tests/unit/test_binary.py b/py-polars/tests/unit/test_binary.py new file mode 100644 index 000000000000..343a27f68b6d --- /dev/null +++ b/py-polars/tests/unit/test_binary.py @@ -0,0 +1,15 @@ +import polars as pl + + +def test_binary_conversions() -> None: + df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_column( + pl.col("blob").cast(pl.Utf8).alias("decoded_blob") + ) + + assert df.to_dict(False) == { + "blob": [b"abc", None, b"cde"], + "decoded_blob": ["abc", None, "cde"], + } + assert df[0, 0] == b"abc" + assert df[1, 0] is None + assert df.dtypes == [pl.Binary, pl.Utf8] From 8f595c4792bbb40d463f207272fd25420f14bea5 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 6 Oct 2022 09:45:47 +0200 Subject: [PATCH 22/22] fix lint --- py-polars/src/conversion.rs | 10 ++++++---- py-polars/src/series.rs | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index 6ba1238865e3..78cee09e20c5 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -413,7 +413,10 @@ impl ToPyObject for Wrap<&Utf8Chunked> { impl ToPyObject for Wrap<&BinaryChunked> { fn to_object(&self, py: Python) -> PyObject { - let iter = self.0.into_iter().map(|opt_bytes| opt_bytes.map(|bytes| PyBytes::new(py, bytes))); + let iter = self + .0 + .into_iter() + .map(|opt_bytes| opt_bytes.map(|bytes| PyBytes::new(py, bytes))); PyList::new(py, iter).into_py(py) } } @@ -507,7 +510,7 @@ impl<'s> FromPyObject<'s> for Wrap> { Ok(AnyValue::Float64(v).into()) } else if let Ok(v) = ob.extract::<&'s str>() { Ok(AnyValue::Utf8(v).into()) - } else if ob.get_type().name()?.contains("datetime") { + } else if ob.get_type().name()?.contains("datetime") { Python::with_gil(|py| { // windows #[cfg(target_arch = "windows")] @@ -593,8 +596,7 @@ impl<'s> FromPyObject<'s> for Wrap> { }) } else if let Ok(v) = ob.extract::<&'s [u8]>() { Ok(AnyValue::Binary(v).into()) - } - else { + } else { Err(PyErr::from(PyPolarsErr::Other(format!( "row type not supported {:?}", ob diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index b57b724e7fd5..22fd8d3d7bf9 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -717,7 +717,9 @@ impl PySeries { let ca = series.binary().unwrap(); return Wrap(ca).to_object(py); } - DataType::Null | DataType::Unknown => {panic!("to_list not implemented for null/unknown")} + DataType::Null | DataType::Unknown => { + panic!("to_list not implemented for null/unknown") + } }; pylist.to_object(py) }