diff --git a/polars/Cargo.toml b/polars/Cargo.toml index d05464928a54..b63b1fadc766 100644 --- a/polars/Cargo.toml +++ b/polars/Cargo.toml @@ -163,6 +163,7 @@ dtype-full = [ "dtype-u16", "dtype-categorical", "dtype-struct", + "dtype-binary", ] # sensible minimal set of opt-in datatypes @@ -210,6 +211,12 @@ dtype-struct = [ "polars-ops/dtype-struct", "polars-io/dtype-struct", ] +dtype-binary = [ + "polars-core/dtype-binary", + "polars-lazy/dtype-binary", + "polars-ops/dtype-binary", + "polars-io/dtype-binary", +] docs-selection = [ "csv-file", diff --git a/polars/polars-arrow/src/array/default_arrays.rs b/polars/polars-arrow/src/array/default_arrays.rs index 8345e3e79f7f..7ac32c0a8ad7 100644 --- a/polars/polars-arrow/src/array/default_arrays.rs +++ b/polars/polars-arrow/src/array/default_arrays.rs @@ -1,4 +1,4 @@ -use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; +use arrow::array::{BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; use arrow::datatypes::DataType; @@ -40,3 +40,23 @@ impl FromDataUtf8 for Utf8Array { Utf8Array::from_data_unchecked(DataType::LargeUtf8, offsets, values, validity) } } + +pub trait FromDataBinary { + /// # Safety + /// `values` buffer must contain valid utf8 between every `offset` + unsafe fn from_data_unchecked_default( + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self; +} + +impl FromDataBinary for BinaryArray { + unsafe fn from_data_unchecked_default( + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + BinaryArray::from_data_unchecked(DataType::LargeBinary, offsets, values, validity) + } +} diff --git a/polars/polars-arrow/src/array/get.rs b/polars/polars-arrow/src/array/get.rs index 61b0ee324a19..0bf196194ae0 100644 --- a/polars/polars-arrow/src/array/get.rs +++ b/polars/polars-arrow/src/array/get.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::types::NativeType; use crate::is_valid::IsValid; @@ -79,6 +79,28 @@ impl<'a> ArrowGetItem for &'a Utf8Array { } } +impl<'a> ArrowGetItem for &'a BinaryArray { + type Item = &'a [u8]; + + #[inline] + fn get(&self, item: usize) -> Option { + if item >= self.len() { + None + } else { + unsafe { self.get_unchecked(item) } + } + } + + #[inline] + unsafe fn get_unchecked(&self, item: usize) -> Option { + if self.is_null_unchecked(item) { + None + } else { + Some(self.value_unchecked(item)) + } + } +} + impl ArrowGetItem for ListArray { type Item = Box; diff --git a/polars/polars-arrow/src/array/mod.rs b/polars/polars-arrow/src/array/mod.rs index 1c5c37015c00..e882b587db0c 100644 --- a/polars/polars-arrow/src/array/mod.rs +++ b/polars/polars-arrow/src/array/mod.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType; use arrow::types::NativeType; @@ -30,6 +30,12 @@ impl ValueSize for Utf8Array { } } +impl ValueSize for BinaryArray { + fn get_values_size(&self) -> usize { + self.values().len() + } +} + impl ValueSize for ArrayRef { fn get_values_size(&self) -> usize { match self.data_type() { @@ -179,6 +185,52 @@ pub trait ListFromIter { Some(validity.into()), ) } + + /// Create a list-array from an iterator. + /// Used in groupby agg-list + /// + /// # Safety + /// Will produce incorrect arrays if size hint is incorrect. + unsafe fn from_iter_binary_trusted_len(iter: I, n_elements: usize) -> ListArray + where + I: IntoIterator>, + P: IntoIterator>, + Ref: AsRef<[u8]>, + { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + + let mut validity = MutableBitmap::with_capacity(lower); + let mut offsets = Vec::::with_capacity(lower + 1); + let mut length_so_far = 0i64; + offsets.push(length_so_far); + let values: BinaryArray = iterator + .filter_map(|opt_iter| match opt_iter { + Some(x) => { + let it = x.into_iter(); + length_so_far += it.size_hint().0 as i64; + validity.push(true); + offsets.push(length_so_far); + Some(it) + } + None => { + validity.push(false); + None + } + }) + .flatten() + .trust_my_length(n_elements) + .collect(); + + // Safety: + // offsets are monotonically increasing + ListArray::new_unchecked( + ListArray::::default_datatype(DataType::LargeBinary), + offsets.into(), + Box::new(values), + Some(validity.into()), + ) + } } impl ListFromIter for ListArray {} diff --git a/polars/polars-arrow/src/compute/take/mod.rs b/polars/polars-arrow/src/compute/take/mod.rs index 4b574414cc61..7e653b5f8f38 100644 --- a/polars/polars-arrow/src/compute/take/mod.rs +++ b/polars/polars-arrow/src/compute/take/mod.rs @@ -327,6 +327,21 @@ pub unsafe fn take_no_null_utf8_iter_unchecked>( Box::new(MutableUtf8Array::::from_trusted_len_values_iter_unchecked(iter).into()) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_no_null_binary_iter_unchecked>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let iter = indices.into_iter().map(|idx| { + debug_assert!(idx < arr.len()); + arr.value_unchecked(idx) + }); + Box::new(MutableBinaryArray::::from_trusted_len_values_iter_unchecked(iter).into()) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -348,6 +363,27 @@ pub unsafe fn take_utf8_iter_unchecked>( Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_binary_iter_unchecked>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let validity = arr.validity().expect("should have nulls"); + let iter = indices.into_iter().map(|idx| { + debug_assert!(idx < arr.len()); + if validity.get_bit_unchecked(idx) { + Some(arr.value_unchecked(idx)) + } else { + None + } + }); + + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -363,6 +399,21 @@ pub unsafe fn take_no_null_utf8_opt_iter_unchecked>>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let iter = indices + .into_iter() + .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); + + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// - no bounds checks /// - iterator must be TrustedLen @@ -384,6 +435,27 @@ pub unsafe fn take_utf8_opt_iter_unchecked> Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter)) } +/// # Safety +/// - no bounds checks +/// - iterator must be TrustedLen +#[inline] +pub unsafe fn take_binary_opt_iter_unchecked>>( + arr: &LargeBinaryArray, + indices: I, +) -> Box { + let validity = arr.validity().expect("should have nulls"); + let iter = indices.into_iter().map(|opt_idx| { + opt_idx.and_then(|idx| { + if validity.get_bit_unchecked(idx) { + Some(arr.value_unchecked(idx)) + } else { + None + } + }) + }); + Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter)) +} + /// # Safety /// caller must ensure indices are in bounds pub unsafe fn take_utf8_unchecked( @@ -497,6 +569,119 @@ pub unsafe fn take_utf8_unchecked( )) } +/// # Safety +/// caller must ensure indices are in bounds +pub unsafe fn take_binary_unchecked( + arr: &LargeBinaryArray, + indices: &IdxArr, +) -> Box { + let data_len = indices.len(); + + let mut offset_buf = vec![0; data_len + 1]; + let offset_typed = offset_buf.as_mut_slice(); + + let mut length_so_far = 0; + offset_typed[0] = length_so_far; + + let validity; + + // The required size is yet unknown + // Allocate 2.0 times the expected size. + // where expected size is the length of bytes multiplied by the factor (take_len / current_len) + let mut values_capacity = if arr.len() > 0 { + ((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() as usize + } else { + 0 + }; + + // 16 bytes per string as default alloc + let mut values_buf = Vec::::with_capacity(values_capacity); + + // both 0 nulls + if !arr.has_validity() && !indices.has_validity() { + offset_typed + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + let index = indices.value_unchecked(idx) as usize; + let s = arr.value_unchecked(index); + length_so_far += s.len() as i64; + *offset = length_so_far; + + if length_so_far as usize >= values_capacity { + values_buf.reserve(values_capacity); + values_capacity *= 2; + } + + values_buf.extend_from_slice(s) + }); + validity = None; + } else if !arr.has_validity() { + offset_typed + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + if indices.is_valid(idx) { + let index = indices.value_unchecked(idx) as usize; + let s = arr.value_unchecked(index); + length_so_far += s.len() as i64; + + if length_so_far as usize >= values_capacity { + values_buf.reserve(values_capacity); + values_capacity *= 2; + } + + values_buf.extend_from_slice(s) + } + *offset = length_so_far; + }); + validity = indices.validity().cloned(); + } else { + let mut builder = MutableBinaryArray::with_capacities(data_len, length_so_far as usize); + let validity_arr = arr.validity().expect("should have nulls"); + + if !indices.has_validity() { + (0..data_len).for_each(|idx| { + let index = indices.value_unchecked(idx) as usize; + builder.push(if validity_arr.get_bit_unchecked(index) { + let s = arr.value_unchecked(index); + Some(s) + } else { + None + }); + }); + } else { + let validity_indices = indices.validity().expect("should have nulls"); + (0..data_len).for_each(|idx| { + if validity_indices.get_bit_unchecked(idx) { + let index = indices.value_unchecked(idx) as usize; + + if validity_arr.get_bit_unchecked(index) { + let s = arr.value_unchecked(index); + builder.push(Some(s)); + } else { + builder.push_null(); + } + } else { + builder.push_null(); + } + }); + } + + let array: BinaryArray = builder.into(); + return Box::new(array); + } + + // Safety: all "values" are &str, and thus valid utf8 + Box::new(BinaryArray::::from_data_unchecked_default( + offset_buf.into(), + values_buf.into(), + validity, + )) +} + /// Forked and adapted from arrow-rs /// This is faster because it does no bounds checks and allocates directly into aligned memory /// diff --git a/polars/polars-arrow/src/data_types.rs b/polars/polars-arrow/src/data_types.rs index 3712a9d26dbe..1ea62a367865 100644 --- a/polars/polars-arrow/src/data_types.rs +++ b/polars/polars-arrow/src/data_types.rs @@ -25,6 +25,7 @@ unsafe impl IsFloat for u16 {} unsafe impl IsFloat for u32 {} unsafe impl IsFloat for u64 {} unsafe impl IsFloat for &str {} +unsafe impl IsFloat for &[u8] {} unsafe impl IsFloat for bool {} unsafe impl IsFloat for Option {} @@ -41,6 +42,7 @@ mod private { impl Sealed for f32 {} impl Sealed for f64 {} impl Sealed for &str {} + impl Sealed for &[u8] {} impl Sealed for bool {} impl Sealed for Option {} } diff --git a/polars/polars-arrow/src/is_valid.rs b/polars/polars-arrow/src/is_valid.rs index 75189d4f6d4d..1f10b6eaf0c1 100644 --- a/polars/polars-arrow/src/is_valid.rs +++ b/polars/polars-arrow/src/is_valid.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; +use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}; use arrow::types::NativeType; pub trait IsValid { @@ -13,6 +13,7 @@ pub trait IsValid { pub trait ArrowArray: Array {} +impl ArrowArray for BinaryArray {} impl ArrowArray for Utf8Array {} impl ArrowArray for PrimitiveArray {} impl ArrowArray for BooleanArray {} diff --git a/polars/polars-arrow/src/prelude.rs b/polars/polars-arrow/src/prelude.rs index d9d7fb000ef3..d72965a66e6e 100644 --- a/polars/polars-arrow/src/prelude.rs +++ b/polars/polars-arrow/src/prelude.rs @@ -1,4 +1,4 @@ -use arrow::array::{ListArray, Utf8Array}; +use arrow::array::{BinaryArray, ListArray, Utf8Array}; pub use crate::array::default_arrays::*; pub use crate::array::*; @@ -8,4 +8,5 @@ pub use crate::index::*; pub use crate::kernels::rolling::no_nulls::QuantileInterpolOptions; pub type LargeStringArray = Utf8Array; +pub type LargeBinaryArray = BinaryArray; pub type LargeListArray = ListArray; diff --git a/polars/polars-arrow/src/trusted_len/mod.rs b/polars/polars-arrow/src/trusted_len/mod.rs index 085b5f648502..c3c73cfca8cf 100644 --- a/polars/polars-arrow/src/trusted_len/mod.rs +++ b/polars/polars-arrow/src/trusted_len/mod.rs @@ -65,6 +65,7 @@ unsafe impl TrustedLen for std::iter::Rev, J> TrustedLen for TrustMyLength {} unsafe impl TrustedLen for std::ops::Range where std::ops::Range: Iterator {} unsafe impl TrustedLen for arrow::array::Utf8ValuesIter<'_, i64> {} +unsafe impl TrustedLen for arrow::array::BinaryValueIter<'_, i64> {} unsafe impl> TrustedLen for ZipValidity<'_, T, I> {} unsafe impl TrustedLen for BitmapIter<'_> {} unsafe impl TrustedLen for std::iter::StepBy {} diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 78dcd6a38acf..5208bc6af965 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -93,6 +93,7 @@ dtype-u8 = [] dtype-u16 = [] dtype-categorical = ["smartstring"] dtype-struct = [] +dtype-binary = [] parquet = ["arrow/io_parquet"] diff --git a/polars/polars-core/src/chunked_array/arithmetic.rs b/polars/polars-core/src/chunked_array/arithmetic.rs index daf71f22c5ba..7a0210e11693 100644 --- a/polars/polars-core/src/chunked_array/arithmetic.rs +++ b/polars/polars-core/src/chunked_array/arithmetic.rs @@ -432,6 +432,14 @@ fn concat_strings(l: &str, r: &str) -> String { s } +#[cfg(feature = "dtype-binary")] +fn concat_binary_arrs(l: &[u8], r: &[u8]) -> Vec { + let mut v = Vec::with_capacity(l.len() + r.len()); + v.extend_from_slice(l); + v.extend_from_slice(r); + v +} + impl Add for &Utf8Chunked { type Output = Utf8Chunked; @@ -494,6 +502,71 @@ impl Add<&str> for &Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl Add for &BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: Self) -> Self::Output { + // broadcasting path rhs + if rhs.len() == 1 { + let rhs = rhs.get(0); + return match rhs { + Some(rhs) => self.add(rhs), + None => BinaryChunked::full_null(self.name(), self.len()), + }; + } + // broadcasting path lhs + if self.len() == 1 { + let lhs = self.get(0); + return match lhs { + Some(lhs) => rhs.apply(|s| Cow::Owned(concat_binary_arrs(lhs, s))), + None => BinaryChunked::full_null(self.name(), rhs.len()), + }; + } + + // todo! add no_null variants. Need 4 paths. + let mut ca: Self::Output = self + .into_iter() + .zip(rhs.into_iter()) + .map(|(opt_l, opt_r)| match (opt_l, opt_r) { + (Some(l), Some(r)) => Some(concat_binary_arrs(l, r)), + _ => None, + }) + .collect_trusted(); + ca.rename(self.name()); + ca + } +} + +#[cfg(feature = "dtype-binary")] +impl Add for BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: Self) -> Self::Output { + (&self).add(&rhs) + } +} + +#[cfg(feature = "dtype-binary")] +impl Add<&[u8]> for &BinaryChunked { + type Output = BinaryChunked; + + fn add(self, rhs: &[u8]) -> Self::Output { + let mut ca: Self::Output = match self.has_validity() { + false => self + .into_no_null_iter() + .map(|l| concat_binary_arrs(l, rhs)) + .collect_trusted(), + _ => self + .into_iter() + .map(|opt_l| opt_l.map(|l| concat_binary_arrs(l, rhs))) + .collect_trusted(), + }; + ca.rename(self.name()); + ca + } +} + #[cfg(test)] pub(crate) mod test { use crate::prelude::*; diff --git a/polars/polars-core/src/chunked_array/builder/binary.rs b/polars/polars-core/src/chunked_array/builder/binary.rs new file mode 100644 index 000000000000..cb2170e72b9a --- /dev/null +++ b/polars/polars-core/src/chunked_array/builder/binary.rs @@ -0,0 +1,90 @@ +use super::*; + +pub struct BinaryChunkedBuilder { + pub(crate) builder: MutableBinaryArray, + pub capacity: usize, + field: Field, +} + +impl BinaryChunkedBuilder { + /// Create a new UtfChunkedBuilder + /// + /// # Arguments + /// + /// * `capacity` - Number of string elements in the final array. + /// * `bytes_capacity` - Number of bytes needed to store the string values. + pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self { + BinaryChunkedBuilder { + builder: MutableBinaryArray::::with_capacities(capacity, bytes_capacity), + capacity, + field: Field::new(name, DataType::Binary), + } + } + + /// Appends a value of type `T` into the builder + #[inline] + pub fn append_value>(&mut self, v: S) { + self.builder.push(Some(v.as_ref())); + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.builder.push::<&[u8]>(None); + } + + #[inline] + pub fn append_option>(&mut self, opt: Option) { + self.builder.push(opt); + } + + pub fn finish(mut self) -> BinaryChunked { + let arr = self.builder.as_box(); + let length = arr.len() as IdxSize; + + ChunkedArray { + field: Arc::new(self.field), + chunks: vec![arr], + phantom: PhantomData, + categorical_map: None, + bit_settings: Default::default(), + length, + } + } + + fn shrink_to_fit(&mut self) { + self.builder.shrink_to_fit() + } +} + +pub struct BinaryChunkedBuilderCow { + builder: BinaryChunkedBuilder, +} + +impl BinaryChunkedBuilderCow { + pub fn new(name: &str, capacity: usize) -> Self { + BinaryChunkedBuilderCow { + builder: BinaryChunkedBuilder::new(name, capacity, capacity), + } + } +} + +impl ChunkedBuilder, BinaryType> for BinaryChunkedBuilderCow { + #[inline] + fn append_value(&mut self, val: Cow<'_, [u8]>) { + self.builder.append_value(val.as_ref()) + } + + #[inline] + fn append_null(&mut self) { + self.builder.append_null() + } + + fn finish(self) -> ChunkedArray { + self.builder.finish() + } + + fn shrink_to_fit(&mut self) { + self.builder.shrink_to_fit() + } +} diff --git a/polars/polars-core/src/chunked_array/builder/from.rs b/polars/polars-core/src/chunked_array/builder/from.rs index 0b2a5e8b7567..f008a934d0e0 100644 --- a/polars/polars-core/src/chunked_array/builder/from.rs +++ b/polars/polars-core/src/chunked_array/builder/from.rs @@ -40,3 +40,13 @@ impl From<(&str, Utf8Array)> for Utf8Chunked { ChunkedArray::from_chunks(name, vec![Box::new(arr)]) } } + +#[cfg(feature = "dtype-binary")] +impl From<(&str, BinaryArray)> for BinaryChunked { + fn from(tpl: (&str, BinaryArray)) -> Self { + let name = tpl.0; + let arr = tpl.1; + + ChunkedArray::from_chunks(name, vec![Box::new(arr)]) + } +} diff --git a/polars/polars-core/src/chunked_array/builder/list.rs b/polars/polars-core/src/chunked_array/builder/list.rs index 9c3363343ff3..8d68dd7cb909 100644 --- a/polars/polars-core/src/chunked_array/builder/list.rs +++ b/polars/polars-core/src/chunked_array/builder/list.rs @@ -179,6 +179,8 @@ where type LargePrimitiveBuilder = MutableListArray>; type LargeListUtf8Builder = MutableListArray>; +#[cfg(feature = "dtype-binary")] +type LargeListBinaryBuilder = MutableListArray>; type LargeListBooleanBuilder = MutableListArray; pub struct ListUtf8ChunkedBuilder { @@ -261,6 +263,89 @@ impl ListBuilderTrait for ListUtf8ChunkedBuilder { } } +#[cfg(feature = "dtype-binary")] +pub struct ListBinaryChunkedBuilder { + builder: LargeListBinaryBuilder, + field: Field, + fast_explode: bool, +} + +#[cfg(feature = "dtype-binary")] +impl ListBinaryChunkedBuilder { + pub fn new(name: &str, capacity: usize, values_capacity: usize) -> Self { + let values = MutableBinaryArray::::with_capacity(values_capacity); + let builder = LargeListBinaryBuilder::new_with_capacity(values, capacity); + let field = Field::new(name, DataType::List(Box::new(DataType::Binary))); + + ListBinaryChunkedBuilder { + builder, + field, + fast_explode: true, + } + } + + pub fn append_trusted_len_iter<'a, I: Iterator> + TrustedLen>( + &mut self, + iter: I, + ) { + let values = self.builder.mut_values(); + + if iter.size_hint().0 == 0 { + self.fast_explode = false; + } + // Safety + // trusted len, trust the type system + unsafe { values.extend_trusted_len_unchecked(iter) }; + self.builder.try_push_valid().unwrap(); + } + + pub fn append_values_iter<'a, I: Iterator>(&mut self, iter: I) { + let values = self.builder.mut_values(); + + if iter.size_hint().0 == 0 { + self.fast_explode = false; + } + values.extend_values(iter); + self.builder.try_push_valid().unwrap(); + } + + pub(crate) fn append(&mut self, ca: &BinaryChunked) { + let value_builder = self.builder.mut_values(); + value_builder.try_extend(ca).unwrap(); + self.builder.try_push_valid().unwrap(); + } +} + +#[cfg(feature = "dtype-binary")] +impl ListBuilderTrait for ListBinaryChunkedBuilder { + fn append_opt_series(&mut self, opt_s: Option<&Series>) { + match opt_s { + Some(s) => self.append_series(s), + None => { + self.append_null(); + } + } + } + + #[inline] + fn append_null(&mut self) { + self.fast_explode = false; + self.builder.push_null(); + } + + fn append_series(&mut self, s: &Series) { + if s.is_empty() { + self.fast_explode = false; + } + let ca = s.binary().unwrap(); + self.append(ca) + } + + fn finish(&mut self) -> ListChunked { + finish_list_builder!(self) + } +} + pub struct ListBooleanChunkedBuilder { builder: LargeListBooleanBuilder, field: Field, @@ -389,10 +474,19 @@ pub fn get_list_builder( Box::new(builder) }}; } + #[cfg(feature = "dtype-binary")] + macro_rules! get_binary_builder { + () => {{ + let builder = + ListBinaryChunkedBuilder::new(&name, list_capacity, 5 * value_capacity); + Box::new(builder) + }}; + } Ok(match_dtype_to_logical_apply_macro!( physical_type, get_primitive_builder, get_utf8_builder, + get_binary_builder, get_bool_builder )) } diff --git a/polars/polars-core/src/chunked_array/builder/mod.rs b/polars/polars-core/src/chunked_array/builder/mod.rs index ec6161f2fafb..d2f53d815d52 100644 --- a/polars/polars-core/src/chunked_array/builder/mod.rs +++ b/polars/polars-core/src/chunked_array/builder/mod.rs @@ -1,3 +1,5 @@ +#[cfg(feature = "dtype-binary")] +mod binary; mod boolean; mod from; pub mod list; @@ -11,6 +13,8 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; +#[cfg(feature = "dtype-binary")] +pub use binary::*; pub use boolean::*; pub use list::*; pub use primitive::*; @@ -159,6 +163,49 @@ where } } +#[cfg(feature = "dtype-binary")] +impl NewChunkedArray for BinaryChunked +where + B: AsRef<[u8]>, +{ + fn from_slice(name: &str, v: &[B]) -> Self { + let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); + + let mut builder = MutableBinaryArray::::with_capacities(v.len(), values_size); + builder.extend_trusted_len_values(v.iter().map(|s| s.as_ref())); + + let chunks = vec![builder.as_box()]; + ChunkedArray::from_chunks(name, chunks) + } + + fn from_slice_options(name: &str, opt_v: &[Option]) -> Self { + let values_size = opt_v.iter().fold(0, |acc, s| match s { + Some(s) => acc + s.as_ref().len(), + None => acc, + }); + let mut builder = MutableBinaryArray::::with_capacities(opt_v.len(), values_size); + builder.extend_trusted_len(opt_v.iter().map(|s| s.as_ref())); + + let chunks = vec![builder.as_box()]; + ChunkedArray::from_chunks(name, chunks) + } + + fn from_iter_options(name: &str, it: impl Iterator>) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); + it.for_each(|opt| builder.append_option(opt)); + builder.finish() + } + + /// Create a new ChunkedArray from an iterator. + fn from_iter_values(name: &str, it: impl Iterator) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = BinaryChunkedBuilder::new(name, cap, cap * 5); + it.for_each(|v| builder.append_value(v)); + builder.finish() + } +} + #[cfg(test)] mod test { use super::*; @@ -221,4 +268,13 @@ mod test { let ca = builder.finish(); dbg!(ca); } + + #[cfg(feature = "dtype-binary")] + #[test] + fn test_list_binary_builder() { + let mut builder = ListBinaryChunkedBuilder::new("a", 10, 10); + builder.append_series(&Series::new("", &["foo".as_bytes(), "bar".as_bytes()])); + let ca = builder.finish(); + dbg!(ca); + } } diff --git a/polars/polars-core/src/chunked_array/cast.rs b/polars/polars-core/src/chunked_array/cast.rs index 9c2bbb850b01..567e08f90d8c 100644 --- a/polars/polars-core/src/chunked_array/cast.rs +++ b/polars/polars-core/src/chunked_array/cast.rs @@ -120,6 +120,17 @@ impl ChunkCast for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkCast for BinaryChunked { + fn cast(&self, data_type: &DataType) -> PolarsResult { + cast_impl(self.name(), &self.chunks, data_type) + } + + fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult { + self.cast(data_type) + } +} + fn boolean_to_utf8(ca: &BooleanChunked) -> Utf8Chunked { ca.into_iter() .map(|opt_b| match opt_b { diff --git a/polars/polars-core/src/chunked_array/comparison.rs b/polars/polars-core/src/chunked_array/comparison.rs index 6b99d078c610..b97ad98e4ffa 100644 --- a/polars/polars-core/src/chunked_array/comparison.rs +++ b/polars/polars-core/src/chunked_array/comparison.rs @@ -3,6 +3,8 @@ use std::ops::Not; use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array}; use arrow::compute; use arrow::compute::comparison; +#[cfg(feature = "dtype-binary")] +use arrow::scalar::BinaryScalar; use arrow::scalar::{PrimitiveScalar, Scalar, Utf8Scalar}; use num::{NumCast, ToPrimitive}; use polars_arrow::prelude::FromData; @@ -636,6 +638,166 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl BinaryChunked { + fn comparison( + &self, + rhs: &BinaryChunked, + f: impl Fn(&BinaryArray, &BinaryArray) -> BooleanArray, + ) -> BooleanChunked { + let chunks = self + .downcast_iter() + .zip(rhs.downcast_iter()) + .map(|(left, right)| { + let arr = f(left, right); + Box::new(arr) as ArrayRef + }) + .collect(); + BooleanChunked::from_chunks("", chunks) + } +} + +#[cfg(feature = "dtype-binary")] +impl ChunkCompare<&BinaryChunked> for BinaryChunked { + type Item = BooleanChunked; + + fn eq_missing(&self, rhs: &BinaryChunked) -> BooleanChunked { + impl_eq_missing!(self, rhs) + } + + fn equal(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else { + let (lhs, rhs) = align_chunks_binary(self, rhs); + lhs.comparison(&rhs, comparison::binary::eq_and_validity) + } + } + + fn not_equal(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.not_equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.not_equal(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else { + let (lhs, rhs) = align_chunks_binary(self, rhs); + lhs.comparison(&rhs, comparison::binary::neq_and_validity) + } + } + + fn gt(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.gt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.lt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::gt(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, >) + } + } + + fn gt_eq(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.gt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.lt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::gt_eq(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, >=) + } + } + + fn lt(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.lt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.gt(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::lt(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, <) + } + } + + fn lt_eq(&self, rhs: &BinaryChunked) -> BooleanChunked { + // broadcast + if rhs.len() == 1 { + if let Some(value) = rhs.get(0) { + self.lt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } else if self.len() == 1 { + if let Some(value) = self.get(0) { + rhs.gt_eq(value) + } else { + BooleanChunked::full("", false, self.len()) + } + } + // same length + else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) { + self.comparison(rhs, |l, r| comparison::lt_eq(l, r)) + } else { + apply_operand_on_chunkedarray_by_iter!(self, rhs, <=) + } + } +} + impl ChunkedArray where T: PolarsNumericType, @@ -728,6 +890,49 @@ impl ChunkCompare<&str> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl BinaryChunked { + fn binary_compare_scalar( + &self, + rhs: &[u8], + f: impl Fn(&BinaryArray, &dyn Scalar) -> BooleanArray, + ) -> BooleanChunked { + let scalar = BinaryScalar::::new(Some(rhs)); + self.apply_kernel_cast(&|arr| Box::new(f(arr, &scalar))) + } +} + +#[cfg(feature = "dtype-binary")] +impl ChunkCompare<&[u8]> for BinaryChunked { + type Item = BooleanChunked; + fn eq_missing(&self, rhs: &[u8]) -> BooleanChunked { + self.equal(rhs) + } + + fn equal(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::eq_scalar_and_validity(l, rhs)) + } + fn not_equal(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::neq_scalar_and_validity(l, rhs)) + } + + fn gt(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::gt_scalar(l, rhs)) + } + + fn gt_eq(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::gt_eq_scalar(l, rhs)) + } + + fn lt(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::lt_scalar(l, rhs)) + } + + fn lt_eq(&self, rhs: &[u8]) -> BooleanChunked { + self.binary_compare_scalar(rhs, |l, rhs| comparison::lt_eq_scalar(l, rhs)) + } +} + macro_rules! impl_cmp_list { ($self:ident, $rhs:ident, $cmp_method:ident) => {{ match ($self.has_validity(), $rhs.has_validity()) { @@ -897,6 +1102,16 @@ impl ChunkEqualElement for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkEqualElement for BinaryChunked { + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + let ca_other = other.as_ref().as_ref(); + debug_assert!(self.dtype() == other.dtype()); + let ca_other = &*(ca_other as *const BinaryChunked); + self.get(idx_self) == ca_other.get(idx_other) + } +} + impl ChunkEqualElement for ListChunked {} #[cfg(feature = "dtype-struct")] diff --git a/polars/polars-core/src/chunked_array/iterator/mod.rs b/polars/polars-core/src/chunked_array/iterator/mod.rs index ac5b36f0b439..8f7fa9d34ba3 100644 --- a/polars/polars-core/src/chunked_array/iterator/mod.rs +++ b/polars/polars-core/src/chunked_array/iterator/mod.rs @@ -8,6 +8,8 @@ use crate::series::iterator::SeriesIter; use crate::utils::CustomIterTools; type LargeStringArray = Utf8Array; +#[cfg(feature = "dtype-binary")] +type LargeBinaryArray = BinaryArray; type LargeListArray = ListArray; pub mod par; @@ -209,6 +211,95 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> IntoIterator for &'a BinaryChunked { + type Item = Option<&'a [u8]>; + type IntoIter = Box + 'a>; + fn into_iter(self) -> Self::IntoIter { + // we know that we only iterate over length == self.len() + unsafe { Box::new(self.downcast_iter().flatten().trust_my_length(self.len())) } + } +} + +#[cfg(feature = "dtype-binary")] +pub struct BinaryIterNoNull<'a> { + array: &'a LargeBinaryArray, + current: usize, + current_end: usize, +} + +#[cfg(feature = "dtype-binary")] +impl<'a> BinaryIterNoNull<'a> { + /// create a new iterator + pub fn new(array: &'a LargeBinaryArray) -> Self { + BinaryIterNoNull { + array, + current: 0, + current_end: array.len(), + } + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a> Iterator for BinaryIterNoNull<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.current == self.current_end { + None + } else { + let old = self.current; + self.current += 1; + unsafe { Some(self.array.value_unchecked(old)) } + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + self.array.len() - self.current, + Some(self.array.len() - self.current), + ) + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a> DoubleEndedIterator for BinaryIterNoNull<'a> { + fn next_back(&mut self) -> Option { + if self.current_end == self.current { + None + } else { + self.current_end -= 1; + unsafe { Some(self.array.value_unchecked(self.current_end)) } + } + } +} + +#[cfg(feature = "dtype-binary")] +/// all arrays have known size. +impl<'a> ExactSizeIterator for BinaryIterNoNull<'a> {} + +#[cfg(feature = "dtype-binary")] +impl BinaryChunked { + #[allow(clippy::wrong_self_convention)] + #[doc(hidden)] + pub fn into_no_null_iter( + &self, + ) -> impl Iterator + + '_ + + Send + + Sync + + ExactSizeIterator + + DoubleEndedIterator + + TrustedLen { + // we know that we only iterate over length == self.len() + unsafe { + self.downcast_iter() + .flat_map(BinaryIterNoNull::new) + .trust_my_length(self.len()) + } + } +} + impl<'a> IntoIterator for &'a ListChunked { type Item = Option; type IntoIter = Box + 'a>; diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs index d7e56b23a4e6..ce94cd3c5506 100644 --- a/polars/polars-core/src/chunked_array/mod.rs +++ b/polars/polars-core/src/chunked_array/mod.rs @@ -521,6 +521,8 @@ where impl AsSinglePtr for BooleanChunked {} impl AsSinglePtr for ListChunked {} impl AsSinglePtr for Utf8Chunked {} +#[cfg(feature = "dtype-binary")] +impl AsSinglePtr for BinaryChunked {} #[cfg(feature = "object")] impl AsSinglePtr for ObjectChunked {} @@ -601,6 +603,15 @@ impl ValueSize for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ValueSize for BinaryChunked { + fn get_values_size(&self) -> usize { + self.chunks + .iter() + .fold(0usize, |acc, arr| acc + arr.get_values_size()) + } +} + impl ListChunked { /// Get the inner data type of the list. pub fn inner_dtype(&self) -> DataType { diff --git a/polars/polars-core/src/chunked_array/ops/aggregate.rs b/polars/polars-core/src/chunked_array/ops/aggregate.rs index a26acd0da39a..5344760a578b 100644 --- a/polars/polars-core/src/chunked_array/ops/aggregate.rs +++ b/polars/polars-core/src/chunked_array/ops/aggregate.rs @@ -654,44 +654,6 @@ impl VarAggSeries for Float64Chunked { } } -impl VarAggSeries for BooleanChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -impl VarAggSeries for ListChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -#[cfg(feature = "object")] -impl VarAggSeries for ObjectChunked { - fn var_as_series(&self, _ddof: u8) -> Series { - unimplemented!() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - unimplemented!() - } -} -impl VarAggSeries for Utf8Chunked { - fn var_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } - - fn std_as_series(&self, _ddof: u8) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - macro_rules! impl_quantile_as_series { ($self:expr, $agg:ident, $ty: ty, $qtl:expr, $opt:expr) => {{ let v = $self.$agg($qtl, $opt)?; @@ -750,60 +712,6 @@ impl QuantileAggSeries for Float64Chunked { } } -impl QuantileAggSeries for BooleanChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -impl QuantileAggSeries for ListChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} -#[cfg(feature = "object")] -impl QuantileAggSeries for ObjectChunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - unimplemented!() - } - - fn median_as_series(&self) -> Series { - unimplemented!() - } -} -impl QuantileAggSeries for Utf8Chunked { - fn quantile_as_series( - &self, - _quantile: f64, - _interpol: QuantileInterpolOptions, - ) -> PolarsResult { - Ok(Self::full_null(self.name(), 1).into_series()) - } - - fn median_as_series(&self) -> Series { - Self::full_null(self.name(), 1).into_series() - } -} - impl ChunkAggSeries for BooleanChunked { fn sum_as_series(&self) -> Series { let v = ChunkAgg::sum(self); @@ -849,6 +757,31 @@ impl ChunkAggSeries for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkAggSeries for BinaryChunked { + fn sum_as_series(&self) -> Series { + BinaryChunked::full_null(self.name(), 1).into_series() + } + fn max_as_series(&self) -> Series { + Series::new( + self.name(), + &[self + .downcast_iter() + .filter_map(compute::aggregate::max_binary) + .fold_first_(|acc, v| if acc > v { acc } else { v })], + ) + } + fn min_as_series(&self) -> Series { + Series::new( + self.name(), + &[self + .downcast_iter() + .filter_map(compute::aggregate::min_binary) + .fold_first_(|acc, v| if acc < v { acc } else { v })], + ) + } +} + macro_rules! one_null_list { ($self:ident, $dtype: expr) => {{ let mut builder = get_list_builder(&$dtype, 0, 1, $self.name()).unwrap(); @@ -892,6 +825,8 @@ where impl ArgAgg for BooleanChunked {} impl ArgAgg for Utf8Chunked {} +#[cfg(feature = "dtype-binary")] +impl ArgAgg for BinaryChunked {} impl ArgAgg for ListChunked {} #[cfg(feature = "object")] diff --git a/polars/polars-core/src/chunked_array/ops/any_value.rs b/polars/polars-core/src/chunked_array/ops/any_value.rs index 9006c19ef6e0..3f891d34ed7d 100644 --- a/polars/polars-core/src/chunked_array/ops/any_value.rs +++ b/polars/polars-core/src/chunked_array/ops/any_value.rs @@ -30,6 +30,8 @@ pub(crate) unsafe fn arr_to_any_value<'a>( // TODO: insert types match dtype { DataType::Utf8 => downcast_and_pack!(LargeStringArray, Utf8), + #[cfg(feature = "dtype-binary")] + DataType::Binary => downcast_and_pack!(LargeBinaryArray, Binary), DataType::Boolean => downcast_and_pack!(BooleanArray, Boolean), DataType::UInt8 => downcast_and_pack!(UInt8Array, UInt8), DataType::UInt16 => downcast_and_pack!(UInt16Array, UInt16), @@ -166,6 +168,18 @@ impl ChunkAnyValue for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkAnyValue for BinaryChunked { + #[inline] + unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { + get_any_value_unchecked!(self, index) + } + + fn get_any_value(&self, index: usize) -> AnyValue { + get_any_value!(self, index) + } +} + impl ChunkAnyValue for ListChunked { #[inline] unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue { diff --git a/polars/polars-core/src/chunked_array/ops/append.rs b/polars/polars-core/src/chunked_array/ops/append.rs index 5c5b9886cf25..e6018de0614d 100644 --- a/polars/polars-core/src/chunked_array/ops/append.rs +++ b/polars/polars-core/src/chunked_array/ops/append.rs @@ -44,6 +44,17 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +#[doc(hidden)] +impl BinaryChunked { + pub fn append(&mut self, other: &Self) { + let len = self.len(); + self.length += other.length; + new_chunks(&mut self.chunks, &other.chunks, len); + self.set_sorted2(IsSorted::Not); + } +} + #[doc(hidden)] impl ListChunked { pub fn append(&mut self, other: &Self) -> PolarsResult<()> { diff --git a/polars/polars-core/src/chunked_array/ops/apply.rs b/polars/polars-core/src/chunked_array/ops/apply.rs index 686ae46b57d2..271445fc497c 100644 --- a/polars/polars-core/src/chunked_array/ops/apply.rs +++ b/polars/polars-core/src/chunked_array/ops/apply.rs @@ -446,6 +446,100 @@ impl<'a> ChunkApply<'a, &'a str, Cow<'a, str>> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> ChunkApply<'a, &'a [u8], Cow<'a, [u8]>> for BinaryChunked { + fn apply_cast_numeric(&'a self, f: F) -> ChunkedArray + where + F: Fn(&'a [u8]) -> S::Native + Copy, + S: PolarsNumericType, + { + let chunks = self + .downcast_iter() + .into_iter() + .map(|array| { + let values = array.values_iter().map(f); + let values = Vec::<_>::from_trusted_len_iter(values); + to_array::(values, array.validity().cloned()) + }) + .collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } + + fn branch_apply_cast_numeric_no_null(&'a self, f: F) -> ChunkedArray + where + F: Fn(Option<&'a [u8]>) -> S::Native + Copy, + S: PolarsNumericType, + { + let chunks = self + .downcast_iter() + .into_iter() + .map(|array| { + let values = array.into_iter().map(f); + let values = Vec::<_>::from_trusted_len_iter(values); + to_array::(values, array.validity().cloned()) + }) + .collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } + + fn apply(&'a self, f: F) -> Self + where + F: Fn(&'a [u8]) -> Cow<'a, [u8]> + Copy, + { + apply!(self, f) + } + + fn try_apply(&'a self, f: F) -> PolarsResult + where + F: Fn(&'a [u8]) -> PolarsResult> + Copy, + { + try_apply!(self, f) + } + + fn apply_on_opt(&'a self, f: F) -> Self + where + F: Fn(Option<&'a [u8]>) -> Option> + Copy, + { + let mut ca: Self = self.into_iter().map(f).collect_trusted(); + ca.rename(self.name()); + ca + } + + fn apply_with_idx(&'a self, f: F) -> Self + where + F: Fn((usize, &'a [u8])) -> Cow<'a, [u8]> + Copy, + { + apply_enumerate!(self, f) + } + + fn apply_with_idx_on_opt(&'a self, f: F) -> Self + where + F: Fn((usize, Option<&'a [u8]>)) -> Option> + Copy, + { + let mut ca: Self = self.into_iter().enumerate().map(f).collect_trusted(); + ca.rename(self.name()); + ca + } + + fn apply_to_slice(&'a self, f: F, slice: &mut [T]) + where + F: Fn(Option<&'a [u8]>, &T) -> T, + { + assert!(slice.len() >= self.len()); + + let mut idx = 0; + self.downcast_iter().for_each(|arr| { + arr.into_iter().for_each(|opt_val| { + // Safety: + // length asserted above + let item = unsafe { slice.get_unchecked_mut(idx) }; + *item = f(opt_val, item); + idx += 1; + }) + }); + } +} + impl ChunkApplyKernel for BooleanChunked { fn apply_kernel(&self, f: &dyn Fn(&BooleanArray) -> ArrayRef) -> Self { let chunks = self.downcast_iter().into_iter().map(f).collect(); @@ -494,6 +588,21 @@ impl ChunkApplyKernel for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkApplyKernel for BinaryChunked { + fn apply_kernel(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> Self { + self.apply_kernel_cast(&f) + } + + fn apply_kernel_cast(&self, f: &dyn Fn(&LargeBinaryArray) -> ArrayRef) -> ChunkedArray + where + S: PolarsDataType, + { + let chunks = self.downcast_iter().into_iter().map(f).collect(); + ChunkedArray::from_chunks(self.name(), chunks) + } +} + impl<'a> ChunkApply<'a, Series, Series> for ListChunked { fn apply_cast_numeric(&self, f: F) -> ChunkedArray where diff --git a/polars/polars-core/src/chunked_array/ops/compare_inner.rs b/polars/polars-core/src/chunked_array/ops/compare_inner.rs index 62edadeafaaf..21a2609e3eba 100644 --- a/polars/polars-core/src/chunked_array/ops/compare_inner.rs +++ b/polars/polars-core/src/chunked_array/ops/compare_inner.rs @@ -4,6 +4,8 @@ use std::cmp::{Ordering, PartialEq}; +#[cfg(feature = "dtype-binary")] +use crate::chunked_array::ops::take::take_random::{BinaryTakeRandom, BinaryTakeRandomSingleChunk}; use crate::chunked_array::ops::take::take_random::{ BoolTakeRandom, BoolTakeRandomSingleChunk, NumTakeRandomChunked, NumTakeRandomCont, NumTakeRandomSingleChunk, Utf8TakeRandom, Utf8TakeRandomSingleChunk, @@ -69,6 +71,10 @@ macro_rules! impl_traits { impl_traits!(Utf8TakeRandom<'_>); impl_traits!(Utf8TakeRandomSingleChunk<'_>); +#[cfg(feature = "dtype-binary")] +impl_traits!(BinaryTakeRandom<'_>); +#[cfg(feature = "dtype-binary")] +impl_traits!(BinaryTakeRandomSingleChunk<'_>); impl_traits!(BoolTakeRandom<'_>); impl_traits!(BoolTakeRandomSingleChunk<'_>); impl_traits!(NumTakeRandomSingleChunk<'_, T>, T); @@ -140,6 +146,27 @@ impl<'a> IntoPartialEqInner<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> IntoPartialEqInner<'a> for &'a BinaryChunked { + fn into_partial_eq_inner(self) -> Box { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + Box::new(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + Box::new(t) + } + } + } +} + impl<'a> IntoPartialEqInner<'a> for &'a BooleanChunked { fn into_partial_eq_inner(self) -> Box { match self.chunks.len() { @@ -240,6 +267,27 @@ impl<'a> IntoPartialOrdInner<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> IntoPartialOrdInner<'a> for &'a BinaryChunked { + fn into_partial_ord_inner(self) -> Box { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + Box::new(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + Box::new(t) + } + } + } +} + impl<'a> IntoPartialOrdInner<'a> for &'a BooleanChunked { fn into_partial_ord_inner(self) -> Box { match self.chunks.len() { diff --git a/polars/polars-core/src/chunked_array/ops/downcast.rs b/polars/polars-core/src/chunked_array/ops/downcast.rs index 520844b03e9b..a75ccced1144 100644 --- a/polars/polars-core/src/chunked_array/ops/downcast.rs +++ b/polars/polars-core/src/chunked_array/ops/downcast.rs @@ -131,6 +131,32 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +#[doc(hidden)] +impl BinaryChunked { + pub fn downcast_iter(&self) -> impl Iterator> + DoubleEndedIterator { + // Safety: + // This is the array type that must be in a BinaryChunked + self.chunks.iter().map(|arr| { + // Safety: + // This should be the array type in BinaryChunked + let arr = &**arr; + unsafe { &*(arr as *const dyn Array as *const BinaryArray) } + }) + } + pub fn downcast_chunks(&self) -> Chunks<'_, BinaryArray> { + Chunks::new(&self.chunks) + } + + #[inline] + pub(crate) fn index_to_chunked_index(&self, index: usize) -> (usize, usize) { + if self.chunks.len() == 1 { + return (0, index); + } + index_to_chunked_index(self.downcast_iter().map(|arr| arr.len()), index) + } +} + #[doc(hidden)] impl ListChunked { pub fn downcast_iter(&self) -> impl Iterator> + DoubleEndedIterator { diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/polars/polars-core/src/chunked_array/ops/explode.rs index e28bc381a6a6..e380ed3d153e 100644 --- a/polars/polars-core/src/chunked_array/ops/explode.rs +++ b/polars/polars-core/src/chunked_array/ops/explode.rs @@ -280,6 +280,48 @@ impl ExplodeByOffsets for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ExplodeByOffsets for BinaryChunked { + fn explode_by_offsets(&self, offsets: &[i64]) -> Series { + debug_assert_eq!(self.chunks.len(), 1); + let arr = self.downcast_iter().next().unwrap(); + + let cap = ((arr.len() as f32) * 1.5) as usize; + let bytes_size = self.get_values_size(); + let mut builder = BinaryChunkedBuilder::new(self.name(), cap, bytes_size); + + let mut start = offsets[0] as usize; + let mut last = start; + for &o in &offsets[1..] { + let o = o as usize; + if o == last { + if start != last { + let vals = arr.slice(start, last - start); + if vals.null_count() == 0 { + builder + .builder + .extend_trusted_len_values(vals.values_iter()) + } else { + builder.builder.extend_trusted_len(vals.into_iter()); + } + } + builder.append_null(); + start = o; + } + last = o; + } + let vals = arr.slice(start, last - start); + if vals.null_count() == 0 { + builder + .builder + .extend_trusted_len_values(vals.values_iter()) + } else { + builder.builder.extend_trusted_len(vals.into_iter()); + } + builder.finish().into() + } +} + /// Convert Arrow array offsets to indexes of the original list pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec { if offsets.is_empty() { diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/polars/polars-core/src/chunked_array/ops/extend.rs index 18233565d1db..2fa8330c9075 100644 --- a/polars/polars-core/src/chunked_array/ops/extend.rs +++ b/polars/polars-core/src/chunked_array/ops/extend.rs @@ -120,6 +120,44 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +#[doc(hidden)] +impl BinaryChunked { + pub fn extend(&mut self, other: &Self) { + if self.chunks.len() > 1 { + self.append(other); + *self = self.rechunk(); + return; + } + let arr = self.downcast_iter().next().unwrap(); + + // increments 1 + let arr = arr.clone(); + + // now we drop our owned ArrayRefs so that + // decrements 1 + { + self.chunks.clear(); + } + + use Either::*; + + match arr.into_mut() { + Left(immutable) => { + extend_immutable(&immutable, &mut self.chunks, &other.chunks); + } + Right(mut mutable) => { + for arr in other.downcast_iter() { + mutable.extend_trusted_len(arr.into_iter()) + } + let arr: BinaryArray = mutable.into(); + self.chunks.push(Box::new(arr) as ArrayRef) + } + } + self.compute_len(); + } +} + #[doc(hidden)] impl BooleanChunked { pub fn extend(&mut self, other: &Self) { diff --git a/polars/polars-core/src/chunked_array/ops/fill_null.rs b/polars/polars-core/src/chunked_array/ops/fill_null.rs index a6bd4ca95105..2ed7723c3895 100644 --- a/polars/polars-core/src/chunked_array/ops/fill_null.rs +++ b/polars/polars-core/src/chunked_array/ops/fill_null.rs @@ -107,6 +107,32 @@ fn fill_backward_limit_utf8(ca: &Utf8Chunked, limit: IdxSize) -> Utf8Chunked { out.into_iter().rev().collect_trusted() } +#[cfg(feature = "dtype-binary")] +fn fill_backward_limit_binary(ca: &BinaryChunked, limit: IdxSize) -> BinaryChunked { + let mut cnt = 0; + let mut previous = None; + let out: BinaryChunked = ca + .into_iter() + .rev() + .map(|opt_v| match opt_v { + Some(v) => { + cnt = 0; + previous = Some(v); + Some(v) + } + None => { + if cnt < limit { + cnt += 1; + previous + } else { + None + } + } + }) + .collect_trusted(); + out.into_iter().rev().collect_trusted() +} + fn fill_forward(ca: &ChunkedArray) -> ChunkedArray where T: PolarsNumericType, @@ -345,6 +371,44 @@ impl ChunkFillNullValue<&str> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkFillNull for BinaryChunked { + fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + // nothing to fill + if !self.has_validity() { + return Ok(self.clone()); + } + match strategy { + FillNullStrategy::Forward(limit) => { + let mut out: Self = match limit { + Some(limit) => impl_fill_forward_limit!(self, limit), + None => impl_fill_forward!(self), + }; + out.rename(self.name()); + Ok(out) + } + FillNullStrategy::Backward(limit) => { + let mut out = match limit { + None => impl_fill_backward!(self, BinaryChunked), + Some(limit) => fill_backward_limit_binary(self, limit), + }; + out.rename(self.name()); + Ok(out) + } + strat => Err(PolarsError::InvalidOperation( + format!("Strategy {:?} not supported", strat).into(), + )), + } + } +} + +#[cfg(feature = "dtype-binary")] +impl ChunkFillNullValue<&[u8]> for BinaryChunked { + fn fill_null_with_values(&self, value: &[u8]) -> PolarsResult { + self.set(&self.is_null(), Some(value)) + } +} + impl ChunkFillNull for ListChunked { fn fill_null(&self, _strategy: FillNullStrategy) -> PolarsResult { Err(PolarsError::InvalidOperation( diff --git a/polars/polars-core/src/chunked_array/ops/filter.rs b/polars/polars-core/src/chunked_array/ops/filter.rs index b729513ddffa..a4d76571b534 100644 --- a/polars/polars-core/src/chunked_array/ops/filter.rs +++ b/polars/polars-core/src/chunked_array/ops/filter.rs @@ -93,6 +93,29 @@ impl ChunkFilter for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkFilter for BinaryChunked { + fn filter(&self, filter: &BooleanChunked) -> PolarsResult> { + // broadcast + if filter.len() == 1 { + return match filter.get(0) { + Some(true) => Ok(self.clone()), + _ => Ok(BinaryChunked::full_null(self.name(), 0)), + }; + } + check_filter_len!(self, filter); + let (left, filter) = align_chunks_binary(self, filter); + + let chunks = left + .downcast_iter() + .zip(filter.downcast_iter()) + .map(|(left, mask)| filter_fn(left, mask).unwrap()) + .collect::>(); + + Ok(self.copy_with_chunks(chunks, true)) + } +} + impl ChunkFilter for ListChunked { fn filter(&self, filter: &BooleanChunked) -> PolarsResult { // broadcast diff --git a/polars/polars-core/src/chunked_array/ops/full.rs b/polars/polars-core/src/chunked_array/ops/full.rs index cb2840488f05..4e1e88c2cd08 100644 --- a/polars/polars-core/src/chunked_array/ops/full.rs +++ b/polars/polars-core/src/chunked_array/ops/full.rs @@ -64,6 +64,28 @@ impl ChunkFullNull for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { + fn full(name: &str, value: &'a [u8], length: usize) -> Self { + let mut builder = BinaryChunkedBuilder::new(name, length, length * value.len()); + + for _ in 0..length { + builder.append_value(value); + } + let mut out = builder.finish(); + out.set_sorted2(IsSorted::Ascending); + out + } +} + +#[cfg(feature = "dtype-binary")] +impl ChunkFullNull for BinaryChunked { + fn full_null(name: &str, length: usize) -> Self { + let arr = new_null_array(DataType::Binary.to_arrow(), length); + BinaryChunked::from_chunks(name, vec![arr]) + } +} + impl ChunkFull<&Series> for ListChunked { fn full(name: &str, value: &Series, length: usize) -> ListChunked { let mut builder = diff --git a/polars/polars-core/src/chunked_array/ops/is_in.rs b/polars/polars-core/src/chunked_array/ops/is_in.rs index c08bea016894..932cd298685e 100644 --- a/polars/polars-core/src/chunked_array/ops/is_in.rs +++ b/polars/polars-core/src/chunked_array/ops/is_in.rs @@ -230,6 +230,70 @@ impl IsIn for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl IsIn for BinaryChunked { + fn is_in(&self, other: &Series) -> PolarsResult { + match other.dtype() { + DataType::List(dt) if DataType::Binary == **dt => { + let mut ca: BooleanChunked = if self.len() == 1 && other.len() != 1 { + let value = self.get(0); + other + .list()? + .amortized_iter() + .map(|opt_b| { + opt_b.map(|s| { + let ca = s.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == value) + }) == Some(true) + }) + .collect_trusted() + } else { + self.into_iter() + .zip(other.list()?.amortized_iter()) + .map(|(value, series)| match (value, series) { + (val, Some(series)) => { + let ca = series.as_ref().unpack::().unwrap(); + ca.into_iter().any(|a| a == val) + } + _ => false, + }) + .collect_trusted() + }; + ca.rename(self.name()); + Ok(ca) + } + DataType::Binary => { + let mut set = HashSet::with_capacity(other.len()); + + let other = other.binary()?; + other.downcast_iter().for_each(|iter| { + iter.into_iter().for_each(|opt_val| { + set.insert(opt_val); + }) + }); + let mut ca: BooleanChunked = self + .into_iter() + .map(|opt_val| set.contains(&opt_val)) + .collect_trusted(); + ca.rename(self.name()); + Ok(ca) + } + _ => Err(PolarsError::SchemaMisMatch( + format!( + "cannot do is_in operation with left a dtype: {:?} and right a dtype {:?}", + self.dtype(), + other.dtype() + ) + .into(), + )), + } + .map(|mut ca| { + ca.rename(self.name()); + ca + }) + } +} + impl IsIn for BooleanChunked { fn is_in(&self, other: &Series) -> PolarsResult { match other.dtype() { diff --git a/polars/polars-core/src/chunked_array/ops/mod.rs b/polars/polars-core/src/chunked_array/ops/mod.rs index 507f8987b51c..3f441c509011 100644 --- a/polars/polars-core/src/chunked_array/ops/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/mod.rs @@ -639,6 +639,13 @@ impl ChunkExpandAtIndex for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkExpandAtIndex for BinaryChunked { + fn expand_at_index(&self, index: usize, length: usize) -> BinaryChunked { + impl_chunk_expand!(self, length, index) + } +} + impl ChunkExpandAtIndex for ListChunked { fn expand_at_index(&self, index: usize, length: usize) -> ListChunked { let opt_val = self.get(index); diff --git a/polars/polars-core/src/chunked_array/ops/repeat_by.rs b/polars/polars-core/src/chunked_array/ops/repeat_by.rs index 6e220a5dfdba..2d31d30e24b6 100644 --- a/polars/polars-core/src/chunked_array/ops/repeat_by.rs +++ b/polars/polars-core/src/chunked_array/ops/repeat_by.rs @@ -63,3 +63,21 @@ impl RepeatBy for Utf8Chunked { ) } } +#[cfg(feature = "dtype-binary")] +impl RepeatBy for BinaryChunked { + fn repeat_by(&self, by: &IdxCa) -> ListChunked { + let iter = self + .into_iter() + .zip(by.into_iter()) + .map(|(opt_v, opt_by)| opt_by.map(|by| std::iter::repeat(opt_v).take(by as usize))); + + // Safety: + // Length of iter is trusted + ListChunked::from_chunks( + self.name(), + vec![Box::new(unsafe { + LargeListArray::from_iter_binary_trusted_len(iter, self.len()) + })], + ) + } +} diff --git a/polars/polars-core/src/chunked_array/ops/reverse.rs b/polars/polars-core/src/chunked_array/ops/reverse.rs index e435f71589c7..f3ec1f0e2c66 100644 --- a/polars/polars-core/src/chunked_array/ops/reverse.rs +++ b/polars/polars-core/src/chunked_array/ops/reverse.rs @@ -39,6 +39,8 @@ macro_rules! impl_reverse { impl_reverse!(BooleanType, BooleanChunked); impl_reverse!(Utf8Type, Utf8Chunked); +#[cfg(feature = "dtype-binary")] +impl_reverse!(BinaryType, BinaryChunked); impl_reverse!(ListType, ListChunked); #[cfg(feature = "object")] diff --git a/polars/polars-core/src/chunked_array/ops/set.rs b/polars/polars-core/src/chunked_array/ops/set.rs index d4f7c831930e..edd4001ae706 100644 --- a/polars/polars-core/src/chunked_array/ops/set.rs +++ b/polars/polars-core/src/chunked_array/ops/set.rs @@ -273,6 +273,81 @@ impl<'a> ChunkSet<'a, &'a str, String> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> ChunkSet<'a, &'a [u8], Vec> for BinaryChunked { + fn set_at_idx>( + &'a self, + idx: I, + opt_value: Option<&'a [u8]>, + ) -> PolarsResult + where + Self: Sized, + { + let idx_iter = idx.into_iter(); + let mut ca_iter = self.into_iter().enumerate(); + let mut builder = + BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + + for current_idx in idx_iter { + if current_idx as usize > self.len() { + return Err(PolarsError::ComputeError( + format!( + "index: {} outside of ChunkedArray with length: {}", + current_idx, + self.len() + ) + .into(), + )); + } + for (cnt_idx, opt_val_self) in &mut ca_iter { + if cnt_idx == current_idx as usize { + builder.append_option(opt_value); + break; + } else { + builder.append_option(opt_val_self); + } + } + } + // the last idx is probably not the last value so we finish the iterator + for (_, opt_val_self) in ca_iter { + builder.append_option(opt_val_self); + } + + let ca = builder.finish(); + Ok(ca) + } + + fn set_at_idx_with, F>( + &'a self, + idx: I, + f: F, + ) -> PolarsResult + where + Self: Sized, + F: Fn(Option<&'a [u8]>) -> Option>, + { + let mut builder = + BinaryChunkedBuilder::new(self.name(), self.len(), self.get_values_size()); + impl_set_at_idx_with!(self, builder, idx, f) + } + + fn set(&'a self, mask: &BooleanChunked, value: Option<&'a [u8]>) -> PolarsResult + where + Self: Sized, + { + check_bounds!(self, mask); + let ca = mask + .into_iter() + .zip(self.into_iter()) + .map(|(mask_val, opt_val)| match mask_val { + Some(true) => value, + _ => opt_val, + }) + .collect_trusted(); + Ok(ca) + } +} + #[cfg(test)] mod test { use crate::prelude::*; diff --git a/polars/polars-core/src/chunked_array/ops/shift.rs b/polars/polars-core/src/chunked_array/ops/shift.rs index 1a2d1428c18a..450f62b3d322 100644 --- a/polars/polars-core/src/chunked_array/ops/shift.rs +++ b/polars/polars-core/src/chunked_array/ops/shift.rs @@ -60,12 +60,26 @@ impl ChunkShiftFill> for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkShiftFill> for BinaryChunked { + fn shift_and_fill(&self, periods: i64, fill_value: Option<&[u8]>) -> BinaryChunked { + impl_shift_fill!(self, periods, fill_value) + } +} + impl ChunkShift for Utf8Chunked { fn shift(&self, periods: i64) -> Self { self.shift_and_fill(periods, None) } } +#[cfg(feature = "dtype-binary")] +impl ChunkShift for BinaryChunked { + fn shift(&self, periods: i64) -> Self { + self.shift_and_fill(periods, None) + } +} + impl ChunkShiftFill> for ListChunked { fn shift_and_fill(&self, periods: i64, fill_value: Option<&Series>) -> ListChunked { // This has its own implementation because a ListChunked cannot have a full-null without diff --git a/polars/polars-core/src/chunked_array/ops/sort/categorical.rs b/polars/polars-core/src/chunked_array/ops/sort/categorical.rs index 137008f2cf89..bbf56ef8f816 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -188,11 +188,13 @@ mod test { toggle_string_cache(toggle); let s = Series::new("", init).cast(&DataType::Categorical(None))?; let ca = s.categorical()?; - let mut ca_lexical = ca.clone(); + let mut ca_lexical: CategoricalChunked = ca.clone(); ca_lexical.set_lexical_sorted(true); + let series = ca_lexical.into_series(); + let df = df![ - "cat" => &ca_lexical.into_series(), + "cat" => &series, "vals" => [1, 1, 2, 2] ]?; diff --git a/polars/polars-core/src/chunked_array/ops/sort/mod.rs b/polars/polars-core/src/chunked_array/ops/sort/mod.rs index 9563d1115eee..f6be933758e0 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/mod.rs @@ -508,6 +508,137 @@ impl ChunkSort for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkSort for BinaryChunked { + fn sort_with(&self, options: SortOptions) -> ChunkedArray { + sort_with_fast_path!(self, options); + let mut v: Vec<&[u8]> = if self.null_count() > 0 { + Vec::from_iter(self.into_iter().flatten()) + } else { + Vec::from_iter(self.into_no_null_iter()) + }; + + sort_branch( + v.as_mut_slice(), + options.descending, + order_default, + order_reverse, + ); + + let mut values = Vec::::with_capacity(self.get_values_size()); + let mut offsets = Vec::::with_capacity(self.len() + 1); + let mut length_so_far = 0i64; + offsets.push(length_so_far); + + let len = self.len(); + let null_count = self.null_count(); + let mut ca: Self = match (null_count, options.nulls_last) { + (0, _) => { + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default(offsets.into(), values.into(), None) + }; + (self.name(), ar).into() + } + (_, true) => { + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + let mut validity = MutableBitmap::with_capacity(len); + validity.extend_constant(len - null_count, true); + validity.extend_constant(null_count, false); + offsets.extend(std::iter::repeat(length_so_far).take(null_count)); + + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default( + offsets.into(), + values.into(), + Some(validity.into()), + ) + }; + (self.name(), ar).into() + } + (_, false) => { + let mut validity = MutableBitmap::with_capacity(len); + validity.extend_constant(null_count, false); + validity.extend_constant(len - null_count, true); + offsets.extend(std::iter::repeat(length_so_far).take(null_count)); + + for val in v { + values.extend_from_slice(val); + length_so_far = values.len() as i64; + offsets.push(length_so_far); + } + + // Safety: + // we pass valid utf8 + let ar = unsafe { + BinaryArray::from_data_unchecked_default( + offsets.into(), + values.into(), + Some(validity.into()), + ) + }; + (self.name(), ar).into() + } + }; + + ca.set_sorted(options.descending); + ca + } + + fn sort(&self, reverse: bool) -> BinaryChunked { + self.sort_with(SortOptions { + descending: reverse, + nulls_last: false, + }) + } + + fn argsort(&self, options: SortOptions) -> IdxCa { + argsort::argsort( + self.name(), + self.downcast_iter().map(|arr| arr.iter()), + options, + self.null_count(), + self.len(), + ) + } + + #[cfg(feature = "sort_multiple")] + /// # Panics + /// + /// This function is very opinionated. On the implementation of `ChunkedArray` for numeric types, + /// we assume that all numeric `Series` are of the same type. + /// + /// In this case we assume that all numeric `Series` are `f64` types. The caller needs to + /// uphold this contract. If not, it will panic. + /// + fn argsort_multiple(&self, other: &[Series], reverse: &[bool]) -> PolarsResult { + args_validate(self, other, reverse)?; + + let mut count: IdxSize = 0; + let vals: Vec<_> = self + .into_iter() + .map(|v| { + let i = count; + count += 1; + (i, v) + }) + .collect_trusted(); + argsort_multiple_impl(vals, other, reverse) + } +} + impl ChunkSort for BooleanChunked { fn sort_with(&self, options: SortOptions) -> ChunkedArray { sort_with_fast_path!(self, options); diff --git a/polars/polars-core/src/chunked_array/ops/take/mod.rs b/polars/polars-core/src/chunked_array/ops/take/mod.rs index 2edf6f96e410..78584b835462 100644 --- a/polars/polars-core/src/chunked_array/ops/take/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/take/mod.rs @@ -323,6 +323,88 @@ impl ChunkTake for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkTake for BinaryChunked { + unsafe fn take_unchecked(&self, indices: TakeIdx) -> Self + where + Self: std::marker::Sized, + I: TakeIterator, + INulls: TakeIteratorNulls, + { + let mut chunks = self.downcast_iter(); + match indices { + TakeIdx::Array(array) => { + if array.null_count() == array.len() { + return Self::full_null(self.name(), array.len()); + } + let array = match self.chunks.len() { + 1 => take_binary_unchecked(chunks.next().unwrap(), array) as ArrayRef, + _ => { + return if !array.has_validity() { + let iter = array.values().iter().map(|i| *i as usize); + let mut ca: BinaryChunked = take_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + ca + } else { + let iter = array + .into_iter() + .map(|opt_idx| opt_idx.map(|idx| *idx as usize)); + let mut ca: BinaryChunked = + take_opt_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + ca + } + } + }; + self.copy_with_chunks(vec![array], false) + } + TakeIdx::Iter(iter) => { + let array = match (self.has_validity(), self.chunks.len()) { + (false, 1) => { + take_no_null_binary_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef + } + (_, 1) => take_binary_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef, + _ => { + let mut ca: BinaryChunked = take_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + return ca; + } + }; + self.copy_with_chunks(vec![array], false) + } + TakeIdx::IterNulls(iter) => { + let array = match (self.has_validity(), self.chunks.len()) { + (false, 1) => { + take_no_null_binary_opt_iter_unchecked(chunks.next().unwrap(), iter) + as ArrayRef + } + (_, 1) => { + take_binary_opt_iter_unchecked(chunks.next().unwrap(), iter) as ArrayRef + } + _ => { + let mut ca: BinaryChunked = take_opt_iter_n_chunks_unchecked!(self, iter); + ca.rename(self.name()); + return ca; + } + }; + self.copy_with_chunks(vec![array], false) + } + } + } + + fn take(&self, indices: TakeIdx) -> PolarsResult + where + Self: std::marker::Sized, + I: TakeIterator, + INulls: TakeIteratorNulls, + { + indices.check_bounds(self.len())?; + // Safety: + // just checked bounds + Ok(unsafe { self.take_unchecked(indices) }) + } +} + impl ChunkTake for ListChunked { unsafe fn take_unchecked(&self, indices: TakeIdx) -> Self where diff --git a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs index c079974a591f..778bb596ac65 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_chunked.rs @@ -90,6 +90,39 @@ impl TakeChunked for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl TakeChunked for BinaryChunked { + unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(*chunk_idx as usize); + arr.get_unchecked(*array_idx as usize) + }) + .collect_trusted(); + ca.rename(self.name()); + ca.set_sorted2(sorted); + ca + } + + unsafe fn take_opt_chunked_unchecked(&self, by: &[Option]) -> Self { + let arrs = self.downcast_iter().collect::>(); + let mut ca: Self = by + .iter() + .map(|opt_idx| { + opt_idx.and_then(|[chunk_idx, array_idx]| { + let arr = arrs.get_unchecked(chunk_idx as usize); + arr.get_unchecked(array_idx as usize) + }) + }) + .collect_trusted(); + + ca.rename(self.name()); + ca + } +} + impl TakeChunked for BooleanChunked { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { let arrs = self.downcast_iter().collect::>(); diff --git a/polars/polars-core/src/chunked_array/ops/take/take_every.rs b/polars/polars-core/src/chunked_array/ops/take/take_every.rs index 0b092da0b190..be5c361943b0 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_every.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_every.rs @@ -41,6 +41,19 @@ impl ChunkTakeEvery for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkTakeEvery for BinaryChunked { + fn take_every(&self, n: usize) -> BinaryChunked { + let mut ca: Self = if !self.has_validity() { + self.into_no_null_iter().step_by(n).collect() + } else { + self.into_iter().step_by(n).collect() + }; + ca.rename(self.name()); + ca + } +} + impl ChunkTakeEvery for ListChunked { fn take_every(&self, n: usize) -> ListChunked { let mut ca: Self = if !self.has_validity() { diff --git a/polars/polars-core/src/chunked_array/ops/take/take_random.rs b/polars/polars-core/src/chunked_array/ops/take/take_random.rs index 0b0c51f84de2..dfcabe358691 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_random.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_random.rs @@ -232,6 +232,75 @@ impl<'a> IntoTakeRandom<'a> for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +pub struct BinaryTakeRandom<'a> { + pub(crate) chunks: Chunks<'a, BinaryArray>, + pub(crate) chunk_lens: Vec, +} + +#[cfg(feature = "dtype-binary")] +impl<'a> TakeRandom for BinaryTakeRandom<'a> { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + take_random_get!(self, index) + } + + #[inline] + unsafe fn get_unchecked(&self, index: usize) -> Option { + take_random_get_unchecked!(self, index) + } +} + +#[cfg(feature = "dtype-binary")] +pub struct BinaryTakeRandomSingleChunk<'a> { + pub(crate) arr: &'a BinaryArray, +} + +#[cfg(feature = "dtype-binary")] +impl<'a> TakeRandom for BinaryTakeRandomSingleChunk<'a> { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + take_random_get_single!(self, index) + } + + #[inline] + unsafe fn get_unchecked(&self, index: usize) -> Option { + if self.arr.is_valid_unchecked(index) { + Some(self.arr.value_unchecked(index)) + } else { + None + } + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a> IntoTakeRandom<'a> for &'a BinaryChunked { + type Item = &'a [u8]; + type TakeRandom = TakeRandBranch2, BinaryTakeRandom<'a>>; + + fn take_rand(&self) -> Self::TakeRandom { + match self.chunks.len() { + 1 => { + let arr = self.downcast_iter().next().unwrap(); + let t = BinaryTakeRandomSingleChunk { arr }; + TakeRandBranch2::Single(t) + } + _ => { + let chunks = self.downcast_chunks(); + let t = BinaryTakeRandom { + chunks, + chunk_lens: self.chunks.iter().map(|a| a.len() as IdxSize).collect(), + }; + TakeRandBranch2::Multi(t) + } + } + } +} + impl<'a> IntoTakeRandom<'a> for &'a BooleanChunked { type Item = bool; type TakeRandom = TakeRandBranch2, BoolTakeRandom<'a>>; diff --git a/polars/polars-core/src/chunked_array/ops/take/take_single.rs b/polars/polars-core/src/chunked_array/ops/take/take_single.rs index b60be9e89360..7764b7fcff06 100644 --- a/polars/polars-core/src/chunked_array/ops/take/take_single.rs +++ b/polars/polars-core/src/chunked_array/ops/take/take_single.rs @@ -113,6 +113,18 @@ impl<'a> TakeRandom for &'a Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl<'a> TakeRandom for &'a BinaryChunked { + type Item = &'a [u8]; + + #[inline] + fn get(&self, index: usize) -> Option { + // Safety: + // Out of bounds is checked and downcast is of correct type + unsafe { impl_take_random_get!(self, index, LargeBinaryArray) } + } +} + // extra trait such that it also works without extra reference. // Autoref will insert the reference and impl<'a> TakeRandomUtf8 for &'a Utf8Chunked { diff --git a/polars/polars-core/src/chunked_array/ops/unique/mod.rs b/polars/polars-core/src/chunked_array/ops/unique/mod.rs index bf532be9f2a2..57631a1f939c 100644 --- a/polars/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/unique/mod.rs @@ -304,6 +304,60 @@ impl ChunkUnique for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl ChunkUnique for BinaryChunked { + fn unique(&self) -> PolarsResult { + match self.null_count() { + 0 => { + let mut set = + PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len())); + for arr in self.downcast_iter() { + set.extend(arr.values_iter()) + } + Ok(BinaryChunked::from_iter_values( + self.name(), + set.iter().copied(), + )) + } + _ => { + let mut set = + PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len())); + for arr in self.downcast_iter() { + set.extend(arr.iter()) + } + Ok(BinaryChunked::from_iter_options( + self.name(), + set.iter().copied(), + )) + } + } + } + + fn arg_unique(&self) -> PolarsResult { + Ok(IdxCa::from_vec(self.name(), arg_unique_ca!(self))) + } + + fn is_unique(&self) -> PolarsResult { + is_unique_duplicated!(self, false) + } + fn is_duplicated(&self) -> PolarsResult { + is_unique_duplicated!(self, true) + } + + fn n_unique(&self) -> PolarsResult { + if self.null_count() > 0 { + Ok(fill_set(self.into_iter().flatten()).len() + 1) + } else { + Ok(fill_set(self.into_no_null_iter()).len()) + } + } + + #[cfg(feature = "mode")] + fn mode(&self) -> PolarsResult { + Ok(mode(self)) + } +} + impl ChunkUnique for BooleanChunked { fn unique(&self) -> PolarsResult { // can be None, Some(true), Some(false) @@ -438,6 +492,25 @@ mod is_first { Ok(BooleanChunked::from_chunks(self.name(), chunks)) } } + + #[cfg(feature = "dtype-binary")] + impl IsFirst for BinaryChunked { + fn is_first(&self) -> PolarsResult { + let mut unique = PlHashSet::new(); + let chunks = self + .downcast_iter() + .map(|arr| { + let mask: BooleanArray = arr + .into_iter() + .map(|opt_v| unique.insert(opt_v)) + .collect_trusted(); + Box::new(mask) as ArrayRef + }) + .collect(); + + Ok(BooleanChunked::from_chunks(self.name(), chunks)) + } + } } #[cfg(test)] diff --git a/polars/polars-core/src/chunked_array/ops/zip.rs b/polars/polars-core/src/chunked_array/ops/zip.rs index 59db6a73a5ac..7a498741fcc7 100644 --- a/polars/polars-core/src/chunked_array/ops/zip.rs +++ b/polars/polars-core/src/chunked_array/ops/zip.rs @@ -142,6 +142,33 @@ impl ChunkZip for Utf8Chunked { } } } + +#[cfg(feature = "dtype-binary")] +impl ChunkZip for BinaryChunked { + fn zip_with( + &self, + mask: &BooleanChunked, + other: &BinaryChunked, + ) -> PolarsResult { + if self.len() != mask.len() || other.len() != mask.len() { + impl_ternary_broadcast!(self, self.len(), other.len(), other, mask, BinaryType) + } else { + let (left, right, mask) = align_chunks_ternary(self, other, mask); + let chunks = left + .downcast_iter() + .zip(right.downcast_iter()) + .zip(mask.downcast_iter()) + .map(|((left_c, right_c), mask_c)| { + let mask_c = prepare_mask(mask_c); + let arr = if_then_else(&mask_c, left_c, right_c)?; + Ok(arr) + }) + .collect::>>()?; + Ok(ChunkedArray::from_chunks(self.name(), chunks)) + } + } +} + impl ChunkZip for ListChunked { fn zip_with( &self, diff --git a/polars/polars-core/src/chunked_array/trusted_len.rs b/polars/polars-core/src/chunked_array/trusted_len.rs index 1be2de8ff7c0..5ae068743f13 100644 --- a/polars/polars-core/src/chunked_array/trusted_len.rs +++ b/polars/polars-core/src/chunked_array/trusted_len.rs @@ -203,6 +203,28 @@ where } } +#[cfg(feature = "dtype-binary")] +impl FromTrustedLenIterator for BinaryChunked +where + Ptr: PolarsAsRef<[u8]>, +{ + fn from_iter_trusted_length>(iter: I) -> Self { + let iter = iter.into_iter(); + iter.collect() + } +} + +#[cfg(feature = "dtype-binary")] +impl FromTrustedLenIterator> for BinaryChunked +where + Ptr: AsRef<[u8]>, +{ + fn from_iter_trusted_length>>(iter: I) -> Self { + let iter = iter.into_iter(); + iter.collect() + } +} + #[cfg(feature = "object")] impl FromTrustedLenIterator> for ObjectChunked { fn from_iter_trusted_length>>(iter: I) -> Self { diff --git a/polars/polars-core/src/chunked_array/upstream_traits.rs b/polars/polars-core/src/chunked_array/upstream_traits.rs index 679ba56a5f79..63ac8aead8bb 100644 --- a/polars/polars-core/src/chunked_array/upstream_traits.rs +++ b/polars/polars-core/src/chunked_array/upstream_traits.rs @@ -133,6 +133,41 @@ where } } +// FromIterator for BinaryChunked variants. +#[cfg(feature = "dtype-binary")] +impl FromIterator> for BinaryChunked +where + Ptr: AsRef<[u8]>, +{ + fn from_iter>>(iter: I) -> Self { + let arr = BinaryArray::::from_iter(iter); + Self::from_chunks("", vec![Box::new(arr)]) + } +} + +#[cfg(feature = "dtype-binary")] +impl PolarsAsRef<[u8]> for Vec {} + +#[cfg(feature = "dtype-binary")] +impl PolarsAsRef<[u8]> for &[u8] {} + +#[cfg(feature = "dtype-binary")] +impl PolarsAsRef<[u8]> for &&[u8] {} + +#[cfg(feature = "dtype-binary")] +impl<'a> PolarsAsRef<[u8]> for Cow<'a, [u8]> {} + +#[cfg(feature = "dtype-binary")] +impl FromIterator for BinaryChunked +where + Ptr: PolarsAsRef<[u8]>, +{ + fn from_iter>(iter: I) -> Self { + let arr = BinaryArray::::from_iter_values(iter.into_iter()); + Self::from_chunks("", vec![Box::new(arr)]) + } +} + impl FromIterator for ListChunked where Ptr: Borrow, diff --git a/polars/polars-core/src/datatypes/_serde.rs b/polars/polars-core/src/datatypes/_serde.rs index 10bfd7ed4ac0..af984f1f8380 100644 --- a/polars/polars-core/src/datatypes/_serde.rs +++ b/polars/polars-core/src/datatypes/_serde.rs @@ -42,6 +42,8 @@ pub enum SerializableDataType { Float64, /// String data Utf8, + #[cfg(feature = "dtype-binary")] + Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date, @@ -76,6 +78,8 @@ impl From<&DataType> for SerializableDataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + #[cfg(feature = "dtype-binary")] + Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(*tu, tz.clone()), Duration(tu) => Self::Duration(*tu), @@ -105,6 +109,8 @@ impl From for DataType { Float32 => Self::Float32, Float64 => Self::Float64, Utf8 => Self::Utf8, + #[cfg(feature = "dtype-binary")] + Binary => Self::Binary, Date => Self::Date, Datetime(tu, tz) => Self::Datetime(tu, tz), Duration(tu) => Self::Duration(tu), diff --git a/polars/polars-core/src/datatypes/dtype.rs b/polars/polars-core/src/datatypes/dtype.rs index 7c236a7f83a3..b1bf7f60030a 100644 --- a/polars/polars-core/src/datatypes/dtype.rs +++ b/polars/polars-core/src/datatypes/dtype.rs @@ -17,6 +17,8 @@ pub enum DataType { Float64, /// String data Utf8, + #[cfg(feature = "dtype-binary")] + Binary, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date, @@ -144,6 +146,8 @@ impl DataType { | DataType::Duration(_) | DataType::Boolean | DataType::Null => false, + #[cfg(feature = "dtype-binary")] + DataType::Binary => false, #[cfg(feature = "object")] DataType::Object(_) => false, #[cfg(feature = "dtype-categorical")] @@ -190,6 +194,8 @@ impl DataType { Float32 => ArrowDataType::Float32, Float64 => ArrowDataType::Float64, Utf8 => ArrowDataType::LargeUtf8, + #[cfg(feature = "dtype-binary")] + Binary => ArrowDataType::LargeBinary, Date => ArrowDataType::Date32, Datetime(unit, tz) => ArrowDataType::Timestamp(unit.to_arrow(), tz.clone()), Duration(unit) => ArrowDataType::Duration(unit.to_arrow()), @@ -241,6 +247,8 @@ impl Display for DataType { DataType::Float32 => "f32", DataType::Float64 => "f64", DataType::Utf8 => "str", + #[cfg(feature = "dtype-binary")] + DataType::Binary => "binary", DataType::Date => "date", DataType::Datetime(tu, tz) => { let s = match tz { diff --git a/polars/polars-core/src/datatypes/field.rs b/polars/polars-core/src/datatypes/field.rs index aeca75576cb6..014472124c69 100644 --- a/polars/polars-core/src/datatypes/field.rs +++ b/polars/polars-core/src/datatypes/field.rs @@ -131,8 +131,9 @@ impl From<&ArrowDataType> for DataType { ArrowDataType::Timestamp(tu, tz) => DataType::Datetime(tu.into(), tz.clone()), ArrowDataType::Duration(tu) => DataType::Duration(tu.into()), ArrowDataType::Date64 => DataType::Datetime(TimeUnit::Milliseconds, None), - ArrowDataType::LargeUtf8 => DataType::Utf8, - ArrowDataType::Utf8 => DataType::Utf8, + ArrowDataType::LargeUtf8 | ArrowDataType::Utf8 => DataType::Utf8, + #[cfg(feature = "dtype-binary")] + ArrowDataType::LargeBinary | ArrowDataType::Binary => DataType::Binary, ArrowDataType::Time64(_) | ArrowDataType::Time32(_) => DataType::Time, #[cfg(feature = "dtype-categorical")] ArrowDataType::Dictionary(_, _, _) => DataType::Categorical(None), diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index c4388b818512..f101a30b5fa2 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -43,6 +43,9 @@ use crate::utils::Wrap; pub struct Utf8Type {} +#[cfg(feature = "dtype-binary")] +pub struct BinaryType {} + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct ListType {} @@ -86,6 +89,13 @@ impl PolarsDataType for Utf8Type { } } +#[cfg(feature = "dtype-binary")] +impl PolarsDataType for BinaryType { + fn get_dtype() -> DataType { + DataType::Binary + } +} + pub struct BooleanType {} impl PolarsDataType for BooleanType { @@ -122,6 +132,9 @@ impl PolarsSingleType for T where T: NativeType + PolarsDataType {} impl PolarsSingleType for Utf8Type {} +#[cfg(feature = "dtype-binary")] +impl PolarsSingleType for BinaryType {} + pub type ListChunked = ChunkedArray; pub type BooleanChunked = ChunkedArray; pub type UInt8Chunked = ChunkedArray; @@ -135,6 +148,8 @@ pub type Int64Chunked = ChunkedArray; pub type Float32Chunked = ChunkedArray; pub type Float64Chunked = ChunkedArray; pub type Utf8Chunked = ChunkedArray; +#[cfg(feature = "dtype-binary")] +pub type BinaryChunked = ChunkedArray; pub trait NumericNative: PartialOrd @@ -292,6 +307,10 @@ pub enum AnyValue<'a> { StructOwned(Box<(Vec>, Vec)>), /// A UTF8 encoded string type. Utf8Owned(String), + #[cfg(feature = "dtype-binary")] + Binary(&'a [u8]), + #[cfg(feature = "dtype-binary")] + BinaryOwned(Vec), } #[cfg(feature = "serde")] @@ -320,6 +339,12 @@ impl Serialize for AnyValue<'_> { AnyValue::Utf8Owned(v) => { serializer.serialize_newtype_variant(name, 13, "Utf8Owned", v) } + #[cfg(feature = "dtype-binary")] + AnyValue::Binary(v) => serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v), + #[cfg(feature = "dtype-binary")] + AnyValue::BinaryOwned(v) => { + serializer.serialize_newtype_variant(name, 14, "BinaryOwned", v) + } _ => todo!(), } } @@ -347,6 +372,8 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { List, Bool, Utf8Owned, + #[cfg(feature = "dtype-binary")] + BinaryOwned, } const VARIANTS: &[&str] = &[ "Null", @@ -363,7 +390,11 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { "List", "Boolean", "Utf8Owned", + "BinaryOwned", ]; + #[cfg(feature = "dtype-binary")] + const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::BinaryOwned) }; + #[cfg(not(feature = "dtype-binary"))] const LAST: u8 = unsafe { std::mem::transmute::<_, u8>(AvField::Utf8Owned) }; struct FieldVisitor; @@ -427,6 +458,8 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { b"List" => AvField::List, b"Bool" => AvField::Bool, b"Utf8Owned" | b"Utf8" => AvField::Utf8Owned, + #[cfg(feature = "dtype-binary")] + b"BinaryOwned" | b"Binary" => AvField::BinaryOwned, _ => { return Err(serde::de::Error::unknown_variant( &String::from_utf8_lossy(v), @@ -514,6 +547,11 @@ impl<'a> Deserialize<'a> for AnyValue<'static> { let value = variant.newtype_variant()?; AnyValue::Utf8Owned(value) } + #[cfg(feature = "dtype-binary")] + (AvField::BinaryOwned, variant) => { + let value = variant.newtype_variant()?; + AnyValue::BinaryOwned(value) + } }; Ok(out) } @@ -571,7 +609,12 @@ impl<'a> Hash for AnyValue<'a> { UInt16(v) => state.write_u16(*v), UInt32(v) => state.write_u32(*v), UInt64(v) => state.write_u64(*v), - Utf8(s) => state.write(s.as_bytes()), + Utf8(v) => state.write(v.as_bytes()), + Utf8Owned(v) => state.write(v.as_bytes()), + #[cfg(feature = "dtype-binary")] + Binary(v) => state.write(v), + #[cfg(feature = "dtype-binary")] + BinaryOwned(v) => state.write(v), Boolean(v) => state.write_u8(*v as u8), List(v) => Hash::hash(&Wrap(v.clone()), state), _ => unimplemented!(), @@ -720,7 +763,12 @@ impl<'a> AnyValue<'a> { #[cfg(feature = "dtype-time")] Time(v) => AnyValue::Time(v), List(v) => AnyValue::List(v), - Utf8(s) => AnyValue::Utf8Owned(s.to_string()), + Utf8(v) => AnyValue::Utf8Owned(v.to_string()), + Utf8Owned(v) => AnyValue::Utf8Owned(v), + #[cfg(feature = "dtype-binary")] + Binary(v) => AnyValue::BinaryOwned(v.to_vec()), + #[cfg(feature = "dtype-binary")] + BinaryOwned(v) => AnyValue::BinaryOwned(v), dt => { return Err(PolarsError::ComputeError( format!("cannot get static AnyValue from {}", dt).into(), @@ -749,6 +797,11 @@ impl PartialEq for AnyValue<'_> { fn eq(&self, other: &Self) -> bool { use AnyValue::*; match (self, other) { + #[cfg(feature = "dtype-binary")] + (BinaryOwned(l), BinaryOwned(r)) => l == r, + #[cfg(feature = "dtype-binary")] + (Binary(l), Binary(r)) => l == r, + (Utf8Owned(l), Utf8Owned(r)) => l == r, (Utf8(l), Utf8(r)) => l == r, (UInt8(l), UInt8(r)) => l == r, (UInt16(l), UInt16(r)) => l == r, @@ -805,6 +858,11 @@ impl PartialOrd for AnyValue<'_> { (Float32(l), Float32(r)) => l.partial_cmp(r), (Float64(l), Float64(r)) => l.partial_cmp(r), (Utf8(l), Utf8(r)) => l.partial_cmp(r), + (Utf8Owned(l), Utf8Owned(r)) => l.partial_cmp(r), + #[cfg(feature = "dtype-binary")] + (Binary(l), Binary(r)) => l.partial_cmp(r), + #[cfg(feature = "dtype-binary")] + (BinaryOwned(l), BinaryOwned(r)) => l.partial_cmp(r), _ => None, } } @@ -938,6 +996,10 @@ mod test { ), (ArrowDataType::LargeUtf8, DataType::Utf8), (ArrowDataType::Utf8, DataType::Utf8), + #[cfg(feature = "dtype-binary")] + (ArrowDataType::LargeBinary, DataType::Binary), + #[cfg(feature = "dtype-binary")] + (ArrowDataType::Binary, DataType::Binary), ( ArrowDataType::Time64(ArrowTimeUnit::Nanosecond), DataType::Time, diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs index 31ef4013fdea..4c7c426a3e33 100644 --- a/polars/polars-core/src/fmt.rs +++ b/polars/polars-core/src/fmt.rs @@ -155,6 +155,13 @@ impl Debug for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl Debug for BinaryChunked { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + format_array!(f, self, "binary", self.name(), "ChunkedArray") + } +} + impl Debug for ListChunked { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { format_array!(f, self, "list", self.name(), "ChunkedArray") @@ -276,6 +283,10 @@ impl Debug for Series { DataType::Null => { writeln!(f, "nullarray") } + #[cfg(feature = "dtype-binary")] + DataType::Binary => { + format_array!(f, self.binary().unwrap(), "binary", self.name(), "Series") + } dt => panic!("{:?} not impl", dt), } } @@ -662,6 +673,8 @@ impl Display for AnyValue<'_> { AnyValue::Boolean(v) => write!(f, "{}", *v), AnyValue::Utf8(v) => write!(f, "{}", format_args!("\"{}\"", v)), AnyValue::Utf8Owned(v) => write!(f, "{}", format_args!("\"{}\"", v)), + #[cfg(feature = "dtype-binary")] + AnyValue::Binary(_) | AnyValue::BinaryOwned(_) => write!(f, "[binary data]"), #[cfg(feature = "dtype-date")] AnyValue::Date(v) => write!(f, "{}", date32_to_date(*v)), #[cfg(feature = "dtype-datetime")] @@ -775,6 +788,13 @@ impl FmtList for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl FmtList for BinaryChunked { + fn fmt_list(&self) -> String { + impl_fmt_list!(self) + } +} + impl FmtList for ListChunked { fn fmt_list(&self) -> String { impl_fmt_list!(self) diff --git a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs index dbb38010ecc8..13d7959b356d 100644 --- a/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs +++ b/polars/polars-core/src/frame/groupby/aggregations/agg_list.rs @@ -202,6 +202,32 @@ impl AggList for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl AggList for BinaryChunked { + unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { + match groups { + GroupsProxy::Idx(groups) => { + let mut builder = + ListBinaryChunkedBuilder::new(self.name(), groups.len(), self.len()); + for idx in groups.all().iter() { + let ca = { self.take_unchecked(idx.into()) }; + builder.append(&ca) + } + builder.finish().into_series() + } + GroupsProxy::Slice { groups, .. } => { + let mut builder = + ListBinaryChunkedBuilder::new(self.name(), groups.len(), self.len()); + for [first, len] in groups { + let ca = self.slice(*first as i64, *len as usize); + builder.append(&ca) + } + builder.finish().into_series() + } + } + } +} + fn agg_list_list, &mut i64, &mut Vec) -> bool>( ca: &ListChunked, groups_len: usize, diff --git a/polars/polars-core/src/frame/groupby/into_groups.rs b/polars/polars-core/src/frame/groupby/into_groups.rs index 1cf2a84425bf..2a78526a6994 100644 --- a/polars/polars-core/src/frame/groupby/into_groups.rs +++ b/polars/polars-core/src/frame/groupby/into_groups.rs @@ -285,6 +285,59 @@ impl IntoGroupsProxy for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl IntoGroupsProxy for BinaryChunked { + #[allow(clippy::needless_lifetimes)] + fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { + let hb = RandomState::default(); + let null_h = get_null_hash_value(hb.clone()); + + let out = if multithreaded { + let n_partitions = set_partition_size(); + + let split = _split_offsets(self.len(), n_partitions); + + let byte_hashes = POOL.install(|| { + split + .into_par_iter() + .map(|(offset, len)| { + let ca = self.slice(offset as i64, len); + ca.into_iter() + .map(|opt_b| { + let hash = match opt_b { + Some(s) => <[u8]>::get_hash(s, &hb), + None => null_h, + }; + // Safety: + // the underlying data is tied to self + unsafe { + std::mem::transmute::, BytesHash<'a>>( + BytesHash::new(opt_b, hash), + ) + } + }) + .collect_trusted::>() + }) + .collect::>() + }); + groupby_threaded_num(byte_hashes, 0, n_partitions as u64, sorted) + } else { + let byte_hashes = self + .into_iter() + .map(|opt_b| { + let hash = match opt_b { + Some(s) => <[u8]>::get_hash(s, &hb), + None => null_h, + }; + BytesHash::new(opt_b, hash) + }) + .collect_trusted::>(); + groupby(byte_hashes.iter(), sorted) + }; + Ok(out) + } +} + impl IntoGroupsProxy for ListChunked { #[allow(clippy::needless_lifetimes)] #[allow(unused_variables)] diff --git a/polars/polars-core/src/frame/hash_join/mod.rs b/polars/polars-core/src/frame/hash_join/mod.rs index 0f31cace8b65..9fc539fd551d 100644 --- a/polars/polars-core/src/frame/hash_join/mod.rs +++ b/polars/polars-core/src/frame/hash_join/mod.rs @@ -235,6 +235,8 @@ macro_rules! impl_zip_outer_join { } impl_zip_outer_join!(BooleanChunked); impl_zip_outer_join!(Utf8Chunked); +#[cfg(feature = "dtype-binary")] +impl_zip_outer_join!(BinaryChunked); impl ZipOuterJoinColumn for Float32Chunked { fn zip_outer_join_column( diff --git a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs index 4b52106daad6..4f4e3a368ade 100644 --- a/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs +++ b/polars/polars-core/src/frame/hash_join/single_keys_dispatch.rs @@ -15,6 +15,12 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_left(rhs) } + #[cfg(feature = "dtype-binary")] + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_left(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = lhs.bit_repr_large(); @@ -40,6 +46,12 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_semi_anti(rhs, anti) } + #[cfg(feature = "dtype-binary")] + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_semi_anti(rhs, anti) + } _ => { if self.bit_repr_is_large() { let lhs = lhs.bit_repr_large(); @@ -65,6 +77,12 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_inner(rhs) } + #[cfg(feature = "dtype-binary")] + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_inner(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = self.bit_repr_large(); @@ -92,6 +110,12 @@ impl Series { let rhs = rhs.utf8().unwrap(); lhs.hash_join_outer(rhs) } + #[cfg(feature = "dtype-binary")] + Binary => { + let lhs = lhs.binary().unwrap(); + let rhs = rhs.binary().unwrap(); + lhs.hash_join_outer(rhs) + } _ => { if self.bit_repr_is_large() { let lhs = self.bit_repr_large(); @@ -409,6 +433,122 @@ impl Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +pub(crate) fn prepare_bytes<'a>( + been_split: &'a [BinaryChunked], + hb: &RandomState, +) -> Vec>> { + POOL.install(|| { + been_split + .par_iter() + .map(|ca| { + ca.into_iter() + .map(|opt_b| { + let mut state = hb.build_hasher(); + opt_b.hash(&mut state); + let hash = state.finish(); + BytesHash::new(opt_b, hash) + }) + .collect::>() + }) + .collect() + }) +} + +#[cfg(feature = "dtype-binary")] +impl BinaryChunked { + fn prepare( + &self, + other: &BinaryChunked, + swapped: bool, + ) -> (Vec, Vec, bool, RandomState) { + let n_threads = POOL.current_num_threads(); + + let (a, b, swap) = if swapped { + det_hash_prone_order!(self, other) + } else { + (self, other, false) + }; + + let hb = RandomState::default(); + let splitted_a = split_ca(a, n_threads).unwrap(); + let splitted_b = split_ca(b, n_threads).unwrap(); + + (splitted_a, splitted_b, swap, hb) + } + + // returns the join tuples and whether or not the lhs tuples are sorted + fn hash_join_inner(&self, other: &BinaryChunked) -> ((Vec, Vec), bool) { + let (splitted_a, splitted_b, swap, hb) = self.prepare(other, true); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + ( + hash_join_tuples_inner(str_hashes_a, str_hashes_b, swap), + !swap, + ) + } + + fn hash_join_left(&self, other: &BinaryChunked) -> LeftJoinIds { + let (splitted_a, splitted_b, _, hb) = self.prepare(other, false); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + + let (mapping_left, mapping_right) = + create_mappings(self.chunks(), other.chunks(), self.len(), other.len()); + hash_join_tuples_left( + str_hashes_a, + str_hashes_b, + mapping_left.as_deref(), + mapping_right.as_deref(), + ) + } + + #[cfg(feature = "semi_anti_join")] + fn hash_join_semi_anti(&self, other: &BinaryChunked, anti: bool) -> Vec { + let (splitted_a, splitted_b, _, hb) = self.prepare(other, false); + let str_hashes_a = prepare_bytes(&splitted_a, &hb); + let str_hashes_b = prepare_bytes(&splitted_b, &hb); + if anti { + hash_join_tuples_left_anti(str_hashes_a, str_hashes_b) + } else { + hash_join_tuples_left_semi(str_hashes_a, str_hashes_b) + } + } + + fn hash_join_outer(&self, other: &BinaryChunked) -> Vec<(Option, Option)> { + let (a, b, swap) = det_hash_prone_order!(self, other); + + let n_partitions = set_partition_size(); + let splitted_a = split_ca(a, n_partitions).unwrap(); + let splitted_b = split_ca(b, n_partitions).unwrap(); + + match (a.has_validity(), b.has_validity()) { + (false, false) => { + let iters_a = splitted_a + .iter() + .map(|ca| ca.into_no_null_iter()) + .collect::>(); + let iters_b = splitted_b + .iter() + .map(|ca| ca.into_no_null_iter()) + .collect::>(); + hash_join_tuples_outer(iters_a, iters_b, swap) + } + _ => { + let iters_a = splitted_a + .iter() + .map(|ca| ca.into_iter()) + .collect::>(); + let iters_b = splitted_b + .iter() + .map(|ca| ca.into_iter()) + .collect::>(); + hash_join_tuples_outer(iters_a, iters_b, swap) + } + } + } +} + #[cfg(feature = "semi_anti_join")] fn num_group_join_anti_semi( left: &ChunkedArray, diff --git a/polars/polars-core/src/named_from.rs b/polars/polars-core/src/named_from.rs index 8f12bce54253..d19ee237d649 100644 --- a/polars/polars-core/src/named_from.rs +++ b/polars/polars-core/src/named_from.rs @@ -63,6 +63,8 @@ macro_rules! impl_named_from { } impl_named_from!([String], Utf8Type, from_slice); +#[cfg(feature = "dtype-binary")] +impl_named_from!([Vec], BinaryType, from_slice); impl_named_from!([bool], BooleanType, from_slice); #[cfg(feature = "dtype-u8")] impl_named_from!([u8], UInt8Type, from_slice); @@ -79,6 +81,8 @@ impl_named_from!([i64], Int64Type, from_slice); impl_named_from!([f32], Float32Type, from_slice); impl_named_from!([f64], Float64Type, from_slice); impl_named_from!([Option], Utf8Type, from_slice_options); +#[cfg(feature = "dtype-binary")] +impl_named_from!([Option>], BinaryType, from_slice_options); impl_named_from!([Option], BooleanType, from_slice_options); #[cfg(feature = "dtype-u8")] impl_named_from!([Option], UInt8Type, from_slice_options); @@ -225,6 +229,70 @@ impl<'a, T: AsRef<[Option>]>> NamedFrom>]> } } +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice(name, v.as_ref()).into_series() + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[&'a [u8]]>> NamedFrom for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice(name, v.as_ref()) + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice_options(name, v.as_ref()).into_series() + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Option<&'a [u8]>]>> NamedFrom]> for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_slice_options(name, v.as_ref()) + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) + .into_series() + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Cow<'a, [u8]>]>> NamedFrom]> for BinaryChunked { + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_values(name, v.as_ref().iter().map(|value| value.as_ref())) + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Option>]>> NamedFrom>]> for Series { + fn new(name: &str, v: T) -> Self { + BinaryChunked::new(name, v).into_series() + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a, T: AsRef<[Option>]>> NamedFrom>]> + for BinaryChunked +{ + fn new(name: &str, v: T) -> Self { + BinaryChunked::from_iter_options( + name, + v.as_ref() + .iter() + .map(|opt| opt.as_ref().map(|value| value.as_ref())), + ) + } +} + #[cfg(feature = "dtype-date")] impl> NamedFrom for DateChunked { fn new(name: &str, v: T) -> Self { diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs index 43a54094fa45..2d302e2b4224 100644 --- a/polars/polars-core/src/prelude.rs +++ b/polars/polars-core/src/prelude.rs @@ -9,6 +9,8 @@ pub use polars_arrow::kernels::ewm::EWMOptions; pub use polars_arrow::prelude::*; pub(crate) use polars_arrow::trusted_len::TrustedLen; +#[cfg(feature = "dtype-binary")] +pub use crate::chunked_array::builder::{BinaryChunkedBuilder, ListBinaryChunkedBuilder}; pub use crate::chunked_array::builder::{ BooleanChunkedBuilder, ChunkedBuilder, ListBooleanChunkedBuilder, ListBuilderTrait, ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray, PrimitiveChunkedBuilder, diff --git a/polars/polars-core/src/series/any_value.rs b/polars/polars-core/src/series/any_value.rs index 9174b4ae6d36..e2a470acb4ab 100644 --- a/polars/polars-core/src/series/any_value.rs +++ b/polars/polars-core/src/series/any_value.rs @@ -16,6 +16,17 @@ fn any_values_to_utf8(avs: &[AnyValue]) -> Utf8Chunked { .collect_trusted() } +#[cfg(feature = "dtype-binary")] +fn any_values_to_binary(avs: &[AnyValue]) -> BinaryChunked { + avs.iter() + .map(|av| match av { + AnyValue::Binary(s) => Some(*s), + AnyValue::BinaryOwned(s) => Some(&**s), + _ => None, + }) + .collect_trusted() +} + fn any_values_to_bool(avs: &[AnyValue]) -> BooleanChunked { avs.iter() .map(|av| match av { @@ -81,6 +92,8 @@ impl Series { DataType::Float32 => any_values_to_primitive::(av).into_series(), DataType::Float64 => any_values_to_primitive::(av).into_series(), DataType::Utf8 => any_values_to_utf8(av).into_series(), + #[cfg(feature = "dtype-binary")] + DataType::Binary => any_values_to_binary(av).into_series(), DataType::Boolean => any_values_to_bool(av).into_series(), #[cfg(feature = "dtype-date")] DataType::Date => any_values_to_primitive::(av) @@ -152,8 +165,9 @@ impl<'a> From<&AnyValue<'a>> for DataType { match val { Null => DataType::Null, Boolean(_) => DataType::Boolean, - Utf8(_) => DataType::Utf8, - Utf8Owned(_) => DataType::Utf8, + Utf8(_) | Utf8Owned(_) => DataType::Utf8, + #[cfg(feature = "dtype-binary")] + Binary(_) | BinaryOwned(_) => DataType::Binary, UInt32(_) => DataType::UInt32, UInt64(_) => DataType::UInt64, Int32(_) => DataType::Int32, diff --git a/polars/polars-core/src/series/arithmetic/borrowed.rs b/polars/polars-core/src/series/arithmetic/borrowed.rs index 60dcdde38644..02634081decf 100644 --- a/polars/polars-core/src/series/arithmetic/borrowed.rs +++ b/polars/polars-core/src/series/arithmetic/borrowed.rs @@ -101,6 +101,15 @@ impl NumOpsDispatch for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl NumOpsDispatch for BinaryChunked { + fn add_to(&self, rhs: &Series) -> PolarsResult { + let rhs = self.unpack_series_matching_type(rhs)?; + let out = self + rhs; + Ok(out.into_series()) + } +} + #[cfg(feature = "checked_arithmetic")] pub mod checked { use num::{CheckedDiv, ToPrimitive, Zero}; diff --git a/polars/polars-core/src/series/comparison.rs b/polars/polars-core/src/series/comparison.rs index e10a9b0124c9..84abe446231a 100644 --- a/polars/polars-core/src/series/comparison.rs +++ b/polars/polars-core/src/series/comparison.rs @@ -22,6 +22,8 @@ macro_rules! impl_compare { match lhs.dtype() { DataType::Boolean => lhs.bool().unwrap().$method(rhs.bool().unwrap()), DataType::Utf8 => lhs.utf8().unwrap().$method(rhs.utf8().unwrap()), + #[cfg(feature = "dtype-binary")] + DataType::Binary => lhs.binary().unwrap().$method(rhs.binary().unwrap()), DataType::UInt8 => lhs.u8().unwrap().$method(rhs.u8().unwrap()), DataType::UInt16 => lhs.u16().unwrap().$method(rhs.u16().unwrap()), DataType::UInt32 => lhs.u32().unwrap().$method(rhs.u32().unwrap()), diff --git a/polars/polars-core/src/series/from.rs b/polars/polars-core/src/series/from.rs index e7c2dc301bdd..76e966e5bcf6 100644 --- a/polars/polars-core/src/series/from.rs +++ b/polars/polars-core/src/series/from.rs @@ -61,6 +61,17 @@ impl Series { .into_series(), List(_) => ListChunked::from_chunks(name, chunks).cast(dtype).unwrap(), Utf8 => Utf8Chunked::from_chunks(name, chunks).into_series(), + #[cfg(feature = "dtype-binary")] + Binary => { + #[cfg(feature = "dtype-binary")] + { + BinaryChunked::from_chunks(name, chunks).into_series() + } + #[cfg(not(feature = "dtype-binary"))] + { + panic!("activate feature 'dtype-binary'") + } + } #[cfg(feature = "dtype-categorical")] Categorical(rev_map) => { let cats = UInt32Chunked::from_chunks(name, chunks); @@ -95,6 +106,43 @@ impl Series { let chunks = cast_chunks(&chunks, &DataType::Utf8, false).unwrap(); Ok(Utf8Chunked::from_chunks(name, chunks).into_series()) } + #[cfg(feature = "dtype-binary")] + ArrowDataType::LargeBinary => { + Ok(BinaryChunked::from_chunks(name, chunks).into_series()) + } + #[cfg(feature = "dtype-binary")] + ArrowDataType::Binary => { + let chunks = cast_chunks(&chunks, &DataType::Binary, false).unwrap(); + Ok(BinaryChunked::from_chunks(name, chunks).into_series()) + } + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] + ArrowDataType::LargeBinary | ArrowDataType::Binary => { + let chunks = chunks + .iter() + .map(|arr| { + let arr = cast(&**arr, &ArrowDataType::LargeBinary).unwrap(); + + let arr = arr.as_any().downcast_ref::>().unwrap(); + let values = arr.values().clone(); + let offsets = arr.offsets().clone(); + let validity = arr.validity().cloned(); + + let values = Box::new(PrimitiveArray::from_data( + ArrowDataType::UInt8, + values, + None, + )); + + let dtype = ListArray::::default_datatype(ArrowDataType::UInt8); + // Safety: + // offsets are monotonically increasing + Box::new(ListArray::::new_unchecked( + dtype, offsets, values, validity, + )) as ArrayRef + }) + .collect(); + Ok(ListChunked::from_chunks(name, chunks).into()) + } ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let chunks = chunks.iter().map(convert_inner_types).collect(); Ok(ListChunked::from_chunks(name, chunks).into_series()) @@ -254,38 +302,6 @@ impl Series { // the invariants of an Arrow Dictionary guarantee the keys are in bounds Ok(CategoricalChunked::from_keys_and_values(name, keys, values).into_series()) } - #[cfg(not(feature = "dtype-u8"))] - ArrowDataType::LargeBinary | ArrowDataType::Binary => { - panic!("activate dtype-u8 to read binary data into polars List") - } - #[cfg(feature = "dtype-u8")] - ArrowDataType::LargeBinary | ArrowDataType::Binary => { - let chunks = chunks - .iter() - .map(|arr| { - let arr = cast(&**arr, &ArrowDataType::LargeBinary).unwrap(); - - let arr = arr.as_any().downcast_ref::>().unwrap(); - let values = arr.values().clone(); - let offsets = arr.offsets().clone(); - let validity = arr.validity().cloned(); - - let values = Box::new(PrimitiveArray::from_data( - ArrowDataType::UInt8, - values, - None, - )); - - let dtype = ListArray::::default_datatype(ArrowDataType::UInt8); - // Safety: - // offsets are monotonically increasing - Box::new(ListArray::::new_unchecked( - dtype, offsets, values, validity, - )) as ArrayRef - }) - .collect(); - Ok(ListChunked::from_chunks(name, chunks).into()) - } #[cfg(feature = "object")] ArrowDataType::Extension(s, _, Some(_)) if s == "POLARS_EXTENSION_TYPE" => { assert_eq!(chunks.len(), 1); @@ -529,11 +545,11 @@ impl IntoSeries for Series { #[cfg(test)] mod test { - #[cfg(feature = "dtype-u8")] + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] use super::*; #[test] - #[cfg(feature = "dtype-u8")] + #[cfg(all(feature = "dtype-u8", not(feature = "dtype-binary")))] fn test_binary_to_list() { let iter = std::iter::repeat(b"hello").take(2).map(Some); let a = Box::new(iter.collect::>()) as ArrayRef; diff --git a/polars/polars-core/src/series/implementations/binary.rs b/polars/polars-core/src/series/implementations/binary.rs new file mode 100644 index 000000000000..a3be49cfb144 --- /dev/null +++ b/polars/polars-core/src/series/implementations/binary.rs @@ -0,0 +1,349 @@ +use std::borrow::Cow; + +use ahash::RandomState; + +use super::{private, IntoSeries, SeriesTrait, *}; +use crate::chunked_array::comparison::*; +use crate::chunked_array::ops::compare_inner::{ + IntoPartialEqInner, IntoPartialOrdInner, PartialEqInner, PartialOrdInner, +}; +use crate::chunked_array::ops::explode::ExplodeByOffsets; +use crate::chunked_array::AsSinglePtr; +use crate::fmt::FmtList; +use crate::frame::groupby::*; +use crate::frame::hash_join::ZipOuterJoinColumn; +use crate::prelude::*; +use crate::series::implementations::SeriesWrap; + +impl private::PrivateSeries for SeriesWrap { + fn compute_len(&mut self) { + self.0.compute_len() + } + fn _field(&self) -> Cow { + Cow::Borrowed(self.0.ref_field()) + } + fn _dtype(&self) -> &DataType { + self.0.ref_field().data_type() + } + fn explode_by_offsets(&self, offsets: &[i64]) -> Series { + self.0.explode_by_offsets(offsets) + } + + fn _set_sorted(&mut self, is_sorted: IsSorted) { + self.0.set_sorted2(is_sorted) + } + + unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { + self.0.equal_element(idx_self, idx_other, other) + } + + #[cfg(feature = "zip_with")] + fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult { + ChunkZip::zip_with(&self.0, mask, other.as_ref().as_ref()).map(|ca| ca.into_series()) + } + fn into_partial_eq_inner<'a>(&'a self) -> Box { + (&self.0).into_partial_eq_inner() + } + fn into_partial_ord_inner<'a>(&'a self) -> Box { + (&self.0).into_partial_ord_inner() + } + + fn vec_hash(&self, random_state: RandomState) -> PolarsResult> { + Ok(self.0.vec_hash(random_state)) + } + + fn vec_hash_combine(&self, build_hasher: RandomState, hashes: &mut [u64]) -> PolarsResult<()> { + self.0.vec_hash_combine(build_hasher, hashes); + Ok(()) + } + + unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { + self.0.agg_list(groups) + } + + fn zip_outer_join_column( + &self, + right_column: &Series, + opt_join_tuples: &[(Option, Option)], + ) -> Series { + ZipOuterJoinColumn::zip_outer_join_column(&self.0, right_column, opt_join_tuples) + } + fn subtract(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::subtract(&self.0, rhs) + } + fn add_to(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::add_to(&self.0, rhs) + } + fn multiply(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::multiply(&self.0, rhs) + } + fn divide(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::divide(&self.0, rhs) + } + fn remainder(&self, rhs: &Series) -> PolarsResult { + NumOpsDispatch::remainder(&self.0, rhs) + } + fn group_tuples(&self, multithreaded: bool, sorted: bool) -> PolarsResult { + IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) + } + + #[cfg(feature = "sort_multiple")] + fn argsort_multiple(&self, by: &[Series], reverse: &[bool]) -> PolarsResult { + self.0.argsort_multiple(by, reverse) + } +} + +impl SeriesTrait for SeriesWrap { + fn is_sorted(&self) -> IsSorted { + if self.0.is_sorted() { + IsSorted::Ascending + } else if self.0.is_sorted_reverse() { + IsSorted::Descending + } else { + IsSorted::Not + } + } + + #[cfg(feature = "interpolate")] + fn interpolate(&self) -> Series { + self.0.clone().into_series() + } + + fn rename(&mut self, name: &str) { + self.0.rename(name); + } + + fn chunk_lengths(&self) -> ChunkIdIter { + self.0.chunk_id() + } + fn name(&self) -> &str { + self.0.name() + } + + fn chunks(&self) -> &Vec { + self.0.chunks() + } + fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit() + } + + fn append_array(&mut self, other: ArrayRef) -> PolarsResult<()> { + self.0.append_array(other) + } + + fn slice(&self, offset: i64, length: usize) -> Series { + self.0.slice(offset, length).into_series() + } + + fn append(&mut self, other: &Series) -> PolarsResult<()> { + if self.0.dtype() == other.dtype() { + // todo! add object + self.0.append(other.as_ref().as_ref()); + Ok(()) + } else { + Err(PolarsError::SchemaMisMatch( + "cannot append Series; data types don't match".into(), + )) + } + } + + fn extend(&mut self, other: &Series) -> PolarsResult<()> { + if self.0.dtype() == other.dtype() { + self.0.extend(other.as_ref().as_ref()); + Ok(()) + } else { + Err(PolarsError::SchemaMisMatch( + "cannot extend Series; data types don't match".into(), + )) + } + } + + fn filter(&self, filter: &BooleanChunked) -> PolarsResult { + ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series()) + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Series { + self.0.take_chunked_unchecked(by, sorted).into_series() + } + + #[cfg(feature = "chunked_ids")] + unsafe fn _take_opt_chunked_unchecked(&self, by: &[Option]) -> Series { + self.0.take_opt_chunked_unchecked(by).into_series() + } + + fn take(&self, indices: &IdxCa) -> PolarsResult { + let indices = if indices.chunks.len() > 1 { + Cow::Owned(indices.rechunk()) + } else { + Cow::Borrowed(indices) + }; + Ok(ChunkTake::take(&self.0, (&*indices).into())?.into_series()) + } + + fn take_iter(&self, iter: &mut dyn TakeIterator) -> PolarsResult { + Ok(ChunkTake::take(&self.0, iter.into())?.into_series()) + } + + fn take_every(&self, n: usize) -> Series { + self.0.take_every(n).into_series() + } + + unsafe fn take_iter_unchecked(&self, iter: &mut dyn TakeIterator) -> Series { + ChunkTake::take_unchecked(&self.0, iter.into()).into_series() + } + + unsafe fn take_unchecked(&self, idx: &IdxCa) -> PolarsResult { + let idx = if idx.chunks.len() > 1 { + Cow::Owned(idx.rechunk()) + } else { + Cow::Borrowed(idx) + }; + + let mut out = ChunkTake::take_unchecked(&self.0, (&*idx).into()); + + if self.0.is_sorted() && (idx.is_sorted() || idx.is_sorted_reverse()) { + out.set_sorted2(idx.is_sorted2()) + } + + Ok(out.into_series()) + } + + unsafe fn take_opt_iter_unchecked(&self, iter: &mut dyn TakeIteratorNulls) -> Series { + ChunkTake::take_unchecked(&self.0, iter.into()).into_series() + } + + #[cfg(feature = "take_opt_iter")] + fn take_opt_iter(&self, iter: &mut dyn TakeIteratorNulls) -> PolarsResult { + Ok(ChunkTake::take(&self.0, iter.into())?.into_series()) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn rechunk(&self) -> Series { + self.0.rechunk().into_series() + } + + fn expand_at_index(&self, index: usize, length: usize) -> Series { + ChunkExpandAtIndex::expand_at_index(&self.0, index, length).into_series() + } + + fn cast(&self, data_type: &DataType) -> PolarsResult { + self.0.cast(data_type) + } + + fn get(&self, index: usize) -> AnyValue { + self.0.get_any_value(index) + } + + #[inline] + #[cfg(feature = "private")] + unsafe fn get_unchecked(&self, index: usize) -> AnyValue { + self.0.get_any_value_unchecked(index) + } + + fn sort_with(&self, options: SortOptions) -> Series { + ChunkSort::sort_with(&self.0, options).into_series() + } + + fn argsort(&self, options: SortOptions) -> IdxCa { + ChunkSort::argsort(&self.0, options) + } + + fn null_count(&self) -> usize { + self.0.null_count() + } + + fn has_validity(&self) -> bool { + self.0.has_validity() + } + + fn unique(&self) -> PolarsResult { + ChunkUnique::unique(&self.0).map(|ca| ca.into_series()) + } + + fn n_unique(&self) -> PolarsResult { + ChunkUnique::n_unique(&self.0) + } + + fn arg_unique(&self) -> PolarsResult { + ChunkUnique::arg_unique(&self.0) + } + + fn arg_min(&self) -> Option { + ArgAgg::arg_min(&self.0) + } + + fn arg_max(&self) -> Option { + ArgAgg::arg_max(&self.0) + } + + fn is_null(&self) -> BooleanChunked { + self.0.is_null() + } + + fn is_not_null(&self) -> BooleanChunked { + self.0.is_not_null() + } + + fn is_unique(&self) -> PolarsResult { + ChunkUnique::is_unique(&self.0) + } + + fn is_duplicated(&self) -> PolarsResult { + ChunkUnique::is_duplicated(&self.0) + } + + fn reverse(&self) -> Series { + ChunkReverse::reverse(&self.0).into_series() + } + + fn as_single_ptr(&mut self) -> PolarsResult { + self.0.as_single_ptr() + } + + fn shift(&self, periods: i64) -> Series { + ChunkShift::shift(&self.0, periods).into_series() + } + + fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { + ChunkFillNull::fill_null(&self.0, strategy).map(|ca| ca.into_series()) + } + + fn _sum_as_series(&self) -> Series { + ChunkAggSeries::sum_as_series(&self.0) + } + fn max_as_series(&self) -> Series { + ChunkAggSeries::max_as_series(&self.0) + } + fn min_as_series(&self) -> Series { + ChunkAggSeries::min_as_series(&self.0) + } + fn fmt_list(&self) -> String { + FmtList::fmt_list(&self.0) + } + fn clone_inner(&self) -> Arc { + Arc::new(SeriesWrap(Clone::clone(&self.0))) + } + + #[cfg(feature = "is_in")] + fn is_in(&self, other: &Series) -> PolarsResult { + IsIn::is_in(&self.0, other) + } + #[cfg(feature = "repeat_by")] + fn repeat_by(&self, by: &IdxCa) -> ListChunked { + RepeatBy::repeat_by(&self.0, by) + } + + #[cfg(feature = "is_first")] + fn is_first(&self) -> PolarsResult { + self.0.is_first() + } + + #[cfg(feature = "mode")] + fn mode(&self) -> PolarsResult { + Ok(self.0.mode()?.into_series()) + } +} diff --git a/polars/polars-core/src/series/implementations/boolean.rs b/polars/polars-core/src/series/implementations/boolean.rs index bab64ab264eb..b76178a9331e 100644 --- a/polars/polars-core/src/series/implementations/boolean.rs +++ b/polars/polars-core/src/series/implementations/boolean.rs @@ -2,7 +2,6 @@ use std::borrow::Cow; use std::ops::{BitAnd, BitOr, BitXor}; use ahash::RandomState; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait, *}; use crate::chunked_array::comparison::*; @@ -332,23 +331,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/implementations/list.rs b/polars/polars-core/src/series/implementations/list.rs index 4d5a9bf7cdcf..a6ad67f606b3 100644 --- a/polars/polars-core/src/series/implementations/list.rs +++ b/polars/polars-core/src/series/implementations/list.rs @@ -1,8 +1,6 @@ use std::any::Any; use std::borrow::Cow; -use polars_arrow::prelude::QuantileInterpolOptions; - use super::{private, IntoSeries, SeriesTrait}; use crate::chunked_array::comparison::*; use crate::chunked_array::ops::explode::ExplodeByOffsets; @@ -220,23 +218,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/implementations/mod.rs b/polars/polars-core/src/series/implementations/mod.rs index 7696c49567df..3a272773b6b4 100644 --- a/polars/polars-core/src/series/implementations/mod.rs +++ b/polars/polars-core/src/series/implementations/mod.rs @@ -1,3 +1,5 @@ +#[cfg(feature = "dtype-binary")] +mod binary; mod boolean; #[cfg(feature = "dtype-categorical")] mod categorical; @@ -567,6 +569,8 @@ impl private::PrivateSeriesNumeric for SeriesWrap {} +#[cfg(feature = "dtype-binary")] +impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap {} impl private::PrivateSeriesNumeric for SeriesWrap { fn bit_repr_is_large(&self) -> bool { diff --git a/polars/polars-core/src/series/implementations/utf8.rs b/polars/polars-core/src/series/implementations/utf8.rs index 481eff691ad9..c3a28f60f46a 100644 --- a/polars/polars-core/src/series/implementations/utf8.rs +++ b/polars/polars-core/src/series/implementations/utf8.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use ahash::RandomState; -use polars_arrow::prelude::QuantileInterpolOptions; use super::{private, IntoSeries, SeriesTrait, *}; use crate::chunked_array::comparison::*; @@ -322,23 +321,6 @@ impl SeriesTrait for SeriesWrap { fn min_as_series(&self) -> Series { ChunkAggSeries::min_as_series(&self.0) } - fn median_as_series(&self) -> Series { - QuantileAggSeries::median_as_series(&self.0) - } - fn var_as_series(&self, ddof: u8) -> Series { - VarAggSeries::var_as_series(&self.0, ddof) - } - fn std_as_series(&self, ddof: u8) -> Series { - VarAggSeries::std_as_series(&self.0, ddof) - } - fn quantile_as_series( - &self, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> PolarsResult { - QuantileAggSeries::quantile_as_series(&self.0, quantile, interpol) - } - fn fmt_list(&self) -> String { FmtList::fmt_list(&self.0) } diff --git a/polars/polars-core/src/series/ops/downcast.rs b/polars/polars-core/src/series/ops/downcast.rs index 2fed1d1d8829..b295523c097e 100644 --- a/polars/polars-core/src/series/ops/downcast.rs +++ b/polars/polars-core/src/series/ops/downcast.rs @@ -158,6 +158,19 @@ impl Series { } } + /// Unpack to ChunkedArray of dtype binary + #[cfg(feature = "dtype-binary")] + pub fn binary(&self) -> PolarsResult<&BinaryChunked> { + match self.dtype() { + DataType::Binary => unsafe { + Ok(&*(self.as_ref() as *const dyn SeriesTrait as *const BinaryChunked)) + }, + dt => Err(PolarsError::SchemaMisMatch( + format!("Series of dtype: {:?} != binary", dt).into(), + )), + } + } + /// Unpack to ChunkedArray of dtype Time #[cfg(feature = "dtype-time")] pub fn time(&self) -> PolarsResult<&TimeChunked> { diff --git a/polars/polars-core/src/series/ops/null.rs b/polars/polars-core/src/series/ops/null.rs index 39f33279ddf1..c9e41a409ee1 100644 --- a/polars/polars-core/src/series/ops/null.rs +++ b/polars/polars-core/src/series/ops/null.rs @@ -50,7 +50,13 @@ impl Series { ChunkedArray::::full_null(name, size).into_series() }}; } - match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, bool) + #[cfg(feature = "dtype-binary")] + macro_rules! binary { + () => {{ + ChunkedArray::::full_null(name, size).into_series() + }}; + } + match_dtype_to_logical_apply_macro!(dtype, primitive, utf8, binary, bool) } } } diff --git a/polars/polars-core/src/series/series_trait.rs b/polars/polars-core/src/series/series_trait.rs index e803a7aed4a9..3a865a68bae6 100644 --- a/polars/polars-core/src/series/series_trait.rs +++ b/polars/polars-core/src/series/series_trait.rs @@ -581,15 +581,15 @@ pub trait SeriesTrait: } /// Get the median of the Series as a new Series of length 1. fn median_as_series(&self) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the variance of the Series as a new Series of length 1. fn var_as_series(&self, _ddof: u8) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the standard deviation of the Series as a new Series of length 1. fn std_as_series(&self, _ddof: u8) -> Series { - invalid_operation_panic!(self) + Series::full_null(self.name(), 1, self.dtype()) } /// Get the quantile of the ChunkedArray as a new Series of length 1. fn quantile_as_series( @@ -597,7 +597,7 @@ pub trait SeriesTrait: _quantile: f64, _interpol: QuantileInterpolOptions, ) -> PolarsResult { - invalid_operation_panic!(self) + Ok(Series::full_null(self.name(), 1, self.dtype())) } fn fmt_list(&self) -> String { diff --git a/polars/polars-core/src/utils/mod.rs b/polars/polars-core/src/utils/mod.rs index b6d84cbde9bb..a011201e7bb3 100644 --- a/polars/polars-core/src/utils/mod.rs +++ b/polars/polars-core/src/utils/mod.rs @@ -257,9 +257,11 @@ macro_rules! match_dtype_to_physical_apply_macro { /// Apply a macro on the Series #[macro_export] macro_rules! match_dtype_to_logical_apply_macro { - ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ + ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_binary:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ match $obj { DataType::Utf8 => $macro_utf8!($($opt_args)*), + #[cfg(feature = "dtype-binary")] + DataType::Binary => $macro_binary!($($opt_args)*), DataType::Boolean => $macro_bool!($($opt_args)*), #[cfg(feature = "dtype-u8")] DataType::UInt8 => $macro!(UInt8Type $(, $opt_args)*), diff --git a/polars/polars-core/src/utils/supertype.rs b/polars/polars-core/src/utils/supertype.rs index 86f5de6467b4..11197bf20e91 100644 --- a/polars/polars-core/src/utils/supertype.rs +++ b/polars/polars-core/src/utils/supertype.rs @@ -205,7 +205,11 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Option { #[cfg(all(feature = "dtype-date", feature = "dtype-time"))] (Date, Time) => Some(Int64), - // every known type can be casted to a string + // every known type can be casted to a string except binary + #[cfg(feature = "dtype-binary")] + (dt, Utf8) if dt != &DataType::Unknown && dt != &DataType::Binary => Some(Utf8), + + #[cfg(not(feature = "dtype-binary"))] (dt, Utf8) if dt != &DataType::Unknown => Some(Utf8), (dt, Null) => Some(dt.clone()), diff --git a/polars/polars-core/src/vector_hasher.rs b/polars/polars-core/src/vector_hasher.rs index 0f16d30a7196..75e28d994d59 100644 --- a/polars/polars-core/src/vector_hasher.rs +++ b/polars/polars-core/src/vector_hasher.rs @@ -147,6 +147,35 @@ impl VecHash for Utf8Chunked { } } +#[cfg(feature = "dtype-binary")] +impl VecHash for BinaryChunked { + fn vec_hash(&self, random_state: RandomState) -> Vec { + let null_h = get_null_hash_value(random_state.clone()); + let mut av = Vec::with_capacity(self.len()); + self.downcast_iter().for_each(|arr| { + av.extend(arr.into_iter().map(|opt_v| match opt_v { + Some(v) => <[u8]>::get_hash(v, &random_state), + None => null_h, + })) + }); + av + } + + fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) { + let null_h = get_null_hash_value(random_state.clone()); + self.apply_to_slice( + |opt_v, h| { + let l = match opt_v { + Some(v) => <[u8]>::get_hash(v, &random_state), + None => null_h, + }; + _boost_hash_combine(l, *h) + }, + hashes, + ) + } +} + impl VecHash for BooleanChunked { fn vec_hash(&self, random_state: RandomState) -> Vec { let mut av = Vec::with_capacity(self.len()); diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml index 4829cb7286fc..0e130625f319 100644 --- a/polars/polars-io/Cargo.toml +++ b/polars/polars-io/Cargo.toml @@ -27,6 +27,7 @@ dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"] dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal", "polars-time/dtype-datetime"] dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"] dtype-struct = ["polars-core/dtype-struct"] +dtype-binary = ["polars-core/dtype-binary"] fmt = ["polars-core/fmt"] lazy = [] parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression", "memmap"] diff --git a/polars/polars-lazy/Cargo.toml b/polars/polars-lazy/Cargo.toml index 60025e80b447..5217d65701e2 100644 --- a/polars/polars-lazy/Cargo.toml +++ b/polars/polars-lazy/Cargo.toml @@ -45,6 +45,7 @@ dtype-duration = ["polars-plan/dtype-duration", "polars-time/dtype-duration", "t dtype-time = ["polars-core/dtype-time", "temporal"] dtype-categorical = ["polars-plan/dtype-categorical"] dtype-struct = ["polars-plan/dtype-struct"] +dtype-binary = ["polars-plan/dtype-binary"] object = ["polars-plan/object"] date_offset = ["polars-plan/date_offset"] trigonometry = ["polars-plan/trigonometry"] diff --git a/polars/polars-lazy/polars-plan/Cargo.toml b/polars/polars-lazy/polars-plan/Cargo.toml index 04821f227c92..186f74366e34 100644 --- a/polars/polars-lazy/polars-plan/Cargo.toml +++ b/polars/polars-lazy/polars-plan/Cargo.toml @@ -43,6 +43,7 @@ dtype-duration = ["polars-core/dtype-duration", "polars-time/dtype-duration", "t dtype-time = ["polars-core/dtype-time", "polars-time/dtype-time"] dtype-categorical = ["polars-core/dtype-categorical"] dtype-struct = ["polars-core/dtype-struct"] +dtype-binary = ["polars-core/dtype-binary"] object = ["polars-core/object"] date_offset = ["polars-time"] trigonometry = [] diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs b/polars/polars-lazy/polars-plan/src/logical_plan/format.rs index b712e76e6c42..f9bb792fbba1 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/format.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/format.rs @@ -397,6 +397,8 @@ impl Debug for LiteralValue { Null => write!(f, "null"), Boolean(b) => write!(f, "{}", b), Utf8(s) => write!(f, "{}", s), + #[cfg(feature = "dtype-binary")] + Binary(_) => write!(f, "[binary value]"), #[cfg(feature = "dtype-u8")] UInt8(v) => write!(f, "{}u8", v), #[cfg(feature = "dtype-u16")] diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs b/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs index 8d7a9d186dd8..29df32fddbdc 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/lit.rs @@ -14,6 +14,9 @@ pub enum LiteralValue { Boolean(bool), /// A UTF8 encoded string type. Utf8(String), + /// A raw binary array + #[cfg(feature = "dtype-binary")] + Binary(Vec), /// An unsigned 8-bit integer number. #[cfg(feature = "dtype-u8")] UInt8(u8), @@ -97,6 +100,8 @@ impl LiteralValue { LiteralValue::Float32(_) => DataType::Float32, LiteralValue::Float64(_) => DataType::Float64, LiteralValue::Utf8(_) => DataType::Utf8, + #[cfg(feature = "dtype-binary")] + LiteralValue::Binary(_) => DataType::Binary, LiteralValue::Range { data_type, .. } => data_type.clone(), #[cfg(all(feature = "temporal", feature = "dtype-datetime"))] LiteralValue::DateTime(_, tu) => DataType::Datetime(*tu, None), @@ -125,6 +130,20 @@ impl<'a> Literal for &'a str { } } +#[cfg(feature = "dtype-binary")] +impl Literal for Vec { + fn lit(self) -> Expr { + Expr::Literal(LiteralValue::Binary(self)) + } +} + +#[cfg(feature = "dtype-binary")] +impl<'a> Literal for &'a [u8] { + fn lit(self) -> Expr { + Expr::Literal(LiteralValue::Binary(self.to_vec())) + } +} + impl TryFrom> for LiteralValue { type Error = PolarsError; fn try_from(value: AnyValue) -> PolarsResult { @@ -132,6 +151,8 @@ impl TryFrom> for LiteralValue { AnyValue::Null => Ok(Self::Null), AnyValue::Boolean(b) => Ok(Self::Boolean(b)), AnyValue::Utf8(s) => Ok(Self::Utf8(s.to_string())), + #[cfg(feature = "dtype-binary")] + AnyValue::Binary(b) => Ok(Self::Binary(b.to_vec())), #[cfg(feature = "dtype-u8")] AnyValue::UInt8(u) => Ok(Self::UInt8(u)), #[cfg(feature = "dtype-u16")] diff --git a/polars/polars-lazy/src/physical_plan/expressions/literal.rs b/polars/polars-lazy/src/physical_plan/expressions/literal.rs index e217dfa7c888..9d16fd69deda 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/literal.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/literal.rs @@ -74,6 +74,8 @@ impl PhysicalExpr for LiteralExpr { } }, Utf8(v) => Utf8Chunked::full("literal", v, 1).into_series(), + #[cfg(feature = "dtype-binary")] + Binary(v) => BinaryChunked::full("literal", v, 1).into_series(), #[cfg(feature = "temporal")] DateTime(ndt, tu) => { use polars_core::chunked_array::temporal::conversion::*; diff --git a/polars/polars-ops/Cargo.toml b/polars/polars-ops/Cargo.toml index 8fdc0ff1f133..92de47c33829 100644 --- a/polars/polars-ops/Cargo.toml +++ b/polars/polars-ops/Cargo.toml @@ -20,6 +20,7 @@ dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal"] dtype-time = ["polars-core/dtype-time", "polars-core/temporal"] dtype-duration = ["polars-core/dtype-duration", "polars-core/temporal"] dtype-struct = ["polars-core/dtype-struct", "polars-core/temporal"] +dtype-binary = ["polars-core/dtype-binary", "polars-core/dtype-binary"] dtype-u8 = ["polars-core/dtype-u8"] object = ["polars-core/object"] propagate_nans = [] diff --git a/polars/src/lib.rs b/polars/src/lib.rs index 269b27c19ee9..b121df9b2e70 100644 --- a/polars/src/lib.rs +++ b/polars/src/lib.rs @@ -275,6 +275,7 @@ //! | UInt16 | dtype-u16 | //! | Categorical | dtype-categorical | //! | Struct | dtype-struct | +//! | Binary | dtype-binary | //! //! //! Or you can choose on of the preconfigured pre-sets. diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 886e10cdfbab..1b02cb70fc68 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -90,6 +90,7 @@ all = [ "propagate_nans", "polars/groupby_list", "polars-sql", + "polars/dtype-binary", ] # we cannot conditionaly activate simd diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 0a3c2616f9e0..df9280070ea3 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -21,6 +21,7 @@ def version() -> str: from_records, ) from polars.datatypes import ( + Binary, Boolean, Categorical, DataType, @@ -172,6 +173,7 @@ def version() -> str: "Float32", "Float64", "Boolean", + "Binary", "Utf8", "List", "Date", diff --git a/py-polars/polars/datatypes.py b/py-polars/polars/datatypes.py index f90c03cfa4af..372bf7064894 100644 --- a/py-polars/polars/datatypes.py +++ b/py-polars/polars/datatypes.py @@ -160,6 +160,10 @@ class Utf8(DataType): """UTF-8 encoded string type.""" +class Binary(DataType): + """Binary type.""" + + class Null(DataType): """Type representing Null / None values.""" @@ -373,6 +377,7 @@ def __hash__(self) -> int: Object: "object", Categorical: "categorical", Struct: "struct", + Binary: "binary", } for tu in DTYPE_TEMPORAL_UNITS: _DTYPE_TO_FFINAME[Datetime(tu)] = "datetime" @@ -411,6 +416,7 @@ def __hash__(self) -> int: list: List, tuple: List, Decimal: Float64, + bytes: Binary, } _PY_STR_TO_DTYPE: dict[str, PolarsDataType] = { @@ -434,6 +440,7 @@ def __hash__(self) -> int: Datetime: datetime, Date: date, Time: time, + Binary: bytes, } for tu in DTYPE_TEMPORAL_UNITS: _DTYPE_TO_PY_TYPE[Datetime(tu)] = datetime diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index dfb63716f481..78cee09e20c5 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -17,7 +17,7 @@ use pyo3::basic::CompareOp; use pyo3::conversion::{FromPyObject, IntoPy}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::types::{PyBool, PyDict, PyList, PySequence}; +use pyo3::types::{PyBool, PyBytes, PyDict, PyList, PySequence}; use pyo3::{PyAny, PyResult}; use crate::dataframe::PyDataFrame; @@ -136,6 +136,22 @@ impl<'a> FromPyObject<'a> for Wrap { } } +impl<'a> FromPyObject<'a> for Wrap { + fn extract(obj: &'a PyAny) -> PyResult { + let (seq, len) = get_pyseq(obj)?; + let mut builder = BinaryChunkedBuilder::new("", len, len * 25); + + for res in seq.iter()? { + let item = res?; + match item.extract::<&str>() { + Ok(val) => builder.append_value(val), + Err(_) => builder.append_null(), + } + } + Ok(Wrap(builder.finish())) + } +} + impl<'a> FromPyObject<'a> for Wrap { fn extract(ob: &'a PyAny) -> PyResult { if let Ok(s) = ob.extract::() { @@ -223,6 +239,8 @@ impl IntoPy for Wrap> { let s = format!("{}", v); s.into_py(py) } + AnyValue::Binary(v) => v.into_py(py), + AnyValue::BinaryOwned(v) => v.into_py(py), } } } @@ -244,6 +262,7 @@ impl ToPyObject for Wrap { DataType::Float64 => pl.getattr("Float64").unwrap().into(), DataType::Boolean => pl.getattr("Boolean").unwrap().into(), DataType::Utf8 => pl.getattr("Utf8").unwrap().into(), + DataType::Binary => pl.getattr("Binary").unwrap().into(), DataType::List(inner) => { let inner = Wrap(*inner.clone()).to_object(py); let list_class = pl.getattr("List").unwrap(); @@ -308,6 +327,7 @@ impl FromPyObject<'_> for Wrap { "Int32" => DataType::Int32, "Int64" => DataType::Int64, "Utf8" => DataType::Utf8, + "Binary" => DataType::Binary, "Boolean" => DataType::Boolean, "Categorical" => DataType::Categorical(None), "Date" => DataType::Date, @@ -391,6 +411,16 @@ impl ToPyObject for Wrap<&Utf8Chunked> { } } +impl ToPyObject for Wrap<&BinaryChunked> { + fn to_object(&self, py: Python) -> PyObject { + let iter = self + .0 + .into_iter() + .map(|opt_bytes| opt_bytes.map(|bytes| PyBytes::new(py, bytes))); + PyList::new(py, iter).into_py(py) + } +} + impl ToPyObject for Wrap<&StructChunked> { fn to_object(&self, py: Python) -> PyObject { let s = self.0.clone().into_series(); @@ -564,6 +594,8 @@ impl<'s> FromPyObject<'s> for Wrap> { let v = td.extract::(py).unwrap(); Ok(Wrap(AnyValue::Duration(v, TimeUnit::Microseconds))) }) + } else if let Ok(v) = ob.extract::<&'s [u8]>() { + Ok(AnyValue::Binary(v).into()) } else { Err(PyErr::from(PyPolarsErr::Other(format!( "row type not supported {:?}", diff --git a/py-polars/src/datatypes.rs b/py-polars/src/datatypes.rs index 94c0643f5bad..c00c4708860c 100644 --- a/py-polars/src/datatypes.rs +++ b/py-polars/src/datatypes.rs @@ -27,6 +27,7 @@ pub(crate) enum PyDataType { Object, Categorical, Struct, + Binary, } impl From<&DataType> for PyDataType { @@ -45,6 +46,7 @@ impl From<&DataType> for PyDataType { DataType::Float64 => Float64, DataType::Boolean => Bool, DataType::Utf8 => Utf8, + DataType::Binary => Binary, DataType::List(_) => List, DataType::Date => Date, DataType::Datetime(tu, tz) => Datetime(*tu, tz.clone()), @@ -83,6 +85,7 @@ impl From for DataType { PyDataType::Float64 => Float64, PyDataType::Bool => Boolean, PyDataType::Utf8 => Utf8, + PyDataType::Binary => Binary, PyDataType::List => List(DataType::Null.into()), PyDataType::Date => Date, PyDataType::Datetime(tu, tz) => Datetime(tu, tz), diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index af98857e08b7..22fd8d3d7bf9 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -713,7 +713,13 @@ impl PySeries { let ca = series.duration().unwrap(); return Wrap(ca).to_object(py); } - dt => panic!("to_list() not implemented for {:?}", dt), + DataType::Binary => { + let ca = series.binary().unwrap(); + return Wrap(ca).to_object(py); + } + DataType::Null | DataType::Unknown => { + panic!("to_list not implemented for null/unknown") + } }; pylist.to_object(py) } diff --git a/py-polars/tests/unit/test_binary.py b/py-polars/tests/unit/test_binary.py new file mode 100644 index 000000000000..343a27f68b6d --- /dev/null +++ b/py-polars/tests/unit/test_binary.py @@ -0,0 +1,15 @@ +import polars as pl + + +def test_binary_conversions() -> None: + df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_column( + pl.col("blob").cast(pl.Utf8).alias("decoded_blob") + ) + + assert df.to_dict(False) == { + "blob": [b"abc", None, b"cde"], + "decoded_blob": ["abc", None, "cde"], + } + assert df[0, 0] == b"abc" + assert df[1, 0] is None + assert df.dtypes == [pl.Binary, pl.Utf8]