Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): add binary dtype #5122

Merged
merged 32 commits into from
Oct 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
48511c0
feat: add support for arrow binary type
ozgrakkurt Sep 22, 2022
fa87cd6
fix DataType from AnyValue
ozgrakkurt Sep 22, 2022
3ac6fca
fmt
ozgrakkurt Sep 22, 2022
f3930fe
run fmt, fix test failure
ozgrakkurt Sep 22, 2022
e14bae4
impl execution for binary type
ozgrakkurt Sep 23, 2022
7dffa6a
impl_named_from! binary
ozgrakkurt Sep 23, 2022
98b0442
impl binary comparison
ozgrakkurt Sep 23, 2022
06da9b6
read arrow binary as binary instead of List<u8>
ozgrakkurt Sep 23, 2022
14353ef
add Binary to Series::from_chunks_and_dtype_unchecked
ozgrakkurt Sep 26, 2022
90a58cb
add Binary to match_dtype_to_logical_apply_macro
ozgrakkurt Sep 26, 2022
342218d
Merge branch 'master' into master
ozgrakkurt Sep 27, 2022
69846bf
update toml files
ozgrakkurt Sep 27, 2022
c5ac957
fix joins for binary
ozgrakkurt Sep 28, 2022
a668bd8
fix predicate pushdown bug
ozgrakkurt Sep 28, 2022
6000e4c
Merge branch 'pola-rs:master' into master
ozgrakkurt Sep 28, 2022
e53b80c
Revert "fix predicate pushdown bug"
ozgrakkurt Sep 29, 2022
a675e4b
Merge branch 'master' of github.com:ozgrakkurt/polars
ozgrakkurt Sep 29, 2022
922fb5e
merge upstream
ozgrakkurt Sep 29, 2022
109b057
add dtype-binary feature gate
ozgrakkurt Sep 29, 2022
62fa394
fix compilation
ozgrakkurt Sep 29, 2022
59522e8
fix test compilation
ozgrakkurt Sep 29, 2022
6791e84
fix clippy warnings
ozgrakkurt Sep 29, 2022
5c9afe9
fix clippy warnings
ozgrakkurt Sep 29, 2022
406c56f
Merge branch 'pola-rs:master' into master
ozgrakkurt Sep 30, 2022
5efcdd0
Merge branch 'pola-rs:master' into master
ozgrakkurt Sep 30, 2022
1893c47
Merge branch 'pola-rs:master' into master
ozgrakkurt Oct 3, 2022
9ca192c
Merge branch 'pola-rs:master' into master
ozgrakkurt Oct 5, 2022
6f88264
Merge branch 'master' of github.com:ozgrakkurt/polars into ozgrakkurt…
ritchie46 Oct 6, 2022
48d99a6
Merge branch 'master' into ozgrakkurt-master
ritchie46 Oct 6, 2022
1943b7b
remove unneeded trait impls
ritchie46 Oct 6, 2022
896586a
expose to python
ritchie46 Oct 6, 2022
8f595c4
fix lint
ritchie46 Oct 6, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ dtype-full = [
"dtype-u16",
"dtype-categorical",
"dtype-struct",
"dtype-binary",
]

# sensible minimal set of opt-in datatypes
Expand Down Expand Up @@ -210,6 +211,12 @@ dtype-struct = [
"polars-ops/dtype-struct",
"polars-io/dtype-struct",
]
dtype-binary = [
"polars-core/dtype-binary",
"polars-lazy/dtype-binary",
"polars-ops/dtype-binary",
"polars-io/dtype-binary",
]

docs-selection = [
"csv-file",
Expand Down
22 changes: 21 additions & 1 deletion polars/polars-arrow/src/array/default_arrays.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array};
use arrow::array::{BinaryArray, BooleanArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::Bitmap;
use arrow::buffer::Buffer;
use arrow::datatypes::DataType;
Expand Down Expand Up @@ -40,3 +40,23 @@ impl FromDataUtf8 for Utf8Array<i64> {
Utf8Array::from_data_unchecked(DataType::LargeUtf8, offsets, values, validity)
}
}

pub trait FromDataBinary {
/// # Safety
/// `values` buffer must contain valid utf8 between every `offset`
unsafe fn from_data_unchecked_default(
offsets: Buffer<i64>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self;
}

impl FromDataBinary for BinaryArray<i64> {
unsafe fn from_data_unchecked_default(
offsets: Buffer<i64>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
BinaryArray::from_data_unchecked(DataType::LargeBinary, offsets, values, validity)
}
}
24 changes: 23 additions & 1 deletion polars/polars-arrow/src/array/get.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::types::NativeType;

use crate::is_valid::IsValid;
Expand Down Expand Up @@ -79,6 +79,28 @@ impl<'a> ArrowGetItem for &'a Utf8Array<i64> {
}
}

impl<'a> ArrowGetItem for &'a BinaryArray<i64> {
type Item = &'a [u8];

#[inline]
fn get(&self, item: usize) -> Option<Self::Item> {
if item >= self.len() {
None
} else {
unsafe { self.get_unchecked(item) }
}
}

#[inline]
unsafe fn get_unchecked(&self, item: usize) -> Option<Self::Item> {
if self.is_null_unchecked(item) {
None
} else {
Some(self.value_unchecked(item))
}
}
}

impl ArrowGetItem for ListArray<i64> {
type Item = Box<dyn Array>;

Expand Down
54 changes: 53 additions & 1 deletion polars/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use arrow::array::{Array, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::MutableBitmap;
use arrow::datatypes::DataType;
use arrow::types::NativeType;
Expand Down Expand Up @@ -30,6 +30,12 @@ impl ValueSize for Utf8Array<i64> {
}
}

impl ValueSize for BinaryArray<i64> {
fn get_values_size(&self) -> usize {
self.values().len()
}
}

impl ValueSize for ArrayRef {
fn get_values_size(&self) -> usize {
match self.data_type() {
Expand Down Expand Up @@ -179,6 +185,52 @@ pub trait ListFromIter {
Some(validity.into()),
)
}

/// Create a list-array from an iterator.
/// Used in groupby agg-list
///
/// # Safety
/// Will produce incorrect arrays if size hint is incorrect.
unsafe fn from_iter_binary_trusted_len<I, P, Ref>(iter: I, n_elements: usize) -> ListArray<i64>
where
I: IntoIterator<Item = Option<P>>,
P: IntoIterator<Item = Option<Ref>>,
Ref: AsRef<[u8]>,
{
let iterator = iter.into_iter();
let (lower, _) = iterator.size_hint();

let mut validity = MutableBitmap::with_capacity(lower);
let mut offsets = Vec::<i64>::with_capacity(lower + 1);
let mut length_so_far = 0i64;
offsets.push(length_so_far);
let values: BinaryArray<i64> = iterator
.filter_map(|opt_iter| match opt_iter {
Some(x) => {
let it = x.into_iter();
length_so_far += it.size_hint().0 as i64;
validity.push(true);
offsets.push(length_so_far);
Some(it)
}
None => {
validity.push(false);
None
}
})
.flatten()
.trust_my_length(n_elements)
.collect();

// Safety:
// offsets are monotonically increasing
ListArray::new_unchecked(
ListArray::<i64>::default_datatype(DataType::LargeBinary),
offsets.into(),
Box::new(values),
Some(validity.into()),
)
}
}
impl ListFromIter for ListArray<i64> {}

Expand Down
185 changes: 185 additions & 0 deletions polars/polars-arrow/src/compute/take/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,21 @@ pub unsafe fn take_no_null_utf8_iter_unchecked<I: IntoIterator<Item = usize>>(
Box::new(MutableUtf8Array::<i64>::from_trusted_len_values_iter_unchecked(iter).into())
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
#[inline]
pub unsafe fn take_no_null_binary_iter_unchecked<I: IntoIterator<Item = usize>>(
arr: &LargeBinaryArray,
indices: I,
) -> Box<LargeBinaryArray> {
let iter = indices.into_iter().map(|idx| {
debug_assert!(idx < arr.len());
arr.value_unchecked(idx)
});
Box::new(MutableBinaryArray::<i64>::from_trusted_len_values_iter_unchecked(iter).into())
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
Expand All @@ -348,6 +363,27 @@ pub unsafe fn take_utf8_iter_unchecked<I: IntoIterator<Item = usize>>(
Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
#[inline]
pub unsafe fn take_binary_iter_unchecked<I: IntoIterator<Item = usize>>(
arr: &LargeBinaryArray,
indices: I,
) -> Box<LargeBinaryArray> {
let validity = arr.validity().expect("should have nulls");
let iter = indices.into_iter().map(|idx| {
debug_assert!(idx < arr.len());
if validity.get_bit_unchecked(idx) {
Some(arr.value_unchecked(idx))
} else {
None
}
});

Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
Expand All @@ -363,6 +399,21 @@ pub unsafe fn take_no_null_utf8_opt_iter_unchecked<I: IntoIterator<Item = Option
Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
#[inline]
pub unsafe fn take_no_null_binary_opt_iter_unchecked<I: IntoIterator<Item = Option<usize>>>(
arr: &LargeBinaryArray,
indices: I,
) -> Box<LargeBinaryArray> {
let iter = indices
.into_iter()
.map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx)));

Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
Expand All @@ -384,6 +435,27 @@ pub unsafe fn take_utf8_opt_iter_unchecked<I: IntoIterator<Item = Option<usize>>
Box::new(LargeStringArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// - no bounds checks
/// - iterator must be TrustedLen
#[inline]
pub unsafe fn take_binary_opt_iter_unchecked<I: IntoIterator<Item = Option<usize>>>(
arr: &LargeBinaryArray,
indices: I,
) -> Box<LargeBinaryArray> {
let validity = arr.validity().expect("should have nulls");
let iter = indices.into_iter().map(|opt_idx| {
opt_idx.and_then(|idx| {
if validity.get_bit_unchecked(idx) {
Some(arr.value_unchecked(idx))
} else {
None
}
})
});
Box::new(LargeBinaryArray::from_trusted_len_iter_unchecked(iter))
}

/// # Safety
/// caller must ensure indices are in bounds
pub unsafe fn take_utf8_unchecked(
Expand Down Expand Up @@ -497,6 +569,119 @@ pub unsafe fn take_utf8_unchecked(
))
}

/// # Safety
/// caller must ensure indices are in bounds
pub unsafe fn take_binary_unchecked(
arr: &LargeBinaryArray,
indices: &IdxArr,
) -> Box<LargeBinaryArray> {
let data_len = indices.len();

let mut offset_buf = vec![0; data_len + 1];
let offset_typed = offset_buf.as_mut_slice();

let mut length_so_far = 0;
offset_typed[0] = length_so_far;

let validity;

// The required size is yet unknown
// Allocate 2.0 times the expected size.
// where expected size is the length of bytes multiplied by the factor (take_len / current_len)
let mut values_capacity = if arr.len() > 0 {
((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() as usize
} else {
0
};

// 16 bytes per string as default alloc
let mut values_buf = Vec::<u8>::with_capacity(values_capacity);

// both 0 nulls
if !arr.has_validity() && !indices.has_validity() {
offset_typed
.iter_mut()
.skip(1)
.enumerate()
.for_each(|(idx, offset)| {
let index = indices.value_unchecked(idx) as usize;
let s = arr.value_unchecked(index);
length_so_far += s.len() as i64;
*offset = length_so_far;

if length_so_far as usize >= values_capacity {
values_buf.reserve(values_capacity);
values_capacity *= 2;
}

values_buf.extend_from_slice(s)
});
validity = None;
} else if !arr.has_validity() {
offset_typed
.iter_mut()
.skip(1)
.enumerate()
.for_each(|(idx, offset)| {
if indices.is_valid(idx) {
let index = indices.value_unchecked(idx) as usize;
let s = arr.value_unchecked(index);
length_so_far += s.len() as i64;

if length_so_far as usize >= values_capacity {
values_buf.reserve(values_capacity);
values_capacity *= 2;
}

values_buf.extend_from_slice(s)
}
*offset = length_so_far;
});
validity = indices.validity().cloned();
} else {
let mut builder = MutableBinaryArray::with_capacities(data_len, length_so_far as usize);
let validity_arr = arr.validity().expect("should have nulls");

if !indices.has_validity() {
(0..data_len).for_each(|idx| {
let index = indices.value_unchecked(idx) as usize;
builder.push(if validity_arr.get_bit_unchecked(index) {
let s = arr.value_unchecked(index);
Some(s)
} else {
None
});
});
} else {
let validity_indices = indices.validity().expect("should have nulls");
(0..data_len).for_each(|idx| {
if validity_indices.get_bit_unchecked(idx) {
let index = indices.value_unchecked(idx) as usize;

if validity_arr.get_bit_unchecked(index) {
let s = arr.value_unchecked(index);
builder.push(Some(s));
} else {
builder.push_null();
}
} else {
builder.push_null();
}
});
}

let array: BinaryArray<i64> = builder.into();
return Box::new(array);
}

// Safety: all "values" are &str, and thus valid utf8
Box::new(BinaryArray::<i64>::from_data_unchecked_default(
offset_buf.into(),
values_buf.into(),
validity,
))
}

/// Forked and adapted from arrow-rs
/// This is faster because it does no bounds checks and allocates directly into aligned memory
///
Expand Down
2 changes: 2 additions & 0 deletions polars/polars-arrow/src/data_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ unsafe impl IsFloat for u16 {}
unsafe impl IsFloat for u32 {}
unsafe impl IsFloat for u64 {}
unsafe impl IsFloat for &str {}
unsafe impl IsFloat for &[u8] {}
unsafe impl IsFloat for bool {}
unsafe impl<T: IsFloat> IsFloat for Option<T> {}

Expand All @@ -41,6 +42,7 @@ mod private {
impl Sealed for f32 {}
impl Sealed for f64 {}
impl Sealed for &str {}
impl Sealed for &[u8] {}
impl Sealed for bool {}
impl<T: Sealed> Sealed for Option<T> {}
}
Expand Down
Loading