From 57bb533c21609e789bb671863f828c349a6861c1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 08:05:26 -0500 Subject: [PATCH 1/5] Improve Bytes documentation --- arrow-buffer/src/bytes.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 77724137aef7..b811bd2c6b40 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -28,14 +28,18 @@ use crate::buffer::dangling_ptr; /// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. /// -/// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's -/// global allocator nor u8 alignment. +/// Note that this structure is an internal implementation detail of the +/// arrow-rs crate. While it has the same name and similar API as +/// [`bytes::Bytes`] it is not limited to rust's global allocator nor u8 +/// alignment. It is possible to create a `Bytes` from `bytes::Bytes` using the +/// `From` implementation. /// /// In the most common case, this buffer is allocated using [`alloc`](std::alloc::alloc) /// with an alignment of [`ALIGNMENT`](crate::alloc::ALIGNMENT) /// /// When the region is allocated by a different allocator, [Deallocation::Custom], this calls the /// custom deallocator to deallocate the region when it is no longer needed. +/// pub struct Bytes { /// The raw pointer to be beginning of the region ptr: NonNull, From cae4a7cd86af47704082cd98cbbedae3efa444f6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 09:20:41 -0500 Subject: [PATCH 2/5] Improve Buffer documentation, add From and From impls --- arrow-buffer/src/buffer/immutable.rs | 101 ++++++++++++++---- .../src/arrow/array_reader/byte_view_array.rs | 10 +- 2 files changed, 86 insertions(+), 25 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index cf1d6f366751..53f6aa280e8b 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -28,8 +28,43 @@ use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; -/// Buffer represents a contiguous memory region that can be shared with other buffers and across -/// thread boundaries. +/// A contiguous memory region that can be shared with other buffers and across +/// thread boundaries that stores Arrow data. +/// +/// `Buffer`s can be sliced and cloned without copying the underlying data and can +/// be created from memory allocated by non-Rust sources such as C/C++. +/// +/// # Example: Create a `Buffer` from a `Vec` (without copying) +/// ``` +/// # use arrow_buffer::Buffer; +/// let vec: Vec = vec![1, 2, 3]; +/// let buffer = Buffer::from(vec); +/// ``` +/// +/// # Example: Convert a `Buffer` to a `Vec` (without copying) +/// +/// Use [`Self::into_vec`] to convert a `Buffer` back into a `Vec` if there are +/// no other references and the types are aligned correctly. +/// ``` +/// # use arrow_buffer::Buffer; +/// # let vec: Vec = vec![1, 2, 3]; +/// # let buffer = Buffer::from(vec); +/// // convert the buffer back into a Vec of u32 +/// // note this will fail if the buffer is shared or not aligned correctly +/// let vec: Vec = buffer.into_vec().unwrap(); +/// ``` +/// +/// # Example: Create a `Buffer` from a [`bytes::Bytes`] (without copying) +/// +/// [`bytes::Bytes`] is a common type in the Rust ecosystem for shared memory +/// regions. You can create a buffer from a `Bytes` instance using the `From` +/// implementation, also without copying. +/// +/// ``` +/// # use arrow_buffer::Buffer; +/// let bytes = bytes::Bytes::from("hello"); +/// let buffer = Buffer::from(bytes); +///``` #[derive(Clone, Debug)] pub struct Buffer { /// the internal byte buffer. @@ -59,15 +94,12 @@ unsafe impl Send for Buffer where Bytes: Send {} unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { - /// Auxiliary method to create a new Buffer + /// Create a new Buffer from a (internal) [`Bytes`]. /// - /// This can be used with a [`bytes::Bytes`] via `into()`: + /// NOTE despite the same name, [`Bytes`] is an internal struct in arrow-rs + /// and is different than [`bytes::Bytes`]. /// - /// ``` - /// # use arrow_buffer::Buffer; - /// let bytes = bytes::Bytes::from_static(b"foo"); - /// let buffer = Buffer::from_bytes(bytes.into()); - /// ``` + /// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`]. #[inline] pub fn from_bytes(bytes: Bytes) -> Self { let length = bytes.len(); @@ -107,8 +139,11 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting - /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. + /// Creates a buffer from an existing memory region. + /// + /// Ownership of the memory is tracked via reference counting + /// and the memory will be freed using the `drop` method of + /// [crate::alloc::Allocation] when the reference count reaches zero. /// /// # Arguments /// @@ -155,7 +190,7 @@ impl Buffer { self.data.capacity() } - /// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory. + /// Tries to shrink the capacity of the buffer as much as possible, freeing unused memory. /// /// If the buffer is shared, this is a no-op. /// @@ -190,7 +225,7 @@ impl Buffer { } } - /// Returns whether the buffer is empty. + /// Returns true if the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { self.length == 0 @@ -206,7 +241,9 @@ impl Buffer { } /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data, and allows the + /// same memory region to be shared between buffers. /// /// # Panics /// @@ -240,7 +277,10 @@ impl Buffer { /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`, /// with `length` bytes. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data and allows the same + /// memory region to be shared between buffers. + /// /// # Panics /// Panics iff `(offset + length)` is larger than the existing length. pub fn slice_with_length(&self, offset: usize, length: usize) -> Self { @@ -328,10 +368,16 @@ impl Buffer { }) } - /// Returns `Vec` for mutating the buffer + /// Converts self into a `Vec`, if possible. + /// + /// This can be used to reuse / mutate the underlying data. /// - /// Returns `Err(self)` if this buffer does not have the same [`Layout`] as - /// the destination Vec or contains a non-zero offset + /// # Errors + /// + /// Returns `Err(self)` if + /// 1. this buffer does not have the same [`Layout`] as the destination Vec + /// 2. contains a non-zero offset + /// 3. The buffer is shared pub fn into_vec(self) -> Result, Self> { let layout = match self.data.deallocation() { Deallocation::Standard(l) => l, @@ -414,7 +460,22 @@ impl From> for Buffer { } } -/// Creating a `Buffer` instance by storing the boolean values into the buffer +/// Convert from internal [`Bytes`], not [`bytes::Bytes`] to `Buffer` +impl From for Buffer { + fn from(bytes: Bytes) -> Self { + Self::from_bytes(bytes) + } +} + +/// Convert from [`bytes::Bytes`], not internal [`Bytes`] to `Buffer` +impl From for Buffer { + fn from(bytes: bytes::Bytes) -> Self { + let bytes: Bytes = bytes.into(); + Self::from(bytes) + } +} + +/// Create a `Buffer` instance by storing the boolean values into the buffer impl FromIterator for Buffer { fn from_iter(iter: I) -> Self where @@ -447,7 +508,9 @@ impl From> for Buffer { impl Buffer { /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length. + /// /// Prefer this to `collect` whenever possible, as it is ~60% faster. + /// /// # Example /// ``` /// # use arrow_buffer::buffer::Buffer; diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 5845e2c08cec..8d848982e656 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -316,9 +316,8 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let buf = arrow_buffer::Buffer::from_bytes(self.buf.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer:Buffer` + let buf = arrow_buffer::Buffer::from(self.buf.clone()); let block_id = output.append_block(buf); let to_read = len.min(self.max_remaining_values); @@ -549,9 +548,8 @@ impl ByteViewArrayDecoderDeltaLength { let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read]; - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let bytes = arrow_buffer::Buffer::from_bytes(self.data.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer:Buffer` + let bytes = Buffer::from(self.data.clone()); let block_id = output.append_block(bytes); let mut current_offset = self.data_offset; From 247ce3fd1abaa1be9f191b764e23cf8fded88733 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 09:40:11 -0500 Subject: [PATCH 3/5] avoid linking to private docs --- arrow-buffer/src/buffer/immutable.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 53f6aa280e8b..b22e0b80d104 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -94,9 +94,9 @@ unsafe impl Send for Buffer where Bytes: Send {} unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { - /// Create a new Buffer from a (internal) [`Bytes`]. + /// Create a new Buffer from a (internal) `Bytes` /// - /// NOTE despite the same name, [`Bytes`] is an internal struct in arrow-rs + /// NOTE despite the same name, `Bytes` is an internal struct in arrow-rs /// and is different than [`bytes::Bytes`]. /// /// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`]. @@ -460,14 +460,14 @@ impl From> for Buffer { } } -/// Convert from internal [`Bytes`], not [`bytes::Bytes`] to `Buffer` +/// Convert from internal `Bytes`, not [`bytes::Bytes`] to `Buffer` impl From for Buffer { fn from(bytes: Bytes) -> Self { Self::from_bytes(bytes) } } -/// Convert from [`bytes::Bytes`], not internal [`Bytes`] to `Buffer` +/// Convert from [`bytes::Bytes`], not internal `Bytes` to `Buffer` impl From for Buffer { fn from(bytes: bytes::Bytes) -> Self { let bytes: Bytes = bytes.into(); From e8a796a5ce196a6e3425181880c506f335a02827 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 12:19:16 -0500 Subject: [PATCH 4/5] Deprecate `Buffer::from_bytes` --- arrow-buffer/src/buffer/immutable.rs | 19 ++++++++++--------- arrow-buffer/src/buffer/mutable.rs | 2 +- arrow-flight/src/decode.rs | 2 +- arrow-flight/src/sql/client.rs | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index b22e0b80d104..5f35559c84eb 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -100,15 +100,9 @@ impl Buffer { /// and is different than [`bytes::Bytes`]. /// /// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`]. - #[inline] + #[deprecated(since = "54.1.0", note = "Use Buffer::from instead")] pub fn from_bytes(bytes: Bytes) -> Self { - let length = bytes.len(); - let ptr = bytes.as_ptr(); - Buffer { - data: Arc::new(bytes), - ptr, - length, - } + Self::from(bytes) } /// Returns the offset, in bytes, of `Self::ptr` to `Self::data` @@ -462,8 +456,15 @@ impl From> for Buffer { /// Convert from internal `Bytes`, not [`bytes::Bytes`] to `Buffer` impl From for Buffer { + #[inline] fn from(bytes: Bytes) -> Self { - Self::from_bytes(bytes) + let length = bytes.len(); + let ptr = bytes.as_ptr(); + Self { + data: Arc::new(bytes), + ptr, + length, + } } } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index c4315a1d64cd..5ad55e306e2a 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -328,7 +328,7 @@ impl MutableBuffer { pub(super) fn into_buffer(self) -> Buffer { let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; std::mem::forget(self); - Buffer::from_bytes(bytes) + Buffer::from(bytes) } /// View this buffer as a mutable slice of a specific type. diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index 7bafc384306b..760fc926fca6 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -295,7 +295,7 @@ impl FlightDataDecoder { )); }; - let buffer = Buffer::from_bytes(data.data_body.into()); + let buffer = Buffer::from(data.data_body); let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { FlightError::protocol( "Could not get dictionary batch from DictionaryBatch message", diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index a6e228737b3f..6d3ac3dbe610 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -721,7 +721,7 @@ pub fn arrow_data_from_flight_data( let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( - &Buffer::from_bytes(flight_data.data_body.into()), + &Buffer::from(flight_data.data_body), ipc_record_batch, arrow_schema_ref.clone(), &dictionaries_by_field, From 1d488653d475859ef8b3f3d06b1db381ca5e1880 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 5 Jan 2025 10:09:48 -0500 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Jeffrey Vo --- arrow-buffer/src/buffer/immutable.rs | 6 +++--- parquet/src/arrow/array_reader/byte_view_array.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 5f35559c84eb..fd145ce2306e 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -236,7 +236,7 @@ impl Buffer { /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. /// - /// This function is `O(1)` and does not copy any data, and allows the + /// This function is `O(1)` and does not copy any data, allowing the /// same memory region to be shared between buffers. /// /// # Panics @@ -272,7 +272,7 @@ impl Buffer { /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`, /// with `length` bytes. /// - /// This function is `O(1)` and does not copy any data and allows the same + /// This function is `O(1)` and does not copy any data, allowing the same /// memory region to be shared between buffers. /// /// # Panics @@ -454,7 +454,7 @@ impl From> for Buffer { } } -/// Convert from internal `Bytes`, not [`bytes::Bytes`] to `Buffer` +/// Convert from internal `Bytes` (not [`bytes::Bytes`]) to `Buffer` impl From for Buffer { #[inline] fn from(bytes: Bytes) -> Self { diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 8d848982e656..92a8b0592d0d 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -316,7 +316,7 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Zero copy convert `bytes::Bytes` into `arrow_buffer:Buffer` + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` let buf = arrow_buffer::Buffer::from(self.buf.clone()); let block_id = output.append_block(buf); @@ -548,7 +548,7 @@ impl ByteViewArrayDecoderDeltaLength { let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read]; - // Zero copy convert `bytes::Bytes` into `arrow_buffer:Buffer` + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` let bytes = Buffer::from(self.data.clone()); let block_id = output.append_block(bytes);