apache · tustvold · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml
@@ -42,6 +42,11 @@ bench = false
 # this is not enabled by default as it is too computationally expensive
 # but is run as part of our CI checks
 force_validate = []
+# Enable ffi support
+ffi = ["arrow-schema/ffi"]
+
+[package.metadata.docs.rs]
+features = ["ffi"]
 
 [dependencies]
 

diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs
@@ -0,0 +1,280 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
+
+use crate::{layout, ArrayData};
+use arrow_buffer::Buffer;
+use arrow_schema::DataType;
+use std::ffi::c_void;
+
+/// ABI-compatible struct for ArrowArray from C Data Interface
+/// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
+///
+/// ```
+/// # use arrow_data::ArrayData;
+/// # use arrow_data::ffi::FFI_ArrowArray;
+/// fn export_array(array: &ArrayData) -> FFI_ArrowArray {
+///     FFI_ArrowArray::new(array)
+/// }
+/// ```
+#[repr(C)]
+#[derive(Debug)]
+pub struct FFI_ArrowArray {
+    pub(crate) length: i64,
+    pub(crate) null_count: i64,
+    pub(crate) offset: i64,
+    pub(crate) n_buffers: i64,
+    pub(crate) n_children: i64,
+    pub(crate) buffers: *mut *const c_void,
+    pub(crate) children: *mut *mut FFI_ArrowArray,
+    pub(crate) dictionary: *mut FFI_ArrowArray,
+    pub(crate) release: Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArray)>,
+    // When exported, this MUST contain everything that is owned by this array.
+    // for example, any buffer pointed to in `buffers` must be here, as well
+    // as the `buffers` pointer itself.
+    // In other words, everything in [FFI_ArrowArray] must be owned by
+    // `private_data` and can assume that they do not outlive `private_data`.
+    pub(crate) private_data: *mut c_void,
+}
+
+impl Drop for FFI_ArrowArray {
+    fn drop(&mut self) {
+        match self.release {
+            None => (),
+            Some(release) => unsafe { release(self) },
+        };
+    }
+}
+
+unsafe impl Send for FFI_ArrowArray {}
+unsafe impl Sync for FFI_ArrowArray {}
+
+// callback used to drop [FFI_ArrowArray] when it is exported
+unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
+    if array.is_null() {
+        return;
+    }
+    let array = &mut *array;
+
+    // take ownership of `private_data`, therefore dropping it`
+    let private = Box::from_raw(array.private_data as *mut ArrayPrivateData);
+    for child in private.children.iter() {
+        let _ = Box::from_raw(*child);
+    }
+    if !private.dictionary.is_null() {
+        let _ = Box::from_raw(private.dictionary);
+    }
+
+    array.release = None;
+}
+
+struct ArrayPrivateData {
+    #[allow(dead_code)]
+    buffers: Vec<Option<Buffer>>,
+    buffers_ptr: Box<[*const c_void]>,
+    children: Box<[*mut FFI_ArrowArray]>,
+    dictionary: *mut FFI_ArrowArray,
+}
+
+impl FFI_ArrowArray {
+    /// creates a new `FFI_ArrowArray` from existing data.
+    /// # Memory Leaks
+    /// This method releases `buffers`. Consumers of this struct *must* call `release` before
+    /// releasing this struct, or contents in `buffers` leak.
+    pub fn new(data: &ArrayData) -> Self {
+        let data_layout = layout(data.data_type());
+
+        let buffers = if data_layout.can_contain_null_mask {
+            // * insert the null buffer at the start
+            // * make all others `Option<Buffer>`.
+            std::iter::once(data.null_buffer().cloned())
+                .chain(data.buffers().iter().map(|b| Some(b.clone())))
+                .collect::<Vec<_>>()
+        } else {
+            data.buffers().iter().map(|b| Some(b.clone())).collect()
+        };
+
+        // `n_buffers` is the number of buffers by the spec.
+        let n_buffers = {
+            data_layout.buffers.len() + {
+                // If the layout has a null buffer by Arrow spec.
+                // Note that even the array doesn't have a null buffer because it has
+                // no null value, we still need to count 1 here to follow the spec.
+                usize::from(data_layout.can_contain_null_mask)
+            }
+        } as i64;
+
+        let buffers_ptr = buffers
+            .iter()
+            .flat_map(|maybe_buffer| match maybe_buffer {
+                // note that `raw_data` takes into account the buffer's offset
+                Some(b) => Some(b.as_ptr() as *const c_void),
+                // This is for null buffer. We only put a null pointer for
+                // null buffer if by spec it can contain null mask.
+                None if data_layout.can_contain_null_mask => Some(std::ptr::null()),
+                None => None,
+            })
+            .collect::<Box<[_]>>();
+
+        let empty = vec![];
+        let (child_data, dictionary) = match data.data_type() {
+            DataType::Dictionary(_, _) => (
+                empty.as_slice(),
+                Box::into_raw(Box::new(FFI_ArrowArray::new(&data.child_data()[0]))),
+            ),
+            _ => (data.child_data(), std::ptr::null_mut()),
+        };
+
+        let children = child_data
+            .iter()
+            .map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child))))
+            .collect::<Box<_>>();
+        let n_children = children.len() as i64;
+
+        // create the private data owning everything.
+        // any other data must be added here, e.g. via a struct, to track lifetime.
+        let mut private_data = Box::new(ArrayPrivateData {
+            buffers,
+            buffers_ptr,
+            children,
+            dictionary,
+        });
+
+        Self {
+            length: data.len() as i64,
+            null_count: data.null_count() as i64,
+            offset: data.offset() as i64,
+            n_buffers,
+            n_children,
+            buffers: private_data.buffers_ptr.as_mut_ptr(),
+            children: private_data.children.as_mut_ptr(),
+            dictionary,
+            release: Some(release_array),
+            private_data: Box::into_raw(private_data) as *mut c_void,
+        }
+    }
+
+    /// create an empty `FFI_ArrowArray`, which can be used to import data into
+    pub fn empty() -> Self {
+        Self {
+            length: 0,
+            null_count: 0,
+            offset: 0,
+            n_buffers: 0,
+            n_children: 0,
+            buffers: std::ptr::null_mut(),
+            children: std::ptr::null_mut(),
+            dictionary: std::ptr::null_mut(),
+            release: None,
+            private_data: std::ptr::null_mut(),
+        }
+    }
+
+    /// the length of the array
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.length as usize
+    }
+
+    /// whether the array is empty
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.length == 0
+    }
+
+    /// Whether the array has been released
+    #[inline]
+    pub fn is_released(&self) -> bool {
+        self.release.is_none()
+    }
+
+    /// the offset of the array
+    #[inline]
+    pub fn offset(&self) -> usize {
+        self.offset as usize
+    }
+
+    /// the null count of the array
+    #[inline]
+    pub fn null_count(&self) -> usize {
+        self.null_count as usize
+    }
+
+    /// Returns the buffer at the provided index
+    ///
+    /// # Panic
+    /// Panics if index exceeds the number of buffers or the buffer is not correctly aligned
+    pub fn buffer(&self, index: usize) -> *const u8 {
+        assert!(!self.buffers.is_null());
+        assert!(index < self.num_buffers());
+        // SAFETY:
+        // If buffers is not null must be valid for reads up to num_buffers
+        unsafe { std::ptr::read_unaligned((self.buffers as *mut *const u8).add(index)) }
+    }
+
+    /// Returns the number of buffers
+    pub fn num_buffers(&self) -> usize {
+        self.n_buffers as _
+    }
+
+    /// Returns the child at the provided index
+    pub fn child(&self, index: usize) -> &FFI_ArrowArray {
+        assert!(!self.children.is_null());
+        assert!(index < self.num_children());
+        // Safety:
+        // If children is not null must be valid for reads up to num_children
+        unsafe {
+            let child = std::ptr::read_unaligned(self.children.add(index));
+            child.as_ref().unwrap()
+        }
+    }
+
+    /// Returns the number of children
+    pub fn num_children(&self) -> usize {
+        self.n_children as _
+    }
+
+    /// Returns the dictionary if any
+    pub fn dictionary(&self) -> Option<&Self> {
+        // Safety:
+        // If dictionary is not null should be valid for reads of `Self`
+        unsafe { self.dictionary.as_ref() }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // More tests located in top-level arrow crate
+
+    #[test]
+    fn null_array_n_buffers() {
+        let data = ArrayData::new_null(&DataType::Null, 10);
+
+        let ffi_array = FFI_ArrowArray::new(&data);
+        assert_eq!(0, ffi_array.n_buffers);
+
+        let private_data =
+            unsafe { Box::from_raw(ffi_array.private_data as *mut ArrayPrivateData) };
+
+        assert_eq!(0, private_data.buffers_ptr.len());
+
+        Box::into_raw(private_data);
+    }
+}
diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs
@@ -28,3 +28,6 @@ pub mod transform;
 pub mod bit_iterator;
 pub mod bit_mask;
 pub mod decimal;
+
+#[cfg(feature = "ffi")]
+pub mod ffi;
diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml
@@ -39,9 +39,14 @@ bench = false
 
 [dependencies]
 serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true }
+bitflags = { version = "1.2.1", default-features = false, optional = true }
 
 [features]
-default = []
+# Enable ffi support
+ffi = ["bitflags"]
+
+[package.metadata.docs.rs]
+features = ["ffi"]
 
 [dev-dependencies]
 serde_json = "1.0"