lancedb · westonpace · Apr 25, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 24, 2024
diff --git a/python/python/lance/file.py b/python/python/lance/file.py
@@ -124,7 +124,14 @@ class LanceFileWriter:
     Lance datasets then you should use the LanceDataset class instead.
     """
 
-    def __init__(self, path: str, schema: pa.Schema, **kwargs):
+    def __init__(
+        self,
+        path: str,
+        schema: pa.Schema,
+        *,
+        data_cache_bytes: int = None,
+        **kwargs,
+    ):
         """
         Create a new LanceFileWriter to write to the given path
 
@@ -135,8 +142,13 @@ def __init__(self, path: str, schema: pa.Schema, **kwargs):
             or a URI for remote storage.
         schema: pa.Schema
             The schema of data that will be written
+        data_cache_bytes: int
+            How many bytes (per column) to cache before writing a page.  The
+            default is an appropriate value based on the filesystem.
         """
-        self._writer = _LanceFileWriter(path, schema, **kwargs)
+        self._writer = _LanceFileWriter(
+            path, schema, data_cache_bytes=data_cache_bytes, **kwargs
+        )
         self.closed = False
 
     def write_batch(self, batch: Union[pa.RecordBatch, pa.Table]) -> None:

diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
@@ -34,7 +34,13 @@ class CompactionMetrics:
     files_added: int
 
 class LanceFileWriter:
-    def __init__(self, path: str, schema: pa.Schema): ...
+    def __init__(
+        self,
+        path: str,
+        schema: pa.Schema,
+        data_cache_bytes: int,
+        keep_original_array: bool,
+    ): ...
     def write_batch(self, batch: pa.RecordBatch) -> None: ...
     def finish(self) -> int: ...
 

diff --git a/python/src/file.rs b/python/src/file.rs
@@ -168,15 +168,23 @@ pub struct LanceFileWriter {
 }
 
 impl LanceFileWriter {
-    async fn open(uri_or_path: String, schema: PyArrowType<ArrowSchema>) -> PyResult<Self> {
+    async fn open(
+        uri_or_path: String,
+        schema: PyArrowType<ArrowSchema>,
+        data_cache_bytes: Option<u64>,
+        keep_original_array: Option<bool>,
+    ) -> PyResult<Self> {
         let (object_store, path) = object_store_from_uri_or_path(uri_or_path).await?;
         let object_writer = object_store.create(&path).await.infer_error()?;
         let lance_schema = lance_core::datatypes::Schema::try_from(&schema.0).infer_error()?;
         let inner = FileWriter::try_new(
             object_writer,
             path.to_string(),
             lance_schema,
-            FileWriterOptions::default(),
+            FileWriterOptions {
+                data_cache_bytes,
+                keep_original_array,
+            },
         )
         .infer_error()?;
         Ok(Self {
@@ -188,8 +196,18 @@ impl LanceFileWriter {
 #[pymethods]
 impl LanceFileWriter {
     #[new]
-    pub fn new(path: String, schema: PyArrowType<ArrowSchema>) -> PyResult<Self> {
-        RT.runtime.block_on(Self::open(path, schema))
+    pub fn new(
+        path: String,
+        schema: PyArrowType<ArrowSchema>,
+        data_cache_bytes: Option<u64>,
+        keep_original_array: Option<bool>,
+    ) -> PyResult<Self> {
+        RT.runtime.block_on(Self::open(
+            path,
+            schema,
+            data_cache_bytes,
+            keep_original_array,
+        ))
     }
 
     pub fn write_batch(&mut self, batch: PyArrowType<RecordBatch>) -> PyResult<()> {

diff --git a/rust/lance-arrow/src/deepcopy.rs b/rust/lance-arrow/src/deepcopy.rs
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::sync::Arc;
+
+use arrow_array::{make_array, Array, RecordBatch};
+use arrow_buffer::{Buffer, NullBuffer};
+use arrow_data::ArrayData;
+
+pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer {
+    Buffer::from(Vec::from(buffer.as_slice()))
+}
+
+fn deep_copy_nulls(nulls: &NullBuffer) -> Buffer {
+    deep_copy_buffer(nulls.inner().inner())
+}
+
+pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData {
+    let data_type = data.data_type().clone();
+    let len = data.len();
+    let null_count = data.null_count();
+    let null_bit_buffer = data.nulls().map(deep_copy_nulls);
+    let offset = data.offset();
+    let buffers = data
+        .buffers()
+        .iter()
+        .map(deep_copy_buffer)
+        .collect::<Vec<_>>();
+    let child_data = data
+        .child_data()
+        .iter()
+        .map(deep_copy_array_data)
+        .collect::<Vec<_>>();
+    unsafe {
+        ArrayData::new_unchecked(
+            data_type,
+            len,
+            Some(null_count),
+            null_bit_buffer,
+            offset,
+            buffers,
+            child_data,
+        )
+    }
+}
+
+pub fn deep_copy_array(array: &dyn Array) -> Arc<dyn Array> {
+    let data = array.to_data();
+    let data = deep_copy_array_data(&data);
+    make_array(data)
+}
+
+pub fn deep_copy_batch(batch: &RecordBatch) -> crate::Result<RecordBatch> {
+    let arrays = batch
+        .columns()
+        .iter()
+        .map(|array| deep_copy_array(array))
+        .collect::<Vec<_>>();
+    RecordBatch::try_new(batch.schema().clone(), arrays)
+}
diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs
@@ -17,6 +17,7 @@ use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit,
 use arrow_select::take::take;
 use rand::prelude::*;
 
+pub mod deepcopy;
 pub mod schema;
 pub use schema::*;
 pub mod bfloat16;

diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs
@@ -165,6 +165,7 @@ impl BatchEncoder {
     pub(crate) fn get_encoder_for_field(
         field: &Field,
         cache_bytes_per_column: u64,
+        keep_original_array: bool,
         col_idx: &mut u32,
         field_col_mapping: &mut Vec<(i32, i32)>,
     ) -> Result<Box<dyn FieldEncoder>> {
@@ -198,6 +199,7 @@ impl BatchEncoder {
                 field_col_mapping.push((field.id, my_col_idx as i32));
                 Ok(Box::new(PrimitiveFieldEncoder::try_new(
                     cache_bytes_per_column,
+                    keep_original_array,
                     &field.data_type(),
                     my_col_idx,
                 )?))
@@ -209,12 +211,14 @@ impl BatchEncoder {
                 let inner_encoding = Self::get_encoder_for_field(
                     &field.children[0],
                     cache_bytes_per_column,
+                    keep_original_array,
                     col_idx,
                     field_col_mapping,
                 )?;
                 Ok(Box::new(ListFieldEncoder::new(
                     inner_encoding,
                     cache_bytes_per_column,
+                    keep_original_array,
                     my_col_idx,
                 )))
             }
@@ -229,6 +233,7 @@ impl BatchEncoder {
                         Self::get_encoder_for_field(
                             field,
                             cache_bytes_per_column,
+                            keep_original_array,
                             col_idx,
                             field_col_mapping,
                         )
@@ -245,14 +250,19 @@ impl BatchEncoder {
                 *col_idx += 2;
                 Ok(Box::new(BinaryFieldEncoder::new(
                     cache_bytes_per_column,
+                    keep_original_array,
                     my_col_idx,
                 )))
             }
             _ => todo!("Implement encoding for data type {}", field.data_type()),
         }
     }
 
-    pub fn try_new(schema: &Schema, cache_bytes_per_column: u64) -> Result<Self> {
+    pub fn try_new(
+        schema: &Schema,
+        cache_bytes_per_column: u64,
+        keep_original_array: bool,
+    ) -> Result<Self> {
         let mut col_idx = 0;
         let mut field_col_mapping = Vec::new();
         let field_encoders = schema
@@ -262,6 +272,7 @@ impl BatchEncoder {
                 Self::get_encoder_for_field(
                     field,
                     cache_bytes_per_column,
+                    keep_original_array,
                     &mut col_idx,
                     &mut field_col_mapping,
                 )
@@ -301,7 +312,11 @@ pub async fn encode_batch(
 ) -> Result<EncodedBatch> {
     let mut data_buffer = BytesMut::new();
     let lance_schema = Schema::try_from(batch.schema().as_ref())?;
-    let batch_encoder = BatchEncoder::try_new(&lance_schema, cache_bytes_per_column)?;
+    // At this point, this is just a test utility, and there is no point in copying allocations
+    // This could become configurable in the future if needed.
+    let keep_original_array = true;
+    let batch_encoder =
+        BatchEncoder::try_new(&lance_schema, cache_bytes_per_column, keep_original_array)?;
     let mut page_table = Vec::new();
     for (arr, mut encoder) in batch.columns().iter().zip(batch_encoder.field_encoders) {
         let mut tasks = encoder.maybe_encode(arr.clone())?;

diff --git a/rust/lance-encoding/src/encodings/logical/binary.rs b/rust/lance-encoding/src/encodings/logical/binary.rs
@@ -166,10 +166,11 @@ pub struct BinaryFieldEncoder {
 }
 
 impl BinaryFieldEncoder {
-    pub fn new(cache_bytes_per_column: u64, column_index: u32) -> Self {
+    pub fn new(cache_bytes_per_column: u64, keep_original_array: bool, column_index: u32) -> Self {
         let items_encoder = Box::new(
             PrimitiveFieldEncoder::try_new(
                 cache_bytes_per_column,
+                keep_original_array,
                 &DataType::UInt8,
                 column_index + 1,
             )
@@ -179,6 +180,7 @@ impl BinaryFieldEncoder {
             varbin_encoder: Box::new(ListFieldEncoder::new(
                 items_encoder,
                 cache_bytes_per_column,
+                keep_original_array,
                 column_index,
             )),
         }

diff --git a/rust/lance-encoding/src/encodings/logical/list.rs b/rust/lance-encoding/src/encodings/logical/list.rs
@@ -447,7 +447,6 @@ impl ArrayEncoder for ListOffsetsEncoder {
             // Nothing to patch, don't incur a copy
             return self.inner.encode(arrays, buffer_index);
         }
-        println!("Stitching offsets {:?}", arrays);
         let num_offsets =
             arrays.iter().map(|array| array.len()).sum::<usize>() - (arrays.len() - 1);
         let mut offsets = Vec::with_capacity(num_offsets);
@@ -472,7 +471,6 @@ impl ArrayEncoder for ListOffsetsEncoder {
                     .map(|&v| v + last_prev_offset - first_curr_offset),
             );
         }
-        println!("Stitched offsets {:?}", offsets);
         self.inner
             .encode(&[Arc::new(Int32Array::from(offsets))], buffer_index)
     }
@@ -487,6 +485,7 @@ impl ListFieldEncoder {
     pub fn new(
         items_encoder: Box<dyn FieldEncoder>,
         cache_bytes_per_columns: u64,
+        keep_original_array: bool,
         column_index: u32,
     ) -> Self {
         let inner_encoder =
@@ -497,6 +496,7 @@ impl ListFieldEncoder {
         Self {
             offsets_encoder: PrimitiveFieldEncoder::new_with_encoder(
                 cache_bytes_per_columns,
+                keep_original_array,
                 column_index,
                 offsets_encoder,
             ),

diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs
@@ -20,6 +20,7 @@ use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
 use arrow_schema::{DataType, IntervalUnit, TimeUnit};
 use bytes::BytesMut;
 use futures::{future::BoxFuture, FutureExt};
+use lance_arrow::deepcopy::deep_copy_array;
 use log::{debug, trace};
 use snafu::{location, Location};
 
@@ -430,6 +431,7 @@ impl LogicalPageDecoder for PrimitiveFieldDecoder {
 
 pub struct PrimitiveFieldEncoder {
     cache_bytes: u64,
+    keep_original_array: bool,
     buffered_arrays: Vec<ArrayRef>,
     current_bytes: u64,
     encoder: Arc<dyn ArrayEncoder>,
@@ -451,9 +453,15 @@ impl PrimitiveFieldEncoder {
         }
     }
 
-    pub fn try_new(cache_bytes: u64, data_type: &DataType, column_index: u32) -> Result<Self> {
+    pub fn try_new(
+        cache_bytes: u64,
+        keep_original_array: bool,
+        data_type: &DataType,
+        column_index: u32,
+    ) -> Result<Self> {
         Ok(Self {
             cache_bytes,
+            keep_original_array,
             column_index,
             buffered_arrays: Vec::new(),
             current_bytes: 0,
@@ -463,11 +471,13 @@ impl PrimitiveFieldEncoder {
 
     pub fn new_with_encoder(
         cache_bytes: u64,
+        keep_original_array: bool,
         column_index: u32,
         encoder: Arc<dyn ArrayEncoder>,
     ) -> Self {
         Self {
             cache_bytes,
+            keep_original_array,
             column_index,
             buffered_arrays: Vec::new(),
             current_bytes: 0,
@@ -502,14 +512,20 @@ impl FieldEncoder for PrimitiveFieldEncoder {
     // Buffers data, if there is enough to write a page then we create an encode task
     fn maybe_encode(&mut self, array: ArrayRef) -> Result<Vec<EncodeTask>> {
         self.current_bytes += array.get_array_memory_size() as u64;
-        self.buffered_arrays.push(array);
         if self.current_bytes > self.cache_bytes {
+            // Push into buffered_arrays without copy since we are about to flush anyways
+            self.buffered_arrays.push(array);
             debug!(
                 "Flushing column {} page of size {} bytes (unencoded)",
                 self.column_index, self.current_bytes
             );
             Ok(vec![self.do_flush()])
         } else {
+            if self.keep_original_array {
+                self.buffered_arrays.push(array);
+            } else {
+                self.buffered_arrays.push(deep_copy_array(array.as_ref()))
+            }
             trace!(
                 "Accumulating data for column {}.  Now at {} bytes",
                 self.column_index,

diff --git a/rust/lance-encoding/src/testing.rs b/rust/lance-encoding/src/testing.rs
@@ -96,6 +96,7 @@ pub async fn check_round_trip_encoding_random(field: Field) {
             BatchEncoder::get_encoder_for_field(
                 &lance_field,
                 page_size,
+                /*keep_original_array=*/ true,
                 &mut col_idx,
                 &mut field_id_to_col_index,
             )
@@ -149,6 +150,7 @@ pub async fn check_round_trip_encoding_of_data(data: Vec<Arc<dyn Array>>, test_c
         let encoder = BatchEncoder::get_encoder_for_field(
             &lance_field,
             page_size,
+            /*keep_original=*/ true,
             &mut col_idx,
             &mut field_id_to_col_index,
         )

diff --git a/rust/lance-file/Cargo.toml b/rust/lance-file/Cargo.toml
@@ -19,6 +19,7 @@ lance-io.workspace = true
 arrow-arith.workspace = true
 arrow-array.workspace = true
 arrow-buffer.workspace = true
+arrow-data.workspace = true
 arrow-schema.workspace = true
 arrow-select.workspace = true
 async-recursion.workspace = true