From 8f879f1038b5496a6b390c1422acd8d1d620b5c8 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 12 Jul 2024 17:44:21 +0800 Subject: [PATCH 1/2] chore: upgrade arrow to v0.52.1 - this fixes `FixedSizeListArray` overflow bug Signed-off-by: BubbleCal --- Cargo.toml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 691a6b13a1..2575312439 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,17 +59,17 @@ lance-test-macros = { version = "=0.14.2", path = "./rust/lance-test-macros" } lance-testing = { version = "=0.14.2", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow -arrow = { version = "51.0.0", optional = false, features = ["prettyprint"] } -arrow-arith = "51.0" -arrow-array = "51.0" -arrow-buffer = "51.0" -arrow-cast = "51.0" -arrow-data = "51.0" -arrow-ipc = { version = "51.0", features = ["zstd"] } -arrow-ord = "51.0" -arrow-row = "51.0" -arrow-schema = "51.0" -arrow-select = "51.0" +arrow = { version = "52.1.0", optional = false, features = ["prettyprint"] } +arrow-arith = "52.1" +arrow-array = "52.1" +arrow-buffer = "52.1" +arrow-cast = "52.1" +arrow-data = "52.1" +arrow-ipc = { version = "52.1", features = ["zstd"] } +arrow-ord = "52.1" +arrow-row = "52.1" +arrow-schema = "52.1" +arrow-select = "52.1" async-recursion = "1.0" async-trait = "0.1" aws-config = "0.57" @@ -120,7 +120,7 @@ moka = "0.11" num-traits = "0.2" num_cpus = "1.0" object_store = { version = "0.9.0" } -parquet = "51.0" +parquet = "52.1" pin-project = "1.0" path_abs = "0.5" pprof = { version = "0.13", features = ["flamegraph", "criterion"] } From fe2c45990b79821a9e6d87b0b57d2c4048c1ada2 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 12 Jul 2024 21:06:31 +0800 Subject: [PATCH 2/2] fix interval array generator Signed-off-by: BubbleCal --- rust/lance-datagen/src/generator.rs | 66 ++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 76119202ea..cf4cad374b 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -3,6 +3,7 @@ use std::{iter, marker::PhantomData, sync::Arc}; +use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano}; use arrow::{ array::{ArrayData, AsArray}, buffer::{BooleanBuffer, Buffer, OffsetBuffer, ScalarBuffer}, @@ -14,7 +15,7 @@ use arrow_array::{ Array, FixedSizeBinaryArray, FixedSizeListArray, ListArray, PrimitiveArray, RecordBatch, RecordBatchOptions, RecordBatchReader, StringArray, StructArray, }; -use arrow_schema::{ArrowError, DataType, Field, Fields, Schema, SchemaRef}; +use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef}; use futures::{stream::BoxStream, StreamExt}; use rand::{distributions::Uniform, Rng, RngCore, SeedableRng}; @@ -596,6 +597,59 @@ impl ArrayGenerator for RandomFixedSizeBinaryGenerator { } } +pub struct RandomIntervalGenerator { + unit: IntervalUnit, + data_type: DataType, +} + +impl RandomIntervalGenerator { + pub fn new(unit: IntervalUnit) -> Self { + Self { + unit, + data_type: DataType::Interval(unit), + } + } +} + +impl ArrayGenerator for RandomIntervalGenerator { + fn generate( + &mut self, + length: RowCount, + rng: &mut rand_xoshiro::Xoshiro256PlusPlus, + ) -> Result, ArrowError> { + match self.unit { + IntervalUnit::YearMonth => { + let months = (0..length.0).map(|_| rng.gen::()).collect::>(); + Ok(Arc::new(arrow_array::IntervalYearMonthArray::from(months))) + } + IntervalUnit::MonthDayNano => { + let day_time_array = (0..length.0) + .map(|_| IntervalMonthDayNano::new(rng.gen(), rng.gen(), rng.gen())) + .collect::>(); + Ok(Arc::new(arrow_array::IntervalMonthDayNanoArray::from( + day_time_array, + ))) + } + IntervalUnit::DayTime => { + let day_time_array = (0..length.0) + .map(|_| IntervalDayTime::new(rng.gen(), rng.gen())) + .collect::>(); + Ok(Arc::new(arrow_array::IntervalDayTimeArray::from( + day_time_array, + ))) + } + } + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn element_size_bytes(&self) -> Option { + Some(ByteCount::from(12)) + } +} + pub struct RandomBinaryGenerator { bytes_per_element: ByteCount, scale_to_utf8: bool, @@ -1461,6 +1515,10 @@ pub mod array { Box::new(RandomFixedSizeBinaryGenerator::new(size)) } + pub fn rand_interval(unit: IntervalUnit) -> Box { + Box::new(RandomIntervalGenerator::new(unit)) + } + /// Create a generator of randomly sampled date32 values /// /// Instead of sampling the entire range, all values will be drawn from the last year as this @@ -1663,11 +1721,7 @@ pub mod array { TimeUnit::Microsecond => rand::(), TimeUnit::Nanosecond => rand::(), }, - DataType::Interval(unit) => match unit { - IntervalUnit::DayTime => rand::(), - IntervalUnit::MonthDayNano => rand::(), - IntervalUnit::YearMonth => rand::(), - }, + DataType::Interval(unit) => rand_interval(*unit), DataType::Date32 => rand_date32(), DataType::Date64 => rand_date64(), DataType::Time32(resolution) => rand_time32(resolution),