diff --git a/CHANGELOG.md b/CHANGELOG.md index 00c66b4ad132..30cd0ec5200a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,79 @@ # Changelog +## [10.0.0](https://github.com/apache/arrow-rs/tree/10.0.0) (2022-03-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/9.1.0...10.0.0) + +**Breaking changes:** + +- Remove existing has\_ methods for optional fields in `ColumnChunkMetaData` [\#1346](https://github.com/apache/arrow-rs/pull/1346) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) +- Remove redundant `has_` methods in `ColumnChunkMetaData` [\#1345](https://github.com/apache/arrow-rs/pull/1345) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([shanisolomon](https://github.com/shanisolomon)) + +**Implemented enhancements:** + +- Add extract month and day in temporal.rs [\#1387](https://github.com/apache/arrow-rs/issues/1387) +- Add clone to `IpcWriteOptions` [\#1381](https://github.com/apache/arrow-rs/issues/1381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `MapArray` in `filter` kernel [\#1378](https://github.com/apache/arrow-rs/issues/1378) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `week` temporal kernel [\#1375](https://github.com/apache/arrow-rs/issues/1375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `compare_dict_op` [\#1371](https://github.com/apache/arrow-rs/issues/1371) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for LargeUtf8 in json writer [\#1357](https://github.com/apache/arrow-rs/issues/1357) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make `arrow::array::builder::MapBuilder` public [\#1354](https://github.com/apache/arrow-rs/issues/1354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor `StructArray::from` [\#1351](https://github.com/apache/arrow-rs/issues/1351) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor `RecordBatch::validate_new_batch` [\#1350](https://github.com/apache/arrow-rs/issues/1350) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove redundant has\_ methods for optional column metadata fields [\#1344](https://github.com/apache/arrow-rs/issues/1344) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `write` method to JsonWriter [\#1340](https://github.com/apache/arrow-rs/issues/1340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Refactor the code of `Bitmap::new` [\#1337](https://github.com/apache/arrow-rs/issues/1337) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use DictionaryArray's iterator in `compare_dict_op` [\#1329](https://github.com/apache/arrow-rs/issues/1329) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `as_decimal_array(arr: &dyn Array) -> &DecimalArray` [\#1312](https://github.com/apache/arrow-rs/issues/1312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- More ergonomic / idiomatic primitive array creation from iterators [\#1298](https://github.com/apache/arrow-rs/issues/1298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement DictionaryArray support in `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#1201](https://github.com/apache/arrow-rs/issues/1201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- `cargo clippy` fails on the `master` branch [\#1362](https://github.com/apache/arrow-rs/issues/1362) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1333](https://github.com/apache/arrow-rs/issues/1333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix CSV Writer::new to accept delimiter and make WriterBuilder::build use it [\#1328](https://github.com/apache/arrow-rs/issues/1328) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make bounds configurable via builder when reading CSV [\#1327](https://github.com/apache/arrow-rs/issues/1327) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `with_datetime_format()` to CSV WriterBuilder [\#1272](https://github.com/apache/arrow-rs/issues/1272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Performance improvements:** + +- Improve performance of `min` and `max` aggregation kernels without nulls [\#1373](https://github.com/apache/arrow-rs/issues/1373) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Consider removing redundant has\_XXX metadata functions in `ColumnChunkMetadata` [\#1332](https://github.com/apache/arrow-rs/issues/1332) + +**Merged pull requests:** + +- Support extract `day` and `month` in temporal.rs [\#1388](https://github.com/apache/arrow-rs/pull/1388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Add write method to Json Writer [\#1383](https://github.com/apache/arrow-rs/pull/1383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- Derive `Clone` for `IpcWriteOptions` [\#1382](https://github.com/apache/arrow-rs/pull/1382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([matthewmturner](https://github.com/matthewmturner)) +- feat: support maps in MutableArrayData [\#1379](https://github.com/apache/arrow-rs/pull/1379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([helgikrs](https://github.com/helgikrs)) +- Support extract `week` in temporal.rs [\#1376](https://github.com/apache/arrow-rs/pull/1376) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Speed up the function `min_max_string` [\#1374](https://github.com/apache/arrow-rs/pull/1374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Improve performance if dictionary kernels, add benchmark and add `take_iter_unchecked` [\#1372](https://github.com/apache/arrow-rs/pull/1372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update pyo3 requirement from 0.15 to 0.16 [\#1369](https://github.com/apache/arrow-rs/pull/1369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update contributing guide [\#1368](https://github.com/apache/arrow-rs/pull/1368) ([HaoYang670](https://github.com/HaoYang670)) +- Allow primitive array creation from iterators of PrimitiveTypes \(as well as `Option`\) [\#1367](https://github.com/apache/arrow-rs/pull/1367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update flatbuffers requirement from =2.1.0 to =2.1.1 [\#1364](https://github.com/apache/arrow-rs/pull/1364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix clippy lints [\#1363](https://github.com/apache/arrow-rs/pull/1363) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor `RecordBatch::validate_new_batch` [\#1361](https://github.com/apache/arrow-rs/pull/1361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor `StructArray::from` [\#1360](https://github.com/apache/arrow-rs/pull/1360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Update flatbuffers requirement from =2.0.0 to =2.1.0 [\#1359](https://github.com/apache/arrow-rs/pull/1359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: add LargeUtf8 support in json writer [\#1358](https://github.com/apache/arrow-rs/pull/1358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tiphaineruy](https://github.com/tiphaineruy)) +- Add `as_decimal_array` function [\#1356](https://github.com/apache/arrow-rs/pull/1356) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Publicly export arrow::array::MapBuilder [\#1355](https://github.com/apache/arrow-rs/pull/1355) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tjwilson90](https://github.com/tjwilson90)) +- Add with\_datetime\_format to csv WriterBuilder [\#1347](https://github.com/apache/arrow-rs/pull/1347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Refactor `Bitmap::new` [\#1343](https://github.com/apache/arrow-rs/pull/1343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove delimiter from csv Writer [\#1342](https://github.com/apache/arrow-rs/pull/1342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- Make bounds configurable in csv ReaderBuilder [\#1341](https://github.com/apache/arrow-rs/pull/1341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gsserge](https://github.com/gsserge)) +- `ArrowArray::try_from_raw` should not assume the pointers are from Arc [\#1334](https://github.com/apache/arrow-rs/pull/1334) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use DictionaryArray's iterator in `compare_dict_op` [\#1330](https://github.com/apache/arrow-rs/pull/1330) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement DictionaryArray support in neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#1326](https://github.com/apache/arrow-rs/pull/1326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Arrow Rust + Conbench Integration [\#1289](https://github.com/apache/arrow-rs/pull/1289) ([dianaclarke](https://github.com/dianaclarke)) + ## [9.1.0](https://github.com/apache/arrow-rs/tree/9.1.0) (2022-02-19) [Full Changelog](https://github.com/apache/arrow-rs/compare/9.0.2...9.1.0) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1ac9a7b30587..4e4c53e5e2bd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,6 +17,16 @@ under the License. --> +## Introduction + +We welcome and encourage contributions of all kinds, such as: + +1. Tickets with issue reports of feature requests +2. Documentation improvements +3. Code (PR or PR Review) + +In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. + ## Developer's guide to Arrow Rust ### Setting Up Your Build Environment diff --git a/README.md b/README.md index 54dcbe74ff07..08c79bac35ff 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,11 @@ and bug fixes and this plays a critical role in the release process. For design discussions we generally collaborate on Google documents and file a GitHub issue linking to the document. +There is more information in the [contributing] guide. + [rust]: https://www.rust-lang.org/ [arrow-readme]: arrow/README.md +[contributing]: CONTRIBUTING.md [parquet-readme]: parquet/README.md [flight-readme]: arrow-flight/README.md [datafusion-readme]: https://github.com/apache/arrow-datafusion/blob/master/README.md diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 79b67a0fbd7d..edb4090fb8c4 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "9.1.0" +version = "10.0.0" edition = "2021" rust-version = "1.57" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "9.1.0" } +arrow = { path = "../arrow", version = "10.0.0" } base64 = "0.13" tonic = "0.6" bytes = "1" diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 34297bada7e6..31915c7f2a39 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "9.1.0" +version = "10.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "9.1.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "10.0.0", features = ["pyarrow"] } pyo3 = { version = "0.16", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d4df7ad87724..bb7b718e7b76 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "9.1.0" +version = "10.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/README.md b/arrow/README.md index 94e98d99aeac..171489901611 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -31,7 +31,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `9.1.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `10.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Features diff --git a/arrow/test/dependency/default-features/Cargo.toml b/arrow/test/dependency/default-features/Cargo.toml index 1e037e0ce94d..244a4f199837 100644 --- a/arrow/test/dependency/default-features/Cargo.toml +++ b/arrow/test/dependency/default-features/Cargo.toml @@ -25,6 +25,6 @@ rust-version = "1.57" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { path = "../../../../arrow", version = "9.1.0" } +arrow = { path = "../../../../arrow", version = "10.0.0" } [workspace] diff --git a/arrow/test/dependency/no-default-features/Cargo.toml b/arrow/test/dependency/no-default-features/Cargo.toml index b66544f39e08..165475ca3c93 100644 --- a/arrow/test/dependency/no-default-features/Cargo.toml +++ b/arrow/test/dependency/no-default-features/Cargo.toml @@ -25,6 +25,6 @@ rust-version = "1.57" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { path = "../../../../arrow", version = "9.1.0", default-features = false } +arrow = { path = "../../../../arrow", version = "10.0.0", default-features = false } [workspace] diff --git a/arrow/test/dependency/simd/Cargo.toml b/arrow/test/dependency/simd/Cargo.toml index 5aa4ca83bdf8..236cc7be61cc 100644 --- a/arrow/test/dependency/simd/Cargo.toml +++ b/arrow/test/dependency/simd/Cargo.toml @@ -25,6 +25,6 @@ rust-version = "1.57" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { path = "../../../../arrow", version = "9.1.0", features = ["simd"]} +arrow = { path = "../../../../arrow", version = "10.0.0", features = ["simd"]} [workspace] diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 3856abab3f98..0d2a818994cc 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -40,5 +40,5 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN=$CHANGELOG_GITHUB_TOKEN -v "$(pwd) --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ --max-issues=300 \ - --since-tag 9.0.2 \ - --future-release 9.1.0 + --since-tag 9.1.0 \ + --future-release 10.0.0 diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 382eb1538c7b..9ae9c841c22b 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "9.1.0" +version = "10.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/integration-testing/src/flight_client_scenarios/integration_test.rs b/integration-testing/src/flight_client_scenarios/integration_test.rs index c021020ca37b..7b32c571383b 100644 --- a/integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/integration-testing/src/flight_client_scenarios/integration_test.rs @@ -185,7 +185,8 @@ async fn consume_flight_location( let mut location = location; // The other Flight implementations use the `grpc+tcp` scheme, but the Rust http libs // don't recognize this as valid. - location.uri = location.uri.replace("grpc+tcp://", "grpc://"); + // more details: https://github.com/apache/arrow-rs/issues/1398 + location.uri = location.uri.replace("grpc+tcp://", "http://"); let mut client = FlightServiceClient::connect(location.uri).await?; let resp = client.do_get(ticket).await?; diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 3cad64cf8caa..12eb74d65197 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "9.1.0" +version = "10.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.10", optional = true } chrono = { version = "0.4", default-features = false } num = "0.4" num-bigint = "0.4" -arrow = { path = "../arrow", version = "9.1.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "10.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", optional = true } clap = { version = "3", optional = true, features = ["derive", "env"] } serde_json = { version = "1.0", features = ["preserve_order"], optional = true } @@ -58,7 +58,7 @@ brotli = "3.3" flate2 = "1.0" lz4 = "1.23" serde_json = { version = "1.0", features = ["preserve_order"] } -arrow = { path = "../arrow", version = "9.1.0", default-features = false, features = ["test_utils", "prettyprint"] } +arrow = { path = "../arrow", version = "10.0.0", default-features = false, features = ["test_utils", "prettyprint"] } [features] default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index c91e83275245..4c10d26fabfd 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -127,6 +127,57 @@ pub struct SerializedFileReader { metadata: ParquetMetaData, } +/// A builder for [`ReadOptions`]. +/// For the predicates that are added to the builder, +/// they will be chained using 'AND' to filter the row groups. +pub struct ReadOptionsBuilder { + predicates: Vec bool>>, +} + +impl ReadOptionsBuilder { + /// New builder + pub fn new() -> Self { + ReadOptionsBuilder { predicates: vec![] } + } + + /// Add a predicate on row group metadata to the reading option, + /// Filter only row groups that match the predicate criteria + pub fn with_predicate( + mut self, + predicate: Box bool>, + ) -> Self { + self.predicates.push(predicate); + self + } + + /// Add a range predicate on filtering row groups if their midpoints are within + /// the Closed-Open range `[start..end) {x | start <= x < end}` + pub fn with_range(mut self, start: i64, end: i64) -> Self { + assert!(start < end); + let predicate = move |rg: &RowGroupMetaData, _: usize| { + let mid = get_midpoint_offset(rg); + mid >= start && mid < end + }; + self.predicates.push(Box::new(predicate)); + self + } + + /// Seal the builder and return the read options + pub fn build(self) -> ReadOptions { + ReadOptions { + predicates: self.predicates, + } + } +} + +/// A collection of options for reading a Parquet file. +/// +/// Currently, only predicates on row group metadata are supported. +/// All predicates will be chained using 'AND' to filter the row groups. +pub struct ReadOptions { + predicates: Vec bool>>, +} + impl SerializedFileReader { /// Creates file reader from a Parquet file. /// Returns error if Parquet file does not exist or is corrupt. @@ -138,25 +189,48 @@ impl SerializedFileReader { }) } - /// Filters row group metadata to only those row groups, - /// for which the predicate function returns true - pub fn filter_row_groups( - &mut self, - predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool, - ) { + /// Creates file reader from a Parquet file with read options. + /// Returns error if Parquet file does not exist or is corrupt. + pub fn new_with_options(chunk_reader: R, options: ReadOptions) -> Result { + let metadata = footer::parse_metadata(&chunk_reader)?; + let mut predicates = options.predicates; + let row_groups = metadata.row_groups().to_vec(); let mut filtered_row_groups = Vec::::new(); - for (i, row_group_metadata) in self.metadata.row_groups().iter().enumerate() { - if predicate(row_group_metadata, i) { - filtered_row_groups.push(row_group_metadata.clone()); + for (i, rg_meta) in row_groups.into_iter().enumerate() { + let mut keep = true; + for predicate in &mut predicates { + if !predicate(&rg_meta, i) { + keep = false; + break; + } + } + if keep { + filtered_row_groups.push(rg_meta); } } - self.metadata = ParquetMetaData::new( - self.metadata.file_metadata().clone(), - filtered_row_groups, - ); + + Ok(Self { + chunk_reader: Arc::new(chunk_reader), + metadata: ParquetMetaData::new( + metadata.file_metadata().clone(), + filtered_row_groups, + ), + }) } } +/// Get midpoint offset for a row group +fn get_midpoint_offset(meta: &RowGroupMetaData) -> i64 { + let col = meta.column(0); + let mut offset = col.data_page_offset(); + if let Some(dic_offset) = col.dictionary_page_offset() { + if offset > dic_offset { + offset = dic_offset + } + }; + offset + meta.compressed_size() / 2 +} + impl FileReader for SerializedFileReader { fn metadata(&self) -> &ParquetMetaData { &self.metadata @@ -790,19 +864,96 @@ mod tests { } #[test] - fn test_file_reader_filter_row_groups() -> Result<()> { + fn test_file_reader_with_no_filter() -> Result<()> { + let test_file = get_test_file("alltypes_plain.parquet"); + let origin_reader = SerializedFileReader::new(test_file)?; + // test initial number of row groups + let metadata = origin_reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + Ok(()) + } + + #[test] + fn test_file_reader_filter_row_groups_with_predicate() -> Result<()> { let test_file = get_test_file("alltypes_plain.parquet"); - let mut reader = SerializedFileReader::new(test_file)?; + let read_options = ReadOptionsBuilder::new() + .with_predicate(Box::new(|_, _| false)) + .build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); + Ok(()) + } + #[test] + fn test_file_reader_filter_row_groups_with_range() -> Result<()> { + let test_file = get_test_file("alltypes_plain.parquet"); + let origin_reader = SerializedFileReader::new(test_file)?; // test initial number of row groups + let metadata = origin_reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + let mid = get_midpoint_offset(metadata.row_group(0)); + + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new().with_range(0, mid + 1).build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new().with_range(0, mid).build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); + Ok(()) + } + + #[test] + fn test_file_reader_filter_row_groups_and_range() -> Result<()> { + let test_file = get_test_file("alltypes_plain.parquet"); + let origin_reader = SerializedFileReader::new(test_file)?; + let metadata = origin_reader.metadata(); + let mid = get_midpoint_offset(metadata.row_group(0)); + + // true, true predicate + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new() + .with_predicate(Box::new(|_, _| true)) + .with_range(mid, mid + 1) + .build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 1); - // test filtering out all row groups - reader.filter_row_groups(&|_, _| false); + // true, false predicate + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new() + .with_predicate(Box::new(|_, _| true)) + .with_range(0, mid) + .build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 0); + // false, true predicate + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new() + .with_predicate(Box::new(|_, _| false)) + .with_range(mid, mid + 1) + .build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); + + // false, false predicate + let test_file = get_test_file("alltypes_plain.parquet"); + let read_options = ReadOptionsBuilder::new() + .with_predicate(Box::new(|_, _| false)) + .with_range(0, mid) + .build(); + let reader = SerializedFileReader::new_with_options(test_file, read_options)?; + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); Ok(()) } } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 3aeaadc36504..60c0926229ae 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "9.1.0" +version = "10.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = "1.0" quote = "1.0" syn = { version = "1.0", features = ["full", "extra-traits"] } -parquet = { path = "../parquet", version = "9.1.0" } +parquet = { path = "../parquet", version = "10.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index e4c84926e8f2..0f5ac955d231 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "9.1.0" -parquet_derive = "9.1.0" +parquet = "10.0.0" +parquet_derive = "10.0.0" ``` and this to your crate root: diff --git a/parquet_derive/test/dependency/default-features/Cargo.toml b/parquet_derive/test/dependency/default-features/Cargo.toml index 7552498add87..d9e544a432d0 100644 --- a/parquet_derive/test/dependency/default-features/Cargo.toml +++ b/parquet_derive/test/dependency/default-features/Cargo.toml @@ -25,7 +25,7 @@ rust-version = "1.57" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -parquet_derive = { path = "../../../../parquet_derive", version = "9.1.0" } +parquet_derive = { path = "../../../../parquet_derive", version = "10.0.0" } # Keep this out of the default workspace [workspace] diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index c8f21e440349..060ff0125e32 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "9.1.0" +version = "10.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.57" [dependencies] -parquet = { path = "../parquet", version = "9.1.0" } -parquet_derive = { path = "../parquet_derive", version = "9.1.0" } +parquet = { path = "../parquet", version = "10.0.0" } +parquet_derive = { path = "../parquet_derive", version = "10.0.0" } chrono = "0.4.19"