-
Notifications
You must be signed in to change notification settings - Fork 818
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add async
arrow parquet reader
#1154
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
078b37c
Async parquet reader (#111)
tustvold ce083cf
Remove Sync from DataType
tustvold c8069a5
Review feedback
tustvold 819913a
Add basic test
tustvold 92b7cb9
Fix lints
tustvold 38e2225
Review feedback
tustvold cbe6bb4
Tweak CI
tustvold File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ description = "Apache Parquet implementation in Rust" | |
homepage = "https://github.com/apache/arrow-rs" | ||
repository = "https://github.com/apache/arrow-rs" | ||
authors = ["Apache Arrow <[email protected]>"] | ||
keywords = [ "arrow", "parquet", "hadoop" ] | ||
keywords = ["arrow", "parquet", "hadoop"] | ||
readme = "README.md" | ||
build = "build.rs" | ||
edition = "2021" | ||
|
@@ -45,6 +45,8 @@ base64 = { version = "0.13", optional = true } | |
clap = { version = "2.33.3", optional = true } | ||
serde_json = { version = "1.0", features = ["preserve_order"], optional = true } | ||
rand = "0.8" | ||
futures = { version = "0.3", optional = true } | ||
tustvold marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "fs", "rt", "io-util"] } | ||
|
||
[dev-dependencies] | ||
criterion = "0.3" | ||
|
@@ -55,24 +57,26 @@ brotli = "3.3" | |
flate2 = "1.0" | ||
lz4 = "1.23" | ||
serde_json = { version = "1.0", features = ["preserve_order"] } | ||
arrow = { path = "../arrow", version = "8.0.0", default-features = false, features = ["test_utils"] } | ||
arrow = { path = "../arrow", version = "8.0.0", default-features = false, features = ["test_utils", "prettyprint"] } | ||
|
||
[features] | ||
default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] | ||
cli = ["serde_json", "base64", "clap"] | ||
test_common = [] | ||
# Experimental, unstable functionality primarily used for testing | ||
experimental = [] | ||
# Enable async API | ||
async = ["futures", "tokio"] | ||
|
||
[[ bin ]] | ||
[[bin]] | ||
name = "parquet-read" | ||
required-features = ["cli"] | ||
|
||
[[ bin ]] | ||
[[bin]] | ||
name = "parquet-schema" | ||
required-features = ["cli"] | ||
|
||
[[ bin ]] | ||
[[bin]] | ||
name = "parquet-rowcount" | ||
required-features = ["cli"] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,7 +42,7 @@ use arrow::datatypes::{ | |
Float32Type as ArrowFloat32Type, Float64Type as ArrowFloat64Type, | ||
Int16Type as ArrowInt16Type, Int32Type as ArrowInt32Type, | ||
Int64Type as ArrowInt64Type, Int8Type as ArrowInt8Type, IntervalUnit, Schema, | ||
Time32MillisecondType as ArrowTime32MillisecondType, | ||
SchemaRef, Time32MillisecondType as ArrowTime32MillisecondType, | ||
Time32SecondType as ArrowTime32SecondType, | ||
Time64MicrosecondType as ArrowTime64MicrosecondType, | ||
Time64NanosecondType as ArrowTime64NanosecondType, TimeUnit as ArrowTimeUnit, | ||
|
@@ -91,7 +91,7 @@ pub use byte_array::make_byte_array_reader; | |
pub use byte_array_dictionary::make_byte_array_dictionary_reader; | ||
|
||
/// Array reader reads parquet data into arrow array. | ||
pub trait ArrayReader { | ||
pub trait ArrayReader: Send { | ||
fn as_any(&self) -> &dyn Any; | ||
|
||
/// Returns the arrow type of this array reader. | ||
|
@@ -117,6 +117,26 @@ pub trait ArrayReader { | |
fn get_rep_levels(&self) -> Option<&[i16]>; | ||
} | ||
|
||
/// A collection of row groups | ||
pub trait RowGroupCollection { | ||
/// Get schema of parquet file. | ||
fn schema(&self) -> Result<SchemaDescPtr>; | ||
|
||
/// Returns an iterator over the column chunks for particular column | ||
fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>>; | ||
} | ||
|
||
impl RowGroupCollection for Arc<dyn FileReader> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does mean we have double dynamic dispatch, given these methods are called a couple of times per-file I'm inclined to consider this largely irrelevant |
||
fn schema(&self) -> Result<SchemaDescPtr> { | ||
Ok(self.metadata().file_metadata().schema_descr_ptr()) | ||
} | ||
|
||
fn column_chunks(&self, column_index: usize) -> Result<Box<dyn PageIterator>> { | ||
let iterator = FilePageIterator::new(column_index, Arc::clone(self))?; | ||
Ok(Box::new(iterator)) | ||
} | ||
} | ||
|
||
/// Uses `record_reader` to read up to `batch_size` records from `pages` | ||
/// | ||
/// Returns the number of records read, which can be less than batch_size if | ||
|
@@ -478,7 +498,7 @@ where | |
impl<T, C> ArrayReader for ComplexObjectArrayReader<T, C> | ||
where | ||
T: DataType, | ||
C: Converter<Vec<Option<T::T>>, ArrayRef> + 'static, | ||
C: Converter<Vec<Option<T::T>>, ArrayRef> + Send + 'static, | ||
{ | ||
fn as_any(&self) -> &dyn Any { | ||
self | ||
|
@@ -1311,9 +1331,9 @@ impl ArrayReader for StructArrayReader { | |
/// Create array reader from parquet schema, column indices, and parquet file reader. | ||
pub fn build_array_reader<T>( | ||
parquet_schema: SchemaDescPtr, | ||
arrow_schema: Schema, | ||
arrow_schema: SchemaRef, | ||
column_indices: T, | ||
file_reader: Arc<dyn FileReader>, | ||
row_groups: Box<dyn RowGroupCollection>, | ||
) -> Result<Box<dyn ArrayReader>> | ||
where | ||
T: IntoIterator<Item = usize>, | ||
|
@@ -1351,13 +1371,8 @@ where | |
fields: filtered_root_fields, | ||
}; | ||
|
||
ArrayReaderBuilder::new( | ||
Arc::new(proj), | ||
Arc::new(arrow_schema), | ||
Arc::new(leaves), | ||
file_reader, | ||
) | ||
.build_array_reader() | ||
ArrayReaderBuilder::new(Arc::new(proj), arrow_schema, Arc::new(leaves), row_groups) | ||
.build_array_reader() | ||
} | ||
|
||
/// Used to build array reader. | ||
|
@@ -1367,7 +1382,7 @@ struct ArrayReaderBuilder { | |
// Key: columns that need to be included in final array builder | ||
// Value: column index in schema | ||
columns_included: Arc<HashMap<*const Type, usize>>, | ||
file_reader: Arc<dyn FileReader>, | ||
row_groups: Box<dyn RowGroupCollection>, | ||
} | ||
|
||
/// Used in type visitor. | ||
|
@@ -1667,13 +1682,13 @@ impl<'a> ArrayReaderBuilder { | |
root_schema: TypePtr, | ||
arrow_schema: Arc<Schema>, | ||
columns_included: Arc<HashMap<*const Type, usize>>, | ||
file_reader: Arc<dyn FileReader>, | ||
file_reader: Box<dyn RowGroupCollection>, | ||
) -> Self { | ||
Self { | ||
root_schema, | ||
arrow_schema, | ||
columns_included, | ||
file_reader, | ||
row_groups: file_reader, | ||
} | ||
} | ||
|
||
|
@@ -1707,10 +1722,10 @@ impl<'a> ArrayReaderBuilder { | |
context.rep_level, | ||
context.path.clone(), | ||
)); | ||
let page_iterator = Box::new(FilePageIterator::new( | ||
self.columns_included[&(cur_type.as_ref() as *const Type)], | ||
self.file_reader.clone(), | ||
)?); | ||
|
||
let page_iterator = self | ||
.row_groups | ||
.column_chunks(self.columns_included[&(cur_type.as_ref() as *const Type)])?; | ||
|
||
let arrow_type: Option<ArrowType> = self | ||
.get_arrow_field(&cur_type, context) | ||
|
@@ -2823,7 +2838,8 @@ mod tests { | |
#[test] | ||
fn test_create_array_reader() { | ||
let file = get_test_file("nulls.snappy.parquet"); | ||
let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); | ||
let file_reader: Arc<dyn FileReader> = | ||
Arc::new(SerializedFileReader::new(file).unwrap()); | ||
|
||
let file_metadata = file_reader.metadata().file_metadata(); | ||
let arrow_schema = parquet_to_arrow_schema( | ||
|
@@ -2834,9 +2850,9 @@ mod tests { | |
|
||
let array_reader = build_array_reader( | ||
file_reader.metadata().file_metadata().schema_descr_ptr(), | ||
arrow_schema, | ||
Arc::new(arrow_schema), | ||
vec![0usize].into_iter(), | ||
file_reader, | ||
Box::new(file_reader), | ||
) | ||
.unwrap(); | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Activating pretty_print in parquet appears to have made clippy find a load of new stuff in arrow 😅