This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added async read of avro files. (#620)
- Loading branch information
1 parent
a8c55f9
commit bfa2d43
Showing
16 changed files
with
254 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
use std::sync::Arc; | ||
|
||
use futures::pin_mut; | ||
use futures::StreamExt; | ||
use tokio::fs::File; | ||
use tokio_util::compat::*; | ||
|
||
use arrow2::error::Result; | ||
use arrow2::io::avro::read::{decompress_block, deserialize}; | ||
use arrow2::io::avro::read_async::*; | ||
|
||
#[tokio::main(flavor = "current_thread")] | ||
async fn main() -> Result<()> { | ||
use std::env; | ||
let args: Vec<String> = env::args().collect(); | ||
|
||
let file_path = &args[1]; | ||
|
||
let mut reader = File::open(file_path).await?.compat(); | ||
|
||
let (avro_schemas, schema, compression, marker) = read_metadata(&mut reader).await?; | ||
let schema = Arc::new(schema); | ||
let avro_schemas = Arc::new(avro_schemas); | ||
|
||
let blocks = block_stream(&mut reader, marker).await; | ||
|
||
pin_mut!(blocks); | ||
while let Some((mut block, rows)) = blocks.next().await.transpose()? { | ||
// the content here is blocking. In general this should run on spawn_blocking | ||
let schema = schema.clone(); | ||
let avro_schemas = avro_schemas.clone(); | ||
let handle = tokio::task::spawn_blocking(move || { | ||
let mut decompressed = vec![]; | ||
decompress_block(&mut block, &mut decompressed, compression)?; | ||
deserialize(&decompressed, rows, schema, &avro_schemas) | ||
}); | ||
let batch = handle.await.unwrap()?; | ||
assert!(batch.num_rows() > 0); | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Avro read | ||
|
||
When compiled with feature `io_avro_async`, you can use this crate to read Avro files | ||
asynchronously. | ||
|
||
```rust | ||
{{#include ../../../examples/avro_read_async.rs}} | ||
``` | ||
|
||
Note how both decompression and deserialization is performed on a separate thread pool to not | ||
block (see also [here](https://ryhl.io/blog/async-what-is-blocking/)). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
//! APIs to read from Avro format to arrow. | ||
use async_stream::try_stream; | ||
use futures::AsyncRead; | ||
use futures::AsyncReadExt; | ||
use futures::Stream; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::utils::zigzag_i64; | ||
|
||
async fn read_size<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<(usize, usize)> { | ||
let rows = match zigzag_i64(reader).await { | ||
Ok(a) => a, | ||
Err(ArrowError::Io(io_err)) => { | ||
if let std::io::ErrorKind::UnexpectedEof = io_err.kind() { | ||
// end | ||
return Ok((0, 0)); | ||
} else { | ||
return Err(ArrowError::Io(io_err)); | ||
} | ||
} | ||
Err(other) => return Err(other), | ||
}; | ||
let bytes = zigzag_i64(reader).await?; | ||
Ok((rows as usize, bytes as usize)) | ||
} | ||
|
||
/// Reads a block from the file into `buf`. | ||
/// # Panic | ||
/// Panics iff the block marker does not equal to the file's marker | ||
async fn read_block<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
buf: &mut Vec<u8>, | ||
file_marker: [u8; 16], | ||
) -> Result<usize> { | ||
let (rows, bytes) = read_size(reader).await?; | ||
if rows == 0 { | ||
return Ok(0); | ||
}; | ||
|
||
buf.clear(); | ||
buf.resize(bytes, 0); | ||
reader.read_exact(buf).await?; | ||
|
||
let mut marker = [0u8; 16]; | ||
reader.read_exact(&mut marker).await?; | ||
|
||
assert!(!(marker != file_marker)); | ||
Ok(rows) | ||
} | ||
|
||
/// Returns a fallible [`Stream`] of Avro blocks bound to `reader` | ||
pub async fn block_stream<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
file_marker: [u8; 16], | ||
) -> impl Stream<Item = Result<(Vec<u8>, usize)>> + '_ { | ||
try_stream! { | ||
loop { | ||
let mut buffer = vec![]; | ||
let rows = read_block(reader, &mut buffer, file_marker).await?; | ||
if rows == 0 { | ||
break | ||
} | ||
yield (buffer, rows) | ||
} | ||
} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
//! Async Avro | ||
use std::collections::HashMap; | ||
|
||
use avro_rs::Schema as AvroSchema; | ||
use futures::AsyncRead; | ||
use futures::AsyncReadExt; | ||
|
||
use crate::datatypes::Schema; | ||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::super::read::convert_schema; | ||
use super::super::read::deserialize_header; | ||
use super::super::read::Compression; | ||
use super::super::{read_header, read_metadata}; | ||
use super::utils::zigzag_i64; | ||
|
||
/// Reads Avro's metadata from `reader` into a [`AvroSchema`], [`Compression`] and magic marker. | ||
#[allow(clippy::type_complexity)] | ||
async fn read_metadata_async<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
) -> Result<(AvroSchema, Option<Compression>, [u8; 16])> { | ||
read_metadata!(reader.await) | ||
} | ||
|
||
/// Reads the avro metadata from `reader` into a [`AvroSchema`], [`Compression`] and magic marker. | ||
#[allow(clippy::type_complexity)] | ||
pub async fn read_metadata<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
) -> Result<(Vec<AvroSchema>, Schema, Option<Compression>, [u8; 16])> { | ||
let (avro_schema, codec, marker) = read_metadata_async(reader).await?; | ||
let schema = convert_schema(&avro_schema)?; | ||
|
||
let avro_schema = if let AvroSchema::Record { fields, .. } = avro_schema { | ||
fields.into_iter().map(|x| x.schema).collect() | ||
} else { | ||
panic!() | ||
}; | ||
|
||
Ok((avro_schema, schema, codec, marker)) | ||
} | ||
|
||
/// Reads the file marker asynchronously | ||
async fn read_file_marker<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<[u8; 16]> { | ||
let mut marker = [0u8; 16]; | ||
reader.read_exact(&mut marker).await?; | ||
Ok(marker) | ||
} | ||
|
||
async fn _read_binary<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<Vec<u8>> { | ||
let len: usize = zigzag_i64(reader).await? as usize; | ||
let mut buf = vec![0u8; len]; | ||
reader.read_exact(&mut buf).await?; | ||
Ok(buf) | ||
} | ||
|
||
async fn read_header<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
) -> Result<HashMap<String, Vec<u8>>> { | ||
read_header!(reader.await) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,8 @@ | ||
//! Async Avro | ||
use std::collections::HashMap; | ||
use avro_rs::Schema; | ||
use futures::AsyncRead; | ||
use futures::AsyncReadExt; | ||
mod block; | ||
mod metadata; | ||
pub(self) mod utils; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::read::deserialize_header; | ||
use super::read::Compression; | ||
use super::{avro_decode, read_header, read_metadata}; | ||
|
||
/// Reads Avro's metadata from `reader` into a [`Schema`], [`Compression`] and magic marker. | ||
#[allow(clippy::type_complexity)] | ||
pub async fn read_metadata_async<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
) -> Result<(Schema, Option<Compression>, [u8; 16])> { | ||
read_metadata!(reader.await) | ||
} | ||
|
||
/// Reads the file marker asynchronously | ||
async fn read_file_marker<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<[u8; 16]> { | ||
let mut marker = [0u8; 16]; | ||
reader.read_exact(&mut marker).await?; | ||
Ok(marker) | ||
} | ||
|
||
async fn zigzag_i64<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<i64> { | ||
let z = decode_variable(reader).await?; | ||
Ok(if z & 0x1 == 0 { | ||
(z >> 1) as i64 | ||
} else { | ||
!(z >> 1) as i64 | ||
}) | ||
} | ||
|
||
async fn decode_variable<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<u64> { | ||
avro_decode!(reader.await) | ||
} | ||
|
||
async fn _read_binary<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<Vec<u8>> { | ||
let len: usize = zigzag_i64(reader).await? as usize; | ||
let mut buf = vec![0u8; len]; | ||
reader.read_exact(&mut buf).await?; | ||
Ok(buf) | ||
} | ||
|
||
async fn read_header<R: AsyncRead + Unpin + Send>( | ||
reader: &mut R, | ||
) -> Result<HashMap<String, Vec<u8>>> { | ||
read_header!(reader.await) | ||
} | ||
pub use block::block_stream; | ||
pub use metadata::read_metadata; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
use futures::AsyncRead; | ||
use futures::AsyncReadExt; | ||
|
||
use crate::error::{ArrowError, Result}; | ||
|
||
use super::super::avro_decode; | ||
|
||
pub async fn zigzag_i64<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<i64> { | ||
let z = decode_variable(reader).await?; | ||
Ok(if z & 0x1 == 0 { | ||
(z >> 1) as i64 | ||
} else { | ||
!(z >> 1) as i64 | ||
}) | ||
} | ||
|
||
async fn decode_variable<R: AsyncRead + Unpin + Send>(reader: &mut R) -> Result<u64> { | ||
avro_decode!(reader.await) | ||
} |
Oops, something went wrong.