-
Notifications
You must be signed in to change notification settings - Fork 867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Parquet reader for null lists #1448
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,10 +26,10 @@ use std::vec::Vec; | |
|
||
use arrow::array::{ | ||
new_empty_array, Array, ArrayData, ArrayDataBuilder, ArrayRef, BinaryArray, | ||
BinaryBuilder, BooleanArray, BooleanBufferBuilder, BooleanBuilder, | ||
BinaryBuilder, BooleanArray, BooleanBufferBuilder, BooleanBuilder, DecimalArray, | ||
FixedSizeBinaryArray, FixedSizeBinaryBuilder, GenericListArray, Int16BufferBuilder, | ||
Int32Array, Int64Array, MapArray, OffsetSizeTrait, PrimitiveArray, PrimitiveBuilder, | ||
StringArray, StringBuilder, StructArray, DecimalArray, | ||
Int32Array, Int64Array, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, | ||
PrimitiveBuilder, StringArray, StringBuilder, StructArray, | ||
}; | ||
use arrow::buffer::{Buffer, MutableBuffer}; | ||
use arrow::datatypes::{ | ||
|
@@ -430,14 +430,16 @@ where | |
} | ||
ArrowType::Decimal(p, s) => { | ||
let array = match array.data_type() { | ||
ArrowType::Int32 => array.as_any() | ||
ArrowType::Int32 => array | ||
.as_any() | ||
.downcast_ref::<Int32Array>() | ||
.unwrap() | ||
.iter() | ||
.map(|v| v.map(|v| v.into())) | ||
.collect::<DecimalArray>(), | ||
|
||
ArrowType::Int64 => array.as_any() | ||
ArrowType::Int64 => array | ||
.as_any() | ||
.downcast_ref::<Int64Array>() | ||
.unwrap() | ||
.iter() | ||
|
@@ -885,6 +887,7 @@ fn remove_indices( | |
Ok(Arc::new(StructArray::from((new_columns, valid.finish())))) | ||
} | ||
} | ||
ArrowType::Null => Ok(Arc::new(NullArray::new(arr.len()))), | ||
_ => Err(ParquetError::General(format!( | ||
"ListArray of type List({:?}) is not supported by array_reader", | ||
item_type | ||
|
@@ -924,7 +927,7 @@ impl<OffsetSize: OffsetSizeTrait> ArrayReader for ListArrayReader<OffsetSize> { | |
&& (rep_levels.len() == next_batch_array.len())) | ||
{ | ||
return Err(ArrowError( | ||
"Expected item_reader def_levels and rep_levels to be same length as batch".to_string(), | ||
format!("Expected item_reader def_levels {} and rep_levels {} to be same length as batch {}", def_levels.len(), rep_levels.len(), next_batch_array.len()), | ||
)); | ||
} | ||
|
||
|
@@ -964,6 +967,7 @@ impl<OffsetSize: OffsetSizeTrait> ArrayReader for ListArrayReader<OffsetSize> { | |
cur_offset += OffsetSize::one(); | ||
} | ||
}); | ||
|
||
offsets.push(cur_offset); | ||
|
||
let num_bytes = bit_util::ceil(offsets.len(), 8); | ||
|
@@ -1767,15 +1771,13 @@ impl<'a> ArrayReaderBuilder { | |
)), | ||
PhysicalType::INT96 => { | ||
// get the optional timezone information from arrow type | ||
let timezone = arrow_type | ||
.as_ref() | ||
.and_then(|data_type| { | ||
if let ArrowType::Timestamp(_, tz) = data_type { | ||
tz.clone() | ||
} else { | ||
None | ||
} | ||
}); | ||
let timezone = arrow_type.as_ref().and_then(|data_type| { | ||
if let ArrowType::Timestamp(_, tz) = data_type { | ||
tz.clone() | ||
} else { | ||
None | ||
} | ||
}); | ||
let converter = Int96Converter::new(Int96ArrayConverter { timezone }); | ||
Ok(Box::new(ComplexObjectArrayReader::< | ||
Int96Type, | ||
|
@@ -1983,13 +1985,15 @@ impl<'a> ArrayReaderBuilder { | |
if i == 1 { | ||
field = self.arrow_schema.field_with_name(part).ok(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not part of this PR, but I wonder why this doesn't just initialize |
||
} else if let Some(f) = field { | ||
if let ArrowType::Struct(fields) = f.data_type() { | ||
field = fields.iter().find(|f| f.name() == part) | ||
} else { | ||
field = None | ||
match f.data_type() { | ||
ArrowType::Struct(fields) => { | ||
field = fields.iter().find(|f| f.name() == part) | ||
} | ||
ArrowType::List(list_field) => field = Some(list_field.as_ref()), | ||
_ => field = None, | ||
} | ||
} else { | ||
field = None | ||
field = None; | ||
} | ||
} | ||
field | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1210,4 +1210,32 @@ mod tests { | |
assert_eq!(get_dict(&batches[3]), get_dict(&batches[4])); | ||
assert_eq!(get_dict(&batches[4]), get_dict(&batches[5])); | ||
} | ||
|
||
#[test] | ||
fn test_read_null_list() { | ||
let testdata = arrow::util::test_util::parquet_test_data(); | ||
let path = format!("{}/null_list.parquet", testdata); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I downloaded the test parquet file from @novemberkilo's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could point to my parquet-testing fork like I have done here novemberkilo@e5952ae#diff-fe7afb5c9c916e521401d3fcfb4277d5071798c3baf83baf11d6071742823584 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I think it works for CI purpose in the PR. Not sure if we can merge into master like that. Anyway, let me point to your fork first to test it. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @novemberkilo could you open a PR against There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, I think it is good idea. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya fwiw I did something like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see. Thanks. I added the file manually to test it before. And seems updating There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will have a play with creating a test that generates a parquet file, so that we can get this PR in without waiting for parquet-testing. I will also file a ticket to start a discussion on a faster way to get parquet test files checked in, without relying on an upstream repo There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @tustvold. The parquet file was merged. It'd be great to get testing parquet test files checked in faster. |
||
let parquet_file_reader = | ||
SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); | ||
let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_file_reader)); | ||
let mut record_batch_reader = arrow_reader | ||
.get_record_reader(60) | ||
.expect("Failed to read into array!"); | ||
|
||
let batch = record_batch_reader.next().unwrap().unwrap(); | ||
assert_eq!(batch.num_rows(), 1); | ||
assert_eq!(batch.num_columns(), 1); | ||
assert_eq!(batch.column(0).len(), 1); | ||
|
||
let list = batch | ||
.column(0) | ||
.as_any() | ||
.downcast_ref::<ListArray>() | ||
.unwrap(); | ||
assert_eq!(list.len(), 1); | ||
assert!(list.is_valid(0)); | ||
|
||
let val = list.value(0); | ||
assert_eq!(val.len(), 0); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Purely for debugging purpose. I feel it is clear.