-
Notifications
You must be signed in to change notification settings - Fork 850
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow to read parquet binary column as UTF8 type #6539
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3077,6 +3077,119 @@ mod tests { | |||||
); | ||||||
} | ||||||
|
||||||
#[test] | ||||||
fn test_read_binary_as_utf8() { | ||||||
let file = write_parquet_from_iter(vec![ | ||||||
( | ||||||
"binary_to_utf8", | ||||||
Arc::new(BinaryArray::from(vec![ | ||||||
b"one".as_ref(), | ||||||
b"two".as_ref(), | ||||||
b"three".as_ref(), | ||||||
])) as ArrayRef, | ||||||
), | ||||||
( | ||||||
"large_binary_to_large_utf8", | ||||||
Arc::new(LargeBinaryArray::from(vec![ | ||||||
b"one".as_ref(), | ||||||
b"two".as_ref(), | ||||||
b"three".as_ref(), | ||||||
])) as ArrayRef, | ||||||
), | ||||||
( | ||||||
"binary_view_to_utf8_view", | ||||||
Arc::new(BinaryViewArray::from(vec![ | ||||||
b"one".as_ref(), | ||||||
b"two".as_ref(), | ||||||
b"three".as_ref(), | ||||||
])) as ArrayRef, | ||||||
), | ||||||
]); | ||||||
let supplied_fields = Fields::from(vec![ | ||||||
Field::new("binary_to_utf8", ArrowDataType::Utf8, false), | ||||||
Field::new( | ||||||
"large_binary_to_large_utf8", | ||||||
ArrowDataType::LargeUtf8, | ||||||
false, | ||||||
), | ||||||
Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View, false), | ||||||
]); | ||||||
|
||||||
let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); | ||||||
let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( | ||||||
file.try_clone().unwrap(), | ||||||
options, | ||||||
) | ||||||
.expect("reader builder with schema") | ||||||
.build() | ||||||
.expect("reader with schema"); | ||||||
|
||||||
let batch = arrow_reader.next().unwrap().unwrap(); | ||||||
assert_eq!(batch.num_columns(), 3); | ||||||
assert_eq!(batch.num_rows(), 3); | ||||||
assert_eq!( | ||||||
batch | ||||||
.column(0) | ||||||
.as_any() | ||||||
.downcast_ref::<StringArray>() | ||||||
.expect("downcast to string") | ||||||
.iter() | ||||||
.collect::<Vec<_>>(), | ||||||
vec![Some("one"), Some("two"), Some("three")] | ||||||
); | ||||||
|
||||||
assert_eq!( | ||||||
batch | ||||||
.column(1) | ||||||
.as_any() | ||||||
.downcast_ref::<LargeStringArray>() | ||||||
.expect("downcast to large string") | ||||||
.iter() | ||||||
.collect::<Vec<_>>(), | ||||||
vec![Some("one"), Some("two"), Some("three")] | ||||||
); | ||||||
|
||||||
assert_eq!( | ||||||
batch | ||||||
.column(2) | ||||||
.as_any() | ||||||
.downcast_ref::<StringViewArray>() | ||||||
.expect("downcast to string view") | ||||||
.iter() | ||||||
.collect::<Vec<_>>(), | ||||||
vec![Some("one"), Some("two"), Some("three")] | ||||||
); | ||||||
} | ||||||
|
||||||
#[test] | ||||||
#[should_panic(expected = "Invalid UTF8 sequence at")] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||||||
fn test_read_non_utf8_binary_as_utf8() { | ||||||
let file = write_parquet_from_iter(vec![( | ||||||
"non_utf8_binary", | ||||||
Arc::new(BinaryArray::from(vec![ | ||||||
b"\xDE\x00\xFF".as_ref(), | ||||||
b"\xDE\x01\xAA".as_ref(), | ||||||
b"\xDE\x02\xFF".as_ref(), | ||||||
])) as ArrayRef, | ||||||
)]); | ||||||
let supplied_fields = Fields::from(vec![Field::new( | ||||||
"non_utf8_binary", | ||||||
ArrowDataType::Utf8, | ||||||
false, | ||||||
)]); | ||||||
|
||||||
let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); | ||||||
let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( | ||||||
file.try_clone().unwrap(), | ||||||
options, | ||||||
) | ||||||
.expect("reader builder with schema") | ||||||
.build() | ||||||
.expect("reader with schema"); | ||||||
|
||||||
arrow_reader.next(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
As this should error, given the data isn't actually UTF-8 |
||||||
} | ||||||
|
||||||
#[test] | ||||||
fn test_with_schema() { | ||||||
let nested_fields = Fields::from(vec![ | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { | |
(DataType::Utf8, DataType::LargeUtf8) => hint, | ||
(DataType::Binary, DataType::LargeBinary) => hint, | ||
|
||
// Read as Utf8 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
(DataType::Binary, DataType::Utf8) => hint, | ||
(DataType::Binary, DataType::LargeUtf8) => hint, | ||
(DataType::Binary, DataType::Utf8View) => hint, | ||
|
||
// Determine view type | ||
(DataType::Utf8, DataType::Utf8View) => hint, | ||
(DataType::Binary, DataType::BinaryView) => hint, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And the same below