Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fixed reading parquet with timezone (#862)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Feb 24, 2022
1 parent 196e6fb commit fb5d433
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 6 deletions.
2 changes: 2 additions & 0 deletions parquet_integration/write_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def case_basic_nullable(size=1):
pa.field("timestamp_us", pa.timestamp("us")),
pa.field("timestamp_s", pa.timestamp("s")),
pa.field("emoji", pa.utf8()),
pa.field("timestamp_s_utc", pa.timestamp("s", "UTC")),
]
schema = pa.schema(fields)

Expand All @@ -50,6 +51,7 @@ def case_basic_nullable(size=1):
"timestamp_us": int64 * size,
"timestamp_s": int64 * size,
"emoji": emoji * size,
"timestamp_s_utc": int64 * size,
},
schema,
f"basic_nullable_{size*10}.parquet",
Expand Down
15 changes: 9 additions & 6 deletions src/io/parquet/read/deserialize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pub fn page_iter_to_arrays<'a, I: 'a + DataPages>(
primitive::Iter::new(pages, data_type, chunk_size, |x: i32| x as i32),
)),

Timestamp(time_unit, None) => {
Timestamp(time_unit, _) => {
let time_unit = *time_unit;
return timestamp(
pages,
Expand Down Expand Up @@ -150,9 +150,12 @@ pub fn page_iter_to_arrays<'a, I: 'a + DataPages>(
},

// INT64
Int64 | Date64 | Time64(_) | Duration(_) | Timestamp(_, _) => dyn_iter(iden(
primitive::Iter::new(pages, data_type, chunk_size, |x: i64| x as i64),
)),
Int64 | Date64 | Time64(_) | Duration(_) => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
chunk_size,
|x: i64| x as i64,
))),
UInt64 => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
Expand Down Expand Up @@ -444,7 +447,7 @@ fn dict_read<'a, K: DictionaryKey, I: 'a + DataPages>(
}),
),

Timestamp(time_unit, None) => {
Timestamp(time_unit, _) => {
let time_unit = *time_unit;
return timestamp_dict::<K, _>(
iter,
Expand All @@ -456,7 +459,7 @@ fn dict_read<'a, K: DictionaryKey, I: 'a + DataPages>(
);
}

Int64 | Date64 | Time64(_) | Duration(_) | Timestamp(_, _) => dyn_iter(
Int64 | Date64 | Time64(_) | Duration(_) => dyn_iter(
primitive::DictIter::<K, _, _, _, _>::new(iter, data_type, chunk_size, |x: i64| x),
),
Float32 => dyn_iter(primitive::DictIter::<K, _, _, _, _>::new(
Expand Down
13 changes: 13 additions & 0 deletions tests/it/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,12 @@ pub fn pyarrow_nullable(column: usize) -> Box<dyn Array> {
11 => Box::new(
PrimitiveArray::<i64>::from(i64_values).to(DataType::Timestamp(TimeUnit::Second, None)),
),
13 => Box::new(
PrimitiveArray::<i64>::from(i64_values).to(DataType::Timestamp(
TimeUnit::Second,
Some("UTC".to_string()),
)),
),
_ => unreachable!(),
}
}
Expand Down Expand Up @@ -415,6 +421,13 @@ pub fn pyarrow_nullable_statistics(column: usize) -> Option<Box<dyn Statistics>>
min_value: Some(0),
max_value: Some(9),
}),
13 => Box::new(PrimitiveStatistics::<i64> {
data_type: DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())),
distinct_count: None,
null_count: Some(3),
min_value: Some(0),
max_value: Some(9),
}),
_ => unreachable!(),
})
}
Expand Down
5 changes: 5 additions & 0 deletions tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,11 @@ fn v1_timestamp_s_nullable_dict() -> Result<()> {
test_pyarrow_integration(11, 1, "basic", true, false, None)
}

#[test]
fn v1_timestamp_s_utc_nullable() -> Result<()> {
test_pyarrow_integration(13, 1, "basic", false, false, None)
}

#[test]
fn v2_decimal_26_required() -> Result<()> {
test_pyarrow_integration(8, 2, "basic", false, true, None)
Expand Down

0 comments on commit fb5d433

Please sign in to comment.