From 39f7ff413cef2e946b03c9ee5ea908bedc6591ee Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 18 Jan 2022 08:37:14 -0800 Subject: [PATCH 1/2] fix columns ordering issue --- python/cudf/cudf/_lib/parquet.pyx | 13 ++++++++++--- python/cudf/cudf/tests/test_parquet.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 16873435e1d..b9d4e07ec07 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -200,12 +200,19 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, update_struct_field_names(df, c_out_table.metadata.schema_info) - # update the decimal precision of each column if meta is not None: - for col, col_meta in zip(column_names, meta["columns"]): + # Book keep each column metadata as the order + # of `meta["columns"]` and `column_names` are not + # guaranteed to be deterministic and same always. + meta_data_per_column = {} + for col_meta in meta["columns"]: + meta_data_per_column[col_meta['name']] = col_meta + + # update the decimal precision of each column + for col in column_names: if is_decimal_dtype(df._data[col].dtype): df._data[col].dtype.precision = ( - col_meta["metadata"]["precision"] + meta_data_per_column[col]["metadata"]["precision"] ) # Set the index column diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 016ed1229f1..2b6ddea447b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2373,3 +2373,21 @@ def test_parquet_writer_row_group_size( math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes) ) assert expected_num_rows == row_groups + + +def test_parquet_reader_decimal_columns(): + df = cudf.DataFrame( + { + "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)), + "col2": [10, 11, 12], + "col3": [12, 13, 14], + "col4": ["a", "b", "c"], + } + ) + buffer = BytesIO() + df.to_parquet(buffer) + + actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"]) + expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"]) + + assert_eq(actual, expected) From 478942e8442238ae5946146b1d57412007cf04df Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 18 Jan 2022 08:59:31 -0800 Subject: [PATCH 2/2] simplify --- python/cudf/cudf/_lib/parquet.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index b9d4e07ec07..8cb7dd942c1 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -204,9 +204,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, # Book keep each column metadata as the order # of `meta["columns"]` and `column_names` are not # guaranteed to be deterministic and same always. - meta_data_per_column = {} - for col_meta in meta["columns"]: - meta_data_per_column[col_meta['name']] = col_meta + meta_data_per_column = { + col_meta['name']: col_meta for col_meta in meta["columns"] + } # update the decimal precision of each column for col in column_names: