From 9cb33c273be2dcb1d176af998757981965b73965 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 13 Aug 2022 01:10:40 +0000 Subject: [PATCH 1/2] fixing nested write issue --- cpp/src/io/utilities/column_utils.cuh | 16 ++++++++++------ cpp/tests/io/parquet_test.cpp | 3 +++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh index 1faab805811..c6d50028813 100644 --- a/cpp/src/io/utilities/column_utils.cuh +++ b/cpp/src/io/utilities/column_utils.cuh @@ -67,13 +67,17 @@ rmm::device_uvector create_leaf_column_device_views( size_type index) mutable { col_desc[index].parent_column = parent_col_view.begin() + index; column_device_view col = parent_col_view.column(index); - if (col_desc[index].stats_dtype != dtype_byte_array) { - // traverse till leaf column - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - col = (col.type().id() == type_id::LIST) - ? col.child(lists_column_view::child_column_index) - : col.child(0); + // traverse till leaf column + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + auto child = (col.type().id() == type_id::LIST) + ? col.child(lists_column_view::child_column_index) + : col.child(0); + // stop early if writing a byte array, it needs to be a list not the int8 column + if (col_desc[index].stats_dtype == dtype_byte_array && + (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) { + break; } + col = child; } // Store leaf_column to device storage column_device_view* leaf_col_ptr = leaf_columns.begin() + index; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 0350bfe2981..774c58f1ecf 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4250,6 +4250,9 @@ TEST_F(ParquetWriterTest, ByteArrayStats) read_footer(source, &fmd); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + auto const stats0 = parse_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = parse_statistics(fmd.row_groups[0].columns[1]); From 5bcfc5945e09db3ad5ca0c1f0753356c6af6ae05 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 15 Aug 2022 16:29:44 +0000 Subject: [PATCH 2/2] review comments update --- cpp/src/io/utilities/column_utils.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh index c6d50028813..ecb74173a46 100644 --- a/cpp/src/io/utilities/column_utils.cuh +++ b/cpp/src/io/utilities/column_utils.cuh @@ -68,11 +68,11 @@ rmm::device_uvector create_leaf_column_device_views( col_desc[index].parent_column = parent_col_view.begin() + index; column_device_view col = parent_col_view.column(index); // traverse till leaf column - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - auto child = (col.type().id() == type_id::LIST) - ? col.child(lists_column_view::child_column_index) - : col.child(0); - // stop early if writing a byte array, it needs to be a list not the int8 column + while (cudf::is_nested(col.type())) { + auto const child = (col.type().id() == type_id::LIST) + ? col.child(lists_column_view::child_column_index) + : col.child(0); + // stop early if writing a byte array if (col_desc[index].stats_dtype == dtype_byte_array && (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) { break;