From 6bc31fdaf4245b43951bd3c811ddbdef5cbae815 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 14 Feb 2024 16:58:04 -0800 Subject: [PATCH 1/5] fix :D --- cpp/src/io/parquet/reader_impl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 24d46d91dbb..9e3c0e9575c 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -156,7 +156,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) valids[idx] = out_buf.null_mask(); data[idx] = out_buf.data(); // only do string buffer for leaf - if (out_buf.string_size() == 0 && col_sizes[pass.chunks[c].src_col_index] > 0) { + if (idx == max_depth - 1 and out_buf.string_size() == 0 and + col_sizes[pass.chunks[c].src_col_index] > 0) { out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } From 10b52e80494b6ff9fdb62854910fb2140452af89 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 26 Feb 2024 10:37:42 -0800 Subject: [PATCH 2/5] rename --- cpp/src/io/parquet/reader_impl.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c2b7e2ecffe..efbd81616f1 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -60,7 +60,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). auto const has_strings = (kernel_mask & STRINGS_MASK) != 0; - std::vector col_sizes(_input_columns.size(), 0L); + std::vector col_string_sizes(_input_columns.size(), 0L); if (has_strings) { ComputePageStringSizes(subpass.pages, pass.chunks, @@ -71,10 +71,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) kernel_mask, _stream); - col_sizes = calculate_page_string_offsets(); + col_string_sizes = calculate_page_string_offsets(); // check for overflow - if (std::any_of(col_sizes.cbegin(), col_sizes.cend(), [](size_t sz) { + if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](size_t sz) { return sz > std::numeric_limits::max(); })) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); @@ -158,8 +158,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) data[idx] = out_buf.data(); // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and - col_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream); + col_string_sizes[pass.chunks[c].src_col_index] > 0) { + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -282,7 +282,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column - size_type const sz = static_cast(col_sizes[idx]); + size_type const sz = static_cast(col_string_sizes[idx]); cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, &sz, sizeof(size_type), From 7bac5f736e1bbf0f4201087fef10c611ad60784b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 26 Feb 2024 11:40:28 -0800 Subject: [PATCH 3/5] std size_t Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/src/io/parquet/reader_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index efbd81616f1..0da8e4aa326 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -74,7 +74,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) col_string_sizes = calculate_page_string_offsets(); // check for overflow - if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](size_t sz) { + if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) { return sz > std::numeric_limits::max(); })) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); From d2b53b517ddbd476a26692be962976b45b0874e3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 26 Feb 2024 11:49:31 -0800 Subject: [PATCH 4/5] iron out types --- cpp/src/io/parquet/reader_impl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 0da8e4aa326..2dfd8a66a6b 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -273,17 +273,17 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) auto const& child = (*cols)[input_col.nesting[l_idx + 1]]; // the final offset for a list at level N is the size of it's child - int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), + size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; + CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), &offset, - sizeof(offset), + sizeof(size_type), cudaMemcpyDefault, _stream.value())); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column - size_type const sz = static_cast(col_string_sizes[idx]); - cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, + auto const sz = static_cast(col_string_sizes[idx]); + cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, &sz, sizeof(size_type), cudaMemcpyDefault, From e9e723646c8c8a329409a747a558aeb5f0a56bb7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 26 Feb 2024 12:58:19 -0800 Subject: [PATCH 5/5] try, despite what Yoda might say --- cpp/src/io/parquet/reader_impl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 2dfd8a66a6b..93fc6bd6bb5 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -283,11 +283,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); - cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value()); + CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, + &sz, + sizeof(size_type), + cudaMemcpyDefault, + _stream.value())); } } }