From fc31aa3c4f99d6348e7c32a3e3c52c68b26ca700 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:19:30 -0400 Subject: [PATCH] Add overflow check when converting large strings to lists columns (#15887) Fixes a couple places where strings columns are converted to lists column as binary -- chars are represented as INT8. Since lists columns only support `size_type` offsets type, this change will throw an error if the size of the chars exceeds max `size_type` values. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15887 --- cpp/src/io/utilities/column_buffer.cpp | 4 ++++ cpp/src/reshape/byte_cast.cu | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index e5d4e1a360f..27fc53fbc9e 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -191,6 +191,10 @@ std::unique_ptr make_column(column_buffer_base& buffer, auto data = col_content.data.release(); auto char_size = data->size(); + CUDF_EXPECTS(char_size < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + auto uint8_col = std::make_unique( data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0); diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 1b05a9744fa..3dfa0b65814 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -135,9 +135,14 @@ struct byte_list_conversion_fn(input, stream, mr)->release(); - auto const num_chars = col_content.data->size(); - auto uint8_col = std::make_unique( + auto const num_chars = strings_column_view(input).chars_size(stream); + CUDF_EXPECTS(num_chars < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + + auto col_content = std::make_unique(input, stream, mr)->release(); + + auto uint8_col = std::make_unique( output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0); auto result = make_lists_column(