From 265743d750aca030dc852418cdd1fca921031739 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 24 May 2023 19:49:23 -0400 Subject: [PATCH] Use std::overflow_error when output would exceed column size limit (#13323) Replaces generic `cudf::logic_error` exception with `std::overflow_error` where appropriate in libcudf. Since this changes what is thrown in certain APIs, I think this technically is a breaking change. Closes #12925 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Divye Gala (https://github.com/divyegala) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13323 --- cpp/include/cudf/column/column.hpp | 3 ++- cpp/include/cudf/column/column_view.hpp | 5 +++-- cpp/include/cudf/detail/join.hpp | 2 -- .../cudf/detail/sizes_to_offsets_iterator.cuh | 2 +- cpp/include/cudf/filling.hpp | 8 +++----- cpp/include/cudf/join.hpp | 12 ++++-------- cpp/include/cudf/strings/detail/gather.cuh | 5 +++-- .../cudf/strings/detail/strings_children.cuh | 4 ++-- cpp/include/cudf/strings/repeat_strings.hpp | 7 ++----- cpp/include/nvtext/minhash.hpp | 1 + cpp/include/nvtext/subword_tokenize.hpp | 4 ++-- cpp/src/copying/concatenate.cu | 12 +++++------- cpp/src/copying/gather.cu | 3 ++- cpp/src/copying/scatter.cu | 3 ++- cpp/src/filling/repeat.cu | 14 ++++++++------ cpp/src/interop/dlpack.cpp | 12 +++++++----- cpp/src/io/utilities/row_selection.cpp | 3 ++- cpp/src/io/utilities/row_selection.hpp | 3 +-- cpp/src/join/hash_join.cu | 4 ---- cpp/src/join/join_common_utils.hpp | 1 - cpp/src/lists/sequences.cu | 4 ++-- cpp/src/strings/case.cu | 4 ++-- cpp/src/strings/copying/concatenate.cu | 6 ++++-- cpp/src/strings/regex/utilities.cuh | 4 ++-- cpp/src/strings/repeat_strings.cu | 3 ++- cpp/src/text/minhash.cu | 9 ++++----- cpp/src/text/ngrams_tokenize.cu | 2 +- cpp/src/text/normalize.cu | 5 +++-- cpp/src/text/subword/subword_tokenize.cu | 8 +++++--- cpp/tests/copying/concatenate_tests.cpp | 4 ++-- cpp/tests/filling/repeat_tests.cpp | 19 +++++++++++++++++++ cpp/tests/interop/dlpack_test.cpp | 4 ++-- cpp/tests/io/row_selection_test.cpp | 2 +- cpp/tests/strings/array_tests.cpp | 2 +- cpp/tests/strings/repeat_strings_tests.cpp | 2 +- cpp/tests/text/minhash_tests.cpp | 6 ++---- cpp/tests/text/subword_tests.cpp | 2 +- 37 files changed, 104 insertions(+), 90 deletions(-) diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index a28bf82962b..a38186458c4 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -89,7 +89,8 @@ class column { _size{[&]() { CUDF_EXPECTS( other.size() <= static_cast(std::numeric_limits::max()), - "The device_uvector size exceeds the maximum size_type."); + "The device_uvector size exceeds the column size limit", + std::overflow_error); return static_cast(other.size()); }()}, _data{other.release()}, diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 703131053f9..d80c720a255 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -422,8 +422,9 @@ class column_view : public detail::column_view_base { cudf::data_type{cudf::type_to_id()}, data.size(), data.data(), nullptr, 0, 0, {}) { CUDF_EXPECTS( - data.size() < static_cast(std::numeric_limits::max()), - "Data exceeds the maximum size of a column view."); + data.size() <= static_cast(std::numeric_limits::max()), + "Data exceeds the column size limit", + std::overflow_error); } /** diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp index c424c20d7c7..4a34eb6b328 100644 --- a/cpp/include/cudf/detail/join.hpp +++ b/cpp/include/cudf/detail/join.hpp @@ -86,7 +86,6 @@ struct hash_join { * @brief Constructor that internally builds the hash table based on the given `build` table. * * @throw cudf::logic_error if the number of columns in `build` table is 0. - * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. * * @param build The build table, from which the hash table is built. * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or @@ -177,7 +176,6 @@ struct hash_join { * @copydoc cudf::detail::hash_join::probe_join_indices * * @throw cudf::logic_error if probe table is empty. - * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`. * @throw cudf::logic_error if the number of columns in build table and probe table do not match. * @throw cudf::logic_error if the column data types in build table and probe table do not match. */ diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 0017ddb305d..7395c2692be 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -319,7 +319,7 @@ std::pair, size_type> make_offsets_child_column( auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); CUDF_EXPECTS( total_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + "Size of output exceeds the column size limit", std::overflow_error); offsets_column->set_null_count(0); diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index 8688e97ab7e..a82bb9d1a48 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,10 +131,8 @@ std::unique_ptr repeat( * count = 2 * return = [4,4,5,5,6,6] * ``` - * @throws cudf::logic_error if the data type of @p count is not size_type. - * @throws cudf::logic_error if @p count is invalid or @p count is negative. - * @throws cudf::logic_error if @p input_table.num_rows() * @p count overflows - * size_type. + * @throws cudf::logic_error if @p count is negative. + * @throws std::overflow_error if @p input_table.num_rows() * @p count overflows size_type. * * @param input_table Input table * @param count Number of repetitions diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 11d1bbf9fc8..314a1bbfad7 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -167,7 +167,7 @@ full_join(cudf::table_view const& left_keys, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a vector of row indices corresponding to a left semi join + * @brief Returns a vector of row indices corresponding to a left semi-join * between the specified tables. * * The returned vector contains the row indices from the left table @@ -179,13 +179,9 @@ full_join(cudf::table_view const& left_keys, * Result: {1, 2} * @endcode * - * @throw cudf::logic_error if number of columns in either - * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE - * - * @param[in] left_keys The left table - * @param[in] right_keys The right table - * @param[in] compare_nulls controls whether null join-key values - * should match or not. + * @param left_keys The left table + * @param right_keys The right table + * @param compare_nulls Controls whether null join-key values should match or not * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 28b98eac3b5..908871774ad 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -324,7 +324,8 @@ std::unique_ptr gather(strings_column_view const& strings, size_t{0}, thrust::plus{}); CUDF_EXPECTS(total_bytes < static_cast(std::numeric_limits::max()), - "total size of output strings is too large for a cudf column"); + "total size of output strings exceeds the column limit", + std::overflow_error); // In-place convert output sizes into offsets thrust::exclusive_scan(rmm::exec_policy_nosync(stream), diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 02a65c01178..5f8a2a34606 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -79,8 +79,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, // Convert the sizes to offsets auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", std::overflow_error); // Now build the chars column diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 26fe5f95983..2b6575f80d0 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -20,8 +20,6 @@ #include -#include - namespace cudf { namespace strings { /** @@ -49,9 +47,8 @@ namespace strings { * out is '123XYZ-123XYZ-123XYZ-' * @endcode * - * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that - * can be stored by the index type: - * `input.size() * repeat_times > max of size_type` + * @throw std::overflow_error if the size of the output string scalar exceeds the maximum value that + * can be stored by the scalar: `input.size() * repeat_times > max of size_type` * * @param input The scalar containing the string to repeat * @param repeat_times The number of times the input string is repeated diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 9fdaeda0959..60116e389a3 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -69,6 +69,7 @@ std::unique_ptr minhash( * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the MurmurHash3_32 algorithm diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 164ec7a603e..d266923187f 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -130,8 +130,8 @@ struct tokenizer_result { * strings column as working memory. * * @throw cudf::logic_error if `stride > max_sequence_length` - * @throw cudf::logic_error if `max_sequence_length * max_rows_tensor` is - * larger than the max value for cudf::size_type + * @throw std::overflow_error if `max_sequence_length * max_rows_tensor` + * exceeds the column size limit * * @param strings The input strings to tokenize. * @param vocabulary_table The vocabulary table pre-loaded into this object. diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b17475cb877..11c363d14e0 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -250,7 +250,7 @@ std::unique_ptr fused_concatenate(host_span views, auto const output_size = std::get<3>(device_views); CUDF_EXPECTS(output_size <= static_cast(std::numeric_limits::max()), - "Total number of concatenated rows exceeds size_type range", + "Total number of concatenated rows exceeds the column size limit", std::overflow_error); // Allocate output @@ -388,9 +388,9 @@ class traverse_children { std::size_t{}, [](size_t a, auto const& b) -> size_t { return a + b.size(); }) + 1; - // note: output text must include "exceeds size_type range" for python error handling CUDF_EXPECTS(total_offset_count <= static_cast(std::numeric_limits::max()), - "Total number of concatenated offsets exceeds size_type range"); + "Total number of concatenated offsets exceeds the column size limit", + std::overflow_error); } }; @@ -418,9 +418,8 @@ void traverse_children::operator()(host_span(scv.offsets(), scv.size(), stream)); }); - // note: output text must include "exceeds size_type range" for python error handling CUDF_EXPECTS(total_char_count <= static_cast(std::numeric_limits::max()), - "Total number of concatenated chars exceeds size_type range", + "Total number of concatenated chars exceeds the column size limit", std::overflow_error); } @@ -490,9 +489,8 @@ void bounds_and_type_check(host_span cols, rmm::cuda_stream_v std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) { return a + static_cast(b.size()); }); - // note: output text must include "exceeds size_type range" for python error handling CUDF_EXPECTS(total_row_count <= static_cast(std::numeric_limits::max()), - "Total number of concatenated rows exceeds size_type range", + "Total number of concatenated rows exceeds the column size limit", std::overflow_error); // traverse children diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index 35ecf180c66..eb8ea92c7b8 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -65,7 +65,8 @@ std::unique_ptr
gather(table_view const& source_table, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(gather_map.size() <= static_cast(std::numeric_limits::max()), - "invalid gather map size"); + "gather map size exceeds the column size limit", + std::overflow_error); auto map_col = column_view(data_type{type_to_id()}, static_cast(gather_map.size()), gather_map.data(), diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 6e275ae1c78..860bda1abac 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -321,7 +321,8 @@ std::unique_ptr
scatter(table_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(scatter_map.size() <= static_cast(std::numeric_limits::max()), - "invalid scatter map size"); + "scatter map size exceeds the column size limit", + std::overflow_error); auto map_col = column_view(data_type{type_to_id()}, static_cast(scatter_map.size()), scatter_map.data(), diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 736c96e0915..9c14ccca1f9 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,7 +63,8 @@ struct count_accessor { auto count = p_count->value(stream); // static_cast is necessary due to bool CUDF_EXPECTS(static_cast(count) <= std::numeric_limits::max(), - "count should not exceed size_type's limit."); + "count should not exceed the column size limit", + std::overflow_error); return static_cast(count); } @@ -86,7 +87,8 @@ struct count_checker { auto max = thrust::reduce( rmm::exec_policy(stream), count.begin(), count.end(), 0, thrust::maximum()); CUDF_EXPECTS(max <= std::numeric_limits::max(), - "count should not have values larger than size_type maximum."); + "count exceeds the column size limit", + std::overflow_error); } } @@ -136,9 +138,9 @@ std::unique_ptr
repeat(table_view const& input_table, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(count >= 0, "count value should be non-negative"); - CUDF_EXPECTS( - static_cast(input_table.num_rows()) * count <= std::numeric_limits::max(), - "The resulting table has more rows than size_type's limit."); + CUDF_EXPECTS(input_table.num_rows() <= std::numeric_limits::max() / count, + "The resulting table exceeds the column size limit", + std::overflow_error); if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); } diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 58afc8e9015..1759c998c75 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,13 +173,15 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, } CUDF_EXPECTS(tensor.shape[0] >= 0, "DLTensor first dim should be of shape greater than or equal to 0."); - CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits::max(), - "DLTensor first dim exceeds size supported by cudf"); + CUDF_EXPECTS(tensor.shape[0] <= std::numeric_limits::max(), + "DLTensor first dim exceeds the column size limit", + std::overflow_error); if (tensor.ndim > 1) { CUDF_EXPECTS(tensor.shape[1] >= 0, "DLTensor second dim should be of shape greater than or equal to 0."); - CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits::max(), - "DLTensor second dim exceeds size supported by cudf"); + CUDF_EXPECTS(tensor.shape[1] <= std::numeric_limits::max(), + "DLTensor second dim exceeds the column size limit", + std::overflow_error); } size_t const num_columns = (tensor.ndim == 2) ? static_cast(tensor.shape[1]) : 1; diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp index 1b79a59aa9e..69432f917e0 100644 --- a/cpp/src/io/utilities/row_selection.cpp +++ b/cpp/src/io/utilities/row_selection.cpp @@ -29,7 +29,8 @@ std::pair skip_rows_num_rows_from_options( auto const rows_to_skip = std::min(skip_rows_opt, num_source_rows); if (not num_rows_opt.has_value()) { CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits::max(), - "The requested number of rows to read exceeds the largest cudf column size"); + "The requested number of rows exceeds the column size limit", + std::overflow_error); return {rows_to_skip, num_source_rows - rows_to_skip}; } // Limit the number of rows to the end of the input diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp index 66a3a83a61e..4f37ce55c20 100644 --- a/cpp/src/io/utilities/row_selection.hpp +++ b/cpp/src/io/utilities/row_selection.hpp @@ -32,8 +32,7 @@ namespace cudf::io::detail { * @param num_source_rows number of rows in the ORC file(s) * @return A std::pair containing the number of rows to skip and the number of rows to read * - * @throw cudf::logic_error when the requested number of rows to read exceeds the largest cudf - * column size + * @throw std::overflow_exception The requested number of rows exceeds the column size limit */ std::pair skip_rows_num_rows_from_options( uint64_t skip_rows_opt, std::optional const& num_rows_opt, uint64_t num_source_rows); diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 15c1d21e74c..76aab1e502a 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -375,8 +375,6 @@ hash_join::hash_join(cudf::table_view const& build, { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); - CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE, - "Build column size is too big for hash join"); if (_is_empty) { return; } @@ -557,8 +555,6 @@ hash_join::compute_hash_join(cudf::table_view const& probe, rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); - CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, - "Probe column size is too big for hash join"); CUDF_EXPECTS(_build.num_columns() == probe.num_columns(), "Mismatch in number of columns to be joined on"); diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 39ec8884ba4..45a8b124ea3 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -34,7 +34,6 @@ namespace cudf { namespace detail { -constexpr size_type MAX_JOIN_SIZE{std::numeric_limits::max()}; constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index 895bc9de816..d1d9c9524f2 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -160,8 +160,8 @@ std::unique_ptr sequences(column_view const& starts, auto const n_elements = cudf::detail::sizes_to_offsets( sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream); - CUDF_EXPECTS(n_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + CUDF_EXPECTS(n_elements <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", std::overflow_error); auto child = type_dispatcher(starts.type(), diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 0997983c95e..02660c46c63 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -255,8 +255,8 @@ std::unique_ptr convert_case(strings_column_view const& input, // convert sizes to offsets auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", std::overflow_error); auto chars = create_chars_child_column(static_cast(bytes), stream, mr); diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 92b71d128e1..c5dfd4a8b93 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -216,9 +216,11 @@ std::unique_ptr concatenate(host_span columns, if (strings_count == 0) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(offsets_count <= static_cast(std::numeric_limits::max()), - "total number of strings is too large for cudf column"); + "total number of strings exceeds the column size limit", + std::overflow_error); CUDF_EXPECTS(total_bytes <= static_cast(std::numeric_limits::max()), - "total size of strings is too large for cudf column"); + "total size of strings exceeds the column size limit", + std::overflow_error); bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 6bbd79166a8..23b53062bf3 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -135,8 +135,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, auto const char_bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(char_bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + CUDF_EXPECTS(char_bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", std::overflow_error); // Now build the chars column diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 3784b535a5b..8b5b71c097d 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -51,7 +51,8 @@ std::unique_ptr repeat_string(string_scalar const& input, if (repeat_times == 1) { return std::make_unique(input, stream, mr); } CUDF_EXPECTS(input.size() <= std::numeric_limits::max() / repeat_times, - "The output string has size that exceeds the maximum allowed size."); + "The output size exceeds the column size limit", + std::overflow_error); auto const str_size = input.size(); auto const iter = thrust::make_counting_iterator(0); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index e9aa6c2693c..d2cc90bb971 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -114,11 +114,10 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3, "Only murmur3 hash algorithm supported", std::invalid_argument); - CUDF_EXPECTS( - (static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows must not exceed maximum of size_type", - std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index 93757fa37e4..fd1cbf99221 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -222,7 +222,7 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream); CUDF_EXPECTS( output_chars_size <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + "Size of output exceeds the column size limit", std::overflow_error); // This will contain the size in bytes of each ngram to generate diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 3ef251611eb..73d01c9f3ec 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -213,8 +213,9 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con }(); CUDF_EXPECTS( - result.first->size() <= static_cast(std::numeric_limits::max()), - "output too large for strings column"); + result.first->size() < static_cast(std::numeric_limits::max()), + "output exceeds the column size limit", + std::overflow_error); // convert the result into a strings column // - the cp_chars are the new 4-byte code-point values for all the characters in the output diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index e34aa4054da..a689fcc7dc3 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -165,9 +165,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, { CUDF_EXPECTS(stride <= max_sequence_length, "stride must be less than or equal to max_sequence_length"); - CUDF_EXPECTS(max_sequence_length * max_rows_tensor < - static_cast(std::numeric_limits::max()), - "max_sequence_length x max_rows_tensor is too large for cudf output column size"); + CUDF_EXPECTS( + max_sequence_length <= + (static_cast(std::numeric_limits::max()) / max_rows_tensor), + "max_sequence_length times max_rows_tensor exceeds the column size limit", + std::overflow_error); auto const strings_count = strings.size(); if (strings_count == strings.null_count()) { // empty or all-null returns empty return tokenizer_result{0, diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index a36b018bc78..a71c8a22af9 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -557,7 +557,7 @@ TEST_F(OverflowTest, Presliced) cudf::table_view tb({b[1]}); EXPECT_THROW(cudf::concatenate(std::vector({ta, ta, ta, tb})), - cudf::logic_error); + std::overflow_error); } } @@ -631,7 +631,7 @@ TEST_F(OverflowTest, Presliced) cudf::table_view tb({b[1]}); EXPECT_THROW(cudf::concatenate(std::vector({ta, ta, ta, tb})), - cudf::logic_error); + std::overflow_error); } } diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp index c38422af688..8fb28fb3390 100644 --- a/cpp/tests/filling/repeat_tests.cpp +++ b/cpp/tests/filling/repeat_tests.cpp @@ -271,3 +271,22 @@ TEST_F(RepeatErrorTestFixture, CountHasNulls) // input_table.has_nulls() == true EXPECT_THROW(auto ret = cudf::repeat(input_table, count), cudf::logic_error); } + +TEST_F(RepeatErrorTestFixture, Overflow) +{ + auto input = cudf::test::fixed_width_column_wrapper( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100); + cudf::table_view input_table{{input}}; + // set the count such that (count * num_rows) > max(size_type); + // the extra divide by 2 ensures the max is exceeded despite truncation in integer division + auto count = std::numeric_limits::max() / (input_table.num_rows() / 2); + EXPECT_THROW(cudf::repeat(input_table, count), std::overflow_error); +} + +TEST_F(RepeatErrorTestFixture, NegativeCount) +{ + auto input = cudf::test::fixed_width_column_wrapper( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100); + cudf::table_view input_table{{input}}; + EXPECT_THROW(cudf::repeat(input_table, -1), cudf::logic_error); +} diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 65fce62e965..ed44727b712 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -157,7 +157,7 @@ TEST_F(DLPackUntypedTests, TooManyRowsFromDlpack) // Spoof too many rows constexpr int64_t max_size_type{std::numeric_limits::max()}; tensor->dl_tensor.shape[0] = max_size_type + 1; - EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error); + EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error); } TEST_F(DLPackUntypedTests, TooManyColsFromDlpack) @@ -170,7 +170,7 @@ TEST_F(DLPackUntypedTests, TooManyColsFromDlpack) // Spoof too many cols constexpr int64_t max_size_type{std::numeric_limits::max()}; tensor->dl_tensor.shape[1] = max_size_type + 1; - EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error); + EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error); } TEST_F(DLPackUntypedTests, InvalidTypeFromDlpack) diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index 984d9425a33..b4583ac4f17 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -127,7 +127,7 @@ TEST_F(FromOptsTest, OverFlowDetection) // Too many rows to read until the end of the file EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit), - cudf::logic_error); + std::overflow_error); // Should work fine with num_rows EXPECT_NO_THROW( diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index 1bc45aaf573..74dc447f85f 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -157,7 +157,7 @@ TEST_F(StringsColumnTest, GatherTooBig) cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars}); auto map = thrust::constant_iterator(0); cudf::test::fixed_width_column_wrapper gather_map(map, map + 1000); - EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), cudf::logic_error); + EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), std::overflow_error); } TEST_F(StringsColumnTest, Scatter) diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp index b7bfad36817..9d08ac9c00c 100644 --- a/cpp/tests/strings/repeat_strings_tests.cpp +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -90,7 +90,7 @@ TYPED_TEST(RepeatStringsTypedTest, ValidStringScalar) // Repeat too many times. { EXPECT_THROW(cudf::strings::repeat_string(str, std::numeric_limits::max() / 2), - cudf::logic_error); + std::overflow_error); } } diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 9572ccd1baf..fa4e2a91600 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -118,9 +118,7 @@ TEST_F(MinHashTest, ErrorsTest) auto view = cudf::strings_column_view(input); EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper< - cudf::hash_value_type>(); // cudf::device_span{}; + auto seeds = cudf::test::fixed_width_column_wrapper(); EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); std::vector h_input(50000, ""); @@ -129,5 +127,5 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); } diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 0cf223f4a99..806d768e303 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -238,7 +238,7 @@ TEST(TextSubwordTest, ParameterErrors) true, // do_lower_case true, // do_truncate 858993459), - cudf::logic_error); + std::overflow_error); } TEST(TextSubwordTest, EmptyStrings)