Skip to content

Commit

Permalink
Use std::overflow_error when output would exceed column size limit (#…
Browse files Browse the repository at this point in the history
…13323)

Replaces generic `cudf::logic_error` exception with `std::overflow_error` where appropriate in libcudf.
Since this changes what is thrown in certain APIs, I think this technically is a breaking change.

Closes #12925

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #13323
  • Loading branch information
davidwendt authored May 24, 2023
1 parent 19554a1 commit 265743d
Show file tree
Hide file tree
Showing 37 changed files with 104 additions and 90 deletions.
3 changes: 2 additions & 1 deletion cpp/include/cudf/column/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ class column {
_size{[&]() {
CUDF_EXPECTS(
other.size() <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"The device_uvector size exceeds the maximum size_type.");
"The device_uvector size exceeds the column size limit",
std::overflow_error);
return static_cast<size_type>(other.size());
}()},
_data{other.release()},
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,9 @@ class column_view : public detail::column_view_base {
cudf::data_type{cudf::type_to_id<T>()}, data.size(), data.data(), nullptr, 0, 0, {})
{
CUDF_EXPECTS(
data.size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"Data exceeds the maximum size of a column view.");
data.size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"Data exceeds the column size limit",
std::overflow_error);
}

/**
Expand Down
2 changes: 0 additions & 2 deletions cpp/include/cudf/detail/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ struct hash_join {
* @brief Constructor that internally builds the hash table based on the given `build` table.
*
* @throw cudf::logic_error if the number of columns in `build` table is 0.
* @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
*
* @param build The build table, from which the hash table is built.
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
Expand Down Expand Up @@ -177,7 +176,6 @@ struct hash_join {
* @copydoc cudf::detail::hash_join::probe_join_indices
*
* @throw cudf::logic_error if probe table is empty.
* @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`.
* @throw cudf::logic_error if the number of columns in build table and probe table do not match.
* @throw cudf::logic_error if the column data types in build table and probe table do not match.
*/
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
CUDF_EXPECTS(
total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit",
"Size of output exceeds the column size limit",
std::overflow_error);

offsets_column->set_null_count(0);
Expand Down
8 changes: 3 additions & 5 deletions cpp/include/cudf/filling.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,10 +131,8 @@ std::unique_ptr<table> repeat(
* count = 2
* return = [4,4,5,5,6,6]
* ```
* @throws cudf::logic_error if the data type of @p count is not size_type.
* @throws cudf::logic_error if @p count is invalid or @p count is negative.
* @throws cudf::logic_error if @p input_table.num_rows() * @p count overflows
* size_type.
* @throws cudf::logic_error if @p count is negative.
* @throws std::overflow_error if @p input_table.num_rows() * @p count overflows size_type.
*
* @param input_table Input table
* @param count Number of repetitions
Expand Down
12 changes: 4 additions & 8 deletions cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ full_join(cudf::table_view const& left_keys,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a vector of row indices corresponding to a left semi join
* @brief Returns a vector of row indices corresponding to a left semi-join
* between the specified tables.
*
* The returned vector contains the row indices from the left table
Expand All @@ -179,13 +179,9 @@ full_join(cudf::table_view const& left_keys,
* Result: {1, 2}
* @endcode
*
* @throw cudf::logic_error if number of columns in either
* `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE
*
* @param[in] left_keys The left table
* @param[in] right_keys The right table
* @param[in] compare_nulls controls whether null join-key values
* should match or not.
* @param left_keys The left table
* @param right_keys The right table
* @param compare_nulls Controls whether null join-key values should match or not
* @param mr Device memory resource used to allocate the returned table and columns' device memory
*
* @return A vector `left_indices` that can be used to construct
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -324,7 +324,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
size_t{0},
thrust::plus{});
CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of output strings is too large for a cudf column");
"total size of output strings exceeds the column limit",
std::overflow_error);
// In-place convert output sizes into offsets
thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
// Convert the sizes to offsets
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit",
CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

// Now build the chars column
Expand Down
7 changes: 2 additions & 5 deletions cpp/include/cudf/strings/repeat_strings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

#include <rmm/mr/device/per_device_resource.hpp>

#include <optional>

namespace cudf {
namespace strings {
/**
Expand Down Expand Up @@ -49,9 +47,8 @@ namespace strings {
* out is '123XYZ-123XYZ-123XYZ-'
* @endcode
*
* @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
* can be stored by the index type:
* `input.size() * repeat_times > max of size_type`
* @throw std::overflow_error if the size of the output string scalar exceeds the maximum value that
* can be stored by the scalar: `input.size() * repeat_times > max of size_type`
*
* @param input The scalar containing the string to repeat
* @param repeat_times The number of times the input string is repeated
Expand Down
1 change: 1 addition & 0 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ std::unique_ptr<cudf::column> minhash(
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if hash_function is not HASH_MURMUR3
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the MurmurHash3_32 algorithm
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ struct tokenizer_result {
* strings column as working memory.
*
* @throw cudf::logic_error if `stride > max_sequence_length`
* @throw cudf::logic_error if `max_sequence_length * max_rows_tensor` is
* larger than the max value for cudf::size_type
* @throw std::overflow_error if `max_sequence_length * max_rows_tensor`
* exceeds the column size limit
*
* @param strings The input strings to tokenize.
* @param vocabulary_table The vocabulary table pre-loaded into this object.
Expand Down
12 changes: 5 additions & 7 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
auto const output_size = std::get<3>(device_views);

CUDF_EXPECTS(output_size <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated rows exceeds size_type range",
"Total number of concatenated rows exceeds the column size limit",
std::overflow_error);

// Allocate output
Expand Down Expand Up @@ -388,9 +388,9 @@ class traverse_children {
std::size_t{},
[](size_t a, auto const& b) -> size_t { return a + b.size(); }) +
1;
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_offset_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated offsets exceeds size_type range");
"Total number of concatenated offsets exceeds the column size limit",
std::overflow_error);
}
};

Expand Down Expand Up @@ -418,9 +418,8 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
? scv.chars_size()
: cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated chars exceeds size_type range",
"Total number of concatenated chars exceeds the column size limit",
std::overflow_error);
}

Expand Down Expand Up @@ -490,9 +489,8 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
// note: output text must include "exceeds size_type range" for python error handling
CUDF_EXPECTS(total_row_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated rows exceeds size_type range",
"Total number of concatenated rows exceeds the column size limit",
std::overflow_error);

// traverse children
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/copying/gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ std::unique_ptr<table> gather(table_view const& source_table,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"invalid gather map size");
"gather map size exceeds the column size limit",
std::overflow_error);
auto map_col = column_view(data_type{type_to_id<size_type>()},
static_cast<size_type>(gather_map.size()),
gather_map.data(),
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,8 @@ std::unique_ptr<table> scatter(table_view const& source,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"invalid scatter map size");
"scatter map size exceeds the column size limit",
std::overflow_error);
auto map_col = column_view(data_type{type_to_id<size_type>()},
static_cast<size_type>(scatter_map.size()),
scatter_map.data(),
Expand Down
14 changes: 8 additions & 6 deletions cpp/src/filling/repeat.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,7 +63,8 @@ struct count_accessor {
auto count = p_count->value(stream);
// static_cast is necessary due to bool
CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
"count should not exceed size_type's limit.");
"count should not exceed the column size limit",
std::overflow_error);
return static_cast<cudf::size_type>(count);
}

Expand All @@ -86,7 +87,8 @@ struct count_checker {
auto max = thrust::reduce(
rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, thrust::maximum<T>());
CUDF_EXPECTS(max <= std::numeric_limits<cudf::size_type>::max(),
"count should not have values larger than size_type maximum.");
"count exceeds the column size limit",
std::overflow_error);
}
}

Expand Down Expand Up @@ -136,9 +138,9 @@ std::unique_ptr<table> repeat(table_view const& input_table,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(count >= 0, "count value should be non-negative");
CUDF_EXPECTS(
static_cast<int64_t>(input_table.num_rows()) * count <= std::numeric_limits<size_type>::max(),
"The resulting table has more rows than size_type's limit.");
CUDF_EXPECTS(input_table.num_rows() <= std::numeric_limits<size_type>::max() / count,
"The resulting table exceeds the column size limit",
std::overflow_error);

if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }

Expand Down
12 changes: 7 additions & 5 deletions cpp/src/interop/dlpack.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -173,13 +173,15 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
}
CUDF_EXPECTS(tensor.shape[0] >= 0,
"DLTensor first dim should be of shape greater than or equal to 0.");
CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits<size_type>::max(),
"DLTensor first dim exceeds size supported by cudf");
CUDF_EXPECTS(tensor.shape[0] <= std::numeric_limits<size_type>::max(),
"DLTensor first dim exceeds the column size limit",
std::overflow_error);
if (tensor.ndim > 1) {
CUDF_EXPECTS(tensor.shape[1] >= 0,
"DLTensor second dim should be of shape greater than or equal to 0.");
CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits<size_type>::max(),
"DLTensor second dim exceeds size supported by cudf");
CUDF_EXPECTS(tensor.shape[1] <= std::numeric_limits<size_type>::max(),
"DLTensor second dim exceeds the column size limit",
std::overflow_error);
}
size_t const num_columns = (tensor.ndim == 2) ? static_cast<size_t>(tensor.shape[1]) : 1;

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/utilities/row_selection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
auto const rows_to_skip = std::min(skip_rows_opt, num_source_rows);
if (not num_rows_opt.has_value()) {
CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
"The requested number of rows to read exceeds the largest cudf column size");
"The requested number of rows exceeds the column size limit",
std::overflow_error);
return {rows_to_skip, num_source_rows - rows_to_skip};
}
// Limit the number of rows to the end of the input
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/utilities/row_selection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ namespace cudf::io::detail {
* @param num_source_rows number of rows in the ORC file(s)
* @return A std::pair containing the number of rows to skip and the number of rows to read
*
* @throw cudf::logic_error when the requested number of rows to read exceeds the largest cudf
* column size
* @throw std::overflow_exception The requested number of rows exceeds the column size limit
*/
std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
uint64_t skip_rows_opt, std::optional<size_type> const& num_rows_opt, uint64_t num_source_rows);
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,6 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
"Build column size is too big for hash join");

if (_is_empty) { return; }

Expand Down Expand Up @@ -557,8 +555,6 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
rmm::mr::device_memory_resource* mr) const
{
CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
"Probe column size is too big for hash join");

CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
"Mismatch in number of columns to be joined on");
Expand Down
1 change: 0 additions & 1 deletion cpp/src/join/join_common_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

namespace cudf {
namespace detail {
constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};

constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/lists/sequences.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ std::unique_ptr<column> sequences(column_view const& starts,

auto const n_elements = cudf::detail::sizes_to_offsets(
sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit",
CUDF_EXPECTS(n_elements <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

auto child = type_dispatcher(starts.type(),
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
// convert sizes to offsets
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit",
CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
if (strings_count == 0) { return make_empty_column(type_id::STRING); }

CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total number of strings is too large for cudf column");
"total number of strings exceeds the column size limit",
std::overflow_error);
CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings is too large for cudf column");
"total size of strings exceeds the column size limit",
std::overflow_error);

bool const has_nulls =
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,

auto const char_bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit",
CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

// Now build the chars column
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/strings/repeat_strings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
if (repeat_times == 1) { return std::make_unique<string_scalar>(input, stream, mr); }

CUDF_EXPECTS(input.size() <= std::numeric_limits<size_type>::max() / repeat_times,
"The output string has size that exceeds the maximum allowed size.");
"The output size exceeds the column size limit",
std::overflow_error);

auto const str_size = input.size();
auto const iter = thrust::make_counting_iterator(0);
Expand Down
Loading

0 comments on commit 265743d

Please sign in to comment.