From 265743d750aca030dc852418cdd1fca921031739 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 May 2023 19:49:23 -0400
Subject: [PATCH] Use std::overflow_error when output would exceed column size
 limit (#13323)

Replaces generic `cudf::logic_error` exception with `std::overflow_error` where appropriate in libcudf.
Since this changes what is thrown in certain APIs, I think this technically is a breaking change.

Closes #12925

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13323
---
 cpp/include/cudf/column/column.hpp            |  3 ++-
 cpp/include/cudf/column/column_view.hpp       |  5 +++--
 cpp/include/cudf/detail/join.hpp              |  2 --
 .../cudf/detail/sizes_to_offsets_iterator.cuh |  2 +-
 cpp/include/cudf/filling.hpp                  |  8 +++-----
 cpp/include/cudf/join.hpp                     | 12 ++++--------
 cpp/include/cudf/strings/detail/gather.cuh    |  5 +++--
 .../cudf/strings/detail/strings_children.cuh  |  4 ++--
 cpp/include/cudf/strings/repeat_strings.hpp   |  7 ++-----
 cpp/include/nvtext/minhash.hpp                |  1 +
 cpp/include/nvtext/subword_tokenize.hpp       |  4 ++--
 cpp/src/copying/concatenate.cu                | 12 +++++-------
 cpp/src/copying/gather.cu                     |  3 ++-
 cpp/src/copying/scatter.cu                    |  3 ++-
 cpp/src/filling/repeat.cu                     | 14 ++++++++------
 cpp/src/interop/dlpack.cpp                    | 12 +++++++-----
 cpp/src/io/utilities/row_selection.cpp        |  3 ++-
 cpp/src/io/utilities/row_selection.hpp        |  3 +--
 cpp/src/join/hash_join.cu                     |  4 ----
 cpp/src/join/join_common_utils.hpp            |  1 -
 cpp/src/lists/sequences.cu                    |  4 ++--
 cpp/src/strings/case.cu                       |  4 ++--
 cpp/src/strings/copying/concatenate.cu        |  6 ++++--
 cpp/src/strings/regex/utilities.cuh           |  4 ++--
 cpp/src/strings/repeat_strings.cu             |  3 ++-
 cpp/src/text/minhash.cu                       |  9 ++++-----
 cpp/src/text/ngrams_tokenize.cu               |  2 +-
 cpp/src/text/normalize.cu                     |  5 +++--
 cpp/src/text/subword/subword_tokenize.cu      |  8 +++++---
 cpp/tests/copying/concatenate_tests.cpp       |  4 ++--
 cpp/tests/filling/repeat_tests.cpp            | 19 +++++++++++++++++++
 cpp/tests/interop/dlpack_test.cpp             |  4 ++--
 cpp/tests/io/row_selection_test.cpp           |  2 +-
 cpp/tests/strings/array_tests.cpp             |  2 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |  2 +-
 cpp/tests/text/minhash_tests.cpp              |  6 ++----
 cpp/tests/text/subword_tests.cpp              |  2 +-
 37 files changed, 104 insertions(+), 90 deletions(-)
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index a28bf82962b..a38186458c4 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -89,7 +89,8 @@ class column {
       _size{[&]() {
         CUDF_EXPECTS(
           other.size() <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-          "The device_uvector size exceeds the maximum size_type.");
+          "The device_uvector size exceeds the column size limit",
+          std::overflow_error);
         return static_cast<size_type>(other.size());
       }()},
       _data{other.release()},
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 703131053f9..d80c720a255 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -422,8 +422,9 @@ class column_view : public detail::column_view_base {
         cudf::data_type{cudf::type_to_id<T>()}, data.size(), data.data(), nullptr, 0, 0, {})
   {
     CUDF_EXPECTS(
-      data.size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-      "Data exceeds the maximum size of a column view.");
+      data.size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Data exceeds the column size limit",
+      std::overflow_error);
   }
 
   /**
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index c424c20d7c7..4a34eb6b328 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -86,7 +86,6 @@ struct hash_join {
    * @brief Constructor that internally builds the hash table based on the given `build` table.
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
-   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
    *
    * @param build The build table, from which the hash table is built.
    * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
@@ -177,7 +176,6 @@ struct hash_join {
    * @copydoc cudf::detail::hash_join::probe_join_indices
    *
    * @throw cudf::logic_error if probe table is empty.
-   * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`.
    * @throw cudf::logic_error if the number of columns in build table and probe table do not match.
    * @throw cudf::logic_error if the column data types in build table and probe table do not match.
    */
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 0017ddb305d..7395c2692be 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -319,7 +319,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
-    "Size of output exceeds column size limit",
+    "Size of output exceeds the column size limit",
     std::overflow_error);
 
   offsets_column->set_null_count(0);
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 8688e97ab7e..a82bb9d1a48 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,10 +131,8 @@ std::unique_ptr<table> repeat(
  * count = 2
  * return = [4,4,5,5,6,6]
  * ```
- * @throws cudf::logic_error if the data type of @p count is not size_type.
- * @throws cudf::logic_error if @p count is invalid or @p count is negative.
- * @throws cudf::logic_error if @p input_table.num_rows() * @p count overflows
- * size_type.
+ * @throws cudf::logic_error if @p count is negative.
+ * @throws std::overflow_error if @p input_table.num_rows() * @p count overflows size_type.
  *
  * @param input_table Input table
  * @param count Number of repetitions
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 11d1bbf9fc8..314a1bbfad7 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -167,7 +167,7 @@ full_join(cudf::table_view const& left_keys,
           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns a vector of row indices corresponding to a left semi join
+ * @brief Returns a vector of row indices corresponding to a left semi-join
  * between the specified tables.
  *
  * The returned vector contains the row indices from the left table
@@ -179,13 +179,9 @@ full_join(cudf::table_view const& left_keys,
  * Result: {1, 2}
  * @endcode
  *
- * @throw cudf::logic_error if number of columns in either
- * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE
- *
- * @param[in] left_keys The left table
- * @param[in] right_keys The right table
- * @param[in] compare_nulls controls whether null join-key values
- * should match or not.
+ * @param left_keys The left table
+ * @param right_keys The right table
+ * @param compare_nulls Controls whether null join-key values should match or not
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28b98eac3b5..908871774ad 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -324,7 +324,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
     size_t{0},
     thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of output strings is too large for a cudf column");
+               "total size of output strings exceeds the column limit",
+               std::overflow_error);
 
   // In-place convert output sizes into offsets
   thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 02a65c01178..5f8a2a34606 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -79,8 +79,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   // Convert the sizes to offsets
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   // Now build the chars column
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 26fe5f95983..2b6575f80d0 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -20,8 +20,6 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include <optional>
-
 namespace cudf {
 namespace strings {
 /**
@@ -49,9 +47,8 @@ namespace strings {
  * out is '123XYZ-123XYZ-123XYZ-'
  * @endcode
  *
- * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
- *        can be stored by the index type:
- *        `input.size() * repeat_times > max of size_type`
+ * @throw std::overflow_error if the size of the output string scalar exceeds the maximum value that
+ *        can be stored by the scalar: `input.size() * repeat_times > max of size_type`
  *
  * @param input The scalar containing the string to repeat
  * @param repeat_times The number of times the input string is repeated
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 9fdaeda0959..60116e389a3 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -69,6 +69,7 @@ std::unique_ptr<cudf::column> minhash(
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the MurmurHash3_32 algorithm
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 164ec7a603e..d266923187f 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -130,8 +130,8 @@ struct tokenizer_result {
  * strings column as working memory.
  *
  * @throw cudf::logic_error if `stride > max_sequence_length`
- * @throw cudf::logic_error if `max_sequence_length * max_rows_tensor` is
- *        larger than the max value for cudf::size_type
+ * @throw std::overflow_error if `max_sequence_length * max_rows_tensor`
+ *        exceeds the column size limit
  *
  * @param strings The input strings to tokenize.
  * @param vocabulary_table The vocabulary table pre-loaded into this object.
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b17475cb877..11c363d14e0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -250,7 +250,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto const output_size  = std::get<3>(device_views);
 
   CUDF_EXPECTS(output_size <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated rows exceeds size_type range",
+               "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
   // Allocate output
@@ -388,9 +388,9 @@ class traverse_children {
                       std::size_t{},
                       [](size_t a, auto const& b) -> size_t { return a + b.size(); }) +
       1;
-    // note:  output text must include "exceeds size_type range" for python error handling
     CUDF_EXPECTS(total_offset_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-                 "Total number of concatenated offsets exceeds size_type range");
+                 "Total number of concatenated offsets exceeds the column size limit",
+                 std::overflow_error);
   }
 };
 
@@ -418,9 +418,8 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
                     ? scv.chars_size()
                     : cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
     });
-  // note:  output text must include "exceeds size_type range" for python error handling
   CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated chars exceeds size_type range",
+               "Total number of concatenated chars exceeds the column size limit",
                std::overflow_error);
 }
 
@@ -490,9 +489,8 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
     std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
       return a + static_cast<size_t>(b.size());
     });
-  // note:  output text must include "exceeds size_type range" for python error handling
   CUDF_EXPECTS(total_row_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated rows exceeds size_type range",
+               "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
   // traverse children
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 35ecf180c66..eb8ea92c7b8 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -65,7 +65,8 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "invalid gather map size");
+               "gather map size exceeds the column size limit",
+               std::overflow_error);
   auto map_col = column_view(data_type{type_to_id<size_type>()},
                              static_cast<size_type>(gather_map.size()),
                              gather_map.data(),
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 6e275ae1c78..860bda1abac 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -321,7 +321,8 @@ std::unique_ptr<table> scatter(table_view const& source,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "invalid scatter map size");
+               "scatter map size exceeds the column size limit",
+               std::overflow_error);
   auto map_col = column_view(data_type{type_to_id<size_type>()},
                              static_cast<size_type>(scatter_map.size()),
                              scatter_map.data(),
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 736c96e0915..9c14ccca1f9 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,8 @@ struct count_accessor {
     auto count = p_count->value(stream);
     // static_cast is necessary due to bool
     CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
-                 "count should not exceed size_type's limit.");
+                 "count should not exceed the column size limit",
+                 std::overflow_error);
     return static_cast<cudf::size_type>(count);
   }
 
@@ -86,7 +87,8 @@ struct count_checker {
       auto max = thrust::reduce(
         rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, thrust::maximum<T>());
       CUDF_EXPECTS(max <= std::numeric_limits<cudf::size_type>::max(),
-                   "count should not have values larger than size_type maximum.");
+                   "count exceeds the column size limit",
+                   std::overflow_error);
     }
   }
 
@@ -136,9 +138,9 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(count >= 0, "count value should be non-negative");
-  CUDF_EXPECTS(
-    static_cast<int64_t>(input_table.num_rows()) * count <= std::numeric_limits<size_type>::max(),
-    "The resulting table has more rows than size_type's limit.");
+  CUDF_EXPECTS(input_table.num_rows() <= std::numeric_limits<size_type>::max() / count,
+               "The resulting table exceeds the column size limit",
+               std::overflow_error);
 
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 58afc8e9015..1759c998c75 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,13 +173,15 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   }
   CUDF_EXPECTS(tensor.shape[0] >= 0,
                "DLTensor first dim should be of shape greater than or equal to 0.");
-  CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits<size_type>::max(),
-               "DLTensor first dim exceeds size supported by cudf");
+  CUDF_EXPECTS(tensor.shape[0] <= std::numeric_limits<size_type>::max(),
+               "DLTensor first dim exceeds the column size limit",
+               std::overflow_error);
   if (tensor.ndim > 1) {
     CUDF_EXPECTS(tensor.shape[1] >= 0,
                  "DLTensor second dim should be of shape greater than or equal to 0.");
-    CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits<size_type>::max(),
-                 "DLTensor second dim exceeds size supported by cudf");
+    CUDF_EXPECTS(tensor.shape[1] <= std::numeric_limits<size_type>::max(),
+                 "DLTensor second dim exceeds the column size limit",
+                 std::overflow_error);
   }
   size_t const num_columns = (tensor.ndim == 2) ? static_cast<size_t>(tensor.shape[1]) : 1;
 
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index 1b79a59aa9e..69432f917e0 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -29,7 +29,8 @@ std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
   auto const rows_to_skip = std::min(skip_rows_opt, num_source_rows);
   if (not num_rows_opt.has_value()) {
     CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows to read exceeds the largest cudf column size");
+                 "The requested number of rows exceeds the column size limit",
+                 std::overflow_error);
     return {rows_to_skip, num_source_rows - rows_to_skip};
   }
   // Limit the number of rows to the end of the input
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 66a3a83a61e..4f37ce55c20 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -32,8 +32,7 @@ namespace cudf::io::detail {
  * @param num_source_rows number of rows in the ORC file(s)
  * @return A std::pair containing the number of rows to skip and the number of rows to read
  *
- * @throw cudf::logic_error when the requested number of rows to read exceeds the largest cudf
- * column size
+ * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
 std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
   uint64_t skip_rows_opt, std::optional<size_type> const& num_rows_opt, uint64_t num_source_rows);
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 15c1d21e74c..76aab1e502a 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -375,8 +375,6 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
-  CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
-               "Build column size is too big for hash join");
 
   if (_is_empty) { return; }
 
@@ -557,8 +555,6 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
-  CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
-               "Probe column size is too big for hash join");
 
   CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
                "Mismatch in number of columns to be joined on");
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 39ec8884ba4..45a8b124ea3 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -34,7 +34,6 @@
 
 namespace cudf {
 namespace detail {
-constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 895bc9de816..d1d9c9524f2 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -160,8 +160,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
 
   auto const n_elements = cudf::detail::sizes_to_offsets(
     sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
-  CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(n_elements <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   auto child = type_dispatcher(starts.type(),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 0997983c95e..02660c46c63 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -255,8 +255,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // convert sizes to offsets
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 92b71d128e1..c5dfd4a8b93 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -216,9 +216,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total number of strings is too large for cudf column");
+               "total number of strings exceeds the column size limit",
+               std::overflow_error);
   CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of strings is too large for cudf column");
+               "total size of strings exceeds the column size limit",
+               std::overflow_error);
 
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 6bbd79166a8..23b53062bf3 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -135,8 +135,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   auto const char_bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   // Now build the chars column
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 3784b535a5b..8b5b71c097d 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -51,7 +51,8 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
   if (repeat_times == 1) { return std::make_unique<string_scalar>(input, stream, mr); }
 
   CUDF_EXPECTS(input.size() <= std::numeric_limits<size_type>::max() / repeat_times,
-               "The output string has size that exceeds the maximum allowed size.");
+               "The output size exceeds the column size limit",
+               std::overflow_error);
 
   auto const str_size = input.size();
   auto const iter     = thrust::make_counting_iterator(0);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index e9aa6c2693c..d2cc90bb971 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -114,11 +114,10 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3,
                "Only murmur3 hash algorithm supported",
                std::invalid_argument);
-  CUDF_EXPECTS(
-    (static_cast<std::size_t>(input.size()) * seeds.size()) <
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-    "The number of seeds times the number of input rows must not exceed maximum of size_type",
-    std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
 
   auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 93757fa37e4..fd1cbf99221 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -222,7 +222,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
     chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
   CUDF_EXPECTS(
     output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds column size limit",
+    "Size of output exceeds the column size limit",
     std::overflow_error);
 
   // This will contain the size in bytes of each ngram to generate
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 3ef251611eb..73d01c9f3ec 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -213,8 +213,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   }();
 
   CUDF_EXPECTS(
-    result.first->size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-    "output too large for strings column");
+    result.first->size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "output exceeds the column size limit",
+    std::overflow_error);
 
   // convert the result into a strings column
   // - the cp_chars are the new 4-byte code-point values for all the characters in the output
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index e34aa4054da..a689fcc7dc3 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -165,9 +165,11 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
-  CUDF_EXPECTS(max_sequence_length * max_rows_tensor <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "max_sequence_length x max_rows_tensor is too large for cudf output column size");
+  CUDF_EXPECTS(
+    max_sequence_length <=
+      (static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / max_rows_tensor),
+    "max_sequence_length times max_rows_tensor exceeds the column size limit",
+    std::overflow_error);
   auto const strings_count = strings.size();
   if (strings_count == strings.null_count()) {  // empty or all-null returns empty
     return tokenizer_result{0,
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index a36b018bc78..a71c8a22af9 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -557,7 +557,7 @@ TEST_F(OverflowTest, Presliced)
       cudf::table_view tb({b[1]});
 
       EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({ta, ta, ta, tb})),
-                   cudf::logic_error);
+                   std::overflow_error);
     }
   }
 
@@ -631,7 +631,7 @@ TEST_F(OverflowTest, Presliced)
       cudf::table_view tb({b[1]});
 
       EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({ta, ta, ta, tb})),
-                   cudf::logic_error);
+                   std::overflow_error);
     }
   }
 
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index c38422af688..8fb28fb3390 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -271,3 +271,22 @@ TEST_F(RepeatErrorTestFixture, CountHasNulls)
   // input_table.has_nulls() == true
   EXPECT_THROW(auto ret = cudf::repeat(input_table, count), cudf::logic_error);
 }
+
+TEST_F(RepeatErrorTestFixture, Overflow)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
+  cudf::table_view input_table{{input}};
+  // set the count such that (count * num_rows) > max(size_type);
+  // the extra divide by 2 ensures the max is exceeded despite truncation in integer division
+  auto count = std::numeric_limits<cudf::size_type>::max() / (input_table.num_rows() / 2);
+  EXPECT_THROW(cudf::repeat(input_table, count), std::overflow_error);
+}
+
+TEST_F(RepeatErrorTestFixture, NegativeCount)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
+  cudf::table_view input_table{{input}};
+  EXPECT_THROW(cudf::repeat(input_table, -1), cudf::logic_error);
+}
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 65fce62e965..ed44727b712 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -157,7 +157,7 @@ TEST_F(DLPackUntypedTests, TooManyRowsFromDlpack)
   // Spoof too many rows
   constexpr int64_t max_size_type{std::numeric_limits<int32_t>::max()};
   tensor->dl_tensor.shape[0] = max_size_type + 1;
-  EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
+  EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error);
 }
 
 TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
@@ -170,7 +170,7 @@ TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
   // Spoof too many cols
   constexpr int64_t max_size_type{std::numeric_limits<int32_t>::max()};
   tensor->dl_tensor.shape[1] = max_size_type + 1;
-  EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
+  EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidTypeFromDlpack)
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 984d9425a33..b4583ac4f17 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -127,7 +127,7 @@ TEST_F(FromOptsTest, OverFlowDetection)
 
   // Too many rows to read until the end of the file
   EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               cudf::logic_error);
+               std::overflow_error);
 
   // Should work fine with num_rows
   EXPECT_NO_THROW(
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 1bc45aaf573..74dc447f85f 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -157,7 +157,7 @@ TEST_F(StringsColumnTest, GatherTooBig)
     cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars});
   auto map = thrust::constant_iterator<int8_t>(0);
   cudf::test::fixed_width_column_wrapper<int8_t> gather_map(map, map + 1000);
-  EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), cudf::logic_error);
+  EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), std::overflow_error);
 }
 
 TEST_F(StringsColumnTest, Scatter)
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index b7bfad36817..9d08ac9c00c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -90,7 +90,7 @@ TYPED_TEST(RepeatStringsTypedTest, ValidStringScalar)
   // Repeat too many times.
   {
     EXPECT_THROW(cudf::strings::repeat_string(str, std::numeric_limits<int32_t>::max() / 2),
-                 cudf::logic_error);
+                 std::overflow_error);
   }
 }
 
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 9572ccd1baf..fa4e2a91600 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -118,9 +118,7 @@ TEST_F(MinHashTest, ErrorsTest)
   auto view  = cudf::strings_column_view(input);
   EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
   EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<
-    cudf::hash_value_type>();  // cudf::device_span<cudf::hash_value_type
-                               // const>{};
+  auto seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>();
   EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
@@ -129,5 +127,5 @@ TEST_F(MinHashTest, ErrorsTest)
 
   auto const zeroes = thrust::constant_iterator<cudf::hash_value_type>(0);
   seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
 }
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 0cf223f4a99..806d768e303 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -238,7 +238,7 @@ TEST(TextSubwordTest, ParameterErrors)
                                         true,  // do_lower_case
                                         true,  // do_truncate
                                         858993459),
-               cudf::logic_error);
+               std::overflow_error);
 }
 
 TEST(TextSubwordTest, EmptyStrings)