From 2969b241c0654a11d1a61e29664bcaecd7bc4a15 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 Feb 2023 13:23:15 +0000
Subject: [PATCH] Produce useful guidance on overflow error in `to_csv`
 (#12705)

Since writing to CSV files is implemented by converting all columns in
a dataframe to strings, and then concatenating those columns, when we
attempt to write a large dataframe to CSV without specifying a chunk
size, we can easily overflow the maximum column size.

Currently the error message is rather inscrutable: that the requested
size of a string column exceeds the column size limit. To help the
user, catch this error and provide a useful error message that points
them towards setting the `chunksize` argument.

So that we don't produce false positive advice, tighten the scope by
only catching `OverflowError`, to do this, make partial progress
towards resolving #10200 by throwing `std::overflow_error` when
checking for overflow of string column lengths.

Closes #12690.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Ashwin Srinath (https://github.com/shwina)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/12705
---
 cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh | 11 +++++++----
 cpp/include/cudf/lists/filling.hpp                    |  4 +++-
 cpp/include/cudf/strings/detail/strings_children.cuh  |  7 +++++--
 cpp/src/lists/sequences.cu                            |  4 +++-
 cpp/src/strings/regex/utilities.cuh                   |  7 +++++--
 cpp/src/text/ngrams_tokenize.cu                       |  5 ++++-
 cpp/tests/strings/repeat_strings_tests.cpp            |  2 +-
 python/cudf/cudf/_lib/csv.pyx                         | 10 ++++++++--
 python/cudf/cudf/_lib/json.pyx                        | 10 ++++++++--
 python/cudf/cudf/utils/ioutils.py                     |  5 ++++-
 10 files changed, 48 insertions(+), 17 deletions(-)
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 013e74ff18c..eefc5718617 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -27,6 +27,8 @@
 #include <thrust/distance.h>
 #include <thrust/scan.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace detail {
 
@@ -242,7 +244,7 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  *   auto const bytes = cudf::detail::sizes_to_offsets(
  *     d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
  *   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
- *               "Size of output exceeds column size limit");
+ *               "Size of output exceeds column size limit", std::overflow_error);
  * @endcode
  *
  * @tparam SizesIterator Iterator type for input of the scan using addition operation
@@ -282,8 +284,8 @@ auto sizes_to_offsets(SizesIterator begin,
  * The return also includes the total number of elements -- the last element value from the
  * scan.
  *
- * @throw cudf::logic_error if the total size of the scan (last element) greater than maximum value
- * of `size_type`
+ * @throw std::overflow_error if the total size of the scan (last element) greater than maximum
+ * value of `size_type`
  *
  * @tparam InputIterator Used as input to scan to set the offset values
  * @param begin The beginning of the input sequence
@@ -317,7 +319,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
-    "Size of output exceeds column size limit");
+    "Size of output exceeds column size limit",
+    std::overflow_error);
 
   offsets_column->set_null_count(0);
   return std::pair(std::move(offsets_column), static_cast<size_type>(total_elements));
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 4a071fc467f..059ed5ffd33 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,7 @@ namespace cudf::lists {
  * @throws cudf::logic_error if @p sizes column is not of integer types.
  * @throws cudf::logic_error if any input column has nulls.
  * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
+ * @throws std::overflow_error if the output column would exceed the column size limit.
  *
  * @param starts First values in the result sequences.
  * @param sizes Numbers of values in the result sequences.
@@ -90,6 +91,7 @@ std::unique_ptr<column> sequences(
  * @throws cudf::logic_error if any input column has nulls.
  * @throws cudf::logic_error if @p starts and @p steps columns have different types.
  * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
+ * @throws std::overflow_error if the output column would exceed the column size limit.
  *
  * @param starts First values in the result sequences.
  * @param steps Increment values for the result sequences.
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 7f57984e88d..09e0f3bb079 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -27,6 +27,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -35,7 +37,7 @@ namespace detail {
  * @brief Creates child offsets and chars columns by applying the template function that
  * can be used for computing the output size of each string as well as create the output
  *
- * @throws cudf::logic_error if the output strings column exceeds the column size limit
+ * @throws std::overflow_error if the output strings column exceeds the column size limit
  *
  * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
  *         It must also have members d_offsets and d_chars which are set to
@@ -78,7 +80,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   // Now build the chars column
   std::unique_ptr<column> chars_column =
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index b3db8e6090b..ecdf81b9158 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -34,6 +34,7 @@
 
 #include <limits>
 #include <optional>
+#include <stdexcept>
 
 namespace cudf::lists {
 namespace detail {
@@ -169,7 +170,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
   auto const n_elements = cudf::detail::sizes_to_offsets(
     sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
   CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   auto child = type_dispatcher(starts.type(),
                                sequences_dispatcher{},
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index bc8e85bf99a..6bbd79166a8 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 
 #include <thrust/scan.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -134,7 +136,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   auto const char_bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
   CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   // Now build the chars column
   std::unique_ptr<column> chars =
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index cf911f13a37..93757fa37e4 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -39,6 +39,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
 
+#include <stdexcept>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -220,7 +222,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
     chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
   CUDF_EXPECTS(
     output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds column size limit");
+    "Size of output exceeds column size limit",
+    std::overflow_error);
 
   // This will contain the size in bytes of each ngram to generate
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index e75409d9f39..73009d167e8 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -229,7 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
   auto const repeat_times =
     int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max};
 
-  EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), std::overflow_error);
 }
 
 TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes)
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index eb6683aed31..09de1f1724e 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -533,8 +533,14 @@ def write_csv(
         .build()
     )
 
-    with nogil:
-        cpp_write_csv(options)
+    try:
+        with nogil:
+            cpp_write_csv(options)
+    except OverflowError:
+        raise OverflowError(
+            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
+            "Consider providing a smaller chunksize argument."
+        )
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 2339b874ea0..21752062201 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -201,8 +201,14 @@ def write_json(
         .build()
     )
 
-    with nogil:
-        libcudf_write_json(options)
+    try:
+        with nogil:
+            libcudf_write_json(options)
+    except OverflowError:
+        raise OverflowError(
+            f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
+            "Consider providing a smaller rows_per_chunk argument."
+        )
 
 
 cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 56e2e539e01..924cc62fb15 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1245,7 +1245,10 @@
 Notes
 -----
 - Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output.
-- If `to_csv` leads to memory errors consider setting the `chunksize` argument.
+- The default behaviour is to write all rows of the dataframe at once.
+  This can lead to memory or overflow errors for large tables. If this
+  happens, consider setting the ``chunksize`` argument to some
+  reasonable fraction of the total rows in the dataframe.
 
 Examples
 --------