Produce useful guidance on overflow error in to_csv (#12705)

Since writing to CSV files is implemented by converting all columns in a dataframe to strings, and then concatenating those columns, when we attempt to write a large dataframe to CSV without specifying a chunk size, we can easily overflow the maximum column size. Currently the error message is rather inscrutable: that the requested size of a string column exceeds the column size limit. To help the user, catch this error and provide a useful error message that points them towards setting the `chunksize` argument. So that we don't produce false positive advice, tighten the scope by only catching `OverflowError`, to do this, make partial progress towards resolving #10200 by throwing `std::overflow_error` when checking for overflow of string column lengths. Closes #12690. Authors: - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Ashwin Srinath (https://github.com/shwina) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: #12705
rapidsai · Feb 17, 2023 · 2969b24 · 2969b24
1 parent 79a924a
commit 2969b24
Show file tree

Hide file tree

Showing 10 changed files with 48 additions and 17 deletions.
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -27,6 +27,8 @@
 #include <thrust/distance.h>
 #include <thrust/scan.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace detail {
 
@@ -242,7 +244,7 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  *   auto const bytes = cudf::detail::sizes_to_offsets(
  *     d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
  *   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
- *               "Size of output exceeds column size limit");
+ *               "Size of output exceeds column size limit", std::overflow_error);
  * @endcode
  *
  * @tparam SizesIterator Iterator type for input of the scan using addition operation
@@ -282,8 +284,8 @@ auto sizes_to_offsets(SizesIterator begin,
  * The return also includes the total number of elements -- the last element value from the
  * scan.
  *
- * @throw cudf::logic_error if the total size of the scan (last element) greater than maximum value
- * of `size_type`
+ * @throw std::overflow_error if the total size of the scan (last element) greater than maximum
+ * value of `size_type`
  *
  * @tparam InputIterator Used as input to scan to set the offset values
  * @param begin The beginning of the input sequence
@@ -317,7 +319,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
-    "Size of output exceeds column size limit");
+    "Size of output exceeds column size limit",
+    std::overflow_error);
 
   offsets_column->set_null_count(0);
   return std::pair(std::move(offsets_column), static_cast<size_type>(total_elements));

diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,7 @@ namespace cudf::lists {
  * @throws cudf::logic_error if @p sizes column is not of integer types.
  * @throws cudf::logic_error if any input column has nulls.
  * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
+ * @throws std::overflow_error if the output column would exceed the column size limit.
  *
  * @param starts First values in the result sequences.
  * @param sizes Numbers of values in the result sequences.
@@ -90,6 +91,7 @@ std::unique_ptr<column> sequences(
  * @throws cudf::logic_error if any input column has nulls.
  * @throws cudf::logic_error if @p starts and @p steps columns have different types.
  * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
+ * @throws std::overflow_error if the output column would exceed the column size limit.
  *
  * @param starts First values in the result sequences.
  * @param steps Increment values for the result sequences.

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -27,6 +27,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -35,7 +37,7 @@ namespace detail {
  * @brief Creates child offsets and chars columns by applying the template function that
  * can be used for computing the output size of each string as well as create the output
  *
- * @throws cudf::logic_error if the output strings column exceeds the column size limit
+ * @throws std::overflow_error if the output strings column exceeds the column size limit
  *
  * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
  *         It must also have members d_offsets and d_chars which are set to
@@ -78,7 +80,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   // Now build the chars column
   std::unique_ptr<column> chars_column =

diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
@@ -34,6 +34,7 @@
 
 #include <limits>
 #include <optional>
+#include <stdexcept>
 
 namespace cudf::lists {
 namespace detail {
@@ -169,7 +170,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
   auto const n_elements = cudf::detail::sizes_to_offsets(
     sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
   CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   auto child = type_dispatcher(starts.type(),
                                sequences_dispatcher{},

diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 
 #include <thrust/scan.h>
 
+#include <stdexcept>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -134,7 +136,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   auto const char_bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
   CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit");
+               "Size of output exceeds column size limit",
+               std::overflow_error);
 
   // Now build the chars column
   std::unique_ptr<column> chars =

diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
@@ -39,6 +39,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
 
+#include <stdexcept>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -220,7 +222,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
     chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
   CUDF_EXPECTS(
     output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds column size limit");
+    "Size of output exceeds column size limit",
+    std::overflow_error);
 
   // This will contain the size in bytes of each ngram to generate
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);

diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -229,7 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
   auto const repeat_times =
     int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max};
 
-  EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), std::overflow_error);
 }
 
 TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -533,8 +533,14 @@ def write_csv(
         .build()
     )
 
-    with nogil:
-        cpp_write_csv(options)
+    try:
+        with nogil:
+            cpp_write_csv(options)
+    except OverflowError:
+        raise OverflowError(
+            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
+            "Consider providing a smaller chunksize argument."
+        )
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
@@ -201,8 +201,14 @@ def write_json(
         .build()
     )
 
-    with nogil:
-        libcudf_write_json(options)
+    try:
+        with nogil:
+            libcudf_write_json(options)
+    except OverflowError:
+        raise OverflowError(
+            f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
+            "Consider providing a smaller rows_per_chunk argument."
+        )
 
 
 cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -1245,7 +1245,10 @@
 Notes
 -----
 - Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output.
-- If `to_csv` leads to memory errors consider setting the `chunksize` argument.
+- The default behaviour is to write all rows of the dataframe at once.
+  This can lead to memory or overflow errors for large tables. If this
+  happens, consider setting the ``chunksize`` argument to some
+  reasonable fraction of the total rows in the dataframe.
 
 Examples
 --------