Skip to content

Commit

Permalink
Produce useful guidance on overflow error in to_csv (#12705)
Browse files Browse the repository at this point in the history
Since writing to CSV files is implemented by converting all columns in
a dataframe to strings, and then concatenating those columns, when we
attempt to write a large dataframe to CSV without specifying a chunk
size, we can easily overflow the maximum column size.

Currently the error message is rather inscrutable: that the requested
size of a string column exceeds the column size limit. To help the
user, catch this error and provide a useful error message that points
them towards setting the `chunksize` argument.

So that we don't produce false positive advice, tighten the scope by
only catching `OverflowError`, to do this, make partial progress
towards resolving #10200 by throwing `std::overflow_error` when
checking for overflow of string column lengths.

Closes #12690.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Ashwin Srinath (https://github.com/shwina)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #12705
  • Loading branch information
wence- authored Feb 17, 2023
1 parent 79a924a commit 2969b24
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 17 deletions.
11 changes: 7 additions & 4 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/distance.h>
#include <thrust/scan.h>

#include <stdexcept>

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -242,7 +244,7 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
* auto const bytes = cudf::detail::sizes_to_offsets(
* d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
* CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
* "Size of output exceeds column size limit");
* "Size of output exceeds column size limit", std::overflow_error);
* @endcode
*
* @tparam SizesIterator Iterator type for input of the scan using addition operation
Expand Down Expand Up @@ -282,8 +284,8 @@ auto sizes_to_offsets(SizesIterator begin,
* The return also includes the total number of elements -- the last element value from the
* scan.
*
* @throw cudf::logic_error if the total size of the scan (last element) greater than maximum value
* of `size_type`
* @throw std::overflow_error if the total size of the scan (last element) greater than maximum
* value of `size_type`
*
* @tparam InputIterator Used as input to scan to set the offset values
* @param begin The beginning of the input sequence
Expand Down Expand Up @@ -317,7 +319,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
CUDF_EXPECTS(
total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

offsets_column->set_null_count(0);
return std::pair(std::move(offsets_column), static_cast<size_type>(total_elements));
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/lists/filling.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -53,6 +53,7 @@ namespace cudf::lists {
* @throws cudf::logic_error if @p sizes column is not of integer types.
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
* @throws std::overflow_error if the output column would exceed the column size limit.
*
* @param starts First values in the result sequences.
* @param sizes Numbers of values in the result sequences.
Expand Down Expand Up @@ -90,6 +91,7 @@ std::unique_ptr<column> sequences(
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p steps columns have different types.
* @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
* @throws std::overflow_error if the output column would exceed the column size limit.
*
* @param starts First values in the result sequences.
* @param steps Increment values for the result sequences.
Expand Down
7 changes: 5 additions & 2 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

#include <stdexcept>

namespace cudf {
namespace strings {
namespace detail {
Expand All @@ -35,7 +37,7 @@ namespace detail {
* @brief Creates child offsets and chars columns by applying the template function that
* can be used for computing the output size of each string as well as create the output
*
* @throws cudf::logic_error if the output strings column exceeds the column size limit
* @throws std::overflow_error if the output strings column exceeds the column size limit
*
* @tparam SizeAndExecuteFunction Function must accept an index and return a size.
* It must also have members d_offsets and d_chars which are set to
Expand Down Expand Up @@ -78,7 +80,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// Now build the chars column
std::unique_ptr<column> chars_column =
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/lists/sequences.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

#include <limits>
#include <optional>
#include <stdexcept>

namespace cudf::lists {
namespace detail {
Expand Down Expand Up @@ -169,7 +170,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
auto const n_elements = cudf::detail::sizes_to_offsets(
sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

auto child = type_dispatcher(starts.type(),
sequences_dispatcher{},
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,6 +28,8 @@

#include <thrust/scan.h>

#include <stdexcept>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -134,7 +136,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
auto const char_bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// Now build the chars column
std::unique_ptr<column> chars =
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -220,7 +222,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
CUDF_EXPECTS(
output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// This will contain the size in bytes of each ngram to generate
rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/strings/repeat_strings_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
auto const repeat_times =
int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max};

EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error);
EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), std::overflow_error);
}

TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes)
Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,14 @@ def write_csv(
.build()
)

with nogil:
cpp_write_csv(options)
try:
with nogil:
cpp_write_csv(options)
except OverflowError:
raise OverflowError(
f"Writing CSV file with chunksize={rows_per_chunk} failed. "
"Consider providing a smaller chunksize argument."
)


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,14 @@ def write_json(
.build()
)

with nogil:
libcudf_write_json(options)
try:
with nogil:
libcudf_write_json(options)
except OverflowError:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
"Consider providing a smaller rows_per_chunk argument."
)


cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,7 +1245,10 @@
Notes
-----
- Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output.
- If `to_csv` leads to memory errors consider setting the `chunksize` argument.
- The default behaviour is to write all rows of the dataframe at once.
This can lead to memory or overflow errors for large tables. If this
happens, consider setting the ``chunksize`` argument to some
reasonable fraction of the total rows in the dataframe.
Examples
--------
Expand Down

0 comments on commit 2969b24

Please sign in to comment.