Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Produce useful guidance on overflow error in to_csv #12705

Merged
merged 10 commits into from
Feb 17, 2023
11 changes: 7 additions & 4 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/distance.h>
#include <thrust/scan.h>

#include <stdexcept>

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -242,7 +244,7 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
* auto const bytes = cudf::detail::sizes_to_offsets(
* d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
* CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
* "Size of output exceeds column size limit");
* "Size of output exceeds column size limit", std::overflow_error);
* @endcode
*
* @tparam SizesIterator Iterator type for input of the scan using addition operation
Expand Down Expand Up @@ -282,8 +284,8 @@ auto sizes_to_offsets(SizesIterator begin,
* The return also includes the total number of elements -- the last element value from the
* scan.
*
* @throw cudf::logic_error if the total size of the scan (last element) greater than maximum value
* of `size_type`
* @throw std::overflow_error if the total size of the scan (last element) greater than maximum
* value of `size_type`
*
* @tparam InputIterator Used as input to scan to set the offset values
* @param begin The beginning of the input sequence
Expand Down Expand Up @@ -317,7 +319,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
CUDF_EXPECTS(
total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
wence- marked this conversation as resolved.
Show resolved Hide resolved
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

offsets_column->set_null_count(0);
return std::pair(std::move(offsets_column), static_cast<size_type>(total_elements));
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/lists/filling.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -53,6 +53,7 @@ namespace cudf::lists {
* @throws cudf::logic_error if @p sizes column is not of integer types.
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
* @throws std::overflow_error if the output column would exceed the column size limit.
*
* @param starts First values in the result sequences.
* @param sizes Numbers of values in the result sequences.
Expand Down Expand Up @@ -90,6 +91,7 @@ std::unique_ptr<column> sequences(
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p steps columns have different types.
* @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
* @throws std::overflow_error if the output column would exceed the column size limit.
*
* @param starts First values in the result sequences.
* @param steps Increment values for the result sequences.
Expand Down
7 changes: 5 additions & 2 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

#include <stdexcept>

namespace cudf {
namespace strings {
namespace detail {
Expand All @@ -35,7 +37,7 @@ namespace detail {
* @brief Creates child offsets and chars columns by applying the template function that
* can be used for computing the output size of each string as well as create the output
*
* @throws cudf::logic_error if the output strings column exceeds the column size limit
* @throws std::overflow_error if the output strings column exceeds the column size limit
*
* @tparam SizeAndExecuteFunction Function must accept an index and return a size.
* It must also have members d_offsets and d_chars which are set to
Expand Down Expand Up @@ -78,7 +80,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// Now build the chars column
std::unique_ptr<column> chars_column =
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/lists/sequences.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

#include <limits>
#include <optional>
#include <stdexcept>

namespace cudf::lists {
namespace detail {
Expand Down Expand Up @@ -169,7 +170,8 @@ std::unique_ptr<column> sequences(column_view const& starts,
auto const n_elements = cudf::detail::sizes_to_offsets(
sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

auto child = type_dispatcher(starts.type(),
sequences_dispatcher{},
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,6 +28,8 @@

#include <thrust/scan.h>

#include <stdexcept>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -134,7 +136,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
auto const char_bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// Now build the chars column
std::unique_ptr<column> chars =
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -220,7 +222,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
CUDF_EXPECTS(
output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds column size limit");
"Size of output exceeds column size limit",
std::overflow_error);

// This will contain the size in bytes of each ngram to generate
rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,14 @@ def write_csv(
.build()
)

with nogil:
cpp_write_csv(options)
try:
with nogil:
cpp_write_csv(options)
except OverflowError:
raise OverflowError(
f"Writing CSV file with chunksize={rows_per_chunk} failed. "
"Consider providing a smaller chunksize argument."
)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,14 @@ def write_json(
.build()
)

with nogil:
libcudf_write_json(options)
try:
with nogil:
libcudf_write_json(options)
except OverflowError:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
"Consider providing a smaller rows_per_chunk argument."
)


cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,7 +1245,10 @@
Notes
-----
- Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output.
- If `to_csv` leads to memory errors consider setting the `chunksize` argument.
- The default behaviour is to write all rows of the dataframe at once.
This can lead to memory or overflow errors for large tables. If this
happens, consider setting the ``chunksize`` argument to some
reasonable fraction of the total rows in the dataframe.

Examples
--------
Expand Down