From 2969b241c0654a11d1a61e29664bcaecd7bc4a15 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 Feb 2023 13:23:15 +0000 Subject: [PATCH] Produce useful guidance on overflow error in `to_csv` (#12705) Since writing to CSV files is implemented by converting all columns in a dataframe to strings, and then concatenating those columns, when we attempt to write a large dataframe to CSV without specifying a chunk size, we can easily overflow the maximum column size. Currently the error message is rather inscrutable: that the requested size of a string column exceeds the column size limit. To help the user, catch this error and provide a useful error message that points them towards setting the `chunksize` argument. So that we don't produce false positive advice, tighten the scope by only catching `OverflowError`, to do this, make partial progress towards resolving #10200 by throwing `std::overflow_error` when checking for overflow of string column lengths. Closes #12690. Authors: - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Ashwin Srinath (https://github.com/shwina) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/12705 --- cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh | 11 +++++++---- cpp/include/cudf/lists/filling.hpp | 4 +++- cpp/include/cudf/strings/detail/strings_children.cuh | 7 +++++-- cpp/src/lists/sequences.cu | 4 +++- cpp/src/strings/regex/utilities.cuh | 7 +++++-- cpp/src/text/ngrams_tokenize.cu | 5 ++++- cpp/tests/strings/repeat_strings_tests.cpp | 2 +- python/cudf/cudf/_lib/csv.pyx | 10 ++++++++-- python/cudf/cudf/_lib/json.pyx | 10 ++++++++-- python/cudf/cudf/utils/ioutils.py | 5 ++++- 10 files changed, 48 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 013e74ff18c..eefc5718617 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -242,7 +244,7 @@ static sizes_to_offsets_iterator make_sizes_to_offsets_i * auto const bytes = cudf::detail::sizes_to_offsets( * d_offsets, d_offsets + strings_count + 1, d_offsets, stream); * CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - * "Size of output exceeds column size limit"); + * "Size of output exceeds column size limit", std::overflow_error); * @endcode * * @tparam SizesIterator Iterator type for input of the scan using addition operation @@ -282,8 +284,8 @@ auto sizes_to_offsets(SizesIterator begin, * The return also includes the total number of elements -- the last element value from the * scan. * - * @throw cudf::logic_error if the total size of the scan (last element) greater than maximum value - * of `size_type` + * @throw std::overflow_error if the total size of the scan (last element) greater than maximum + * value of `size_type` * * @tparam InputIterator Used as input to scan to set the offset values * @param begin The beginning of the input sequence @@ -317,7 +319,8 @@ std::pair, size_type> make_offsets_child_column( auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); CUDF_EXPECTS( total_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); + "Size of output exceeds column size limit", + std::overflow_error); offsets_column->set_null_count(0); return std::pair(std::move(offsets_column), static_cast(total_elements)); diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp index 4a071fc467f..059ed5ffd33 100644 --- a/cpp/include/cudf/lists/filling.hpp +++ b/cpp/include/cudf/lists/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,6 +53,7 @@ namespace cudf::lists { * @throws cudf::logic_error if @p sizes column is not of integer types. * @throws cudf::logic_error if any input column has nulls. * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size. + * @throws std::overflow_error if the output column would exceed the column size limit. * * @param starts First values in the result sequences. * @param sizes Numbers of values in the result sequences. @@ -90,6 +91,7 @@ std::unique_ptr sequences( * @throws cudf::logic_error if any input column has nulls. * @throws cudf::logic_error if @p starts and @p steps columns have different types. * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size. + * @throws std::overflow_error if the output column would exceed the column size limit. * * @param starts First values in the result sequences. * @param steps Increment values for the result sequences. diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 7f57984e88d..09e0f3bb079 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -35,7 +37,7 @@ namespace detail { * @brief Creates child offsets and chars columns by applying the template function that * can be used for computing the output size of each string as well as create the output * - * @throws cudf::logic_error if the output strings column exceeds the column size limit + * @throws std::overflow_error if the output strings column exceeds the column size limit * * @tparam SizeAndExecuteFunction Function must accept an index and return a size. * It must also have members d_offsets and d_chars which are set to @@ -78,7 +80,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); + "Size of output exceeds column size limit", + std::overflow_error); // Now build the chars column std::unique_ptr chars_column = diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index b3db8e6090b..ecdf81b9158 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -34,6 +34,7 @@ #include #include +#include namespace cudf::lists { namespace detail { @@ -169,7 +170,8 @@ std::unique_ptr sequences(column_view const& starts, auto const n_elements = cudf::detail::sizes_to_offsets( sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream); CUDF_EXPECTS(n_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); + "Size of output exceeds column size limit", + std::overflow_error); auto child = type_dispatcher(starts.type(), sequences_dispatcher{}, diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index bc8e85bf99a..6bbd79166a8 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,8 @@ #include +#include + namespace cudf { namespace strings { namespace detail { @@ -134,7 +136,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, auto const char_bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); CUDF_EXPECTS(char_bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); + "Size of output exceeds column size limit", + std::overflow_error); // Now build the chars column std::unique_ptr chars = diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index cf911f13a37..93757fa37e4 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -39,6 +39,8 @@ #include #include +#include + namespace nvtext { namespace detail { namespace { @@ -220,7 +222,8 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream); CUDF_EXPECTS( output_chars_size <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit"); + "Size of output exceeds column size limit", + std::overflow_error); // This will contain the size in bytes of each ngram to generate rmm::device_uvector ngram_sizes(total_ngrams, stream); diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp index e75409d9f39..73009d167e8 100644 --- a/cpp/tests/strings/repeat_strings_tests.cpp +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -229,7 +229,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput) auto const repeat_times = int32s_col{half_max, half_max, half_max, half_max, half_max, half_max, half_max}; - EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), cudf::logic_error); + EXPECT_THROW(cudf::strings::repeat_strings(strs_cv, repeat_times), std::overflow_error); } TYPED_TEST(RepeatStringsTypedTest, StringsColumnNoNullWithScalarRepeatTimes) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index eb6683aed31..09de1f1724e 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -533,8 +533,14 @@ def write_csv( .build() ) - with nogil: - cpp_write_csv(options) + try: + with nogil: + cpp_write_csv(options) + except OverflowError: + raise OverflowError( + f"Writing CSV file with chunksize={rows_per_chunk} failed. " + "Consider providing a smaller chunksize argument." + ) cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 2339b874ea0..21752062201 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -201,8 +201,14 @@ def write_json( .build() ) - with nogil: - libcudf_write_json(options) + try: + with nogil: + libcudf_write_json(options) + except OverflowError: + raise OverflowError( + f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " + "Consider providing a smaller rows_per_chunk argument." + ) cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +: diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 56e2e539e01..924cc62fb15 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1245,7 +1245,10 @@ Notes ----- - Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output. -- If `to_csv` leads to memory errors consider setting the `chunksize` argument. +- The default behaviour is to write all rows of the dataframe at once. + This can lead to memory or overflow errors for large tables. If this + happens, consider setting the ``chunksize`` argument to some + reasonable fraction of the total rows in the dataframe. Examples --------