From 6a6cba5066fd7af03e77ddfdf3d24f24cc5c6957 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:10:38 -0500 Subject: [PATCH] Remove extra total chars size calculation from cudf::concatenate (#14540) Changes bounds check logic in `cudf::concatenate` to remove the upfront chars bytes calculation on strings columns. The calculation was thrown away and redone later (and more efficiently) during the call to `cudf::strings::detail::concatenate`. There does not seem to be a need to check this twice. A gtest was added to make sure the appropriate exception is thrown as expected. Reference #14202 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14540 --- cpp/src/copying/concatenate.cu | 21 +-------------- cpp/tests/copying/concatenate_tests.cpp | 34 +++++++++++-------------- 2 files changed, 16 insertions(+), 39 deletions(-) diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 9b9e780965a..ddf39e21685 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -403,26 +403,7 @@ void traverse_children::operator()(host_span size_t { - strings_column_view scv(b); - return a + (scv.is_empty() ? 0 - // if the column is unsliced, skip the offset retrieval. - : scv.offset() > 0 - ? cudf::detail::get_value( - scv.offsets(), scv.offset() + scv.size(), stream) - - cudf::detail::get_value(scv.offsets(), scv.offset(), stream) - // if the offset() is 0, it can still be sliced to a shorter length. in this case - // we only need to read a single offset. otherwise just return the full length - // (chars_size()) - : scv.size() + 1 == scv.offsets().size() - ? scv.chars_size() - : cudf::detail::get_value(scv.offsets(), scv.size(), stream)); - }); - CUDF_EXPECTS(total_char_count <= static_cast(std::numeric_limits::max()), - "Total number of concatenated chars exceeds the column size limit", - std::overflow_error); + // chars -- checked in call to cudf::strings::detail::concatenate } template <> diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index c81f1772d10..b8faa0bd081 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -193,26 +193,10 @@ TEST_F(StringColumnTest, ConcatenateColumnViewLarge) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringColumnTest, ConcatenateTooManyColumns) +TEST_F(StringColumnTest, ConcatenateManyColumns) { - std::vector h_strings{"aaa", - "bb", - "", - "cccc", - "d", - "ééé", - "ff", - "gggg", - "", - "h", - "iiii", - "jjj", - "k", - "lllllll", - "mmmmm", - "n", - "oo", - "ppp"}; + std::vector h_strings{ + "aaa", "bb", "", "cccc", "d", "ééé", "ff", "gggg", "", "h", "iiii", "jjj"}; std::vector expected_strings; std::vector wrappers; @@ -228,6 +212,18 @@ TEST_F(StringColumnTest, ConcatenateTooManyColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringColumnTest, ConcatenateTooLarge) +{ + std::string big_str(1000000, 'a'); // 1 million bytes x 5 = 5 million bytes + cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str}; + std::vector input_cols; + // 5 millions bytes x 500 = 2.5GB > std::numeric_limits::max() + for (int i = 0; i < 500; ++i) { + input_cols.push_back(input); + } + EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error); +} + struct TableTest : public cudf::test::BaseFixture {}; TEST_F(TableTest, ConcatenateTables)