Skip to content

Commit

Permalink
Remove extra total chars size calculation from cudf::concatenate (#14540
Browse files Browse the repository at this point in the history
)

Changes bounds check logic in `cudf::concatenate` to remove the upfront chars bytes calculation on strings columns. 
The calculation was thrown away and redone later (and more efficiently) during the call to `cudf::strings::detail::concatenate`. There does not seem to be a need to check this twice. A gtest was added to make sure the appropriate exception is thrown as expected.
Reference #14202

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14540
  • Loading branch information
davidwendt authored Dec 7, 2023
1 parent b02e82f commit 6a6cba5
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 39 deletions.
21 changes: 1 addition & 20 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -403,26 +403,7 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
// verify offsets
check_offsets_size(cols);

// chars
size_t const total_char_count = std::accumulate(
cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
strings_column_view scv(b);
return a + (scv.is_empty() ? 0
// if the column is unsliced, skip the offset retrieval.
: scv.offset() > 0
? cudf::detail::get_value<size_type>(
scv.offsets(), scv.offset() + scv.size(), stream) -
cudf::detail::get_value<size_type>(scv.offsets(), scv.offset(), stream)
// if the offset() is 0, it can still be sliced to a shorter length. in this case
// we only need to read a single offset. otherwise just return the full length
// (chars_size())
: scv.size() + 1 == scv.offsets().size()
? scv.chars_size()
: cudf::detail::get_value<size_type>(scv.offsets(), scv.size(), stream));
});
CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated chars exceeds the column size limit",
std::overflow_error);
// chars -- checked in call to cudf::strings::detail::concatenate
}

template <>
Expand Down
34 changes: 15 additions & 19 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,26 +193,10 @@ TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateTooManyColumns)
TEST_F(StringColumnTest, ConcatenateManyColumns)
{
std::vector<char const*> h_strings{"aaa",
"bb",
"",
"cccc",
"d",
"ééé",
"ff",
"gggg",
"",
"h",
"iiii",
"jjj",
"k",
"lllllll",
"mmmmm",
"n",
"oo",
"ppp"};
std::vector<char const*> h_strings{
"aaa", "bb", "", "cccc", "d", "ééé", "ff", "gggg", "", "h", "iiii", "jjj"};

std::vector<char const*> expected_strings;
std::vector<cudf::test::strings_column_wrapper> wrappers;
Expand All @@ -228,6 +212,18 @@ TEST_F(StringColumnTest, ConcatenateTooManyColumns)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateTooLarge)
{
std::string big_str(1000000, 'a'); // 1 million bytes x 5 = 5 million bytes
cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
std::vector<cudf::column_view> input_cols;
// 5 millions bytes x 500 = 2.5GB > std::numeric_limits<size_type>::max()
for (int i = 0; i < 500; ++i) {
input_cols.push_back(input);
}
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down

0 comments on commit 6a6cba5

Please sign in to comment.