Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove extra total chars size calculation from cudf::concatenate #14540

Merged
merged 7 commits into from
Dec 7, 2023
21 changes: 1 addition & 20 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -403,26 +403,7 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
// verify offsets
check_offsets_size(cols);

// chars
size_t const total_char_count = std::accumulate(
cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
strings_column_view scv(b);
return a + (scv.is_empty() ? 0
// if the column is unsliced, skip the offset retrieval.
: scv.offset() > 0
? cudf::detail::get_value<size_type>(
scv.offsets(), scv.offset() + scv.size(), stream) -
cudf::detail::get_value<size_type>(scv.offsets(), scv.offset(), stream)
// if the offset() is 0, it can still be sliced to a shorter length. in this case
// we only need to read a single offset. otherwise just return the full length
// (chars_size())
: scv.size() + 1 == scv.offsets().size()
? scv.chars_size()
: cudf::detail::get_value<size_type>(scv.offsets(), scv.size(), stream));
});
CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated chars exceeds the column size limit",
std::overflow_error);
// chars -- checked in call to cudf::strings::detail::concatenate
}

template <>
Expand Down
34 changes: 15 additions & 19 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,26 +193,10 @@ TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateTooManyColumns)
TEST_F(StringColumnTest, ConcatenateManyColumns)
{
std::vector<char const*> h_strings{"aaa",
"bb",
"",
"cccc",
"d",
"ééé",
"ff",
"gggg",
"",
"h",
"iiii",
"jjj",
"k",
"lllllll",
"mmmmm",
"n",
"oo",
"ppp"};
std::vector<char const*> h_strings{
"aaa", "bb", "", "cccc", "d", "ééé", "ff", "gggg", "", "h", "iiii", "jjj"};

std::vector<char const*> expected_strings;
std::vector<cudf::test::strings_column_wrapper> wrappers;
Expand All @@ -228,6 +212,18 @@ TEST_F(StringColumnTest, ConcatenateTooManyColumns)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateTooLarge)
{
std::string big_str(1000000, 'a'); // 1 million bytes x 5 = 5 million bytes
cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
std::vector<cudf::column_view> input_cols;
// 5 millions bytes x 500 = 2.5GB > std::numeric_limits<size_type>::max()
for (int i = 0; i < 500; ++i) {
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
input_cols.push_back(input);
}
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
Loading