Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Large strings support in cudf::concatenate #15195

Merged
merged 33 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3ab5b7e
Large strings support in cudf::concatenate
davidwendt Feb 29, 2024
eb76f5a
update test logic
davidwendt Feb 29, 2024
524c7fa
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Feb 29, 2024
4fe659b
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Feb 29, 2024
48ed48b
improve gtest
davidwendt Mar 1, 2024
ad64a11
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 1, 2024
c78f21a
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 1, 2024
c5549ea
fix style violation
davidwendt Mar 1, 2024
3469d5e
add LIBCUDF_LARGE_STRINGS_ENABLED env var
davidwendt Mar 2, 2024
8db694d
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 2, 2024
20cae7b
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 4, 2024
3806fe8
fix merge conflict
davidwendt Mar 5, 2024
a2ef2ac
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 6, 2024
6224d92
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 6, 2024
c2fe121
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 6, 2024
1ff9665
fix merge conflict
davidwendt Mar 8, 2024
12f0aad
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 11, 2024
3c0ccf6
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 12, 2024
3c16b83
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 13, 2024
51ccd60
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 13, 2024
0a03987
add large_strings_enabler test utility class
davidwendt Mar 13, 2024
22cb4cd
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 13, 2024
2189ce2
Merge branch 'branch-24.04' into concat-large-strings
davidwendt Mar 14, 2024
d8b4b7a
remove unneeded changes and unneeded large(slow) test
davidwendt Mar 14, 2024
f02fa8e
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 18, 2024
12f6ead
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 19, 2024
295e6f8
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 20, 2024
19effd3
fix doxygen comments
davidwendt Mar 20, 2024
d373083
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 20, 2024
55bbccf
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 22, 2024
f868b32
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Mar 29, 2024
97e6608
Merge branch 'branch-24.06' into concat-large-strings
davidwendt Apr 3, 2024
54df945
populate splits in for-loop in gtest
davidwendt Apr 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions cpp/include/cudf/strings/detail/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Create an offsets column to be a child of a strings column
*
* This will return the properly typed column to be filled in by the caller
* given the number of bytes to address.
*
* @param chars_bytes Number of bytes for the chars in the strings column
* @param count Number of elements for the offsets column
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return The offsets child column for a strings column
*/
std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
size_type count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Creates a string_view vector from a strings column.
*
Expand All @@ -52,6 +69,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
*/
int64_t get_offset64_threshold();

/**
* @brief Checks if large strings is enabled
*
* This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
*
* @return true if large strings are supported
*/
bool is_large_strings_enabled();

/**
* @brief Return a normalized offset value from a strings offsets column
*
Expand Down
25 changes: 25 additions & 0 deletions cpp/include/cudf_test/column_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ template <>
std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
//! @endcond

/**
* @brief For enabling large strings testing in specific tests
*/
struct large_strings_enabler {
/**
* @brief Create large strings enable object
*
* @param default_enable Default enables large strings support
*/
large_strings_enabler(bool default_enable = true);
~large_strings_enabler();

/**
* @brief Enable large strings support
*/
void enable();

/**
* @brief Disable large strings support
*/
void disable();
};

} // namespace cudf::test

// Macros for showing line of failure.
Expand Down Expand Up @@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
SCOPED_TRACE(" <-- line of failure\n"); \
cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
} while (0)

#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___
6 changes: 1 addition & 5 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total number of strings exceeds the column size limit",
std::overflow_error);
CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings exceeds the column size limit",
std::overflow_error);

bool const has_nulls =
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
Expand All @@ -232,8 +229,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
auto d_new_chars = output_chars.data();

// create output offsets column
auto offsets_column = make_numeric_column(
data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
auto itr_new_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());

Expand Down
32 changes: 30 additions & 2 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/get_value.cuh>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand All @@ -31,6 +32,9 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

#include <cstdlib>
#include <string>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -65,6 +69,24 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
return strings_vector;
}

std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
size_type count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const threshold = get_offset64_threshold();
if (!is_large_strings_enabled()) {
CUDF_EXPECTS(
chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
}
return make_numeric_column(
chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
count,
mask_state::UNALLOCATED,
stream,
mr);
}

namespace {
// The device variables are created here to avoid using a singleton that may cause issues
// with RMM initialize/finalize. See PR #3159 for details on this approach.
Expand Down Expand Up @@ -123,13 +145,19 @@ special_case_mapping const* get_special_case_mapping_table()

int64_t get_offset64_threshold()
{
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
int64_t const rtn = threshold != nullptr ? std::atol(threshold) : 0L;
return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
? rtn
: std::numeric_limits<int32_t>::max();
}

bool is_large_strings_enabled()
{
auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
return env != nullptr && std::string(env) == "1";
}

int64_t get_offset_value(cudf::column_view const& offsets,
size_type index,
rmm::cuda_stream_view stream)
Expand Down
80 changes: 49 additions & 31 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <numeric>
#include <stdexcept>
#include <string>
Expand Down Expand Up @@ -164,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
{
// Test large concatenate, causes out of bound device memory errors if kernel
// indexing is not int64_t.
// 1.5GB bytes, 5k columns
constexpr size_t num_strings = 10000;
constexpr size_t string_length = 150000;
constexpr size_t strings_per_column = 2;
constexpr size_t num_columns = num_strings / strings_per_column;

std::vector<std::string> strings;
std::vector<char const*> h_strings;
std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
std::vector<cudf::column_view> strings_columns;

std::string s(string_length, 'a');
for (size_t i = 0; i < num_strings; ++i)
h_strings.push_back(s.data());

for (size_t i = 0; i < num_columns; ++i)
strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
for (auto& wrapper : strings_column_wrappers)
strings_columns.push_back(wrapper);

auto results = cudf::concatenate(strings_columns);

cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateManyColumns)
{
std::vector<char const*> h_strings{
Expand Down Expand Up @@ -226,6 +197,53 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

TEST_F(StringColumnTest, ConcatenateLargeStrings)
{
CUDF_TEST_ENABLE_LARGE_STRINGS();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the long term plan for CUDF_TEST_ENABLE_LARGE_STRINGS? Will we need this forever or is it only until we turn on long strings by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be temporary until long strings are turned on by default.
It will be useful until then in case specific tests require them.
Also, a future PR will introduce a new test fixture where this feature is turned on by default.
And then this test may be moved to use that fixture at that time.

auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
std::vector<cudf::column_view> input_cols;
for (int i = 0; i < 10; ++i) { // 2500MB > 2GB
input_cols.push_back(input);
}
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), 50'000'000);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto splits = std::vector<cudf::size_type>({5'000'000,
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
10'000'000,
15'000'000,
20'000'000,
25'000'000,
30'000'000,
35'000'000,
40'000'000,
45'000'000});
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
{
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), 55'000'000);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(50'000'000);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/utilities/column_utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1011,5 +1011,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
return {std::move(host_data), bitmask_to_host(c)};
}

large_strings_enabler::large_strings_enabler(bool default_enable)
{
default_enable ? enable() : disable();
}

large_strings_enabler::~large_strings_enabler() { disable(); }

void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }

void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }

} // namespace test
} // namespace cudf
Loading