From 3ab5b7e93580e4f043ad4aad0635c743362ece78 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 29 Feb 2024 13:53:06 -0500 Subject: [PATCH 1/9] Large strings support in cudf::concatenate --- cpp/include/cudf/strings/detail/utilities.hpp | 16 ++++++++++ cpp/src/strings/copying/concatenate.cu | 15 ++++++--- cpp/src/strings/strings_column_view.cpp | 3 +- cpp/src/strings/utilities.cu | 14 +++++++++ cpp/tests/copying/concatenate_tests.cpp | 31 +++++++++++++++++++ 5 files changed, 73 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 3cf2850548d..ed0252baa22 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -40,6 +40,22 @@ std::unique_ptr create_chars_child_column(size_type bytes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Create an offsets column to be a child of a strings column + * + * This will return the properly typed column to be filled in by the caller. + * + * @param chars_bytes Number of bytes for the chars in the strings column + * @param count Number of elements for the offsets column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return The offsets child column for a strings column + */ +std::unique_ptr create_offsets_child_column(int64_t chars_bytes, + size_type count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** * @brief Creates a string_view vector from a strings column. * diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index c4564b1105b..3935aa12d0a 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -220,9 +220,9 @@ std::unique_ptr concatenate(host_span columns, CUDF_EXPECTS(offsets_count <= static_cast(std::numeric_limits::max()), "total number of strings exceeds the column size limit", std::overflow_error); - CUDF_EXPECTS(total_bytes <= static_cast(std::numeric_limits::max()), - "total size of strings exceeds the column size limit", - std::overflow_error); + // CUDF_EXPECTS(total_bytes <= static_cast(std::numeric_limits::max()), + // "total size of strings exceeds the column size limit", + // std::overflow_error); bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); @@ -232,8 +232,11 @@ std::unique_ptr concatenate(host_span columns, auto d_new_chars = output_chars.data(); // create output offsets column - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr); + // auto offsets_column = make_numeric_column( + // data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr); + auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr); + // std::cout << total_bytes << "\n"; + // std::cout << (int)offsets_column->type().id() << "\n"; auto itr_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); @@ -260,6 +263,8 @@ std::unique_ptr concatenate(host_span columns, itr_new_offsets, reinterpret_cast(null_mask.data()), d_valid_count.data()); + // auto err = cudaStreamSynchronize(stream.value()); + // std::cout << "fused_concatenate_string_offset_kernel = " << err << "\n"; if (has_nulls) { null_count = strings_count - d_valid_count.value(stream); } } diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp index 6be22d8e729..629b56c7e90 100644 --- a/cpp/src/strings/strings_column_view.cpp +++ b/cpp/src/strings/strings_column_view.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -48,7 +49,7 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept { if (size() == 0) return 0; - return detail::get_value(offsets(), offsets().size() - 1, stream); + return cudf::strings::detail::get_offset_value(offsets(), offsets().size() - 1, stream); } strings_column_view::chars_iterator strings_column_view::chars_begin(rmm::cuda_stream_view) const diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 72c3ccf4ac5..54ccaf5e7d3 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,19 @@ std::unique_ptr create_chars_child_column(cudf::size_type total_bytes, data_type{type_id::INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr); } +std::unique_ptr create_offsets_child_column(int64_t chars_bytes, + size_type count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return make_numeric_column( + chars_bytes < get_offset64_threshold() ? data_type{type_id::INT32} : data_type{type_id::INT64}, + count, + mask_state::UNALLOCATED, + stream, + mr); +} + namespace { // The device variables are created here to avoid using a singleton that may cause issues // with RMM initialize/finalize. See PR #3159 for details on this approach. diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 0f7c1053adf..956f178118f 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -32,6 +32,8 @@ #include #include +#include + #include #include #include @@ -226,6 +228,35 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error); } +TEST_F(StringColumnTest, ConcatenateLargeStrings) +{ + auto itr = thrust::constant_iterator("abcdefghijklmnopqrstuvwxy"); // 25 bytes + auto input = cudf::test::strings_column_wrapper(itr, itr + 10'000'000); // 250MB + std::vector input_cols; + for (int i = 0; i < 10; ++i) { // 2500MB > 2GB + input_cols.push_back(input); + } + auto result = cudf::concatenate(input_cols); + std::cout << result->view().size() << "\n"; + auto sv = cudf::strings_column_view(result->view()); + std::cout << sv.chars_size(cudf::get_default_stream()) << "\n"; + std::cout << (int)sv.offsets().type().id() << "\n"; + auto sliced = cudf::split(result->view(), + {10'000'000, + 20'000'000, + 30'000'000, + 40'000'000, + 50'000'000, + 60'000'000, + 70'000'000, + 80'000'000, + 90'000'000}); + std::cout << sliced.size() << "\n"; + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } +} + struct TableTest : public cudf::test::BaseFixture {}; TEST_F(TableTest, ConcatenateTables) From eb76f5a018ad180aaa7ee5e18ed7ee763cc82976 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 29 Feb 2024 14:34:37 -0500 Subject: [PATCH 2/9] update test logic --- cpp/tests/copying/concatenate_tests.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 956f178118f..c989b78dbf1 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -237,10 +237,12 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) input_cols.push_back(input); } auto result = cudf::concatenate(input_cols); - std::cout << result->view().size() << "\n"; - auto sv = cudf::strings_column_view(result->view()); - std::cout << sv.chars_size(cudf::get_default_stream()) << "\n"; - std::cout << (int)sv.offsets().type().id() << "\n"; + auto sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), 100'000'000); + // std::cout << sv.chars_size(cudf::get_default_stream()) << "\n"; + // std::cout << (int)sv.offsets().type().id() << "\n"; + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + auto sliced = cudf::split(result->view(), {10'000'000, 20'000'000, @@ -251,7 +253,6 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) 70'000'000, 80'000'000, 90'000'000}); - std::cout << sliced.size() << "\n"; for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } From 48ed48b4a421055a788bcad94d8d39b65b25d9a4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 1 Mar 2024 13:13:28 -0500 Subject: [PATCH 3/9] improve gtest --- cpp/tests/copying/concatenate_tests.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index c989b78dbf1..2bea5290b1a 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -230,29 +230,31 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) TEST_F(StringColumnTest, ConcatenateLargeStrings) { - auto itr = thrust::constant_iterator("abcdefghijklmnopqrstuvwxy"); // 25 bytes - auto input = cudf::test::strings_column_wrapper(itr, itr + 10'000'000); // 250MB + auto itr = thrust::constant_iterator( + "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes + auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB std::vector input_cols; for (int i = 0; i < 10; ++i) { // 2500MB > 2GB input_cols.push_back(input); } auto result = cudf::concatenate(input_cols); auto sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), 100'000'000); + EXPECT_EQ(sv.size(), 50'000'000); // std::cout << sv.chars_size(cudf::get_default_stream()) << "\n"; // std::cout << (int)sv.offsets().type().id() << "\n"; EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); auto sliced = cudf::split(result->view(), - {10'000'000, + {5'000'000, + 10'000'000, + 15'000'000, 20'000'000, + 25'000'000, 30'000'000, + 35'000'000, 40'000'000, - 50'000'000, - 60'000'000, - 70'000'000, - 80'000'000, - 90'000'000}); + 45'000'000 + }); for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } From c5549ea5ca0a071e7d32dea78e50b34486918cbb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 1 Mar 2024 14:29:25 -0500 Subject: [PATCH 4/9] fix style violation --- cpp/tests/copying/concatenate_tests.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 2bea5290b1a..6a527f210d7 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -246,15 +246,14 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) auto sliced = cudf::split(result->view(), {5'000'000, - 10'000'000, - 15'000'000, + 10'000'000, + 15'000'000, 20'000'000, 25'000'000, 30'000'000, 35'000'000, 40'000'000, - 45'000'000 - }); + 45'000'000}); for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } From 3469d5e4a9748c8203032366f41abe9c4ad02cb1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Sat, 2 Mar 2024 11:35:32 -0500 Subject: [PATCH 5/9] add LIBCUDF_LARGE_STRINGS_ENABLED env var --- .../cudf/strings/detail/strings_children.cuh | 29 ++++++++++--------- cpp/include/cudf/strings/detail/utilities.hpp | 7 +++++ cpp/src/strings/utilities.cu | 13 +++++++-- cpp/tests/copying/concatenate_tests.cpp | 2 ++ 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 8e2b6055a5c..f8d4b2d736e 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -164,22 +164,23 @@ std::pair, int64_t> make_offsets_child_column( }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_elements = + auto const total_bytes = cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); - // TODO: replace exception with if-statement when enabling creating INT64 offsets - CUDF_EXPECTS(total_elements <= size_type_max, - "Size of output exceeds the character size limit", - std::overflow_error); - // if (total_elements >= get_offset64_threshold()) { - // // recompute as int64 offsets when above the threshold - // offsets_column = make_numeric_column( - // data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - // auto d_offsets64 = offsets_column->mutable_view().template data(); - // sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); - // } - - return std::pair(std::move(offsets_column), total_elements); + auto const threshold = get_offset64_threshold(); + if (!is_large_strings_enabled()) { + CUDF_EXPECTS( + total_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error); + } + if (total_bytes >= threshold) { + // recompute as int64 offsets when above the threshold + offsets_column = make_numeric_column( + data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets64 = offsets_column->mutable_view().template data(); + cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); + } + + return std::pair(std::move(offsets_column), total_bytes); } } // namespace detail diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index ed0252baa22..c54148cb414 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -81,6 +81,13 @@ rmm::device_uvector create_string_vector_from_column( */ int64_t get_offset64_threshold(); +/** + * @brief Return if large strings are supported + * + * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED. + */ +bool is_large_strings_enabled(); + /** * @brief Return a normalized offset value from a strings offsets column * diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 54ccaf5e7d3..f641ada60a3 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -79,8 +79,13 @@ std::unique_ptr create_offsets_child_column(int64_t chars_bytes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto const threshold = get_offset64_threshold(); + if (!is_large_strings_enabled()) { + CUDF_EXPECTS( + chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error); + } return make_numeric_column( - chars_bytes < get_offset64_threshold() ? data_type{type_id::INT32} : data_type{type_id::INT64}, + chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64}, count, mask_state::UNALLOCATED, stream, @@ -145,13 +150,15 @@ special_case_mapping const* get_special_case_mapping_table() int64_t get_offset64_threshold() { - auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); - std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0; + auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + int64_t const rtn = threshold != nullptr ? std::atol(threshold) : 0L; return (rtn > 0 && rtn < std::numeric_limits::max()) ? rtn : std::numeric_limits::max(); } +bool is_large_strings_enabled() { return std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED") != nullptr; } + int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 6a527f210d7..4461a593779 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -230,6 +230,7 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) TEST_F(StringColumnTest, ConcatenateLargeStrings) { + setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); auto itr = thrust::constant_iterator( "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB @@ -257,6 +258,7 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } + unsetenv("LIBCUDF_LARGE_STRINGS_ENABLED"); } struct TableTest : public cudf::test::BaseFixture {}; From 0a039871cb58b1664564001e050ad08a727c8b68 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 13 Mar 2024 14:20:54 -0400 Subject: [PATCH 6/9] add large_strings_enabler test utility class --- cpp/include/cudf_test/column_utilities.hpp | 25 +++++++++++++ cpp/src/strings/utilities.cu | 8 ++++- cpp/tests/copying/concatenate_tests.cpp | 42 ++++++++++++++-------- cpp/tests/utilities/column_utilities.cu | 11 ++++++ 4 files changed, 71 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index a8957473175..c83599a8072 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -210,6 +210,29 @@ template <> std::pair, std::vector> to_host(column_view c); //! @endcond +/** + * @brief For enabling large strings testing in specific tests + */ +struct large_strings_enabler { + /** + * @brief Create large strings enable object + * + * @param default_enable Default enables large strings support + */ + large_strings_enabler(bool default_enable = true); + ~large_strings_enabler(); + + /** + * @brief Enable large strings support + */ + void enable(); + + /** + * @brief Disable large strings support + */ + void disable(); +}; + } // namespace cudf::test // Macros for showing line of failure. @@ -242,3 +265,5 @@ std::pair, std::vector> to_host(c SCOPED_TRACE(" <-- line of failure\n"); \ cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \ } while (0) + +#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___ diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 5ce025354a6..578113d3611 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -32,6 +32,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -149,7 +151,11 @@ int64_t get_offset64_threshold() : std::numeric_limits::max(); } -bool is_large_strings_enabled() { return std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED") != nullptr; } +bool is_large_strings_enabled() +{ + auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED"); + return env != nullptr && std::string(env) == "1"; +} int64_t get_offset_value(cudf::column_view const& offsets, size_type index, diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 4461a593779..40ff9d76fd5 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -230,7 +230,7 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) TEST_F(StringColumnTest, ConcatenateLargeStrings) { - setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); + CUDF_TEST_ENABLE_LARGE_STRINGS(); auto itr = thrust::constant_iterator( "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB @@ -241,24 +241,38 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) auto result = cudf::concatenate(input_cols); auto sv = cudf::strings_column_view(result->view()); EXPECT_EQ(sv.size(), 50'000'000); - // std::cout << sv.chars_size(cudf::get_default_stream()) << "\n"; - // std::cout << (int)sv.offsets().type().id() << "\n"; EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - auto sliced = cudf::split(result->view(), - {5'000'000, - 10'000'000, - 15'000'000, - 20'000'000, - 25'000'000, - 30'000'000, - 35'000'000, - 40'000'000, - 45'000'000}); + // verify results in sections + auto splits = std::vector({5'000'000, + 10'000'000, + 15'000'000, + 20'000'000, + 25'000'000, + 30'000'000, + 35'000'000, + 40'000'000, + 45'000'000}); + auto sliced = cudf::split(result->view(), splits); for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } - unsetenv("LIBCUDF_LARGE_STRINGS_ENABLED"); + + // also test with large strings column as input + { + input_cols.clear(); + input_cols.push_back(input); // regular column + input_cols.push_back(result->view()); // large column + result = cudf::concatenate(input_cols); + sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), 55'000'000); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(50'000'000); + sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + } } struct TableTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 2cd7dc1574c..047b096a283 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -1011,5 +1011,16 @@ std::pair, std::vector> to_host(c return {std::move(host_data), bitmask_to_host(c)}; } +large_strings_enabler::large_strings_enabler(bool default_enable) +{ + default_enable ? enable() : disable(); +} + +large_strings_enabler::~large_strings_enabler() { disable(); } + +void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); } + +void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); } + } // namespace test } // namespace cudf From d8b4b7a137e844f03b4b44adb0d7afaf7bb43abf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 14 Mar 2024 11:53:50 -0400 Subject: [PATCH 7/9] remove unneeded changes and unneeded large(slow) test --- .../cudf/strings/detail/strings_children.cuh | 29 +++++++++-------- cpp/src/strings/copying/concatenate.cu | 9 ------ cpp/tests/copying/concatenate_tests.cpp | 31 ------------------- 3 files changed, 14 insertions(+), 55 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index fc0b2bd5bab..49c4be88ca5 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -163,23 +163,22 @@ std::pair, int64_t> make_offsets_child_column( }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_bytes = + auto const total_elements = cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); - auto const threshold = get_offset64_threshold(); - if (!is_large_strings_enabled()) { - CUDF_EXPECTS( - total_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error); - } - if (total_bytes >= threshold) { - // recompute as int64 offsets when above the threshold - offsets_column = make_numeric_column( - data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - auto d_offsets64 = offsets_column->mutable_view().template data(); - cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); - } - - return std::pair(std::move(offsets_column), total_bytes); + // TODO: replace exception with if-statement when enabling creating INT64 offsets + CUDF_EXPECTS(total_elements <= size_type_max, + "Size of output exceeds the character size limit", + std::overflow_error); + // if (total_elements >= get_offset64_threshold()) { + // // recompute as int64 offsets when above the threshold + // offsets_column = make_numeric_column( + // data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + // auto d_offsets64 = offsets_column->mutable_view().template data(); + // sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); + // } + + return std::pair(std::move(offsets_column), total_elements); } } // namespace detail diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 3935aa12d0a..de7067f0bed 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -220,9 +220,6 @@ std::unique_ptr concatenate(host_span columns, CUDF_EXPECTS(offsets_count <= static_cast(std::numeric_limits::max()), "total number of strings exceeds the column size limit", std::overflow_error); - // CUDF_EXPECTS(total_bytes <= static_cast(std::numeric_limits::max()), - // "total size of strings exceeds the column size limit", - // std::overflow_error); bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); @@ -232,11 +229,7 @@ std::unique_ptr concatenate(host_span columns, auto d_new_chars = output_chars.data(); // create output offsets column - // auto offsets_column = make_numeric_column( - // data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr); auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr); - // std::cout << total_bytes << "\n"; - // std::cout << (int)offsets_column->type().id() << "\n"; auto itr_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); @@ -263,8 +256,6 @@ std::unique_ptr concatenate(host_span columns, itr_new_offsets, reinterpret_cast(null_mask.data()), d_valid_count.data()); - // auto err = cudaStreamSynchronize(stream.value()); - // std::cout << "fused_concatenate_string_offset_kernel = " << err << "\n"; if (has_nulls) { null_count = strings_count - d_valid_count.value(stream); } } diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 40ff9d76fd5..058eff59208 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -166,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringColumnTest, ConcatenateColumnViewLarge) -{ - // Test large concatenate, causes out of bound device memory errors if kernel - // indexing is not int64_t. - // 1.5GB bytes, 5k columns - constexpr size_t num_strings = 10000; - constexpr size_t string_length = 150000; - constexpr size_t strings_per_column = 2; - constexpr size_t num_columns = num_strings / strings_per_column; - - std::vector strings; - std::vector h_strings; - std::vector strings_column_wrappers; - std::vector strings_columns; - - std::string s(string_length, 'a'); - for (size_t i = 0; i < num_strings; ++i) - h_strings.push_back(s.data()); - - for (size_t i = 0; i < num_columns; ++i) - strings_column_wrappers.push_back(cudf::test::strings_column_wrapper( - h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column)); - for (auto& wrapper : strings_column_wrappers) - strings_columns.push_back(wrapper); - - auto results = cudf::concatenate(strings_columns); - - cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); -} - TEST_F(StringColumnTest, ConcatenateManyColumns) { std::vector h_strings{ From 19effd3196052a41e395e373134ca17df62ceb6a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 20 Mar 2024 14:27:37 -0400 Subject: [PATCH 8/9] fix doxygen comments --- cpp/include/cudf/strings/detail/utilities.hpp | 7 +++++-- cpp/src/strings/utilities.cu | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d8c8c0baa78..588ceac4ec0 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -30,7 +30,8 @@ namespace detail { /** * @brief Create an offsets column to be a child of a strings column * - * This will return the properly typed column to be filled in by the caller. + * This will return the properly typed column to be filled in by the caller + * given the number of bytes to address. * * @param chars_bytes Number of bytes for the chars in the strings column * @param count Number of elements for the offsets column @@ -69,9 +70,11 @@ rmm::device_uvector create_string_vector_from_column( int64_t get_offset64_threshold(); /** - * @brief Return if large strings are supported + * @brief Checks if large strings is enabled * * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED. + * + * @return true if large strings are supported */ bool is_large_strings_enabled(); diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 578113d3611..bc87776f0a6 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -32,6 +32,7 @@ #include #include +#include #include namespace cudf { From 54df945135128b3ace4d18aa9b8ec3fca423d098 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 3 Apr 2024 10:49:27 -0400 Subject: [PATCH 9/9] populate splits in for-loop in gtest --- cpp/include/cudf/strings/detail/utilities.hpp | 3 ++- cpp/src/strings/utilities.cu | 3 +++ cpp/tests/copying/concatenate_tests.cpp | 24 ++++++++----------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 588ceac4ec0..cf9a13e9742 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -34,7 +34,8 @@ namespace detail { * given the number of bytes to address. * * @param chars_bytes Number of bytes for the chars in the strings column - * @param count Number of elements for the offsets column + * @param count Number of elements for the offsets column. + * This is the number of rows in the parent strings column +1. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The offsets child column for a strings column diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index bc87776f0a6..c83f827f290 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -69,6 +69,9 @@ rmm::device_uvector create_string_vector_from_column( return strings_vector; } +/** + * @copydoc cudf::strings::detail::create_offsets_child_column + */ std::unique_ptr create_offsets_child_column(int64_t chars_bytes, size_type count, rmm::cuda_stream_view stream, diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 058eff59208..3e2e332936e 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -203,25 +203,21 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) auto itr = thrust::constant_iterator( "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB + auto view = cudf::column_view(input); std::vector input_cols; - for (int i = 0; i < 10; ++i) { // 2500MB > 2GB - input_cols.push_back(input); + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_cols.push_back(view); + splits.push_back(view.size() * (i + 1)); } + splits.pop_back(); // remove last entry auto result = cudf::concatenate(input_cols); auto sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), 50'000'000); + EXPECT_EQ(sv.size(), view.size() * multiplier); EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); // verify results in sections - auto splits = std::vector({5'000'000, - 10'000'000, - 15'000'000, - 20'000'000, - 25'000'000, - 30'000'000, - 35'000'000, - 40'000'000, - 45'000'000}); auto sliced = cudf::split(result->view(), splits); for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); @@ -234,9 +230,9 @@ TEST_F(StringColumnTest, ConcatenateLargeStrings) input_cols.push_back(result->view()); // large column result = cudf::concatenate(input_cols); sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), 55'000'000); + EXPECT_EQ(sv.size(), view.size() * (multiplier + 1)); EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - splits.push_back(50'000'000); + splits.push_back(view.size() * multiplier); sliced = cudf::split(result->view(), splits); for (auto c : sliced) { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);