diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index ecac761f7cb..66b831b917f 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv) } } +/** + * @brief Sets up stream mode memory resource adaptor + * + * The resource adaptor is only set as the current device resource if the + * stream mode is enabled. + * + * The caller must keep the return object alive for the life of the test runs. + * + * @param cmd_opts Command line options returned by parse_cudf_test_opts + * @return Memory resource adaptor + */ +inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) +{ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); + auto resource = cudf::test::create_memory_resource(rmm_mode); + rmm::mr::set_current_device_resource(resource.get()); + return resource; +} + /** * @brief Sets up stream mode memory resource adaptor * @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) * function parses the command line to customize test behavior, like the * allocation mode used for creating the default memory resource. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cudf::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - auto adaptor = make_stream_mode_adaptor(cmd_opts); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ + [[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \ + [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f59e675e1d5..6c56d82007a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -568,6 +568,15 @@ ConfigureTest( strings/urls_tests.cpp ) +# ################################################################################################## +# * large strings test ---------------------------------------------------------------------------- +ConfigureTest( + LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp + large_strings/concatenate_tests.cpp + GPUS 1 + PERCENT 100 +) + # ################################################################################################## # * json path test -------------------------------------------------------------------------------- ConfigureTest(JSON_PATH_TEST json/json_tests.cpp) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 3e2e332936e..c2d1e1d9f4f 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error); } -TEST_F(StringColumnTest, ConcatenateLargeStrings) -{ - CUDF_TEST_ENABLE_LARGE_STRINGS(); - auto itr = thrust::constant_iterator( - "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes - auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB - auto view = cudf::column_view(input); - std::vector input_cols; - std::vector splits; - int const multiplier = 10; - for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB - input_cols.push_back(view); - splits.push_back(view.size() * (i + 1)); - } - splits.pop_back(); // remove last entry - auto result = cudf::concatenate(input_cols); - auto sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), view.size() * multiplier); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - - // verify results in sections - auto sliced = cudf::split(result->view(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also test with large strings column as input - { - input_cols.clear(); - input_cols.push_back(input); // regular column - input_cols.push_back(result->view()); // large column - result = cudf::concatenate(input_cols); - sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), view.size() * (multiplier + 1)); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - splits.push_back(view.size() * multiplier); - sliced = cudf::split(result->view(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - } -} - struct TableTest : public cudf::test::BaseFixture {}; TEST_F(TableTest, ConcatenateTables) diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp new file mode 100644 index 00000000000..aa445bf761b --- /dev/null +++ b/cpp/tests/large_strings/concatenate_tests.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include + +#include + +struct ConcatenateTest : public cudf::test::StringsLargeTest {}; + +TEST_F(ConcatenateTest, ConcatenateVertical) +{ + auto input = this->long_column(); + auto view = cudf::column_view(input); + std::vector input_cols; + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_cols.push_back(view); + splits.push_back(view.size() * (i + 1)); + } + splits.pop_back(); // remove last entry + auto result = cudf::concatenate(input_cols); + auto sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + // verify results in sections + auto sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also test with large strings column as input + input_cols.clear(); + input_cols.push_back(input); // regular column + input_cols.push_back(result->view()); // large column + result = cudf::concatenate(input_cols); + sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), view.size() * (multiplier + 1)); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(view.size() * multiplier); + sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } +} diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp new file mode 100644 index 00000000000..59e0cd43d05 --- /dev/null +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::test { +class LargeStringsData { + public: + using DataPointer = std::unique_ptr; + + virtual ~LargeStringsData() {} + + void add_table(std::string_view name, std::unique_ptr&& data) + { + _data[std::string(name)] = std::move(data); + } + + cudf::table_view get_table(std::string_view name) const + { + std::string key{name}; + return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{}; + } + + void add_column(std::string_view name, std::unique_ptr&& data) + { + std::vector> cols; + cols.emplace_back(std::move(data)); + _data[std::string(name)] = std::make_unique(std::move(cols)); + } + + cudf::column_view get_column(std::string_view name) const + { + std::string key{name}; + return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{}; + } + + bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); } + + protected: + std::map _data; +}; + +cudf::column_view StringsLargeTest::wide_column() +{ + std::string name{"wide1"}; + if (!g_ls_data->has_key(name)) { + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "The result does not include the value in the sum in"}); + auto counts = cudf::test::fixed_width_column_wrapper({8, 8, 8, 8, 8}); + auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts); + g_ls_data->add_column(name, std::move(result)); + } + return g_ls_data->get_column(name); +} + +cudf::column_view StringsLargeTest::long_column() +{ + std::string name("long1"); + if (!g_ls_data->has_key(name)) { + auto itr = thrust::constant_iterator( + "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes + auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB + g_ls_data->add_column(name, input.release()); + } + return g_ls_data->get_column(name); +} + +std::unique_ptr StringsLargeTest::get_ls_data() +{ + CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data"); + auto lsd_data = std::make_unique(); + g_ls_data = lsd_data.get(); + return lsd_data; +} + +LargeStringsData* StringsLargeTest::g_ls_data = nullptr; +} // namespace cudf::test + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + // hardcoding the CUDA memory resource to keep from exceeding the pool + auto mr = cudf::test::make_cuda(); + rmm::mr::set_current_device_resource(mr.get()); + auto adaptor = make_stream_mode_adaptor(cmd_opts); + + // create object to automatically be destroyed at the end of main() + auto lsd = cudf::test::StringsLargeTest::get_ls_data(); + + return RUN_ALL_TESTS(); +} diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp new file mode 100644 index 00000000000..8827b65f1ce --- /dev/null +++ b/cpp/tests/large_strings/large_strings_fixture.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +namespace cudf::test { +class LargeStringsData; + +/** + * @brief Fixture for creating large strings tests + * + * Stores tests strings columns for reuse by specific tests. + * Creating the test input only once helps speed up the overall tests. + * + * Also automatically enables appropriate large strings environment variables. + */ +struct StringsLargeTest : public cudf::test::BaseFixture { + /** + * @brief Returns a column of long strings + */ + cudf::column_view wide_column(); + + /** + * @brief Returns a long column of strings + */ + cudf::column_view long_column(); + + large_strings_enabler g_ls_enabler; + static LargeStringsData* g_ls_data; + + static std::unique_ptr get_ls_data(); +}; +} // namespace cudf::test diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp new file mode 100644 index 00000000000..afe6e424371 --- /dev/null +++ b/cpp/tests/large_strings/merge_tests.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include + +#include + +struct MergeTest : public cudf::test::StringsLargeTest {}; + +TEST_F(MergeTest, MergeLargeStrings) +{ + auto const input = this->long_column(); + auto input_views = std::vector(); + auto const view = cudf::table_view({input}); + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_views.push_back(view); + splits.push_back(view.num_rows() * (i + 1)); + } + splits.pop_back(); // remove last entry + auto const column_order = std::vector{cudf::order::ASCENDING}; + auto const null_precedence = std::vector{cudf::null_order::AFTER}; + + auto result = cudf::merge(input_views, {0}, column_order, null_precedence); + auto sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + auto sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also test with large strings column as input + input_views.clear(); + input_views.push_back(view); // regular column + input_views.push_back(result->view()); // large column + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1)); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(view.num_rows() * multiplier); + sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also check merge still returns 32-bit offsets for regular columns + input_views.clear(); + input_views.push_back(view); + input_views.push_back(view); + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * 2); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32}); + sliced = cudf::split(sv.parent(), {view.num_rows()}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input); +} diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index d7368d31944..28179a7341c 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -411,60 +411,3 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3); } - -class MergeLargeStringsTest : public cudf::test::BaseFixture {}; - -TEST_F(MergeLargeStringsTest, MergeLargeStrings) -{ - CUDF_TEST_ENABLE_LARGE_STRINGS(); - auto itr = thrust::constant_iterator( - "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes - auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB - auto input_views = std::vector(); - auto const view = cudf::table_view({input}); - std::vector splits; - int const multiplier = 10; - for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB - input_views.push_back(view); - splits.push_back(view.num_rows() * (i + 1)); - } - splits.pop_back(); // remove last entry - auto const column_order = std::vector{cudf::order::ASCENDING}; - auto const null_precedence = std::vector{cudf::null_order::AFTER}; - - auto result = cudf::merge(input_views, {0}, column_order, null_precedence); - auto sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * multiplier); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - - auto sliced = cudf::split(sv.parent(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also test with large strings column as input - input_views.clear(); - input_views.push_back(view); // regular column - input_views.push_back(result->view()); // large column - result = cudf::merge(input_views, {0}, column_order, null_precedence); - sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1)); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - splits.push_back(view.num_rows() * multiplier); - sliced = cudf::split(sv.parent(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also check merge still returns 32-bit offsets for regular columns - input_views.clear(); - input_views.push_back(view); - input_views.push_back(view); - result = cudf::merge(input_views, {0}, column_order, null_precedence); - sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * 2); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32}); - sliced = cudf::split(sv.parent(), {view.num_rows()}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input); -}