From 2eb71b28d9607e3dfa5b891cbc40ce53a5d27bc6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:05:34 -0400 Subject: [PATCH] Large strings gtest fixture and utilities (#15513) Creates the base class and utilities for testing APIs to produce large strings. The main purpose of the fixture is to enable the large strings environment variable(s) and to setup large test data that can be reused by multiple tests. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15513 --- cpp/include/cudf_test/testing_main.hpp | 37 ++++-- cpp/tests/CMakeLists.txt | 9 ++ cpp/tests/copying/concatenate_tests.cpp | 43 ------ cpp/tests/large_strings/concatenate_tests.cpp | 65 ++++++++++ .../large_strings/large_strings_fixture.cpp | 122 ++++++++++++++++++ .../large_strings/large_strings_fixture.hpp | 49 +++++++ cpp/tests/large_strings/merge_tests.cpp | 79 ++++++++++++ cpp/tests/merge/merge_string_test.cpp | 57 -------- 8 files changed, 351 insertions(+), 110 deletions(-) create mode 100644 cpp/tests/large_strings/concatenate_tests.cpp create mode 100644 cpp/tests/large_strings/large_strings_fixture.cpp create mode 100644 cpp/tests/large_strings/large_strings_fixture.hpp create mode 100644 cpp/tests/large_strings/merge_tests.cpp diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index ecac761f7cb..66b831b917f 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv) } } +/** + * @brief Sets up stream mode memory resource adaptor + * + * The resource adaptor is only set as the current device resource if the + * stream mode is enabled. + * + * The caller must keep the return object alive for the life of the test runs. + * + * @param cmd_opts Command line options returned by parse_cudf_test_opts + * @return Memory resource adaptor + */ +inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) +{ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); + auto resource = cudf::test::create_memory_resource(rmm_mode); + rmm::mr::set_current_device_resource(resource.get()); + return resource; +} + /** * @brief Sets up stream mode memory resource adaptor * @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) * function parses the command line to customize test behavior, like the * allocation mode used for creating the default memory resource. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cudf::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - auto adaptor = make_stream_mode_adaptor(cmd_opts); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ + [[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \ + [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f59e675e1d5..6c56d82007a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -568,6 +568,15 @@ ConfigureTest( strings/urls_tests.cpp ) +# ################################################################################################## +# * large strings test ---------------------------------------------------------------------------- +ConfigureTest( + LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp + large_strings/concatenate_tests.cpp + GPUS 1 + PERCENT 100 +) + # ################################################################################################## # * json path test -------------------------------------------------------------------------------- ConfigureTest(JSON_PATH_TEST json/json_tests.cpp) diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 3e2e332936e..c2d1e1d9f4f 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge) EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error); } -TEST_F(StringColumnTest, ConcatenateLargeStrings) -{ - CUDF_TEST_ENABLE_LARGE_STRINGS(); - auto itr = thrust::constant_iterator( - "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes - auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB - auto view = cudf::column_view(input); - std::vector input_cols; - std::vector splits; - int const multiplier = 10; - for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB - input_cols.push_back(view); - splits.push_back(view.size() * (i + 1)); - } - splits.pop_back(); // remove last entry - auto result = cudf::concatenate(input_cols); - auto sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), view.size() * multiplier); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - - // verify results in sections - auto sliced = cudf::split(result->view(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also test with large strings column as input - { - input_cols.clear(); - input_cols.push_back(input); // regular column - input_cols.push_back(result->view()); // large column - result = cudf::concatenate(input_cols); - sv = cudf::strings_column_view(result->view()); - EXPECT_EQ(sv.size(), view.size() * (multiplier + 1)); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - splits.push_back(view.size() * multiplier); - sliced = cudf::split(result->view(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - } -} - struct TableTest : public cudf::test::BaseFixture {}; TEST_F(TableTest, ConcatenateTables) diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp new file mode 100644 index 00000000000..aa445bf761b --- /dev/null +++ b/cpp/tests/large_strings/concatenate_tests.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include + +#include + +struct ConcatenateTest : public cudf::test::StringsLargeTest {}; + +TEST_F(ConcatenateTest, ConcatenateVertical) +{ + auto input = this->long_column(); + auto view = cudf::column_view(input); + std::vector input_cols; + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_cols.push_back(view); + splits.push_back(view.size() * (i + 1)); + } + splits.pop_back(); // remove last entry + auto result = cudf::concatenate(input_cols); + auto sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + // verify results in sections + auto sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also test with large strings column as input + input_cols.clear(); + input_cols.push_back(input); // regular column + input_cols.push_back(result->view()); // large column + result = cudf::concatenate(input_cols); + sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), view.size() * (multiplier + 1)); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(view.size() * multiplier); + sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } +} diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp new file mode 100644 index 00000000000..59e0cd43d05 --- /dev/null +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::test { +class LargeStringsData { + public: + using DataPointer = std::unique_ptr; + + virtual ~LargeStringsData() {} + + void add_table(std::string_view name, std::unique_ptr&& data) + { + _data[std::string(name)] = std::move(data); + } + + cudf::table_view get_table(std::string_view name) const + { + std::string key{name}; + return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{}; + } + + void add_column(std::string_view name, std::unique_ptr&& data) + { + std::vector> cols; + cols.emplace_back(std::move(data)); + _data[std::string(name)] = std::make_unique(std::move(cols)); + } + + cudf::column_view get_column(std::string_view name) const + { + std::string key{name}; + return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{}; + } + + bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); } + + protected: + std::map _data; +}; + +cudf::column_view StringsLargeTest::wide_column() +{ + std::string name{"wide1"}; + if (!g_ls_data->has_key(name)) { + auto input = + cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "The result does not include the value in the sum in"}); + auto counts = cudf::test::fixed_width_column_wrapper({8, 8, 8, 8, 8}); + auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts); + g_ls_data->add_column(name, std::move(result)); + } + return g_ls_data->get_column(name); +} + +cudf::column_view StringsLargeTest::long_column() +{ + std::string name("long1"); + if (!g_ls_data->has_key(name)) { + auto itr = thrust::constant_iterator( + "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes + auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB + g_ls_data->add_column(name, input.release()); + } + return g_ls_data->get_column(name); +} + +std::unique_ptr StringsLargeTest::get_ls_data() +{ + CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data"); + auto lsd_data = std::make_unique(); + g_ls_data = lsd_data.get(); + return lsd_data; +} + +LargeStringsData* StringsLargeTest::g_ls_data = nullptr; +} // namespace cudf::test + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + // hardcoding the CUDA memory resource to keep from exceeding the pool + auto mr = cudf::test::make_cuda(); + rmm::mr::set_current_device_resource(mr.get()); + auto adaptor = make_stream_mode_adaptor(cmd_opts); + + // create object to automatically be destroyed at the end of main() + auto lsd = cudf::test::StringsLargeTest::get_ls_data(); + + return RUN_ALL_TESTS(); +} diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp new file mode 100644 index 00000000000..8827b65f1ce --- /dev/null +++ b/cpp/tests/large_strings/large_strings_fixture.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +namespace cudf::test { +class LargeStringsData; + +/** + * @brief Fixture for creating large strings tests + * + * Stores tests strings columns for reuse by specific tests. + * Creating the test input only once helps speed up the overall tests. + * + * Also automatically enables appropriate large strings environment variables. + */ +struct StringsLargeTest : public cudf::test::BaseFixture { + /** + * @brief Returns a column of long strings + */ + cudf::column_view wide_column(); + + /** + * @brief Returns a long column of strings + */ + cudf::column_view long_column(); + + large_strings_enabler g_ls_enabler; + static LargeStringsData* g_ls_data; + + static std::unique_ptr get_ls_data(); +}; +} // namespace cudf::test diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp new file mode 100644 index 00000000000..afe6e424371 --- /dev/null +++ b/cpp/tests/large_strings/merge_tests.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include + +#include + +struct MergeTest : public cudf::test::StringsLargeTest {}; + +TEST_F(MergeTest, MergeLargeStrings) +{ + auto const input = this->long_column(); + auto input_views = std::vector(); + auto const view = cudf::table_view({input}); + std::vector splits; + int const multiplier = 10; + for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB + input_views.push_back(view); + splits.push_back(view.num_rows() * (i + 1)); + } + splits.pop_back(); // remove last entry + auto const column_order = std::vector{cudf::order::ASCENDING}; + auto const null_precedence = std::vector{cudf::null_order::AFTER}; + + auto result = cudf::merge(input_views, {0}, column_order, null_precedence); + auto sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + auto sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also test with large strings column as input + input_views.clear(); + input_views.push_back(view); // regular column + input_views.push_back(result->view()); // large column + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1)); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + splits.push_back(view.num_rows() * multiplier); + sliced = cudf::split(sv.parent(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); + } + + // also check merge still returns 32-bit offsets for regular columns + input_views.clear(); + input_views.push_back(view); + input_views.push_back(view); + result = cudf::merge(input_views, {0}, column_order, null_precedence); + sv = cudf::strings_column_view(result->view().column(0)); + EXPECT_EQ(sv.size(), view.num_rows() * 2); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32}); + sliced = cudf::split(sv.parent(), {view.num_rows()}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input); +} diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index d7368d31944..28179a7341c 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -411,60 +411,3 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3); } - -class MergeLargeStringsTest : public cudf::test::BaseFixture {}; - -TEST_F(MergeLargeStringsTest, MergeLargeStrings) -{ - CUDF_TEST_ENABLE_LARGE_STRINGS(); - auto itr = thrust::constant_iterator( - "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes - auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB - auto input_views = std::vector(); - auto const view = cudf::table_view({input}); - std::vector splits; - int const multiplier = 10; - for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB - input_views.push_back(view); - splits.push_back(view.num_rows() * (i + 1)); - } - splits.pop_back(); // remove last entry - auto const column_order = std::vector{cudf::order::ASCENDING}; - auto const null_precedence = std::vector{cudf::null_order::AFTER}; - - auto result = cudf::merge(input_views, {0}, column_order, null_precedence); - auto sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * multiplier); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - - auto sliced = cudf::split(sv.parent(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also test with large strings column as input - input_views.clear(); - input_views.push_back(view); // regular column - input_views.push_back(result->view()); // large column - result = cudf::merge(input_views, {0}, column_order, null_precedence); - sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1)); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); - splits.push_back(view.num_rows() * multiplier); - sliced = cudf::split(sv.parent(), splits); - for (auto c : sliced) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); - } - - // also check merge still returns 32-bit offsets for regular columns - input_views.clear(); - input_views.push_back(view); - input_views.push_back(view); - result = cudf::merge(input_views, {0}, column_order, null_precedence); - sv = cudf::strings_column_view(result->view().column(0)); - EXPECT_EQ(sv.size(), view.num_rows() * 2); - EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32}); - sliced = cudf::split(sv.parent(), {view.num_rows()}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input); -}