Skip to content

Commit

Permalink
Large strings gtest fixture and utilities (#15513)
Browse files Browse the repository at this point in the history
Creates the base class and utilities for testing APIs to produce large strings.
The main purpose of the fixture is to enable the large strings environment variable(s) and to setup large test data that can be reused by multiple tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: #15513
  • Loading branch information
davidwendt authored Apr 24, 2024
1 parent 117eff6 commit 2eb71b2
Show file tree
Hide file tree
Showing 8 changed files with 351 additions and 110 deletions.
37 changes: 27 additions & 10 deletions cpp/include/cudf_test/testing_main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
}
}

/**
* @brief Sets up stream mode memory resource adaptor
*
* The resource adaptor is only set as the current device resource if the
* stream mode is enabled.
*
* The caller must keep the return object alive for the life of the test runs.
*
* @param cmd_opts Command line options returned by parse_cudf_test_opts
* @return Memory resource adaptor
*/
inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
{
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
auto resource = cudf::test::create_memory_resource(rmm_mode);
rmm::mr::set_current_device_resource(resource.get());
return resource;
}

/**
* @brief Sets up stream mode memory resource adaptor
*
Expand Down Expand Up @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
* function parses the command line to customize test behavior, like the
* allocation mode used for creating the default memory resource.
*/
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); \
auto resource = cudf::test::create_memory_resource(rmm_mode); \
rmm::mr::set_current_device_resource(resource.get()); \
auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
[[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \
[[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
}
9 changes: 9 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,15 @@ ConfigureTest(
strings/urls_tests.cpp
)

# ##################################################################################################
# * large strings test ----------------------------------------------------------------------------
ConfigureTest(
LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
large_strings/concatenate_tests.cpp
GPUS 1
PERCENT 100
)

# ##################################################################################################
# * json path test --------------------------------------------------------------------------------
ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
Expand Down
43 changes: 0 additions & 43 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

TEST_F(StringColumnTest, ConcatenateLargeStrings)
{
CUDF_TEST_ENABLE_LARGE_STRINGS();
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
{
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
65 changes: 65 additions & 0 deletions cpp/tests/large_strings/concatenate_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_fixture.hpp"

#include <cudf_test/column_utilities.hpp>

#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <vector>

struct ConcatenateTest : public cudf::test::StringsLargeTest {};

TEST_F(ConcatenateTest, ConcatenateVertical)
{
auto input = this->long_column();
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
122 changes: 122 additions & 0 deletions cpp/tests/large_strings/large_strings_fixture.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_fixture.hpp"

#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/testing_main.hpp>

#include <cudf/column/column.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <map>
#include <memory>
#include <vector>

namespace cudf::test {
class LargeStringsData {
public:
using DataPointer = std::unique_ptr<cudf::table>;

virtual ~LargeStringsData() {}

void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
{
_data[std::string(name)] = std::move(data);
}

cudf::table_view get_table(std::string_view name) const
{
std::string key{name};
return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
}

void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
{
std::vector<std::unique_ptr<cudf::column>> cols;
cols.emplace_back(std::move(data));
_data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
}

cudf::column_view get_column(std::string_view name) const
{
std::string key{name};
return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
}

bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }

protected:
std::map<std::string, DataPointer> _data;
};

cudf::column_view StringsLargeTest::wide_column()
{
std::string name{"wide1"};
if (!g_ls_data->has_key(name)) {
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
"the fat cat lays next to the other accénted cat",
"a slow moving turtlé cannot catch the bird",
"which can be composéd together to form a more complete",
"The result does not include the value in the sum in"});
auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
g_ls_data->add_column(name, std::move(result));
}
return g_ls_data->get_column(name);
}

cudf::column_view StringsLargeTest::long_column()
{
std::string name("long1");
if (!g_ls_data->has_key(name)) {
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
g_ls_data->add_column(name, input.release());
}
return g_ls_data->get_column(name);
}

std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
{
CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
auto lsd_data = std::make_unique<LargeStringsData>();
g_ls_data = lsd_data.get();
return lsd_data;
}

LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
} // namespace cudf::test

int main(int argc, char** argv)
{
::testing::InitGoogleTest(&argc, argv);
auto const cmd_opts = parse_cudf_test_opts(argc, argv);
// hardcoding the CUDA memory resource to keep from exceeding the pool
auto mr = cudf::test::make_cuda();
rmm::mr::set_current_device_resource(mr.get());
auto adaptor = make_stream_mode_adaptor(cmd_opts);

// create object to automatically be destroyed at the end of main()
auto lsd = cudf::test::StringsLargeTest::get_ls_data();

return RUN_ALL_TESTS();
}
49 changes: 49 additions & 0 deletions cpp/tests/large_strings/large_strings_fixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>

#include <cudf/column/column_view.hpp>

namespace cudf::test {
class LargeStringsData;

/**
* @brief Fixture for creating large strings tests
*
* Stores tests strings columns for reuse by specific tests.
* Creating the test input only once helps speed up the overall tests.
*
* Also automatically enables appropriate large strings environment variables.
*/
struct StringsLargeTest : public cudf::test::BaseFixture {
/**
* @brief Returns a column of long strings
*/
cudf::column_view wide_column();

/**
* @brief Returns a long column of strings
*/
cudf::column_view long_column();

large_strings_enabler g_ls_enabler;
static LargeStringsData* g_ls_data;

static std::unique_ptr<LargeStringsData> get_ls_data();
};
} // namespace cudf::test
Loading

0 comments on commit 2eb71b2

Please sign in to comment.