Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Large strings gtest fixture and utilities #15513

Merged
merged 13 commits into from
Apr 24, 2024
37 changes: 27 additions & 10 deletions cpp/include/cudf_test/testing_main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
}
}

/**
* @brief Sets up stream mode memory resource adaptor
*
* The resource adaptor is only set as the current device resource if the
* stream mode is enabled.
*
* The caller must keep the return object alive for the life of the test runs.
*
* @param cmd_opts Command line options returned by parse_cudf_test_opts
* @return Memory resource adaptor
*/
inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
{
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
auto resource = cudf::test::create_memory_resource(rmm_mode);
rmm::mr::set_current_device_resource(resource.get());
return resource;
}

/**
* @brief Sets up stream mode memory resource adaptor
*
Expand Down Expand Up @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
* function parses the command line to customize test behavior, like the
* allocation mode used for creating the default memory resource.
*/
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); \
auto resource = cudf::test::create_memory_resource(rmm_mode); \
rmm::mr::set_current_device_resource(resource.get()); \
auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
auto mr = make_memory_resource_adaptor(cmd_opts); \
auto adaptor = make_stream_mode_adaptor(cmd_opts); \
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return RUN_ALL_TESTS(); \
}
9 changes: 9 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,15 @@ ConfigureTest(
strings/urls_tests.cpp
)

# ##################################################################################################
# * large strings test ----------------------------------------------------------------------------
ConfigureTest(
LARGE_STRINGS_TEST large_strings/large_strings_tests.cpp large_strings/merge_tests.cpp
large_strings/concatenate_tests.cpp
GPUS 1
PERCENT 100
)

# ##################################################################################################
# * json path test --------------------------------------------------------------------------------
ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
Expand Down
43 changes: 0 additions & 43 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

TEST_F(StringColumnTest, ConcatenateLargeStrings)
{
CUDF_TEST_ENABLE_LARGE_STRINGS();
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
{
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
65 changes: 65 additions & 0 deletions cpp/tests/large_strings/concatenate_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_tests.hpp"

#include <cudf_test/column_utilities.hpp>

#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <vector>

struct ConcatenateTest : public cudf::test::StringsLargeTest {};

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from copying/concatenate_tests.cpp

TEST_F(ConcatenateTest, ConcatenateVertical)
{
auto input = this->long_column();
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
114 changes: 114 additions & 0 deletions cpp/tests/large_strings/large_strings_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "large_strings_tests.hpp"

#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/testing_main.hpp>

#include <cudf/column/column.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <map>
#include <memory>
#include <vector>

namespace cudf::test {
class LargeStringsData {
public:
using DataPointer = std::unique_ptr<cudf::table>;

~LargeStringsData() {}
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
{
_data[std::string(name)] = std::move(data);
}

cudf::table_view get_table(std::string_view name) const
{
std::string key{name};
return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
}

void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
{
std::vector<std::unique_ptr<cudf::column>> cols;
cols.emplace_back(std::move(data));
_data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
}

cudf::column_view get_column(std::string_view name) const
{
std::string key{name};
return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
}

bool has_key(std::string_view name) { return _data.find(std::string(name)) != _data.end(); }
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

protected:
std::map<std::string, DataPointer> _data;
};

cudf::column_view StringsLargeTest::wide_column()
{
std::string name{"wide1"};
if (!g_ls_data->has_key(name)) {
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
"the fat cat lays next to the other accénted cat",
"a slow moving turtlé cannot catch the bird",
"which can be composéd together to form a more complete",
"The result does not include the value in the sum in"});
auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
g_ls_data->add_column(name, std::move(result));
}
return g_ls_data->get_column(name);
}

cudf::column_view StringsLargeTest::long_column()
{
std::string name("long1");
if (!g_ls_data->has_key(name)) {
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
g_ls_data->add_column(name, input.release());
}
return g_ls_data->get_column(name);
}

LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
} // namespace cudf::test

int main(int argc, char** argv)
{
::testing::InitGoogleTest(&argc, argv);
auto const cmd_opts = parse_cudf_test_opts(argc, argv);
auto mr = make_memory_resource_adaptor(cmd_opts);
auto adaptor = make_stream_mode_adaptor(cmd_opts);

// create object to automatically be destroyed at the end of main()
auto lsd = cudf::test::LargeStringsData();
// set object pointer into static variable
cudf::test::StringsLargeTest::g_ls_data = &lsd;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh no, manually assigning static variable like this is not a good practice. Can we initialize it automatically?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is that the variable (or its pointer at least) needs to be accessible globally but the lifetime scope must be within main(). So lsd must be created and destroyed within the main() scope but needs to be singleton for the entire process at the same time.

Copy link
Contributor

@ttnghia ttnghia Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about create-on-first-access?

struct StringsLargeTest : public cudf::test::BaseFixture {
  public:
  static auto get_ls_data() {
    g_ls_data = new cudf::test::LargeStringsData;
    return g_ls_data;
  }
  private:
  static LargeStringsData* g_ls_data;
};

get_ls_data() then should be called within main() scope.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't work because it will return a pointer to main which will not automatically destroy it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there value in making this a function static? Its construction is guaranteed to be thread safe, and will be destroyed in reverse order of construction.

static auto get_ls_data() {
  auto the_instance = cudf::test::LargeStringsData{};
  return &the_instance;
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use a smart pointer here. This seems to work.

    static auto get_lsd_data(int v) {
        auto ls_data = std::make_unique<LargeStringsData>(v);
        g_ls_data = ls_data.get();
        return ls_data;
    }

When the smart pointer goes out of scope, it will delete the object.
This is more inline with what RMM does with resource memory manager objects.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the object (not the pointer) is a static variable anywhere there is a chance it could be destroyed outside of main().

Sorry, I don't fully understand the concern. The function static object is guaranteed to be alive until main() exits. Does that not suit?

Do we have a dependency somewhere in the global static destruction sequence, or something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The getter needs to check and throw if g_ls_data is not yet initialized.

@ttnghia: With a function static, the object is guaranteed to be initialized once, on the first call.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initializing part is not too challenging. Global static destruction is not good since the object holds device memory.
Here is a godbolt which I hope will explain some of this: https://godbolt.org/z/rTa9ceEKf

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Global static destruction is not good since the object holds device memory.

Hmm. Thank you, I'll try bear this in mind.


return RUN_ALL_TESTS();
}
47 changes: 47 additions & 0 deletions cpp/tests/large_strings/large_strings_tests.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>

#include <cudf/column/column_view.hpp>

namespace cudf::test {
class LargeStringsData;

/**
* @brief Fixture for creating large strings tests
*
* Stores tests strings columns for reuse by specific tests.
* Creating the test input only once helps speed up the overall tests.
*
* Also automatically enables appropriate large strings environment variables.
*/
struct StringsLargeTest : public cudf::test::BaseFixture {
/**
* @brief Returns a column of long strings
*/
cudf::column_view wide_column();

/**
* @brief Returns a long column of strings
*/
cudf::column_view long_column();

large_strings_enabler g_ls_enabler;
static LargeStringsData* g_ls_data;
};
} // namespace cudf::test
Loading
Loading