Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix quantile tests running on multi-gpus #8775

Merged
merged 8 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions tests/buildkite/pipeline-mgpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ steps:
queue: linux-amd64-mgpu
- wait
#### -------- TEST --------
- label: ":console: Run Google Tests"
command: "tests/buildkite/test-cpp-mgpu.sh"
key: test-cpp-mgpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"
key: test-python-mgpu
Expand Down
16 changes: 16 additions & 0 deletions tests/buildkite/test-cpp-mgpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -euo pipefail

source tests/buildkite/conftest.sh

# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'

echo "--- Run Google Tests with CUDA, using multiple GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost --gtest_filter=*MGPU*
61 changes: 29 additions & 32 deletions tests/cpp/common/test_quantile.cu
Original file line number Diff line number Diff line change
Expand Up @@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) {
namespace {
void TestAllReduceBasic(int32_t n_gpus) {
auto const world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}

constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
auto const device = collective::GetRank();

// Set up single node version;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);

size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
std::vector<SketchContainer> containers;
for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage;
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft;
containers.emplace_back(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<FeatureType> ft({}, device);
containers.emplace_back(ft, n_bins, kCols, kRows, device);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&containers.back());
Expand All @@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
sketch_on_single_node.FixError();
}
sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(),
TestQuantileElemRank(device, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr(), true);

// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
auto rank = collective::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
Expand All @@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
ASSERT_EQ(sketch_distributed.Data().size(),
sketch_on_single_node.Data().size());

TestQuantileElemRank(0, sketch_distributed.Data(),
TestQuantileElemRank(device, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr(), true);

std::vector<SketchEntry> single_node_data(
Expand All @@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) {
}
} // anonymous namespace

TEST(GPUQuantile, AllReduceBasic) {
TEST(GPUQuantile, MGPUAllReduceBasic) {
auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
}

namespace {
void TestSameOnAllWorkers(int32_t n_gpus) {
auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}

constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto rank = collective::GetRank();
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
auto const rank = collective::GetRank();
auto const device = rank;
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
Expand All @@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
&sketch_distributed);
sketch_distributed.AllReduce();
sketch_distributed.Unique();
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);

// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
Expand All @@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);

communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
communicator->Synchronize();
Expand All @@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
}
} // anonymous namespace

TEST(GPUQuantile, SameOnAllWorkers) {
TEST(GPUQuantile, MGPUSameOnAllWorkers) {
auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
}

Expand Down