Skip to content

Commit

Permalink
Fix quantile tests running on multi-gpus (#8775)
Browse files Browse the repository at this point in the history
* Fix quantile tests running on multi-gpus

* Run some gtests with multiple GPUs

* fix mgpu test naming

* Instruct NCCL to print extra logs

* Allocate extra space in /dev/shm to enable NCCL

* use gtest_skip to skip mgpu tests

---------

Co-authored-by: Hyunsu Philip Cho <[email protected]>
  • Loading branch information
rongou and hcho3 authored Feb 13, 2023
1 parent 225b315 commit ed91e77
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 32 deletions.
5 changes: 5 additions & 0 deletions tests/buildkite/pipeline-mgpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ steps:
queue: linux-amd64-mgpu
- wait
#### -------- TEST --------
- label: ":console: Run Google Tests"
command: "tests/buildkite/test-cpp-mgpu.sh"
key: test-cpp-mgpu
agents:
queue: linux-amd64-mgpu
- label: ":console: Test Python package, 4 GPUs"
command: "tests/buildkite/test-python-gpu.sh mgpu"
key: test-python-mgpu
Expand Down
16 changes: 16 additions & 0 deletions tests/buildkite/test-cpp-mgpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -euo pipefail

source tests/buildkite/conftest.sh

# Allocate extra space in /dev/shm to enable NCCL
export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'

echo "--- Run Google Tests with CUDA, using multiple GPUs"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost --gtest_filter=*MGPU*
61 changes: 29 additions & 32 deletions tests/cpp/common/test_quantile.cu
Original file line number Diff line number Diff line change
Expand Up @@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) {
namespace {
void TestAllReduceBasic(int32_t n_gpus) {
auto const world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}

constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
auto const device = collective::GetRank();

// Set up single node version;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);

size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
std::vector<SketchContainer> containers;
for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage;
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft;
containers.emplace_back(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<FeatureType> ft({}, device);
containers.emplace_back(ft, n_bins, kCols, kRows, device);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&containers.back());
Expand All @@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
sketch_on_single_node.FixError();
}
sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(),
TestQuantileElemRank(device, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr(), true);

// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
auto rank = collective::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
Expand All @@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
ASSERT_EQ(sketch_distributed.Data().size(),
sketch_on_single_node.Data().size());

TestQuantileElemRank(0, sketch_distributed.Data(),
TestQuantileElemRank(device, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr(), true);

std::vector<SketchEntry> single_node_data(
Expand All @@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) {
}
} // anonymous namespace

TEST(GPUQuantile, AllReduceBasic) {
TEST(GPUQuantile, MGPUAllReduceBasic) {
auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
}

namespace {
void TestSameOnAllWorkers(int32_t n_gpus) {
auto world = collective::GetWorldSize();
if (world != 1) {
ASSERT_EQ(world, n_gpus);
} else {
return;
}

constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto rank = collective::GetRank();
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
HostDeviceVector<float> storage;
auto const rank = collective::GetRank();
auto const device = rank;
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(device)
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
Expand All @@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
&sketch_distributed);
sketch_distributed.AllReduce();
sketch_distributed.Unique();
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);

// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
Expand All @@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);

communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
communicator->Synchronize();
Expand All @@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
}
} // anonymous namespace

TEST(GPUQuantile, SameOnAllWorkers) {
TEST(GPUQuantile, MGPUSameOnAllWorkers) {
auto const n_gpus = AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
}

Expand Down

0 comments on commit ed91e77

Please sign in to comment.