From ed91e775ec20cf8fd984963a13b2614ad7ba8bb9 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Sun, 12 Feb 2023 17:00:26 -0800 Subject: [PATCH] Fix quantile tests running on multi-gpus (#8775) * Fix quantile tests running on multi-gpus * Run some gtests with multiple GPUs * fix mgpu test naming * Instruct NCCL to print extra logs * Allocate extra space in /dev/shm to enable NCCL * use gtest_skip to skip mgpu tests --------- Co-authored-by: Hyunsu Philip Cho --- tests/buildkite/pipeline-mgpu.yml | 5 +++ tests/buildkite/test-cpp-mgpu.sh | 16 ++++++++ tests/cpp/common/test_quantile.cu | 61 +++++++++++++++---------------- 3 files changed, 50 insertions(+), 32 deletions(-) create mode 100755 tests/buildkite/test-cpp-mgpu.sh diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index 75d7855b6dc9..316e78ad8033 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -36,6 +36,11 @@ steps: queue: linux-amd64-mgpu - wait #### -------- TEST -------- + - label: ":console: Run Google Tests" + command: "tests/buildkite/test-cpp-mgpu.sh" + key: test-cpp-mgpu + agents: + queue: linux-amd64-mgpu - label: ":console: Test Python package, 4 GPUs" command: "tests/buildkite/test-python-gpu.sh mgpu" key: test-python-mgpu diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh new file mode 100755 index 000000000000..935a301a66f1 --- /dev/null +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +source tests/buildkite/conftest.sh + +# Allocate extra space in /dev/shm to enable NCCL +export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' + +echo "--- Run Google Tests with CUDA, using multiple GPUs" +buildkite-agent artifact download "build/testxgboost" . --step build-cuda +chmod +x build/testxgboost +tests/ci_build/ci_build.sh gpu nvidia-docker \ + --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ + --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ + build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index d3f7dbed0f4b..cb24f8bb4140 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) { namespace { void TestAllReduceBasic(int32_t n_gpus) { auto const world = collective::GetWorldSize(); - if (world != 1) { - ASSERT_EQ(world, n_gpus); - } else { - return; - } - constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { + auto const device = collective::GetRank(); + // Set up single node version; - HostDeviceVector ft; - SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0); + HostDeviceVector ft({}, device); + SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device); size_t intermediate_num_cuts = std::min( kRows * world, static_cast(n_bins * WQSketch::kFactor)); std::vector containers; for (auto rank = 0; rank < world; ++rank) { - HostDeviceVector storage; + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); - HostDeviceVector ft; - containers.emplace_back(ft, n_bins, kCols, kRows, 0); + HostDeviceVector ft({}, device); + containers.emplace_back(ft, n_bins, kCols, kRows, device); AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &containers.back()); @@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) { sketch_on_single_node.FixError(); } sketch_on_single_node.Unique(); - TestQuantileElemRank(0, sketch_on_single_node.Data(), + TestQuantileElemRank(device, sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr(), true); // Set up distributed version. We rely on using rank as seed to generate // the exact same copy of data. auto rank = collective::GetRank(); - SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); - HostDeviceVector storage; + SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); @@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) { ASSERT_EQ(sketch_distributed.Data().size(), sketch_on_single_node.Data().size()); - TestQuantileElemRank(0, sketch_distributed.Data(), + TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); std::vector single_node_data( @@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, AllReduceBasic) { +TEST(GPUQuantile, MGPUAllReduceBasic) { auto const n_gpus = AllVisibleGPUs(); + if (n_gpus <= 1) { + GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus; + } RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus); } namespace { void TestSameOnAllWorkers(int32_t n_gpus) { auto world = collective::GetWorldSize(); - if (world != 1) { - ASSERT_EQ(world, n_gpus); - } else { - return; - } - constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { - auto rank = collective::GetRank(); - HostDeviceVector ft; - SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); - HostDeviceVector storage; + auto const rank = collective::GetRank(); + auto const device = rank; + HostDeviceVector ft({}, device); + SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); @@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { &sketch_distributed); sketch_distributed.AllReduce(); sketch_distributed.Unique(); - TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); + TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); // Test for all workers having the same sketch. size_t n_data = sketch_distributed.Data().size(); @@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { thrust::copy(thrust::device, local_data.data(), local_data.data() + local_data.size(), all_workers.begin() + local_data.size() * rank); - collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0); + collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device); communicator->AllReduceSum(all_workers.data().get(), all_workers.size()); communicator->Synchronize(); @@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, SameOnAllWorkers) { +TEST(GPUQuantile, MGPUSameOnAllWorkers) { auto const n_gpus = AllVisibleGPUs(); + if (n_gpus <= 1) { + GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus; + } RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus); }