From ddd9c19f41d91be5e0cd223c5214b51803590365 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 27 Jun 2024 07:07:23 -0700 Subject: [PATCH 1/2] remove openmpi ceiling (#4496) fixes #4474 #4496 and related PRs introduced a ceiling on `openmpi`, a dependency that's only pulled in at test time, because `cugraph`'s builds were struggling to find it. This proposes removing that pin, as the fixes in https://github.com/conda-forge/openmpi-feedstock/pull/159 should allow the package to again be found by e.g. `find_package(MPI)` in CMake scripts. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cugraph/pull/4496 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 ++- conda/environments/all_cuda-122_arch-x86_64.yaml | 3 ++- dependencies.yaml | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4a235eac7c4..40aaef5b6ed 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -42,7 +42,7 @@ dependencies: - numpy>=1.23,<2.0a0 - numpydoc - nvcc_linux-64=11.8 -- openmpi<5.0.3 +- openmpi - packaging>=21 - pandas - pre-commit @@ -70,6 +70,7 @@ dependencies: - sphinx-markdown-tables - sphinx<6 - sphinxcontrib-websupport +- thriftpy2<=0.5.0 - ucx-proc=*=gpu - ucx-py==0.39.* - wget diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 8275634e55b..1c42ad39fb1 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -47,7 +47,7 @@ dependencies: - numba>=0.57 - numpy>=1.23,<2.0a0 - numpydoc -- openmpi<5.0.3 +- openmpi - packaging>=21 - pandas - pre-commit @@ -75,6 +75,7 @@ dependencies: - sphinx-markdown-tables - sphinx<6 - sphinxcontrib-websupport +- thriftpy2<=0.5.0 - ucx-proc=*=gpu - ucx-py==0.39.* - wget diff --git a/dependencies.yaml b/dependencies.yaml index 91593bf9168..c37d2080771 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -366,7 +366,7 @@ dependencies: - libraft-headers==24.8.* - libraft==24.8.* - librmm==24.8.* - - openmpi<5.0.3 # Required for building cpp-mgtests (multi-GPU tests) + - openmpi # Required for building cpp-mgtests (multi-GPU tests) specific: - output_types: [conda] matrices: @@ -545,6 +545,9 @@ dependencies: - output_types: [conda] packages: - pylibwholegraph==24.8.* + # this thriftpy2 entry can be removed entirely (or switched to a '!=') + # once a new release of that project resolves https://github.com/Thriftpy/thriftpy2/issues/281 + - thriftpy2<=0.5.0 test_python_pylibcugraph: common: - output_types: [conda, pyproject] From ece789dd27a4e745ff41242206248fd0b6072e31 Mon Sep 17 00:00:00 2001 From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:47:23 -0400 Subject: [PATCH 2/2] Tweak rmm configuration for C++ unit tests (#4503) We are seeing intermittent failures in CI from having trouble allocating the RMM pool allocator. Dropping the memory usage by default from 1/6 to 1/10. Added an option `maxpool` that will use 1/2 of the available memory, since we use the unit tests in larger configurations to do scale testing of algorithms. Authors: - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cugraph/pull/4503 --- cpp/tests/utilities/base_fixture.hpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index cb302674a25..25011c0c97a 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -68,14 +68,18 @@ inline auto make_cuda() { return std::make_shared inline auto make_managed() { return std::make_shared(); } -inline auto make_pool() +// use_max set to true will use half of available GPU memory for RMM, otherwise +// otherwise we'll use 1/10. +inline auto make_pool(bool use_max = false) { - // Reduce the default pool allocation to 1/6th of the GPU memory so that we can + // Reduce the default pool allocation to 1/10 of GPU memory so that we can // run more than 2 tests in parallel at the same time. Changes to this value could // effect the maximum amount of parallel tests, and therefore `tests/CMakeLists.txt` // `_CUGRAPH_TEST_PERCENT` default value will need to be audited. auto const [free, total] = rmm::available_device_memory(); - auto const min_alloc = rmm::align_down(std::min(free, total / 6), rmm::CUDA_ALLOCATION_ALIGNMENT); + auto const min_alloc = + use_max ? rmm::align_down(std::min(free, total / 2), rmm::CUDA_ALLOCATION_ALIGNMENT) + : rmm::align_down(std::min(free, total / 10), rmm::CUDA_ALLOCATION_ALIGNMENT); return rmm::mr::make_owning_wrapper(make_cuda(), min_alloc); } @@ -99,7 +103,8 @@ inline auto make_binning() * @throw cugraph::logic_error if the `allocation_mode` is unsupported. * * @param allocation_mode String identifies which resource type. - * Accepted types are "pool", "cuda", and "managed" only. + * Accepted types are "pool", "cuda", "managed" and + * "maxpool" only. * @return Memory resource instance */ inline std::shared_ptr create_memory_resource( @@ -108,6 +113,7 @@ inline std::shared_ptr create_memory_resource( if (allocation_mode == "binning") return make_binning(); if (allocation_mode == "cuda") return make_cuda(); if (allocation_mode == "pool") return make_pool(); + if (allocation_mode == "maxpool") return make_pool(true); if (allocation_mode == "managed") return make_managed(); CUGRAPH_FAIL("Invalid RMM allocation mode"); }