Skip to content

Commit

Permalink
Dynamically loading libnvidia-ml.so.1 instead of directly linking (#…
Browse files Browse the repository at this point in the history
…313)

Currently, we install the driver into one of the CI images to allow for stub generation during compilation. This was needed because `device_info.cpp` linked directly against `CUDA::nvml`. This caused a link dependency on a driver library which is problematic when building with CPU-only docker images.

Instead, we dynamically load `libnvidia-ml.so.1` (appending the `.so.1` to avoid collisions with the stub file `libnvidia-ml.so`) and the necessary functions at runtime. If the library is not found, using a GPU will be disabled. This allows loading of the library for stub generation without needing a GPU.

Authors:
  - Michael Demoret (https://github.com/mdemoret-nv)

Approvers:
  - David Gardner (https://github.com/dagardner-nv)
  - Devin Robison (https://github.com/drobison00)

URL: #313
  • Loading branch information
mdemoret-nv authored Apr 13, 2023
1 parent c3f67c0 commit 25d9ca8
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 93 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci_pipe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ jobs:
test:
name: Test
needs: [build]
runs-on: [self-hosted, linux, amd64, gpu-v100-525-1]
runs-on: [self-hosted, linux, amd64, gpu-v100-latest-1]
timeout-minutes: 60
container:
credentials:
Expand Down Expand Up @@ -171,7 +171,7 @@ jobs:

codecov:
name: Code Coverage
runs-on: [self-hosted, linux, amd64, gpu-v100-525-1]
runs-on: [self-hosted, linux, amd64, gpu-v100-latest-1]
timeout-minutes: 60
container:
credentials:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ jobs:
with:
run_check: ${{ startsWith(github.ref_name, 'pull-request/') }}
run_package_conda: ${{ !startsWith(github.ref_name, 'pull-request/') }}
container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-driver-230410
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-test-230410
container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-build-230412
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-test-230412
secrets:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
CONDA_TOKEN: ${{ secrets.CONDA_TOKEN }}
Expand Down
13 changes: 4 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,11 @@ ENV CMAKE_CUDA_COMPILER_LAUNCHER=
ENV CMAKE_CXX_COMPILER_LAUNCHER=
ENV CMAKE_C_COMPILER_LAUNCHER=

# ============ driver ==================
FROM base as driver
# ============ build ==================
FROM base as build

RUN --mount=type=cache,target=/var/cache/apt \
apt update && \
DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \
apt install --no-install-recommends -y \
libnvidia-compute-525 \
&& \
rm -rf /var/lib/apt/lists/*
# Add any build only dependencies here. For now there is none but we need the
# target to get the CI runner build scripts to work

# ============ test ==================
FROM base as test
Expand Down
8 changes: 1 addition & 7 deletions cpp/mrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ target_link_libraries(libmrc
mrc_protos
mrc_architect_protos
rmm::rmm
CUDA::nvml
CUDA::cudart
rxcpp::rxcpp
glog::glog
Expand All @@ -180,8 +179,8 @@ target_link_libraries(libmrc
PRIVATE
hwloc::hwloc
prometheus-cpp::core # private in MR !199
ucx::ucs
ucx::ucp
ucx::ucs
)

target_include_directories(libmrc
Expand All @@ -206,10 +205,6 @@ target_compile_features(libmrc PUBLIC cxx_std_20)

set_target_properties(libmrc PROPERTIES OUTPUT_NAME ${PROJECT_NAME})

# Finally, set the install RPATH to include the stubs folder for CUDA::nvml. If thats made private, this can be removed
set_target_properties(libmrc PROPERTIES INSTALL_RPATH
"${CMAKE_INSTALL_PREFIX}/lib:\$ORIGIN:${CMAKE_INSTALL_PREFIX}/lib/stubs")

# ##################################################################################################
# - install targets --------------------------------------------------------------------------------

Expand Down Expand Up @@ -277,4 +272,3 @@ rapids_export(BUILD ${PROJECT_NAME}
DOCUMENTATION doc_string
FINAL_CODE_BLOCK code_string
)

Loading

0 comments on commit 25d9ca8

Please sign in to comment.