From e19d9e3b650052ebe9663d79061b84f0f9da716e Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Fri, 17 Sep 2021 13:12:04 -0500 Subject: [PATCH 1/8] gpu singularity-in-singularity workaround --- HeterogeneousCore/SonicTriton/scripts/cmsTriton | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 93e132340eb56..7a578f873327e 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -194,6 +194,7 @@ start_singularity(){ # triton server image may need to modify contents of opt/tritonserver/lib/ # but cvmfs is read-only # -> make a writable local directory with the same contents + # this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions $DRYRUN mkdir ${LIB} $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/ @@ -205,6 +206,17 @@ start_singularity(){ REPOARGS="$REPOARGS --model-repository=${REPO}" done + # workaround for nvidia libs w/ singularity-in-singularity + # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970 + if [ -d /.singularity.d/libs ]; then + TMPD=`mktemp -d` + (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig + chmod +x $TMPD/ldconfig + PATH=$TMPD:$PATH + # this does not work with LD_LIBRARY_PATH from cmsenv + ldconfig /.singularity.d/libs + fi + # start instance # need to bind /cvmfs for above symlinks to work inside container $DRYRUN singularity instance start \ From dcf080bd60b3f06dbf87095b16d2beb642a77cbf Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Fri, 17 Sep 2021 18:04:58 -0500 Subject: [PATCH 2/8] check nvidia driver version --- HeterogeneousCore/SonicTriton/scripts/cmsTriton | 7 +++++-- HeterogeneousCore/SonicTriton/test/unittest.sh | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 7a578f873327e..0959f77085539 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -55,6 +55,7 @@ usage() { $ECHO "Operations:" $ECHO "start \t start server" $ECHO "stop \t stop server" + $ECHO "info \t print server info" exit $1 } @@ -106,7 +107,7 @@ done shift $(($OPTIND - 1)) OP=$1 -if [ "$OP" != start ] && [ "$OP" != stop ]; then +if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != info ]; then usage 1 fi @@ -389,7 +390,9 @@ else PROG_NAME=Singularity fi -if [ "$OP" == start ]; then +if [ "$OP" == info ]; then + echo $SANDBOX +elif [ "$OP" == start ]; then # handle cleaning up if [ -n "$FORCE" ]; then auto_stop diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh index 7ae19df174da1..604ee4db4ebea 100755 --- a/HeterogeneousCore/SonicTriton/test/unittest.sh +++ b/HeterogeneousCore/SonicTriton/test/unittest.sh @@ -5,6 +5,7 @@ DEVICE=$2 # the test is not possible if: # 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU) +# 1b. Nvidia driver version too low # 2. singularity not found or not usable # 3. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity) # so just return true in those cases @@ -16,6 +17,18 @@ if [ "$DEVICE" = "GPU" ]; then echo "missing GPU" exit 0 fi + + SANDBOX=$(cmsTriton info) + # get sandbox env vars in subshell + CUDA_DRIVER_VERSION=$(source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION) + # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh + DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then + echo "has NVIDIA driver" + else + echo "missing (or too old) NVIDIA driver" + exit 0 + fi else if grep -q avx /proc/cpuinfo; then echo "has avx" From 2f06e8035e60cc583e74bccd0fffc2aa7b7ab0cb Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Thu, 16 Sep 2021 17:42:13 -0500 Subject: [PATCH 3/8] disable tests for non-amd64 archs --- HeterogeneousCore/SonicTriton/test/unittest.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh index 604ee4db4ebea..51d6ac089c64b 100755 --- a/HeterogeneousCore/SonicTriton/test/unittest.sh +++ b/HeterogeneousCore/SonicTriton/test/unittest.sh @@ -6,8 +6,9 @@ DEVICE=$2 # the test is not possible if: # 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU) # 1b. Nvidia driver version too low -# 2. singularity not found or not usable -# 3. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity) +# 2. wrong architecture (not amd64) +# 3. singularity not found or not usable +# 4. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity) # so just return true in those cases if [ "$DEVICE" = "GPU" ]; then @@ -38,6 +39,14 @@ else fi fi +THIS_ARCH=$(echo $SCRAM_ARCH | cut -d'_' -f2) +if [ "$THIS_ARCH" == "amd64" ]; then + echo "has amd64" +else + echo "missing amd64" + exit 0 +fi + if type singularity >& /dev/null; then echo "has singularity" else From 0fe705f43ca7ea8cc6faed2c57e02466dab4645a Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Mon, 20 Sep 2021 11:07:59 -0500 Subject: [PATCH 4/8] try to find compatibility drivers --- .../SonicTriton/scripts/cmsTriton | 75 +++++++++++++++++-- .../SonicTriton/test/unittest.sh | 11 +-- 2 files changed, 71 insertions(+), 15 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 0959f77085539..3e9b3c696bb32 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -20,6 +20,7 @@ AUTOPORT="" NPORTS=3 IMAGE=fastml/triton-torchgeo:21.06-py3-geometric SANDBOX="" +COMPAT_USR="" get_sandbox(){ if [ -z "$SANDBOX" ]; then @@ -35,6 +36,7 @@ usage() { $ECHO $ECHO "Options:" $ECHO "-c \t don't cleanup temporary dir (for debugging)" + $ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)" $ECHO "-D \t dry run: print container commands rather than executing them" $ECHO "-d \t use Docker instead of Singularity" $ECHO "-f \t force reuse of (possibly) existing container instance" @@ -55,7 +57,7 @@ usage() { $ECHO "Operations:" $ECHO "start \t start server" $ECHO "stop \t stop server" - $ECHO "info \t print server info" + $ECHO "check \t check if server can run on this system" exit $1 } @@ -65,10 +67,12 @@ if [ -e /run/shm ]; then SHM=/run/shm fi -while getopts "cDdfgi:M:m:n:P:p:r:s:t:vw:h" opt; do +while getopts "cC:Ddfgi:M:m:n:P:p:r:s:t:vw:h" opt; do case "$opt" in c) CLEANUP="" ;; + C) COMPAT_USR="$OPTARG" + ;; D) DRYRUN=echo ;; d) USEDOCKER=true @@ -107,7 +111,7 @@ done shift $(($OPTIND - 1)) OP=$1 -if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != info ]; then +if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then usage 1 fi @@ -185,9 +189,17 @@ start_docker(){ REPOARGS="$REPOARGS --model-repository=${REPO}" done + # compatibility driver environment + ENVARGS="" + if [ -n "$COMPAT" ]; then + ENVARGS="--env _CUDA_COMPAT_PATH=$COMPAT" + # make sure drivers are available inside container + MOUNTARGS="$MOUNTARGS -v$COMPAT:$COMPAT" + fi + $DRYRUN $DOCKER run -d --name ${SERVER} \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \ - -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \ + -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $ENVARGS $MOUNTARGS \ ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE } @@ -207,6 +219,14 @@ start_singularity(){ REPOARGS="$REPOARGS --model-repository=${REPO}" done + # compatibility driver environment + ENVARGS="" + if [ -n "$COMPAT" ]; then + ENVARGS="--env _CUDA_COMPAT_PATH=$COMPAT" + # make sure drivers are available inside container + MOUNTARGS="$MOUNTARGS -B $COMPAT" + fi + # workaround for nvidia libs w/ singularity-in-singularity # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970 if [ -d /.singularity.d/libs ]; then @@ -221,7 +241,7 @@ start_singularity(){ # start instance # need to bind /cvmfs for above symlinks to work inside container $DRYRUN singularity instance start \ - -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \ + -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $ENVARGS $MOUNTARGS $EXTRA \ ${SANDBOX} ${SERVER} START_EXIT=$? @@ -372,6 +392,42 @@ make_tmp(){ $DRYRUN cd "$TMPDIR" } +check_drivers(){ + # get sandbox env vars in subshell + CUDA_DRIVER_VERSION=$(source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION) + # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh + DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then + return 0 + fi + + export COMPAT="" + + # 1. check for user-specified compatibility drivers + if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then + COMPAT="$COMPAT_USR" + fi + + # 2. check for compatibility drivers from CMSSW if available + if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then + # stubs are not useful here + COMPAT=$($CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers | grep -v "/lib64/stubs") + fi + + # 3. finally, check expected system location + COMPAT_SYS=/usr/local/cuda/compat + if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then + COMPAT="$COMPAT_SYS" + fi + + if [ -n "$COMPAT" ]; then + export COMPAT + return 0 + else + return 1 + fi +} + if [ -n "$USEDOCKER" ]; then if [ -n "$GPU" ]; then EXTRA="--gpus all" @@ -390,8 +446,9 @@ else PROG_NAME=Singularity fi -if [ "$OP" == info ]; then - echo $SANDBOX +if [ "$OP" == check ]; then + check_drivers + exit $? elif [ "$OP" == start ]; then # handle cleaning up if [ -n "$FORCE" ]; then @@ -401,6 +458,10 @@ elif [ "$OP" == start ]; then exit 1 fi + check_drivers + DRIVER_EXIT=$? + if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi + handle_ports PORT_EXIT=$? if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh index 51d6ac089c64b..38989cc598e42 100755 --- a/HeterogeneousCore/SonicTriton/test/unittest.sh +++ b/HeterogeneousCore/SonicTriton/test/unittest.sh @@ -5,7 +5,7 @@ DEVICE=$2 # the test is not possible if: # 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU) -# 1b. Nvidia driver version too low +# 1b. Nvidia drivers not available # 2. wrong architecture (not amd64) # 3. singularity not found or not usable # 4. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity) @@ -19,15 +19,10 @@ if [ "$DEVICE" = "GPU" ]; then exit 0 fi - SANDBOX=$(cmsTriton info) - # get sandbox env vars in subshell - CUDA_DRIVER_VERSION=$(source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION) - # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh - DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) - if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then + if cmsTriton check; then echo "has NVIDIA driver" else - echo "missing (or too old) NVIDIA driver" + echo "missing current or compatible NVIDIA driver" exit 0 fi else From 03d9f28b9b22e25fc497f695903b4ac5484ae8e9 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Mon, 20 Sep 2021 11:32:37 -0500 Subject: [PATCH 5/8] simpler check --- .../SonicTriton/scripts/cmsTriton | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 3e9b3c696bb32..a990472915587 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -190,16 +190,13 @@ start_docker(){ done # compatibility driver environment - ENVARGS="" if [ -n "$COMPAT" ]; then - ENVARGS="--env _CUDA_COMPAT_PATH=$COMPAT" - # make sure drivers are available inside container - MOUNTARGS="$MOUNTARGS -v$COMPAT:$COMPAT" + MOUNTARGS="$MOUNTARGS -v$COMPAT" fi $DRYRUN $DOCKER run -d --name ${SERVER} \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \ - -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $ENVARGS $MOUNTARGS \ + -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \ ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE } @@ -220,10 +217,7 @@ start_singularity(){ done # compatibility driver environment - ENVARGS="" if [ -n "$COMPAT" ]; then - ENVARGS="--env _CUDA_COMPAT_PATH=$COMPAT" - # make sure drivers are available inside container MOUNTARGS="$MOUNTARGS -B $COMPAT" fi @@ -241,7 +235,7 @@ start_singularity(){ # start instance # need to bind /cvmfs for above symlinks to work inside container $DRYRUN singularity instance start \ - -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $ENVARGS $MOUNTARGS $EXTRA \ + -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \ ${SANDBOX} ${SERVER} START_EXIT=$? @@ -392,6 +386,12 @@ make_tmp(){ $DRYRUN cd "$TMPDIR" } +scram_tag(){ + TOOL="$1" + TAG="$2" + scram tool tag $TOOL $TAG 2> /dev/null || true +} + check_drivers(){ # get sandbox env vars in subshell CUDA_DRIVER_VERSION=$(source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION) @@ -409,9 +409,15 @@ check_drivers(){ fi # 2. check for compatibility drivers from CMSSW if available + # based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers + # but need to check drivers, not runtime if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then - # stubs are not useful here - COMPAT=$($CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers | grep -v "/lib64/stubs") + CUDA_BASE=$(scram_tag cuda CUDA_BASE) + COMPAT_CMSSW=${CUDA_BASE}/drivers + + if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then + COMPAT="$COMPAT_CMSSW" + fi fi # 3. finally, check expected system location @@ -421,7 +427,8 @@ check_drivers(){ fi if [ -n "$COMPAT" ]; then - export COMPAT + # in order to mount drivers where Triton expects them + export COMPAT="${COMPAT}:${COMPAT_SYS}/lib" return 0 else return 1 From 502c0213dd44bec88a7285af5bfbd84a0d48fd11 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Mon, 20 Sep 2021 17:25:14 -0500 Subject: [PATCH 6/8] workaround for bug in Triton/Nvidia compatibility script --- .../SonicTriton/scripts/cmsTriton | 47 ++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index a990472915587..d4bad2dccf830 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -135,6 +135,7 @@ LIB=lib STARTED_INDICATOR="Started GRPCInferenceService" SEGFAULT_INDICATOR="Address already in use" EXTRA="" +COMPAT_SCRIPT=/etc/shinit_v2 compute_ports(){ # compute derived port numbers @@ -192,6 +193,9 @@ start_docker(){ # compatibility driver environment if [ -n "$COMPAT" ]; then MOUNTARGS="$MOUNTARGS -v$COMPAT" + if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then + MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT" + fi fi $DRYRUN $DOCKER run -d --name ${SERVER} \ @@ -219,6 +223,9 @@ start_singularity(){ # compatibility driver environment if [ -n "$COMPAT" ]; then MOUNTARGS="$MOUNTARGS -B $COMPAT" + if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then + MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT" + fi fi # workaround for nvidia libs w/ singularity-in-singularity @@ -392,9 +399,25 @@ scram_tag(){ scram tool tag $TOOL $TAG 2> /dev/null || true } +driver_docker(){ + docker run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION=" +} + +driver_singularity(){ + source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION +} + +compat_docker(){ + docker cp $(docker create --rm ${IMAGE}):${COMPAT_SCRIPT} . +} + +compat_singularity(){ + cp ${SANDBOX}/${COMPAT_SCRIPT} . +} + check_drivers(){ # get sandbox env vars in subshell - CUDA_DRIVER_VERSION=$(source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION) + CUDA_DRIVER_VERSION=$($DRIVER_FN) # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then @@ -428,7 +451,14 @@ check_drivers(){ if [ -n "$COMPAT" ]; then # in order to mount drivers where Triton expects them - export COMPAT="${COMPAT}:${COMPAT_SYS}/lib" + export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real" + # workaround for bug in Triton/Nvidia compatibility check + if [ "$PWD" == "$TMPDIR" ]; then + $DRYRUN $COMPAT_FN + COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT) + $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE + COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_BASE:$COMPAT_SCRIPT" + fi return 0 else return 1 @@ -442,6 +472,8 @@ if [ -n "$USEDOCKER" ]; then START_FN=start_docker TEST_FN=test_docker STOP_FN=stop_docker + DRIVER_FN=driver_docker + COMPAT_FN=compat_docker PROG_NAME=Docker else if [ -n "$GPU" ]; then @@ -450,6 +482,8 @@ else START_FN=start_singularity TEST_FN=test_singularity STOP_FN=stop_singularity + DRIVER_FN=driver_singularity + COMPAT_FN=compat_singularity PROG_NAME=Singularity fi @@ -465,10 +499,6 @@ elif [ "$OP" == start ]; then exit 1 fi - check_drivers - DRIVER_EXIT=$? - if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi - handle_ports PORT_EXIT=$? if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi @@ -477,6 +507,11 @@ elif [ "$OP" == start ]; then make_tmp + # after make_tmp because this may create file in tmp dir + check_drivers + DRIVER_EXIT=$? + if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi + # if parent PID is provided, automatically stop server when finished # do this before actually trying to start the server in case of ctrl+c if [ -n "$PARENTPID" ]; then From b05f1483cd3f618335d3037b26694b7f4b2ec69b Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Tue, 21 Sep 2021 10:31:36 -0500 Subject: [PATCH 7/8] fix scram call --- HeterogeneousCore/SonicTriton/scripts/cmsTriton | 1 + 1 file changed, 1 insertion(+) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index d4bad2dccf830..ac16eb3989fd8 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -394,6 +394,7 @@ make_tmp(){ } scram_tag(){ + cd $CMSSW_BASE TOOL="$1" TAG="$2" scram tool tag $TOOL $TAG 2> /dev/null || true From b96778b56e663db6724f22c00e3dfe397fd163e1 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Tue, 21 Sep 2021 11:00:26 -0500 Subject: [PATCH 8/8] more control over fallback server name for tests --- HeterogeneousCore/SonicTriton/interface/TritonService.h | 9 ++++++++- HeterogeneousCore/SonicTriton/src/TritonService.cc | 7 +------ HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py | 3 +++ HeterogeneousCore/SonicTriton/test/unittest.sh | 7 +++++-- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/interface/TritonService.h b/HeterogeneousCore/SonicTriton/interface/TritonService.h index 255d1855f86c7..ea441088d2d4a 100644 --- a/HeterogeneousCore/SonicTriton/interface/TritonService.h +++ b/HeterogeneousCore/SonicTriton/interface/TritonService.h @@ -2,6 +2,7 @@ #define HeterogeneousCore_SonicTriton_TritonService #include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/GlobalIdentifier.h" #include #include @@ -38,7 +39,13 @@ class TritonService { instanceName(pset.getUntrackedParameter("instanceName")), tempDir(pset.getUntrackedParameter("tempDir")), imageName(pset.getUntrackedParameter("imageName")), - sandboxName(pset.getUntrackedParameter("sandboxName")) {} + sandboxName(pset.getUntrackedParameter("sandboxName")) { + //randomize instance name + if (instanceName.empty()) { + instanceName = + pset.getUntrackedParameter("instanceBaseName") + "_" + edm::createGlobalIdentifier(); + } + } bool enable; bool debug; diff --git a/HeterogeneousCore/SonicTriton/src/TritonService.cc b/HeterogeneousCore/SonicTriton/src/TritonService.cc index 82ee6fa2d9658..f6b8f1914dc3a 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonService.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonService.cc @@ -8,7 +8,6 @@ #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" #include "FWCore/ServiceRegistry/interface/ProcessContext.h" #include "FWCore/Utilities/interface/Exception.h" -#include "FWCore/Utilities/interface/GlobalIdentifier.h" #include "grpc_client.h" #include "grpc_service.pb.h" @@ -217,11 +216,6 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: if (verbose_) edm::LogInfo("TritonService") << msg; - //randomize instance name - if (fallbackOpts_.instanceName.empty()) { - fallbackOpts_.instanceName = "triton_server_instance_" + edm::createGlobalIdentifier(); - } - //assemble server start command std::string command("cmsTriton -P -1 -p " + pid_); if (fallbackOpts_.debug) @@ -308,6 +302,7 @@ void TritonService::fillDescriptions(edm::ConfigurationDescriptions& description fallbackDesc.addUntracked("useGPU", false); fallbackDesc.addUntracked("retries", -1); fallbackDesc.addUntracked("wait", -1); + fallbackDesc.addUntracked("instanceBaseName", "triton_server_instance"); fallbackDesc.addUntracked("instanceName", ""); fallbackDesc.addUntracked("tempDir", ""); fallbackDesc.addUntracked("imageName", ""); diff --git a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py index d9ba2be799cc5..fafb6346eaaaf 100644 --- a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py +++ b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py @@ -29,6 +29,7 @@ options.register("mode","Async", VarParsing.multiplicity.singleton, VarParsing.varType.string, "mode for client (choices: {})".format(', '.join(allowed_modes))) options.register("verbose", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "enable verbose output") options.register("brief", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "briefer output for graph modules") +options.register("fallbackName", "", VarParsing.multiplicity.singleton, VarParsing.varType.string, "name for fallback server") options.register("unittest", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "unit test mode: reduce input sizes") options.register("testother", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "also test gRPC communication if shared memory enabled, or vice versa") options.register("shm", True, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "enable shared memory") @@ -83,6 +84,8 @@ process.TritonService.verbose = options.verbose process.TritonService.fallback.verbose = options.verbose process.TritonService.fallback.useDocker = options.docker +if len(options.fallbackName)>0: + process.TritonService.fallback.instanceBaseName = options.fallbackName if options.device != "auto": process.TritonService.fallback.useGPU = options.device=="gpu" if len(options.address)>0: diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh index 38989cc598e42..410a34991e4f1 100755 --- a/HeterogeneousCore/SonicTriton/test/unittest.sh +++ b/HeterogeneousCore/SonicTriton/test/unittest.sh @@ -58,14 +58,16 @@ if [ -n "$SINGULARITY_CONTAINER" ]; then fi fi +fallbackName=triton_server_instance_${DEVICE} tmpFile=$(mktemp -p ${LOCALTOP} SonicTritonTestXXXXXXXX.log) -cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 >& $tmpFile +cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 fallbackName=${fallbackName} >& $tmpFile CMSEXIT=$? cat $tmpFile +sleep 15 STOP_COUNTER=0 -while ! LOGFILE="$(ls -rt ${LOCALTOP}/log_triton_server_instance*.log 2>/dev/null | tail -n 1)" && [ "$STOP_COUNTER" -lt 5 ]; do +while ! LOGFILE="$(ls -rt ${LOCALTOP}/log_${fallbackName}_*.log 2>/dev/null | tail -n 1)" && [ "$STOP_COUNTER" -lt 5 ]; do STOP_COUNTER=$((STOP_COUNTER+1)) sleep 5 done @@ -73,6 +75,7 @@ done if [ -n "$LOGFILE" ]; then echo -e '\n=====\nContents of '$LOGFILE':\n=====\n' cat "$LOGFILE" + rm $LOGFILE fi if grep -q "Socket closed" $tmpFile; then