Skip to content

Commit

Permalink
Merge pull request #35328 from kpedro88/TritonTestFixes
Browse files Browse the repository at this point in the history
Triton test fixes
  • Loading branch information
cmsbuild authored Sep 24, 2021
2 parents 0bf749a + b96778b commit 7c3f222
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 14 deletions.
9 changes: 8 additions & 1 deletion HeterogeneousCore/SonicTriton/interface/TritonService.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define HeterogeneousCore_SonicTriton_TritonService

#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/Utilities/interface/GlobalIdentifier.h"

#include <vector>
#include <unordered_set>
Expand Down Expand Up @@ -38,7 +39,13 @@ class TritonService {
instanceName(pset.getUntrackedParameter<std::string>("instanceName")),
tempDir(pset.getUntrackedParameter<std::string>("tempDir")),
imageName(pset.getUntrackedParameter<std::string>("imageName")),
sandboxName(pset.getUntrackedParameter<std::string>("sandboxName")) {}
sandboxName(pset.getUntrackedParameter<std::string>("sandboxName")) {
//randomize instance name
if (instanceName.empty()) {
instanceName =
pset.getUntrackedParameter<std::string>("instanceBaseName") + "_" + edm::createGlobalIdentifier();
}
}

bool enable;
bool debug;
Expand Down
125 changes: 122 additions & 3 deletions HeterogeneousCore/SonicTriton/scripts/cmsTriton
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ AUTOPORT=""
NPORTS=3
IMAGE=fastml/triton-torchgeo:21.06-py3-geometric
SANDBOX=""
COMPAT_USR=""

get_sandbox(){
if [ -z "$SANDBOX" ]; then
Expand All @@ -35,6 +36,7 @@ usage() {
$ECHO
$ECHO "Options:"
$ECHO "-c \t don't cleanup temporary dir (for debugging)"
$ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
$ECHO "-D \t dry run: print container commands rather than executing them"
$ECHO "-d \t use Docker instead of Singularity"
$ECHO "-f \t force reuse of (possibly) existing container instance"
Expand All @@ -55,6 +57,7 @@ usage() {
$ECHO "Operations:"
$ECHO "start \t start server"
$ECHO "stop \t stop server"
$ECHO "check \t check if server can run on this system"
exit $1
}

Expand All @@ -64,10 +67,12 @@ if [ -e /run/shm ]; then
SHM=/run/shm
fi

while getopts "cDdfgi:M:m:n:P:p:r:s:t:vw:h" opt; do
while getopts "cC:Ddfgi:M:m:n:P:p:r:s:t:vw:h" opt; do
case "$opt" in
c) CLEANUP=""
;;
C) COMPAT_USR="$OPTARG"
;;
D) DRYRUN=echo
;;
d) USEDOCKER=true
Expand Down Expand Up @@ -106,7 +111,7 @@ done
shift $(($OPTIND - 1))
OP=$1

if [ "$OP" != start ] && [ "$OP" != stop ]; then
if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
usage 1
fi

Expand All @@ -130,6 +135,7 @@ LIB=lib
STARTED_INDICATOR="Started GRPCInferenceService"
SEGFAULT_INDICATOR="Address already in use"
EXTRA=""
COMPAT_SCRIPT=/etc/shinit_v2

compute_ports(){
# compute derived port numbers
Expand Down Expand Up @@ -184,6 +190,14 @@ start_docker(){
REPOARGS="$REPOARGS --model-repository=${REPO}"
done

# compatibility driver environment
if [ -n "$COMPAT" ]; then
MOUNTARGS="$MOUNTARGS -v$COMPAT"
if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
fi
fi

$DRYRUN $DOCKER run -d --name ${SERVER} \
--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
-p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
Expand All @@ -194,6 +208,7 @@ start_singularity(){
# triton server image may need to modify contents of opt/tritonserver/lib/
# but cvmfs is read-only
# -> make a writable local directory with the same contents
# this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions
$DRYRUN mkdir ${LIB}
$DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/

Expand All @@ -205,6 +220,25 @@ start_singularity(){
REPOARGS="$REPOARGS --model-repository=${REPO}"
done

# compatibility driver environment
if [ -n "$COMPAT" ]; then
MOUNTARGS="$MOUNTARGS -B $COMPAT"
if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
fi
fi

# workaround for nvidia libs w/ singularity-in-singularity
# from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970
if [ -d /.singularity.d/libs ]; then
TMPD=`mktemp -d`
(echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
chmod +x $TMPD/ldconfig
PATH=$TMPD:$PATH
# this does not work with LD_LIBRARY_PATH from cmsenv
ldconfig /.singularity.d/libs
fi

# start instance
# need to bind /cvmfs for above symlinks to work inside container
$DRYRUN singularity instance start \
Expand Down Expand Up @@ -359,13 +393,88 @@ make_tmp(){
$DRYRUN cd "$TMPDIR"
}

scram_tag(){
cd $CMSSW_BASE
TOOL="$1"
TAG="$2"
scram tool tag $TOOL $TAG 2> /dev/null || true
}

driver_docker(){
docker run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
}

driver_singularity(){
source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION
}

compat_docker(){
docker cp $(docker create --rm ${IMAGE}):${COMPAT_SCRIPT} .
}

compat_singularity(){
cp ${SANDBOX}/${COMPAT_SCRIPT} .
}

check_drivers(){
# get sandbox env vars in subshell
CUDA_DRIVER_VERSION=$($DRIVER_FN)
# copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh
DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
return 0
fi

export COMPAT=""

# 1. check for user-specified compatibility drivers
if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
COMPAT="$COMPAT_USR"
fi

# 2. check for compatibility drivers from CMSSW if available
# based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers
# but need to check drivers, not runtime
if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
CUDA_BASE=$(scram_tag cuda CUDA_BASE)
COMPAT_CMSSW=${CUDA_BASE}/drivers

if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
COMPAT="$COMPAT_CMSSW"
fi
fi

# 3. finally, check expected system location
COMPAT_SYS=/usr/local/cuda/compat
if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
COMPAT="$COMPAT_SYS"
fi

if [ -n "$COMPAT" ]; then
# in order to mount drivers where Triton expects them
export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
# workaround for bug in Triton/Nvidia compatibility check
if [ "$PWD" == "$TMPDIR" ]; then
$DRYRUN $COMPAT_FN
COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
$DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_BASE:$COMPAT_SCRIPT"
fi
return 0
else
return 1
fi
}

if [ -n "$USEDOCKER" ]; then
if [ -n "$GPU" ]; then
EXTRA="--gpus all"
fi
START_FN=start_docker
TEST_FN=test_docker
STOP_FN=stop_docker
DRIVER_FN=driver_docker
COMPAT_FN=compat_docker
PROG_NAME=Docker
else
if [ -n "$GPU" ]; then
Expand All @@ -374,10 +483,15 @@ else
START_FN=start_singularity
TEST_FN=test_singularity
STOP_FN=stop_singularity
DRIVER_FN=driver_singularity
COMPAT_FN=compat_singularity
PROG_NAME=Singularity
fi

if [ "$OP" == start ]; then
if [ "$OP" == check ]; then
check_drivers
exit $?
elif [ "$OP" == start ]; then
# handle cleaning up
if [ -n "$FORCE" ]; then
auto_stop
Expand All @@ -394,6 +508,11 @@ if [ "$OP" == start ]; then

make_tmp

# after make_tmp because this may create file in tmp dir
check_drivers
DRIVER_EXIT=$?
if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi

# if parent PID is provided, automatically stop server when finished
# do this before actually trying to start the server in case of ctrl+c
if [ -n "$PARENTPID" ]; then
Expand Down
7 changes: 1 addition & 6 deletions HeterogeneousCore/SonicTriton/src/TritonService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/ServiceRegistry/interface/ProcessContext.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "FWCore/Utilities/interface/GlobalIdentifier.h"

#include "grpc_client.h"
#include "grpc_service.pb.h"
Expand Down Expand Up @@ -217,11 +216,6 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm::
if (verbose_)
edm::LogInfo("TritonService") << msg;

//randomize instance name
if (fallbackOpts_.instanceName.empty()) {
fallbackOpts_.instanceName = "triton_server_instance_" + edm::createGlobalIdentifier();
}

//assemble server start command
std::string command("cmsTriton -P -1 -p " + pid_);
if (fallbackOpts_.debug)
Expand Down Expand Up @@ -308,6 +302,7 @@ void TritonService::fillDescriptions(edm::ConfigurationDescriptions& description
fallbackDesc.addUntracked<bool>("useGPU", false);
fallbackDesc.addUntracked<int>("retries", -1);
fallbackDesc.addUntracked<int>("wait", -1);
fallbackDesc.addUntracked<std::string>("instanceBaseName", "triton_server_instance");
fallbackDesc.addUntracked<std::string>("instanceName", "");
fallbackDesc.addUntracked<std::string>("tempDir", "");
fallbackDesc.addUntracked<std::string>("imageName", "");
Expand Down
3 changes: 3 additions & 0 deletions HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
options.register("mode","Async", VarParsing.multiplicity.singleton, VarParsing.varType.string, "mode for client (choices: {})".format(', '.join(allowed_modes)))
options.register("verbose", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "enable verbose output")
options.register("brief", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "briefer output for graph modules")
options.register("fallbackName", "", VarParsing.multiplicity.singleton, VarParsing.varType.string, "name for fallback server")
options.register("unittest", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "unit test mode: reduce input sizes")
options.register("testother", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "also test gRPC communication if shared memory enabled, or vice versa")
options.register("shm", True, VarParsing.multiplicity.singleton, VarParsing.varType.bool, "enable shared memory")
Expand Down Expand Up @@ -83,6 +84,8 @@
process.TritonService.verbose = options.verbose
process.TritonService.fallback.verbose = options.verbose
process.TritonService.fallback.useDocker = options.docker
if len(options.fallbackName)>0:
process.TritonService.fallback.instanceBaseName = options.fallbackName
if options.device != "auto":
process.TritonService.fallback.useGPU = options.device=="gpu"
if len(options.address)>0:
Expand Down
28 changes: 24 additions & 4 deletions HeterogeneousCore/SonicTriton/test/unittest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ DEVICE=$2

# the test is not possible if:
# 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU)
# 2. singularity not found or not usable
# 3. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity)
# 1b. Nvidia drivers not available
# 2. wrong architecture (not amd64)
# 3. singularity not found or not usable
# 4. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity)
# so just return true in those cases

if [ "$DEVICE" = "GPU" ]; then
Expand All @@ -16,6 +18,13 @@ if [ "$DEVICE" = "GPU" ]; then
echo "missing GPU"
exit 0
fi

if cmsTriton check; then
echo "has NVIDIA driver"
else
echo "missing current or compatible NVIDIA driver"
exit 0
fi
else
if grep -q avx /proc/cpuinfo; then
echo "has avx"
Expand All @@ -25,6 +34,14 @@ else
fi
fi

THIS_ARCH=$(echo $SCRAM_ARCH | cut -d'_' -f2)
if [ "$THIS_ARCH" == "amd64" ]; then
echo "has amd64"
else
echo "missing amd64"
exit 0
fi

if type singularity >& /dev/null; then
echo "has singularity"
else
Expand All @@ -41,21 +58,24 @@ if [ -n "$SINGULARITY_CONTAINER" ]; then
fi
fi

fallbackName=triton_server_instance_${DEVICE}
tmpFile=$(mktemp -p ${LOCALTOP} SonicTritonTestXXXXXXXX.log)
cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 >& $tmpFile
cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 fallbackName=${fallbackName} >& $tmpFile
CMSEXIT=$?

cat $tmpFile
sleep 15

STOP_COUNTER=0
while ! LOGFILE="$(ls -rt ${LOCALTOP}/log_triton_server_instance*.log 2>/dev/null | tail -n 1)" && [ "$STOP_COUNTER" -lt 5 ]; do
while ! LOGFILE="$(ls -rt ${LOCALTOP}/log_${fallbackName}_*.log 2>/dev/null | tail -n 1)" && [ "$STOP_COUNTER" -lt 5 ]; do
STOP_COUNTER=$((STOP_COUNTER+1))
sleep 5
done

if [ -n "$LOGFILE" ]; then
echo -e '\n=====\nContents of '$LOGFILE':\n=====\n'
cat "$LOGFILE"
rm $LOGFILE
fi

if grep -q "Socket closed" $tmpFile; then
Expand Down

0 comments on commit 7c3f222

Please sign in to comment.