From ae5949e8f9796d88c8874222dc66c46afa83da06 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 15 Sep 2022 13:02:53 +0200 Subject: [PATCH] Fix the NVProfilerService Use a ProcessCallGraph to get the highest possible module id, instead of relaying on the modules count. --- .../CUDAServices/plugins/BuildFile.xml | 1 + .../CUDAServices/plugins/NVProfilerService.cc | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml index b9bd22319cc8c..f6b5f0a63fb12 100644 --- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml @@ -7,6 +7,7 @@ + diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc index 99960a719dab5..5cbf1819618b4 100644 --- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc @@ -41,6 +41,7 @@ #include "FWCore/Utilities/interface/ProductKindOfType.h" #include "FWCore/Utilities/interface/TimeOfDay.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HLTrigger/Timer/interface/ProcessCallGraph.h" using namespace std::string_literals; @@ -287,6 +288,9 @@ class NVProfilerService { return highlight(label) ? nvtxLightAmber : nvtxLightGreen; } + // build a complete representation of the modules in the whole job + ProcessCallGraph callgraph_; + std::vector highlightModules_; const bool showModulePrefetching_; const bool skipFirstEvent_; @@ -502,7 +506,7 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { std::stringstream out; out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, " << bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams() - << " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads"; + << " streams\nrunning on " << bounds.maxNumberOfThreads() << " threads"; nvtxDomainMark(global_domain_, out.str().c_str()); auto concurrentStreams = bounds.maxNumberOfStreams(); @@ -524,12 +528,13 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { } void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes, - edm::ProcessContext const& pc) { + edm::ProcessContext const& context) { + callgraph_.preBeginJob(pathsAndConsumes, context); nvtxDomainMark(global_domain_, "preBeginJob"); - // FIXME this probably works only in the absence of subprocesses - // size() + 1 because pathsAndConsumes.allModules() does not include the source - unsigned int modules = pathsAndConsumes.allModules().size() + 1; + // this assumes that preBeginJob is not called concurrently with the modules' beginJob method + // or the preBeginJob for a subprocess + unsigned int modules = callgraph_.size(); global_modules_.resize(modules, nvtxInvalidRangeId); for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) { stream_modules_[sid].resize(modules, nvtxInvalidRangeId); @@ -1115,6 +1120,8 @@ void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, ed } void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc) { + callgraph_.preSourceConstruction(desc); + if (not skipFirstEvent_) { auto mid = desc.id(); global_modules_.grow_to_at_least(mid + 1);