diff --git a/DQMServices/Core/README.md b/DQMServices/Core/README.md index fba63c8830d92..821e661b94568 100644 --- a/DQMServices/Core/README.md +++ b/DQMServices/Core/README.md @@ -137,7 +137,7 @@ When a ME is booked, internally _global_ and _local_ MEs are created. This shou - In the DQM API, we face the conflict that `MonitorElement` objects are held in the modules (so their life cycle has to match that of the module) but also represent histograms whose life cycle depends the data processed (run and lumi transitions). This caused conflicts since the introduction of multi-threading. - The `DQMStore` resolves this conflict by representing each monitor element using (at least) two objects: A _local_ `MonitorElement`, that follows the module life cycle but does not own data, and a _global_ `MonitorElement` that owns histogram data but does not belong to any module. There may be multiple _local_ MEs for one _global_ ME if multiple modules fill the same histogram (`edm::stream` or even independent modules). There may be multiple _global_ MEs for the same histogram if there are concurrent lumisections. - The live cycle of _local_ MEs is driven by callbacks from each of the module base classes (`enterLumi`, `leaveLumi`). For legacy `edm::EDAnalyzer`s, global begin/end run/lumi hooks are used, which only work as long as there are no concurrent lumisections. The _local_ MEs are kept in a set of containers indexed by the `moduleID`, with special value `0` for _all_ legacy modules and special values for `DQMGlobalEDAnalyzer`s, where the local MEs need to match the life cycle of the `runCache` (module id + run number), and `DQMEDAnalyzer`s, where the `streamID` is combined with the `moduleID` to get a unique identifier for each stream instance. -- The live cycle of _global_ MEs is driven by the enter/leave lumi calls (indirectly) and the `cleanupLumi` hook called via the edm service interface. They are kept in a set of containers indexed by run and lumi number. For `RUN` MEs, the lumi number is 0; for `JOB` MEs, run and lumi are zero. The special pair `(0,0)` is also used for _prototypes_: Global MEs that are not currently associated to any run or lumi, but can (and _have to_, for the legacy guarantees) be recycled once a run or lumi starts. +- The live cycle of _global_ MEs is driven by the `initLumi/cleanupLumi` hooks called via the edm service interface. They are kept in a set of containers indexed by run and lumi number. For `RUN` MEs, the lumi number is 0; for `JOB` MEs, run and lumi are zero. The special pair `(0,0)` is also used for _prototypes_: Global MEs that are not currently associated to any run or lumi, but can (and _have to_, for the legacy guarantees) be recycled once a run or lumi starts. - If there are no concurrent lumisections, both _local_ and _global_ MEs live for the entire job and are always connected in the same way, which means all legacy interactions continue to work. `assertLegacySafe` (enabled by default) checks for this condition and crashes the job if it is violated. diff --git a/DQMServices/Core/interface/DQMEDAnalyzer.h b/DQMServices/Core/interface/DQMEDAnalyzer.h index 7499716d4e683..909ae0012b174 100644 --- a/DQMServices/Core/interface/DQMEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMEDAnalyzer.h @@ -75,6 +75,11 @@ class DQMEDAnalyzer : public edm::stream::EDProducer()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); + edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); dqmBeginRun(run, setup); edm::Service()->bookTransaction( [this, &run, &setup](DQMStore::IBooker& booker) { @@ -83,6 +88,7 @@ class DQMEDAnalyzer : public edm::stream::EDProducergetCanSaveByLumi()); + edm::Service()->initLumi(run.run(), /* lumi */ 0, meId()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, meId()); } diff --git a/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h b/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h index 2ea0937676119..75bfec42370fa 100644 --- a/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h @@ -45,6 +45,7 @@ class DQMGlobalEDAnalyzer // local MEs for each run cache. meId(run), /* canSaveByLumi */ false); + dqmstore_->initLumi(run.run(), /* lumi */ 0, meId(run)); dqmstore_->enterLumi(run.run(), /* lumi */ 0, meId(run)); return h; } diff --git a/DQMServices/Core/interface/DQMOneEDAnalyzer.h b/DQMServices/Core/interface/DQMOneEDAnalyzer.h index 3ea7468f3b916..6eea9026b9b12 100644 --- a/DQMServices/Core/interface/DQMOneEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMOneEDAnalyzer.h @@ -32,6 +32,10 @@ class DQMOneEDAnalyzer } void beginRun(edm::Run const& run, edm::EventSetup const& setup) final { + // if we run booking multiple times because there are multiple runs in a + // job, this is needed to make sure all existing MEs are in a valid state + // before the booking code runs. + edm::Service()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); dqmBeginRun(run, setup); edm::Service()->bookTransaction( @@ -41,6 +45,7 @@ class DQMOneEDAnalyzer }, this->moduleDescription().id(), this->getCanSaveByLumi()); + edm::Service()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); } diff --git a/DQMServices/Core/interface/DQMStore.h b/DQMServices/Core/interface/DQMStore.h index 67d84b711804c..f2e74be85990d 100644 --- a/DQMServices/Core/interface/DQMStore.h +++ b/DQMServices/Core/interface/DQMStore.h @@ -642,9 +642,16 @@ namespace dqm { // For input modules: trigger recycling without local ME/enterLumi/moduleID. MonitorElement* findOrRecycle(MonitorElementData::Key const&); + // this creates local all needed global MEs for the given run/lumi (and + // module), potentially cloning them if there are concurrent runs/lumis. + // Symmetrical to cleanupLumi, this is called from a framwork hook, to + // make sure it also runs when the module does not call anything. + void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi); + void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); + // modules are expected to call these callbacks when they change run/lumi. - // The DQMStore then updates the module's MEs, potentially cloning them - // if there are concurrent runs/lumis. + // The DQMStore then updates the module's MEs local MEs to point to the + // new run/lumi. void enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); void leaveLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); diff --git a/DQMServices/Core/src/DQMStore.cc b/DQMServices/Core/src/DQMStore.cc index 095094b7c3820..f61a7b8e67e36 100644 --- a/DQMServices/Core/src/DQMStore.cc +++ b/DQMServices/Core/src/DQMStore.cc @@ -357,15 +357,23 @@ namespace dqm::implementation { return nullptr; } - void DQMStore::enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { - // Make sure global MEs for the run/lumi exist (depending on scope), and - // point the local MEs for this module to these global MEs. + void DQMStore::initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi) { + // Call initLumi for all modules, as a global operation. + auto lock = std::scoped_lock(this->booking_mutex_); + for (auto& kv : this->localMEs_) { + initLumi(run, lumi, kv.first); + } + } + + void DQMStore::initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { + // Make sure global MEs for the run/lumi exist (depending on scope) auto lock = std::scoped_lock(this->booking_mutex_); // these are the MEs we need to update. auto& localset = this->localMEs_[moduleID]; // this is where they need to point to. + // This could be a per-run or per-lumi set (depending on lumi == 0) auto& targetset = this->globalMEs_[edm::LuminosityBlockID(run, lumi)]; // this is where we can get MEs to reuse. auto& prototypes = this->globalMEs_[edm::LuminosityBlockID()]; @@ -386,7 +394,7 @@ namespace dqm::implementation { auto target = targetset.find(me); // lookup by path, thanks to MEComparison if (target != targetset.end()) { // we already have a ME, just use it! - debugTrackME("enterLumi (existing)", nullptr, *target); + debugTrackME("initLumi (existing)", nullptr, *target); } else { // look for a prototype to reuse. auto proto = prototypes.find(me); @@ -410,7 +418,7 @@ namespace dqm::implementation { auto result = targetset.insert(oldme); assert(result.second); // was new insertion target = result.first; // iterator to new ME - debugTrackME("enterLumi (reused)", nullptr, *target); + debugTrackME("initLumi (reused)", nullptr, *target); } else { // no prototype available. That means we have concurrent Lumis/Runs, // and need to make a clone now. @@ -431,9 +439,48 @@ namespace dqm::implementation { auto result = targetset.insert(newme); assert(result.second); // was new insertion target = result.first; // iterator to new ME - debugTrackME("enterLumi (allocated)", nullptr, *target); + debugTrackME("initLumi (allocated)", nullptr, *target); } } + } + } + + void DQMStore::enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { + // point the local MEs for this module to these global MEs. + + // This needs to happen before we can use the global MEs for this run/lumi here. + // We could do it lazyly here, or eagerly globally in global begin lumi. + //initLumi(run, lumi, moduleID); + + auto lock = std::scoped_lock(this->booking_mutex_); + + // these are the MEs we need to update. + auto& localset = this->localMEs_[moduleID]; + // this is where they need to point to. + auto& targetset = this->globalMEs_[edm::LuminosityBlockID(run, lumi)]; + + // only for a sanity check + auto checkScope = [run, lumi](MonitorElementData::Scope scope) { + if (scope == MonitorElementData::Scope::JOB) { + return (run == 0 && lumi == 0); + } else if (scope == MonitorElementData::Scope::RUN) { + return (run != 0 && lumi == 0); + } else if (scope == MonitorElementData::Scope::LUMI) { + return (lumi != 0); + } + assert(!"Impossible Scope."); + return false; + }; + + for (MonitorElement* me : localset) { + auto target = targetset.find(me); // lookup by path, thanks to MEComparison + if (target == targetset.end()) { + auto anyme = this->findME(me); + debugTrackME("enterLumi (nothingtodo)", me, nullptr); + assert(anyme && checkScope(anyme->getScope()) == false); + continue; + } + assert(target != targetset.end()); // initLumi should have taken care of this. // now we have the proper global ME in the right place, point the local there. // This is only safe if the name is exactly the same -- else it might corrupt // the tree structure of the set! @@ -675,15 +722,19 @@ namespace dqm::implementation { // Set lumi and run for legacy booking. // This is no more than a guess with concurrent runs/lumis, but should be // correct for purely sequential legacy stuff. - // These transitions should only affect non-DQM*EDAnalyzer based code. // Also reset Scope, such that legacy modules can expect it to be JOB. + // initLumi and leaveLumi are needed for all module types: these handle + // creating and deleting global MEs as needed, which has to happen even if + // a module does not see lumi transitions. ar.watchPreGlobalBeginRun([this](edm::GlobalContext const& gc) { this->setRunLumi(gc.luminosityBlockID()); + this->initLumi(gc.luminosityBlockID().run(), /* lumi */ 0); this->enterLumi(gc.luminosityBlockID().run(), /* lumi */ 0, /* moduleID */ 0); this->setScope(MonitorElementData::Scope::JOB); }); ar.watchPreGlobalBeginLumi([this](edm::GlobalContext const& gc) { this->setRunLumi(gc.luminosityBlockID()); + this->initLumi(gc.luminosityBlockID().run(), gc.luminosityBlockID().luminosityBlock()); this->enterLumi(gc.luminosityBlockID().run(), gc.luminosityBlockID().luminosityBlock(), /* moduleID */ 0); }); ar.watchPostGlobalEndRun([this](edm::GlobalContext const& gc) { diff --git a/DQMServices/Demo/test/run_analyzers_cfg.py b/DQMServices/Demo/test/run_analyzers_cfg.py index 4912f8c190e7f..aeb56505ef65a 100644 --- a/DQMServices/Demo/test/run_analyzers_cfg.py +++ b/DQMServices/Demo/test/run_analyzers_cfg.py @@ -37,6 +37,7 @@ parser.register('firstRun', 1, one, int, "See EmptySource.") parser.register('numberEventsInRun', 100, one, int, "See EmptySource.") parser.register('numberEventsInLuminosityBlock', 20, one, int, "See EmptySource.") +parser.register('processingMode', 'RunsLumisAndEvents', one, string, "See EmptySource.") parser.register('nEvents', 100, one, int, "Total number of events.") parser.register('nThreads', 1, one, int, "Number of threads and streams.") parser.register('nConcurrent', 1, one, int, "Number of concurrent runs/lumis.") @@ -50,7 +51,8 @@ numberEventsInLuminosityBlock = cms.untracked.uint32(args.numberEventsInLuminosityBlock), firstLuminosityBlock = cms.untracked.uint32(args.firstLuminosityBlock), firstEvent = cms.untracked.uint32(args.firstEvent), - firstRun = cms.untracked.uint32(args.firstRun)) + firstRun = cms.untracked.uint32(args.firstRun), + processingMode = cms.untracked.string(args.processingMode)) process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(args.nEvents) ) diff --git a/DQMServices/Demo/test/runtests.sh b/DQMServices/Demo/test/runtests.sh index c27429a3d4165..f9b80cb001bc7 100755 --- a/DQMServices/Demo/test/runtests.sh +++ b/DQMServices/Demo/test/runtests.sh @@ -155,4 +155,15 @@ cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root nEvents=0 cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 legacyoutput=True cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 protobufoutput=True +# also try empty lumisections. EmptySource does not really support 'no events' mode (never terminates), so, a bit of a hack here. +cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=noevents.root processingMode='RunsAndLumis' & +PID=$! +sleep 5 +kill -INT $PID +wait +[ 66 = $(dqmiolistmes.py noevents.root -r 1 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 1 -l 1 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 2 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 2 -l 2 | wc -l) ] +