From ca17fd7a505df17f7ad7278173337e3c30bb374b Mon Sep 17 00:00:00 2001 From: Marcel Andre Schneider Date: Tue, 5 May 2020 14:22:16 +0200 Subject: [PATCH 1/4] Intorduce initLumi as a counterpart to cleanupLumi. As reported in #29605, it can happen that a DQMOneEDAnalyzer does not produce it's per lumi MEs, because there were no events in the Lumisection and it only calls enterLumi as needed per event. To prevent this, we need to make sure that lumi ME are always created for every ME, but we can't have lumitransitions in DQMOneEDAnalyzer, so it needs to happen in a global callback. But there we can't safely use enterLumi, since that would corrupt the module's local MEs. The solution is to have a new method, initLumi, dedicated to initializing global MEs, but not touching local MEs. This is actually a nice thing in general, since now initLumi/cleanupLumi form a symmetrical pair (create/destroy global MEs), as well as enterLumi/leavLumi (update local MEs). All of these are idempotent, as before. There are a bunch of corner cases aorund booking: initLumi *must* be called before enterLumi, but the global callback that triggers it globally might (will) happen before booking has happened for the module. So we also need to do it after booking. There is a race related to lumi MEs if globalBeginLumi can happen *before* beginRun finishes for all plugins. This should not happen for now. --- DQMServices/Core/interface/DQMEDAnalyzer.h | 6 ++ .../Core/interface/DQMGlobalEDAnalyzer.h | 1 + DQMServices/Core/interface/DQMOneEDAnalyzer.h | 5 ++ DQMServices/Core/interface/DQMStore.h | 11 +++- DQMServices/Core/src/DQMStore.cc | 65 +++++++++++++++++-- 5 files changed, 79 insertions(+), 9 deletions(-) diff --git a/DQMServices/Core/interface/DQMEDAnalyzer.h b/DQMServices/Core/interface/DQMEDAnalyzer.h index 7499716d4e683..909ae0012b174 100644 --- a/DQMServices/Core/interface/DQMEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMEDAnalyzer.h @@ -75,6 +75,11 @@ class DQMEDAnalyzer : public edm::stream::EDProducer()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); + edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); dqmBeginRun(run, setup); edm::Service()->bookTransaction( [this, &run, &setup](DQMStore::IBooker& booker) { @@ -83,6 +88,7 @@ class DQMEDAnalyzer : public edm::stream::EDProducergetCanSaveByLumi()); + edm::Service()->initLumi(run.run(), /* lumi */ 0, meId()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, meId()); } diff --git a/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h b/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h index 2ea0937676119..75bfec42370fa 100644 --- a/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMGlobalEDAnalyzer.h @@ -45,6 +45,7 @@ class DQMGlobalEDAnalyzer // local MEs for each run cache. meId(run), /* canSaveByLumi */ false); + dqmstore_->initLumi(run.run(), /* lumi */ 0, meId(run)); dqmstore_->enterLumi(run.run(), /* lumi */ 0, meId(run)); return h; } diff --git a/DQMServices/Core/interface/DQMOneEDAnalyzer.h b/DQMServices/Core/interface/DQMOneEDAnalyzer.h index 3ea7468f3b916..6eea9026b9b12 100644 --- a/DQMServices/Core/interface/DQMOneEDAnalyzer.h +++ b/DQMServices/Core/interface/DQMOneEDAnalyzer.h @@ -32,6 +32,10 @@ class DQMOneEDAnalyzer } void beginRun(edm::Run const& run, edm::EventSetup const& setup) final { + // if we run booking multiple times because there are multiple runs in a + // job, this is needed to make sure all existing MEs are in a valid state + // before the booking code runs. + edm::Service()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); dqmBeginRun(run, setup); edm::Service()->bookTransaction( @@ -41,6 +45,7 @@ class DQMOneEDAnalyzer }, this->moduleDescription().id(), this->getCanSaveByLumi()); + edm::Service()->initLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); edm::Service()->enterLumi(run.run(), /* lumi */ 0, this->moduleDescription().id()); } diff --git a/DQMServices/Core/interface/DQMStore.h b/DQMServices/Core/interface/DQMStore.h index 67d84b711804c..e27262db68950 100644 --- a/DQMServices/Core/interface/DQMStore.h +++ b/DQMServices/Core/interface/DQMStore.h @@ -642,9 +642,16 @@ namespace dqm { // For input modules: trigger recycling without local ME/enterLumi/moduleID. MonitorElement* findOrRecycle(MonitorElementData::Key const&); + // this creates local all needed global MEs for the given run/lumi (and + // module), potentially cloning them if there are concurrent runs/lumis. + // Symmetrical to cleanupLumi, this is called from a framwork hook, to + // make sure it also runs when the module does not call anything. + void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi); + void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); + // modules are expected to call these callbacks when they change run/lumi. - // The DQMStore then updates the module's MEs, potentially cloning them - // if there are concurrent runs/lumis. + // The DQMStore then updates the module's MEs local MEs to point to the + // new run/lumi. void enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); void leaveLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); diff --git a/DQMServices/Core/src/DQMStore.cc b/DQMServices/Core/src/DQMStore.cc index 095094b7c3820..bbc632a1e2187 100644 --- a/DQMServices/Core/src/DQMStore.cc +++ b/DQMServices/Core/src/DQMStore.cc @@ -357,15 +357,23 @@ namespace dqm::implementation { return nullptr; } - void DQMStore::enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { - // Make sure global MEs for the run/lumi exist (depending on scope), and - // point the local MEs for this module to these global MEs. + void DQMStore::initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi) { + // Call initLumi for all modules, as a global operation. + auto lock = std::scoped_lock(this->booking_mutex_); + for (auto& kv : this->localMEs_) { + initLumi(run, lumi, kv.first); + } + } + + void DQMStore::initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { + // Make sure global MEs for the run/lumi exist (depending on scope) auto lock = std::scoped_lock(this->booking_mutex_); // these are the MEs we need to update. auto& localset = this->localMEs_[moduleID]; // this is where they need to point to. + // This could be a per-run or per-lumi set (depending on lumi == 0) auto& targetset = this->globalMEs_[edm::LuminosityBlockID(run, lumi)]; // this is where we can get MEs to reuse. auto& prototypes = this->globalMEs_[edm::LuminosityBlockID()]; @@ -386,7 +394,7 @@ namespace dqm::implementation { auto target = targetset.find(me); // lookup by path, thanks to MEComparison if (target != targetset.end()) { // we already have a ME, just use it! - debugTrackME("enterLumi (existing)", nullptr, *target); + debugTrackME("initLumi (existing)", nullptr, *target); } else { // look for a prototype to reuse. auto proto = prototypes.find(me); @@ -410,7 +418,7 @@ namespace dqm::implementation { auto result = targetset.insert(oldme); assert(result.second); // was new insertion target = result.first; // iterator to new ME - debugTrackME("enterLumi (reused)", nullptr, *target); + debugTrackME("initLumi (reused)", nullptr, *target); } else { // no prototype available. That means we have concurrent Lumis/Runs, // and need to make a clone now. @@ -431,9 +439,48 @@ namespace dqm::implementation { auto result = targetset.insert(newme); assert(result.second); // was new insertion target = result.first; // iterator to new ME - debugTrackME("enterLumi (allocated)", nullptr, *target); + debugTrackME("initLumi (allocated)", nullptr, *target); } } + } + } + + void DQMStore::enterLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID) { + // point the local MEs for this module to these global MEs. + + // This needs to happen before we can use the global MEs for this run/lumi here. + // We could do it lazyly here, or eagerly globally in global begin lumi. + //initLumi(run, lumi, moduleID); + + auto lock = std::scoped_lock(this->booking_mutex_); + + // these are the MEs we need to update. + auto& localset = this->localMEs_[moduleID]; + // this is where they need to point to. + auto& targetset = this->globalMEs_[edm::LuminosityBlockID(run, lumi)]; + + // only for a sanity check + auto checkScope = [run, lumi](MonitorElementData::Scope scope) { + if (scope == MonitorElementData::Scope::JOB) { + return (run == 0 && lumi == 0); + } else if (scope == MonitorElementData::Scope::RUN) { + return (run != 0 && lumi == 0); + } else if (scope == MonitorElementData::Scope::LUMI) { + return (lumi != 0); + } + assert(!"Impossible Scope."); + return false; + }; + + for (MonitorElement* me : localset) { + auto target = targetset.find(me); // lookup by path, thanks to MEComparison + if (target == targetset.end()) { + auto anyme = this->findME(me); + debugTrackME("enterLumi (nothingtodo)", me, nullptr); + assert(anyme && checkScope(anyme->getScope()) == false); + continue; + } + assert(target != targetset.end()); // initLumi should have taken care of this. // now we have the proper global ME in the right place, point the local there. // This is only safe if the name is exactly the same -- else it might corrupt // the tree structure of the set! @@ -675,15 +722,19 @@ namespace dqm::implementation { // Set lumi and run for legacy booking. // This is no more than a guess with concurrent runs/lumis, but should be // correct for purely sequential legacy stuff. - // These transitions should only affect non-DQM*EDAnalyzer based code. // Also reset Scope, such that legacy modules can expect it to be JOB. + // initLumi and leaveLumi are needed for all module types: these handle + // creating and deleting global MEs as needed, which has to happen even if + // a module does not see lumi transitions. ar.watchPreGlobalBeginRun([this](edm::GlobalContext const& gc) { this->setRunLumi(gc.luminosityBlockID()); + this->initLumi(gc.luminosityBlockID().run(), /* lumi */ 0); this->enterLumi(gc.luminosityBlockID().run(), /* lumi */ 0, /* moduleID */ 0); this->setScope(MonitorElementData::Scope::JOB); }); ar.watchPreGlobalBeginLumi([this](edm::GlobalContext const& gc) { this->setRunLumi(gc.luminosityBlockID()); + this->initLumi(gc.luminosityBlockID().run(), gc.luminosityBlockID().luminosityBlock()); this->enterLumi(gc.luminosityBlockID().run(), gc.luminosityBlockID().luminosityBlock(), /* moduleID */ 0); }); ar.watchPostGlobalEndRun([this](edm::GlobalContext const& gc) { From 1ffdc508cd85a1c4663506308cb978180ea15057 Mon Sep 17 00:00:00 2001 From: Marcel Andre Schneider Date: Tue, 5 May 2020 14:34:23 +0200 Subject: [PATCH 2/4] Also update README. --- DQMServices/Core/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DQMServices/Core/README.md b/DQMServices/Core/README.md index fba63c8830d92..821e661b94568 100644 --- a/DQMServices/Core/README.md +++ b/DQMServices/Core/README.md @@ -137,7 +137,7 @@ When a ME is booked, internally _global_ and _local_ MEs are created. This shou - In the DQM API, we face the conflict that `MonitorElement` objects are held in the modules (so their life cycle has to match that of the module) but also represent histograms whose life cycle depends the data processed (run and lumi transitions). This caused conflicts since the introduction of multi-threading. - The `DQMStore` resolves this conflict by representing each monitor element using (at least) two objects: A _local_ `MonitorElement`, that follows the module life cycle but does not own data, and a _global_ `MonitorElement` that owns histogram data but does not belong to any module. There may be multiple _local_ MEs for one _global_ ME if multiple modules fill the same histogram (`edm::stream` or even independent modules). There may be multiple _global_ MEs for the same histogram if there are concurrent lumisections. - The live cycle of _local_ MEs is driven by callbacks from each of the module base classes (`enterLumi`, `leaveLumi`). For legacy `edm::EDAnalyzer`s, global begin/end run/lumi hooks are used, which only work as long as there are no concurrent lumisections. The _local_ MEs are kept in a set of containers indexed by the `moduleID`, with special value `0` for _all_ legacy modules and special values for `DQMGlobalEDAnalyzer`s, where the local MEs need to match the life cycle of the `runCache` (module id + run number), and `DQMEDAnalyzer`s, where the `streamID` is combined with the `moduleID` to get a unique identifier for each stream instance. -- The live cycle of _global_ MEs is driven by the enter/leave lumi calls (indirectly) and the `cleanupLumi` hook called via the edm service interface. They are kept in a set of containers indexed by run and lumi number. For `RUN` MEs, the lumi number is 0; for `JOB` MEs, run and lumi are zero. The special pair `(0,0)` is also used for _prototypes_: Global MEs that are not currently associated to any run or lumi, but can (and _have to_, for the legacy guarantees) be recycled once a run or lumi starts. +- The live cycle of _global_ MEs is driven by the `initLumi/cleanupLumi` hooks called via the edm service interface. They are kept in a set of containers indexed by run and lumi number. For `RUN` MEs, the lumi number is 0; for `JOB` MEs, run and lumi are zero. The special pair `(0,0)` is also used for _prototypes_: Global MEs that are not currently associated to any run or lumi, but can (and _have to_, for the legacy guarantees) be recycled once a run or lumi starts. - If there are no concurrent lumisections, both _local_ and _global_ MEs live for the entire job and are always connected in the same way, which means all legacy interactions continue to work. `assertLegacySafe` (enabled by default) checks for this condition and crashes the job if it is violated. From 6172725e062ac5793092c585f489ba7db9b90745 Mon Sep 17 00:00:00 2001 From: Marcel Andre Schneider Date: Tue, 5 May 2020 14:36:22 +0200 Subject: [PATCH 3/4] Code-format. --- DQMServices/Core/interface/DQMStore.h | 4 ++-- DQMServices/Core/src/DQMStore.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DQMServices/Core/interface/DQMStore.h b/DQMServices/Core/interface/DQMStore.h index e27262db68950..f2e74be85990d 100644 --- a/DQMServices/Core/interface/DQMStore.h +++ b/DQMServices/Core/interface/DQMStore.h @@ -642,9 +642,9 @@ namespace dqm { // For input modules: trigger recycling without local ME/enterLumi/moduleID. MonitorElement* findOrRecycle(MonitorElementData::Key const&); - // this creates local all needed global MEs for the given run/lumi (and + // this creates local all needed global MEs for the given run/lumi (and // module), potentially cloning them if there are concurrent runs/lumis. - // Symmetrical to cleanupLumi, this is called from a framwork hook, to + // Symmetrical to cleanupLumi, this is called from a framwork hook, to // make sure it also runs when the module does not call anything. void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi); void initLumi(edm::RunNumber_t run, edm::LuminosityBlockNumber_t lumi, uint64_t moduleID); diff --git a/DQMServices/Core/src/DQMStore.cc b/DQMServices/Core/src/DQMStore.cc index bbc632a1e2187..f61a7b8e67e36 100644 --- a/DQMServices/Core/src/DQMStore.cc +++ b/DQMServices/Core/src/DQMStore.cc @@ -480,7 +480,7 @@ namespace dqm::implementation { assert(anyme && checkScope(anyme->getScope()) == false); continue; } - assert(target != targetset.end()); // initLumi should have taken care of this. + assert(target != targetset.end()); // initLumi should have taken care of this. // now we have the proper global ME in the right place, point the local there. // This is only safe if the name is exactly the same -- else it might corrupt // the tree structure of the set! From b4c63f1c24dcb2ebefb33193c3c248e4ff893bd2 Mon Sep 17 00:00:00 2001 From: Marcel Andre Schneider Date: Tue, 5 May 2020 15:23:23 +0200 Subject: [PATCH 4/4] Add a unit test with empty lumis. Turns out EmptySource does not really support that... --- DQMServices/Demo/test/run_analyzers_cfg.py | 4 +++- DQMServices/Demo/test/runtests.sh | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/DQMServices/Demo/test/run_analyzers_cfg.py b/DQMServices/Demo/test/run_analyzers_cfg.py index 4912f8c190e7f..aeb56505ef65a 100644 --- a/DQMServices/Demo/test/run_analyzers_cfg.py +++ b/DQMServices/Demo/test/run_analyzers_cfg.py @@ -37,6 +37,7 @@ parser.register('firstRun', 1, one, int, "See EmptySource.") parser.register('numberEventsInRun', 100, one, int, "See EmptySource.") parser.register('numberEventsInLuminosityBlock', 20, one, int, "See EmptySource.") +parser.register('processingMode', 'RunsLumisAndEvents', one, string, "See EmptySource.") parser.register('nEvents', 100, one, int, "Total number of events.") parser.register('nThreads', 1, one, int, "Number of threads and streams.") parser.register('nConcurrent', 1, one, int, "Number of concurrent runs/lumis.") @@ -50,7 +51,8 @@ numberEventsInLuminosityBlock = cms.untracked.uint32(args.numberEventsInLuminosityBlock), firstLuminosityBlock = cms.untracked.uint32(args.firstLuminosityBlock), firstEvent = cms.untracked.uint32(args.firstEvent), - firstRun = cms.untracked.uint32(args.firstRun)) + firstRun = cms.untracked.uint32(args.firstRun), + processingMode = cms.untracked.string(args.processingMode)) process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(args.nEvents) ) diff --git a/DQMServices/Demo/test/runtests.sh b/DQMServices/Demo/test/runtests.sh index c27429a3d4165..f9b80cb001bc7 100755 --- a/DQMServices/Demo/test/runtests.sh +++ b/DQMServices/Demo/test/runtests.sh @@ -155,4 +155,15 @@ cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root nEvents=0 cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 legacyoutput=True cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=empty.root howmany=0 protobufoutput=True +# also try empty lumisections. EmptySource does not really support 'no events' mode (never terminates), so, a bit of a hack here. +cmsRun $LOCAL_TEST_DIR/run_analyzers_cfg.py outfile=noevents.root processingMode='RunsAndLumis' & +PID=$! +sleep 5 +kill -INT $PID +wait +[ 66 = $(dqmiolistmes.py noevents.root -r 1 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 1 -l 1 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 2 | wc -l) ] +[ 66 = $(dqmiolistmes.py noevents.root -r 2 -l 2 | wc -l) ] +