Skip to content

Commit

Permalink
Merge pull request #35990 from abhih1/ML4DQM_EB_integration_master
Browse files Browse the repository at this point in the history
Integration of the ML based online Ecal DQM using autoencoder network
  • Loading branch information
cmsbuild authored Nov 11, 2021
2 parents 1411a3a + 61c60ff commit 1d81c36
Show file tree
Hide file tree
Showing 9 changed files with 847 additions and 8 deletions.
2 changes: 2 additions & 0 deletions DQM/EcalMonitorClient/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
<use name="FWCore/MessageLogger"/>
<use name="FWCore/ParameterSet"/>
<use name="rootcore"/>
<use name="onnxruntime"/>
<use name="PhysicsTools/ONNXRuntime"/>
<export>
<lib name="1"/>
</export>
41 changes: 41 additions & 0 deletions DQM/EcalMonitorClient/interface/MLClient.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#ifndef MLClient_H
#define MLClient_H

#include "DQWorkerClient.h"
#include <vector>
#include <valarray>
#include <deque>

namespace ecaldqm {

class MLClient : public DQWorkerClient {
public:
MLClient();
~MLClient() override {}

void producePlots(ProcessType) override;

private:
void setParams(edm::ParameterSet const&) override;

//Each Ecal Barrel occupancy map is plotted at the tower level
//34 towers in the eta and 72 towers in the phi directions
static const int nEtaTowers = 34;
static const int nPhiTowers = 72;
//After padding with two rows above and below to prevent the edge effect, 36 towers in eta direction
static const int nEtaTowersPad = 36;
float MLThreshold_;
float PUcorr_slope_;
float PUcorr_intercept_;
size_t nLS = 3; //No.of lumisections to add the occupancy over
size_t nLSloss = 6; //No.of lumisections to multiply the loss over

std::deque<int> NEventQ; //To keep the no.of events in each occupancy plot
std::deque<std::valarray<float>> ebOccMap1dQ; //To keep the input occupancy plots to be summed
std::vector<double> avgOcc_; //To keep the average occupancy to do response correction
std::deque<std::valarray<std::valarray<float>>> lossMap2dQ; //To keep the ML losses to be multiplied
};

} // namespace ecaldqm

#endif
5 changes: 4 additions & 1 deletion DQM/EcalMonitorClient/python/EcalMonitorClient_cfi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from DQM.EcalMonitorClient.TimingClient_cfi import ecalTimingClient
from DQM.EcalMonitorClient.TrigPrimClient_cfi import ecalTrigPrimClient
from DQM.EcalMonitorClient.SummaryClient_cfi import ecalSummaryClient
from DQM.EcalMonitorClient.MLClient_cfi import ecalMLClient

ecalMonitorClient = DQMEDHarvester("EcalDQMonitorClient",
moduleName = cms.untracked.string("Ecal Monitor Client"),
Expand All @@ -22,6 +23,7 @@
"RawDataClient",
"TrigPrimClient",
"TimingClient",
"MLClient",
"SummaryClient"
),
# task parameters (included from indivitual cfis)
Expand All @@ -33,7 +35,8 @@
SelectiveReadoutClient = ecalSelectiveReadoutClient,
TimingClient = ecalTimingClient,
TrigPrimClient = ecalTrigPrimClient,
SummaryClient = ecalSummaryClient
SummaryClient = ecalSummaryClient,
MLClient = ecalMLClient
),
commonParameters = ecalCommonParams,
verbosity = cms.untracked.int32(0)
Expand Down
411 changes: 411 additions & 0 deletions DQM/EcalMonitorClient/python/MLClient_cfi.py

Large diffs are not rendered by default.

281 changes: 281 additions & 0 deletions DQM/EcalMonitorClient/src/MLClient.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#include "DQM/EcalMonitorClient/interface/MLClient.h"

#include "DataFormats/EcalDetId/interface/EcalTrigTowerDetId.h"

#include "CondFormats/EcalObjects/interface/EcalDQMStatusHelper.h"

#include "DQM/EcalCommon/interface/EcalDQMCommonUtils.h"

#include "FWCore/ParameterSet/interface/ParameterSet.h"

#include "PhysicsTools/ONNXRuntime/interface/ONNXRuntime.h"

#include "DQM/EcalCommon/interface/MESetNonObject.h"

using namespace cms::Ort;

namespace ecaldqm {
MLClient::MLClient() : DQWorkerClient() { qualitySummaries_.insert("MLQualitySummary"); }

void MLClient::setParams(edm::ParameterSet const& _params) {
MLThreshold_ = _params.getUntrackedParameter<double>("MLThreshold");
PUcorr_slope_ = _params.getUntrackedParameter<double>("PUcorr_slope");
PUcorr_intercept_ = _params.getUntrackedParameter<double>("PUcorr_intercept");
avgOcc_ = _params.getUntrackedParameter<std::vector<double>>("avgOcc");
if (!onlineMode_) {
MEs_.erase(std::string("MLQualitySummary"));
MEs_.erase(std::string("EventsperMLImage"));
sources_.erase(std::string("PU"));
sources_.erase(std::string("NumEvents"));
sources_.erase(std::string("DigiAllByLumi"));
sources_.erase(std::string("AELoss"));
}
}

void MLClient::producePlots(ProcessType) {
if (!onlineMode_)
return;
using namespace std;
MESet& meMLQualitySummary(MEs_.at("MLQualitySummary"));
MESet& meEventsperMLImage(MEs_.at("EventsperMLImage"));

MESetNonObject const& sPU(static_cast<MESetNonObject&>(sources_.at("PU")));
MESetNonObject const& sNumEvents(static_cast<MESetNonObject&>(sources_.at("NumEvents")));

//Get the no.of events and the PU per LS calculated in OccupancyTask
int nEv = sNumEvents.getFloatValue();
double pu = sPU.getFloatValue();
//Do not compute ML quality if PU is non existent.
if (pu < 0.) {
return;
}
uint32_t mask(1 << EcalDQMStatusHelper::PEDESTAL_ONLINE_HIGH_GAIN_RMS_ERROR |
1 << EcalDQMStatusHelper::PHYSICS_BAD_CHANNEL_WARNING |
1 << EcalDQMStatusHelper::PHYSICS_BAD_CHANNEL_ERROR);

//////////////// ML Data Preprocessing //////////////////////////////////
//Inorder to feed the data into the ML model we apply some preprocessing.
//We use the Digi Occupancy per Lumisection as the input source.
//The model was trained on each occupancy plot having 500 events.
//In apprehension of the low luminosity in the beginning of Run3, where in online DQM
//the no.of events per LS could be lower than 500, we sum the occupancies over a fixed no.of lumisections as a running sum,
//and require that the total no.of events on this summed occupancy to be atleast 200.
//(This no.of LS and the no.of events are parameters which would require tuning later)
//This summed occupancy is now the input image, which is then corrected for PileUp(PU) dependence and
//change in no.of events, which are derived from training.
//The input image is also padded by replicating the top and bottom rows so as to prevent the "edge effect"
//wherein the ML model's learning degrades near the edge of the data set it sees.
//This padding is then removed during inference on the model output.

//Get the histogram of the input digi occupancy per lumisection.
TH2F* hEbDigiMap((sources_.at("DigiAllByLumi")).getME(1)->getTH2F());

size_t nTowers = nEtaTowers * nPhiTowers; //Each occupancy map is of size 34x72 towers
std::vector<float> ebOccMap1dCumulPad; //Vector to feed into the ML network
std::valarray<float> ebOccMap1d(nTowers); //Array to store occupancy map of size 34x72
//Store the values from the input histogram into the array
//to do preprocessing
for (int i = 0; i < hEbDigiMap->GetNbinsY(); i++) { //NbinsY = 34, NbinsX = 72
for (int j = 0; j < hEbDigiMap->GetNbinsX(); j++) {
int bin = hEbDigiMap->GetBin(j + 1, i + 1);
int k = (i * nPhiTowers) + j;
ebOccMap1d[k] = hEbDigiMap->GetBinContent(bin);
}
}
ebOccMap1dQ.push_back(ebOccMap1d); //Queue which stores input occupancy maps for nLS lumis
NEventQ.push_back(nEv); //Queue which stores the no.of events per LS for nLS lumis

if (NEventQ.size() < nLS) {
return; //Should have nLS lumis to add the occupancy over.
}
if (NEventQ.size() > nLS) {
NEventQ.pop_front(); //Keep only nLS consecutive LS. Pop the first one if size greater than nLS
}
if (ebOccMap1dQ.size() > nLS) {
ebOccMap1dQ.pop_front(); //Same conditon for the input occupancy maps.
}

int TNum = 0;
for (size_t i = 0; i < nLS; i++) {
TNum += NEventQ[i]; //Total no.of events over nLS lumis
}
if (TNum < 200) {
return; //The total no.of events should be atleast 200 over nLS for meaningful statistics
}
//Fill the ME to monitor the trend of the total no.of events in each input image to the ML model
meEventsperMLImage.fill(getEcalDQMSetupObjects(), EcalBarrel, double(timestamp_.iLumi), double(TNum));

//Array to hold the sum of inputs, which make atleast 200 events.
std::valarray<float> ebOccMap1dCumul(0., nTowers);

for (size_t i = 0; i < ebOccMap1dQ.size(); i++) {
ebOccMap1dCumul += ebOccMap1dQ[i]; //Sum the input arrays of N LS.
}
//Applying PU correction derived from training
ebOccMap1dCumul = ebOccMap1dCumul / (PUcorr_slope_ * pu + PUcorr_intercept_);

//Scaling up to match input dimensions. 36*72 used instead of 34*72 to accommodate the additional padding
//of 2 rows to prevent the "edge effect" which is done below
ebOccMap1dCumul = ebOccMap1dCumul * (nEtaTowersPad * nPhiTowers);

//Correction for no.of events in each input image as originally model trained with 500 events per image
ebOccMap1dCumul = ebOccMap1dCumul * (500. / TNum);

//The pre-processed input is now fed into the input tensor vector which will go into the ML model
ebOccMap1dCumulPad.assign(std::begin(ebOccMap1dCumul), std::end(ebOccMap1dCumul));

//Replicate and pad with the first and last row to prevent the edge effect
for (int k = 0; k < nPhiTowers; k++) {
float val = ebOccMap1dCumulPad[nPhiTowers - 1];
ebOccMap1dCumulPad.insert(ebOccMap1dCumulPad.begin(),
val); //padding in the beginning with the first row elements
}

int size = ebOccMap1dCumulPad.size();
for (int k = (size - nPhiTowers); k < size; k++) {
float val = ebOccMap1dCumulPad[k];
ebOccMap1dCumulPad.push_back(val); //padding at the end with the last row elements
}

///// Model Inference //////
//An Autoencoder (AE) network with resnet architecture is used here which is trained on
//certified good data (EB digi occupancy) from Run 2018 data.
//On giving an input occupancy map, the encoder part of the AE compresses and reduces the input data, learning its features,
//and the decoder reconstructs the data from the encoded form into a representation as close to the original input as possible.
//We then compute the Mean squared error (MSE) between the input and output image, also called the Reconstruction Loss,
//calculated at a tower by tower basis.
//Thus, given an anomalous tower the loss should be significantly higher than the loss with respect to good towers, which the model
//has already seen --> anomaly detection.
//When calculating the loss we also apply a response correction by dividing each input and output image with the average occupancy from
//all 2018 data (also to be tuned),to accommodate the difference in response of crystals in different regions of the Ecal Barrel
//Further each loss map from each input image is then multiplied by the last N loss maps,
///so that real anomalies which persist with time are enhanced and fluctuations are suppressed.
//A quality threshold is then applied on this time multiplied loss map, to mark them as GOOD or BAD,
//after which it is stored as a quality summary ME.

///ONNX model running///
std::string instanceName{"AE-DQM-inference"};
std::string modelFilepath = edm::FileInPath("DQM/EcalMonitorClient/data/onnxModels/resnet.onnx").fullPath();

Ort::SessionOptions sessionOptions;
sessionOptions.SetIntraOpNumThreads(1);
Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, instanceName.c_str());
Ort::Session session(env, modelFilepath.c_str(), sessionOptions);

Ort::AllocatorWithDefaultOptions allocator;

const char* inputName = session.GetInputName(0, allocator);

Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();

std::vector<int64_t> inputDims = inputTensorInfo.GetShape();

const char* outputName = session.GetOutputName(0, allocator);

Ort::TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();

std::vector<int64_t> outputDims = outputTensorInfo.GetShape();

size_t TensorSize = nEtaTowersPad * nPhiTowers;
std::vector<float> ebRecoOccMap1dPad(TensorSize); //To store the output reconstructed occupancy

std::vector<const char*> inputNames{inputName};
std::vector<const char*> outputNames{outputName};
std::vector<Ort::Value> inputTensors;
std::vector<Ort::Value> outputTensors;

Ort::MemoryInfo memoryInfo =
Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
inputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, ebOccMap1dCumulPad.data(), TensorSize, inputDims.data(), inputDims.size()));

outputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, ebRecoOccMap1dPad.data(), TensorSize, outputDims.data(), outputDims.size()));

session.Run(Ort::RunOptions{nullptr},
inputNames.data(),
inputTensors.data(),
1,
outputNames.data(),
outputTensors.data(),
1);

///Inference on the output from the model///
//2D Loss map to store tower by tower loss between the output (reconstructed) and input occupancies,
//Have same dimensions as the occupancy plot
std::valarray<std::valarray<float>> lossMap2d(std::valarray<float>(nPhiTowers), nEtaTowers);

//1D val arrays to store row wise information corresponding to the reconstructed, input and average occupancies, and loss.
//and to do element wise (tower wise) operations on them to calculate the MSE loss between the reco and input occupancy.
std::valarray<float> recoOcc1d(0., nPhiTowers);
std::valarray<float> inputOcc1d(0., nPhiTowers);
std::valarray<float> avgOcc1d(0., nPhiTowers);
std::valarray<float> loss_;

//Loss calculation
//Ignore the top and bottom replicated padded rows when doing inference
//by making index i run over (1,35) instead of (0,36)
for (int i = 1; i < 35; i++) {
for (int j = 0; j < nPhiTowers; j++) {
int k = (i * nPhiTowers) + j;
recoOcc1d[j] = ebRecoOccMap1dPad[k];
inputOcc1d[j] = ebOccMap1dCumulPad[k];
avgOcc1d[j] = avgOcc_[k];
}
//Calculate the MSE loss = (output-input)^2, with avg response correction
loss_ = std::pow((recoOcc1d / avgOcc1d - inputOcc1d / avgOcc1d), 2);
lossMap2d[i - 1] = (loss_);
}

lossMap2dQ.push_back(lossMap2d); //Store each loss map from the output in the queue
if (lossMap2dQ.size() > nLSloss) {
lossMap2dQ.pop_front(); //Keep exactly nLSloss loss maps to multiply
}
if (lossMap2dQ.size() < nLSloss) { //Exit if there are not nLSloss loss maps
return;
}
//To hold the final multiplied loss
std::valarray<std::valarray<float>> lossMap2dMult(std::valarray<float>(1., nPhiTowers), nEtaTowers);

//Multiply together the last nLSloss loss maps
//So that real anomalies which persist with time are enhanced and fluctuations are suppressed.
for (size_t i = 0; i < lossMap2dQ.size(); i++) {
lossMap2dMult *= lossMap2dQ[i];
}

//Fill the AELoss ME with the values of this time multiplied loss map
MESet const& sAELoss(sources_.at("AELoss"));
TH2F* hLossMap2dMult(sAELoss.getME(1)->getTH2F());
for (int i = 0; i < hLossMap2dMult->GetNbinsY(); i++) {
for (int j = 0; j < hLossMap2dMult->GetNbinsX(); j++) {
int bin_ = hLossMap2dMult->GetBin(j + 1, i + 1);
double content = lossMap2dMult[i][j];
hLossMap2dMult->SetBinContent(bin_, content);
}
}
///////////////////// ML Quality Summary /////////////////////
//Apply the quality threshold on the time multiplied loss map stored in the ME AELoss
//If anomalous, the tower entry will have a large loss value. If good, the value will be close to zero.

MESet::const_iterator dAEnd(sAELoss.end(GetElectronicsMap()));
for (MESet::const_iterator dItr(sAELoss.beginChannel(GetElectronicsMap())); dItr != dAEnd;
dItr.toNextChannel(GetElectronicsMap())) {
DetId id(dItr->getId());

bool doMaskML(meMLQualitySummary.maskMatches(id, mask, statusManager_, GetTrigTowerMap()));

float entries(dItr->getBinContent());
int quality(doMaskML ? kMGood : kGood);
//If a trigger tower entry is greater than the ML threshold, set it to Bad quality, otherwise Good.
if (entries > MLThreshold_) {
quality = doMaskML ? kMBad : kBad;
}
//Fill the quality summary with the quality of the given tower id.
meMLQualitySummary.setBinContent(getEcalDQMSetupObjects(), id, double(quality));
} // ML Quality Summary
} // producePlots()

DEFINE_ECALDQM_WORKER(MLClient);
} // namespace ecaldqm
8 changes: 8 additions & 0 deletions DQM/EcalMonitorTasks/interface/OccupancyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "CalibCalorimetry/EcalLaserCorrection/interface/EcalLaserDbService.h"
#include "DataFormats/Provenance/interface/Timestamp.h"
#include "FWCore/Framework/interface/ConsumesCollector.h"
#include "DataFormats/Scalers/interface/LumiScalers.h"

namespace ecaldqm {
class OccupancyTask : public DQWorkerTask {
Expand All @@ -30,6 +31,7 @@ namespace ecaldqm {
void runOnTPDigis(EcalTrigPrimDigiCollection const&);
void runOnRecHits(EcalRecHitCollection const&, Collections);
void setTokens(edm::ConsumesCollector&) override;
void endLuminosityBlock(edm::LuminosityBlock const&, edm::EventSetup const&) override;

private:
void setParams(edm::ParameterSet const&) override;
Expand All @@ -38,6 +40,12 @@ namespace ecaldqm {
float recHitThreshold_;
float tpThreshold_;
edm::TimeValue_t m_iTime;
edm::InputTag lumiTag;
edm::EDGetTokenT<LumiScalersCollection> lumiScalersToken_;
double scal_pu;
bool FindPUinLS = false;
int nEv;
bool lumiCheck_;
};

inline bool OccupancyTask::analyze(void const* _p, Collections _collection) {
Expand Down
Loading

0 comments on commit 1d81c36

Please sign in to comment.