From 9eef8d2bf1f04f81d7243f8960d7976edde65e5c Mon Sep 17 00:00:00 2001 From: Nicola Amapane Date: Thu, 4 May 2023 13:48:56 +0200 Subject: [PATCH] Import from cms-nanoAOD/nanoAOD-tools (backport of #42323) --- DataFormats/Math/src/classes.h | 2 + DataFormats/Math/src/classes_def.xml | 1 + PhysicsTools/NanoAOD/scripts/haddnano.py | 5 +- PhysicsTools/NanoAODTools/.gitignore | 10 + PhysicsTools/NanoAODTools/README.md | 123 ++++++++ PhysicsTools/NanoAODTools/crab/PSet.py | 17 ++ PhysicsTools/NanoAODTools/crab/crab_cfg.py | 33 +++ PhysicsTools/NanoAODTools/crab/crab_script.py | 18 ++ PhysicsTools/NanoAODTools/crab/crab_script.sh | 27 ++ .../postprocessing/examples/exampleModule.py | 54 ++++ .../examples/mhtjuProducerCpp.py | 78 +++++ .../framework/branchselection.py | 62 ++++ .../framework/collectionMerger.py | 136 +++++++++ .../postprocessing/framework/datamodel.py | 150 ++++++++++ .../postprocessing/framework/eventloop.py | 101 +++++++ .../postprocessing/framework/jobreport.py | 74 +++++ .../python/postprocessing/framework/output.py | 190 ++++++++++++ .../postprocessing/framework/postprocessor.py | 274 ++++++++++++++++++ .../postprocessing/framework/preskimming.py | 85 ++++++ .../framework/treeReaderArrayTools.py | 151 ++++++++++ .../python/postprocessing/utils/crabhelper.py | 66 +++++ .../NanoAODTools/scripts/nano_postproc.py | 94 ++++++ .../standalone/checkoutStandalone.sh | 61 ++++ .../NanoAODTools/standalone/env_standalone.sh | 37 +++ PhysicsTools/NanoAODTools/test/BuildFile.xml | 6 + .../NanoAODTools/test/exampleAnalysis.py | 56 ++++ .../NanoAODTools/test/exampleGenDump.py | 133 +++++++++ .../NanoAODTools/test/example_postproc.py | 24 ++ .../test/examples/MhtjuProducerCppWorker.cc | 14 + .../test/examples/MhtjuProducerCppWorker.h | 29 ++ PhysicsTools/NanoAODTools/test/runTest.sh | 8 + 31 files changed, 2118 insertions(+), 1 deletion(-) mode change 100644 => 100755 PhysicsTools/NanoAOD/scripts/haddnano.py create mode 100644 PhysicsTools/NanoAODTools/.gitignore create mode 100644 PhysicsTools/NanoAODTools/README.md create mode 100644 PhysicsTools/NanoAODTools/crab/PSet.py create mode 100644 PhysicsTools/NanoAODTools/crab/crab_cfg.py create mode 100755 PhysicsTools/NanoAODTools/crab/crab_script.py create mode 100755 PhysicsTools/NanoAODTools/crab/crab_script.sh create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/examples/exampleModule.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/examples/mhtjuProducerCpp.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/branchselection.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/collectionMerger.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/datamodel.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/eventloop.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/jobreport.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/output.py create mode 100755 PhysicsTools/NanoAODTools/python/postprocessing/framework/postprocessor.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/preskimming.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/framework/treeReaderArrayTools.py create mode 100644 PhysicsTools/NanoAODTools/python/postprocessing/utils/crabhelper.py create mode 100755 PhysicsTools/NanoAODTools/scripts/nano_postproc.py create mode 100644 PhysicsTools/NanoAODTools/standalone/checkoutStandalone.sh create mode 100644 PhysicsTools/NanoAODTools/standalone/env_standalone.sh create mode 100644 PhysicsTools/NanoAODTools/test/BuildFile.xml create mode 100755 PhysicsTools/NanoAODTools/test/exampleAnalysis.py create mode 100755 PhysicsTools/NanoAODTools/test/exampleGenDump.py create mode 100755 PhysicsTools/NanoAODTools/test/example_postproc.py create mode 100644 PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.cc create mode 100644 PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.h create mode 100755 PhysicsTools/NanoAODTools/test/runTest.sh diff --git a/DataFormats/Math/src/classes.h b/DataFormats/Math/src/classes.h index a14e612ed7bfa..35980108b56d6 100644 --- a/DataFormats/Math/src/classes.h +++ b/DataFormats/Math/src/classes.h @@ -15,6 +15,7 @@ #include "DataFormats/Math/interface/Vector.h" #include "DataFormats/Math/interface/Error.h" #include "DataFormats/Math/interface/Matrix.h" +#include "DataFormats/Math/interface/libminifloat.h" #include "DataFormats/Common/interface/Wrapper.h" #include "DataFormats/Common/interface/RefVector.h" #include "DataFormats/Common/interface/ValueMap.h" @@ -227,4 +228,5 @@ namespace DataFormats_Math { edm::ValueMap vmp4; edm::Wrapper > wvmp4; }; + MiniFloatConverter::ReduceMantissaToNbitsRounding red(12); } // namespace DataFormats_Math diff --git a/DataFormats/Math/src/classes_def.xml b/DataFormats/Math/src/classes_def.xml index 12aeda101983e..6b3d3bc903796 100755 --- a/DataFormats/Math/src/classes_def.xml +++ b/DataFormats/Math/src/classes_def.xml @@ -12,6 +12,7 @@ + diff --git a/PhysicsTools/NanoAOD/scripts/haddnano.py b/PhysicsTools/NanoAOD/scripts/haddnano.py old mode 100644 new mode 100755 index 335d41eca81c9..1b7ce9831d4e6 --- a/PhysicsTools/NanoAOD/scripts/haddnano.py +++ b/PhysicsTools/NanoAOD/scripts/haddnano.py @@ -1,4 +1,4 @@ -#!/bin/env python +#!/bin/env python3 import ROOT import numpy import sys @@ -101,5 +101,8 @@ def zeroFill(tree, brName, brObj, allowNonBool=False): if st.GetString() != obj.GetString(): print("Strings are not matching") obj.Write() + elif obj.IsA().InheritsFrom(ROOT.THnSparse.Class()): + obj.Merge(inputs) + obj.Write() else: print("Cannot handle " + str(obj.IsA().GetName())) diff --git a/PhysicsTools/NanoAODTools/.gitignore b/PhysicsTools/NanoAODTools/.gitignore new file mode 100644 index 0000000000000..b90ed537ad786 --- /dev/null +++ b/PhysicsTools/NanoAODTools/.gitignore @@ -0,0 +1,10 @@ +__init__.py +*.pyc +.*.swp +.#* +#*# +*~ +build +*.d +*.so +*.pcm diff --git a/PhysicsTools/NanoAODTools/README.md b/PhysicsTools/NanoAODTools/README.md new file mode 100644 index 0000000000000..78dba59a65b5f --- /dev/null +++ b/PhysicsTools/NanoAODTools/README.md @@ -0,0 +1,123 @@ +# NanoAODTools +A simple set of python tools to post-process NanoAODs to: +* skim events +* add variables +* produce plots +* perform simple analyses (but beware that performance may be unsatisfactory beacuse of the inherently sequential design model). + +It can be used directly from a CMSSW environment, or checked out as a standalone package. + +Originally imported to CMSSW from [cms-nanoAOD/nanoAOD-tools](https://github.com/cms-nanoAOD/nanoAOD-tools) (post-processor functionality only). + +## Usage in CMSSW +No specific setup is needed. + +It is recommended to add external modules (eg correction modules) in separate packages. + +## Standalone usage (without CMSSW): checkout instructions + +You need to setup python 3 and a recent ROOT version first. + + wget https://raw.githubusercontent.com/cms-sw/cmssw/master/PhysicsTools/NanoAODTools/standalone/checkoutStandalone.sh + bash checkoutStandalone.sh -d MyProject + cd MyProject + source PhysicsTools/NanoAODTools/standalone/env_standalone.sh + +Repeat only the last command at the beginning of every session. + +It is recommended to add analysis code, correction modules etc. in a separate package and repository rather than in a CMSSW fork. + +Please note that some limitations apply: +* Bindings to C++ code are up to the user. +* Adding packed variables is not supported, as this requires binding to the corresponding code. +Using from a CMSSW environment is thus recommended. + +## General instructions to run the post-processing step +The post-processor can be run in two different ways: +* In a normal python script. +* From the command line, using the script under `scripts/nano_postproc.py` (more details [below](#command-line-invocation)). + +## How to write and run modules + +It is possible to define modules that will be run on each entry passing the event selection, and can be used to calculate new variables that will be included in the output tree (both in friend and full mode) or to apply event filter decisions. + +A first, very simple example is available in `test/exampleAnalysis.py`. It can be executed directly, and implements a module to fill a plot. + +An example of an module coded to be imported in scripts or called with the command-line interface is available in `python/postprocessing/examples/exampleModule.py`. This module adds one new variable, which can be stored in skimmed NanoAOD and also used in the subsequent Modules in the same job. The example `test/example_postproc.py` shows how to import and use it in a script while skimming events. + +Let us now examine the structure of a module class. +* All modules must inherit from `PhysicsTools.NanoAODTools.postprocessing.framework.eventloop.Module`. +* the `__init__` constructor function should be used to set the module options. +* the `beginFile` function should create the branches that you want to add to the output file, calling the `branch(branchname, typecode, lenVar)` method of `wrappedOutputTree`. `typecode` should be the ROOT TBranch type ("F" for float, "I" for int etc.). `lenVar` should be the name of the variable holding the length of array branches (for instance, `branch("Electron_myNewVar","F","nElectron")`). If the `lenVar` branch does not exist already - it can happen if you create a new collection - it will be automatically created. +* the `analyze` function is called on each event. It should return `True` if the event is to be retained, `False` if it should be dropped. + +The event interface, defined in `PhysicsTools.NanoAODTools.postprocessing.framework.datamodule`, allows to dynamically construct views of objects organized in collections, based on the branch names, for instance: + + electrons = Collection(event, "Electron") + if len(electrons)>1: print electrons[0].someVar+electrons[1].someVar + electrons_highpt = filter(lambda x: x.pt>50, electrons) + +and this will access the elements of the `Electron_someVar`, `Electron_pt` branch arrays. Event variables can be accessed simply by `event.someVar`, for instance `event.rho`. + +The output branches should be filled calling the `fillBranch(branchname, value)` method of `wrappedOutputTree`. `value` should be the desired value for single-value branches, an iterable with the correct length for array branches. It is not necessary to fill the `lenVar` branch explicitly, as this is done automatically using the length of the passed iterable. + + + +### Command-line invocation +The basic syntax of the command line invocation is the following: + + nano_postproc.py /path/to/output_directory /path/to/input_tree.root + +(in standalone mode, should be invoked as `./scripts/nano_postproc.py`). + +Here is a summary of its features: +* the `-s`,`--postfix` option is used to specify the suffix that will be appended to the input file name to obtain the output file name. It defaults to *_Friend* in friend mode, *_Skim* in full mode. +* the `-c`,`--cut` option is used to pass a string expression (using the same syntax as in TTree::Draw) that will be used to select events. It cannot be used in friend mode. +* the `-J`,`--json` option is used to pass the name of a JSON file that will be used to select events. It cannot be used in friend mode. +* if run with the `--full` option (default), the output will be a full nanoAOD file. If run with the `--friend` option, instead, the output will be a friend tree that can be attached to the input tree. In the latter case, it is not possible to apply any kind of event selection, as the number of entries in the parent and friend tree must be the same. +* the `-b`,`--branch-selection` option is used to pass the name of a file containing directives to keep or drop branches from the output tree. The file should contain one directive among `keep`/`drop` (wildcards allowed as in TTree::SetBranchStatus) or `keepmatch`/`dropmatch` (python regexp matching the branch name) per line. More details are provided in the section [Keep/drop branches](#keepdrop-branches) below. + * `--bi` and `--bo` allows to specify the keep/drop file separately for input and output trees. +* the `--justcount` option will cause the script to printout the number of selected events, without actually writing the output file. + +Please run with `--help` for a complete list of options. + +Let's take the already mentioned [exampleModule.py](python/postprocessing/examples/exampleModule.py). It contains a simple constructor: +``` + exampleModuleConstr = lambda : exampleProducer(jetSelection= lambda j : j.pt > 30) +``` +whih can be imported using the following syntax: + +``` +nano_postproc.py outDir /eos/cms/store/user/andrey/f.root -I PhysicsTools.NanoAODTools.postprocessing.examples.exampleModule exampleModuleConstr +``` + +### Keep/drop branches +See the effect of keep/drop instructions by creating a `keep_and_drop.txt` file: + +``` +drop * +keep Muon* +keep Electron* +keep Jet* +``` +and specify it with thne --bi option: +``` +python scripts/nano_postproc.py outDir /eos/cms/store/user/andrey/f.root -I PhysicsTools.NanoAODTools.postprocessing.examples.exampleModule exampleModuleConstr -s _exaModu_keepdrop --bi keep_and_drop_input.txt +``` +comparing to the previous command (without `--bi`), the output branch created by _exampleModuleConstr_ is the same, but with --bi all other branche are dropped when creating output tree. It also runs faster. +Option --bo can be used to further fliter output branches. + +The keep and drop directive also accept python lists __when called from a python script__, e.g: +``` +outputbranchsel=["drop *", "keep EventMass"] +``` + +### Calling C++ helpers +Now, let's have a look at another example, `python/postprocessing/examples/mhtjuProducerCpp.py` ([link](python/postprocessing/examples/mhtjuProducerCpp.py)). Similarly, it should be imported using the following syntax: + +``` +nano_postproc.py outDir /eos/cms/store/user/andrey/f.root -I PhysicsTools.NanoAODTools.postprocessing.examples.mhtjuProducerCpp mhtju +``` +This module has the same structure of its producer as `exampleProducer`, but in addition it utilizes a C++ code to calculate the mht variable, `test/examples/mhtjuProducerCppWorker.cc`. This code is loaded in the `__init__` method of the producer. + + diff --git a/PhysicsTools/NanoAODTools/crab/PSet.py b/PhysicsTools/NanoAODTools/crab/PSet.py new file mode 100644 index 0000000000000..ed999043914e0 --- /dev/null +++ b/PhysicsTools/NanoAODTools/crab/PSet.py @@ -0,0 +1,17 @@ +# this fake PSET is needed for local test and for crab to figure the output +# filename you do not need to edit it unless you want to do a local test using +# a different input file than the one marked below +import FWCore.ParameterSet.Config as cms +process = cms.Process('NANO') +process.source = cms.Source( + "PoolSource", + fileNames=cms.untracked.vstring(), + # lumisToProcess=cms.untracked.VLuminosityBlockRange("254231:1-254231:24") +) +process.source.fileNames = [ + '../../NanoAOD/test/lzma.root' # you can change only this line +] +process.maxEvents = cms.untracked.PSet(input=cms.untracked.int32(10)) +process.output = cms.OutputModule("PoolOutputModule", + fileName=cms.untracked.string('tree.root')) +process.out = cms.EndPath(process.output) diff --git a/PhysicsTools/NanoAODTools/crab/crab_cfg.py b/PhysicsTools/NanoAODTools/crab/crab_cfg.py new file mode 100644 index 0000000000000..eaa755c86eb80 --- /dev/null +++ b/PhysicsTools/NanoAODTools/crab/crab_cfg.py @@ -0,0 +1,33 @@ +from WMCore.Configuration import Configuration +from CRABClient.UserUtilities import config, getUsernameFromSiteDB + +config = Configuration() + +config.section_("General") +config.General.requestName = 'NanoPost1' +config.General.transferLogs = True +config.section_("JobType") +config.JobType.pluginName = 'Analysis' +config.JobType.psetName = 'PSet.py' +config.JobType.scriptExe = 'crab_script.sh' +config.JobType.inputFiles = ['crab_script.py'] +config.JobType.sendPythonFolder = True +config.section_("Data") +config.Data.inputDataset = '/DYJetsToLL_1J_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIIFall17NanoAOD-PU2017_12Apr2018_94X_mc2017_realistic_v14-v1/NANOAODSIM' +#config.Data.inputDBS = 'phys03' +config.Data.inputDBS = 'global' +config.Data.splitting = 'FileBased' +#config.Data.splitting = 'EventAwareLumiBased' +config.Data.unitsPerJob = 2 +config.Data.totalUnits = 10 + +config.Data.outLFNDirBase = '/store/user/%s/NanoPost' % ( + getUsernameFromSiteDB()) +config.Data.publication = False +config.Data.outputDatasetTag = 'NanoTestPost' +config.section_("Site") +config.Site.storageSite = "T2_DE_DESY" + +#config.Site.storageSite = "T2_CH_CERN" +# config.section_("User") +#config.User.voGroup = 'dcms' diff --git a/PhysicsTools/NanoAODTools/crab/crab_script.py b/PhysicsTools/NanoAODTools/crab/crab_script.py new file mode 100755 index 0000000000000..98c47fc31cb5d --- /dev/null +++ b/PhysicsTools/NanoAODTools/crab/crab_script.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import os +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import * + +# this takes care of converting the input files from CRAB +from PhysicsTools.NanoAODTools.postprocessing.utils.crabhelper import inputFiles, runsAndLumis + +from PhysicsTools.NanoAODTools.postprocessing.examples.exampleModule import * +p = PostProcessor(".", + inputFiles(), + "Jet_pt>200", + modules=[exampleModuleConstr()], + provenance=True, + fwkJobReport=True, + jsonInput=runsAndLumis()) +p.run() + +print("DONE") diff --git a/PhysicsTools/NanoAODTools/crab/crab_script.sh b/PhysicsTools/NanoAODTools/crab/crab_script.sh new file mode 100755 index 0000000000000..a9e8c5caf59de --- /dev/null +++ b/PhysicsTools/NanoAODTools/crab/crab_script.sh @@ -0,0 +1,27 @@ +#this is not meant to be run locally +# +echo Check if TTY +if [ "`tty`" != "not a tty" ]; then + echo "YOU SHOULD NOT RUN THIS IN INTERACTIVE, IT DELETES YOUR LOCAL FILES" +else + +echo "ENV..................................." +env +echo "VOMS" +voms-proxy-info -all +echo "CMSSW BASE, python path, pwd" +echo $CMSSW_BASE +echo $PYTHON_PATH +echo $PWD +rm -rf $CMSSW_BASE/lib/ +rm -rf $CMSSW_BASE/src/ +rm -rf $CMSSW_BASE/module/ +rm -rf $CMSSW_BASE/python/ +mv lib $CMSSW_BASE/lib +mv src $CMSSW_BASE/src +mv module $CMSSW_BASE/module +mv python $CMSSW_BASE/python + +echo Found Proxy in: $X509_USER_PROXY +python crab_script.py $1 +fi diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/examples/exampleModule.py b/PhysicsTools/NanoAODTools/python/postprocessing/examples/exampleModule.py new file mode 100644 index 0000000000000..cb5bb16f54a74 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/examples/exampleModule.py @@ -0,0 +1,54 @@ +# This is an example of a NanoAODTools Module to add one variable to nanoAODs. +# Note that: +# -the new variable will be available for use in the subsequent modules +# -it is possible to update the value for existing variables +# +# Example of using from command line: +# nano_postproc.py outDir /eos/cms/store/user/andrey/f.root -I PhysicsTools.NanoAODTools.postprocessing.examples.exampleModule exampleModuleConstr +# +# Example of running in a python script: see test/example_postproc.py +# + +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import Collection +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import Module +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +class exampleProducer(Module): + def __init__(self, jetSelection): + self.jetSel = jetSelection + pass + + def beginJob(self): + pass + + def endJob(self): + pass + + def beginFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + self.out = wrappedOutputTree + self.out.branch("EventMass", "F") + + def endFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + pass + + def analyze(self, event): + """process event, return True (go to next module) or False (fail, go to next event)""" + electrons = Collection(event, "Electron") + muons = Collection(event, "Muon") + jets = Collection(event, "Jet") + eventSum = ROOT.TLorentzVector() + for lep in muons: + eventSum += lep.p4() + for lep in electrons: + eventSum += lep.p4() + for j in filter(self.jetSel, jets): + eventSum += j.p4() + self.out.fillBranch("EventMass", eventSum.M()) + return True + + +# define modules using the syntax 'name = lambda : constructor' to avoid having them loaded when not needed + +exampleModuleConstr = lambda: exampleProducer(jetSelection=lambda j: j.pt > 30) diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/examples/mhtjuProducerCpp.py b/PhysicsTools/NanoAODTools/python/postprocessing/examples/mhtjuProducerCpp.py new file mode 100644 index 0000000000000..52e47e57b7735 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/examples/mhtjuProducerCpp.py @@ -0,0 +1,78 @@ +# Example of calling a C++ helper from a Module. +# +# Run with: +# nano_postproc.py outDir /eos/cms/store/user/andrey/f.root -I PhysicsTools.NanoAODTools.postprocessing.examples.mhtjuProducerCpp mhtju + +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import Module +import ROOT +import os +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +# MHT producer, unclean jets only (no lepton overlap cleaning, no jet selection) +class mhtjuProducerCpp(Module): + def __init__(self): + base = os.getenv("NANOAODTOOLS_BASE") + if base: + # Running in standalone mode: compile the C++ helper + if "/MhtjuProducerCppWorker_cc.so" not in ROOT.gSystem.GetLibraries(): + print("Load C++ MhtjuProducerCppWorker worker module") + ROOT.gROOT.ProcessLine( + ".L %s/test/examples/MhtjuProducerCppWorker.cc+O" % base) + else: + # Load the helper from the CMSSW compiled. This is not required if + # dictionaries for the helper are generated with classes_def.xml and + # classes.h + base = "%s/src/PhysicsTools/NanoAODTools" % os.getenv("CMSSW_BASE") + ROOT.gSystem.Load("libPhysicsToolsNanoAODToolsTest.so") + ROOT.gROOT.ProcessLine(".L %s/test/examples/MhtjuProducerCppWorker.h" % base) + self.worker = ROOT.MhtjuProducerCppWorker() + pass + + def beginJob(self): + pass + + def endJob(self): + pass + + def beginFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + self.initReaders(inputTree) # initReaders must be called in beginFile + self.out = wrappedOutputTree + self.out.branch("MHTju_pt", "F") + self.out.branch("MHTju_phi", "F") + + def endFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + pass + + # this function gets the pointers to Value and ArrayReaders and sets + # them in the C++ worker class + def initReaders(self, tree): + self.nJet = tree.valueReader("nJet") + self.Jet_pt = tree.arrayReader("Jet_pt") + self.Jet_phi = tree.arrayReader("Jet_phi") + self.worker.setJets(self.nJet, self.Jet_pt, self.Jet_phi) + # self._ttreereaderversion must be set AFTER all calls to + # tree.valueReader or tree.arrayReader + self._ttreereaderversion = tree._ttreereaderversion + + def analyze(self, event): + """process event, return True (go to next module) or False (fail, + go to next event)""" + + # do this check at every event, as other modules might have read + # further branches + if event._tree._ttreereaderversion > self._ttreereaderversion: + self.initReaders(event._tree) + # do NOT access other branches in python between the check/call to + # initReaders and the call to C++ worker code + output = self.worker.getHT() + + self.out.fillBranch("MHTju_pt", output[0]) + self.out.fillBranch("MHTju_phi", -output[1]) # note the minus + return True + + +# define modules using the syntax 'name = lambda : constructor' to avoid +# having them loaded when not needed + +mhtju = lambda: mhtjuProducerCpp() diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/branchselection.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/branchselection.py new file mode 100644 index 0000000000000..d27d4d314fb81 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/branchselection.py @@ -0,0 +1,62 @@ +import re +try: + Pattern = re._pattern_type +except AttributeError: + # Python 3.7 + Pattern = re.Pattern + + +class BranchSelection(): + def __init__(self, branchsel): + comment = re.compile(r"#.*") + ops = [] + + if isinstance(branchsel, list): + # branchsel is a list of commands + lines = branchsel + elif isinstance(branchsel, str): + # branchsel is a filename + lines=[] + for line in open(branchsel, 'r'): + line = line.strip() + if len(line) == 0 or line[0] == '#': + continue + line = re.sub(comment, "", line) + while line[-1] == "\\": + line = line[:-1] + " " + file.next().strip() + line = re.sub(comment, "", line) + lines.append(line) + + for line in lines: + try: + (op, sel) = line.split() + if op == "keep": + ops.append((sel, 1)) + elif op == "drop": + ops.append((sel, 0)) + elif op == "keepmatch": + ops.append((re.compile("(:?%s)$" % sel), 1)) + elif op == "dropmatch": + ops.append((re.compile("(:?%s)$" % sel), 0)) + else: + print("Error in branchsel: line '%s': "% (line) + + "it's not (keep|keepmatch|drop|dropmatch) " + + "" + ) + except ValueError as e: + print("Error in branchsel: line '%s': " % (line) + + "it's not (keep|keepmatch|drop|dropmatch) " + + "" + ) + self._ops = ops + + def selectBranches(self, tree): + tree.SetBranchStatus("*", 1) + branchNames = [b.GetName() for b in tree.GetListOfBranches()] + for bre, stat in self._ops: + if type(bre) == Pattern: + for n in branchNames: + if re.match(bre, n): + tree.SetBranchStatus(n, stat) + else: + tree.SetBranchStatus(bre, stat) diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/collectionMerger.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/collectionMerger.py new file mode 100644 index 0000000000000..c5c824b421d85 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/collectionMerger.py @@ -0,0 +1,136 @@ +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import Module +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import Collection +import ROOT +import numpy as np +import itertools +ROOT.PyConfig.IgnoreCommandLineOptions = True + +_rootLeafType2rootBranchType = { + 'UChar_t': 'b', + 'Char_t': 'B', + 'UInt_t': 'i', + 'Int_t': 'I', + 'Float_t': 'F', + 'Double_t': 'D', + 'ULong64_t': 'l', + 'Long64_t': 'L', + 'Bool_t': 'O' +} + + +class collectionMerger(Module): + def __init__(self, + input, + output, + sortkey=lambda x: x.pt, + reverse=True, + selector=None, + maxObjects=None): + self.input = input + self.output = output + self.nInputs = len(self.input) + self.sortkey = lambda obj_j_i1: sortkey(obj_j_i1[0]) + self.reverse = reverse + # pass dict([(collection_name,lambda obj : selection(obj)]) + self.selector = [(selector[coll] if coll in selector else + (lambda x: True)) + for coll in self.input] if selector else None + # save only the first maxObjects objects passing the selection in the merged collection + self.maxObjects = maxObjects + self.branchType = {} + pass + + def beginJob(self): + pass + + def endJob(self): + pass + + def beginFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + + # Find list of activated branches in input tree + _brlist_in = inputTree.GetListOfBranches() + branches_in = set( + [_brlist_in.At(i) for i in range(_brlist_in.GetEntries())]) + branches_in = [ + x for x in branches_in if inputTree.GetBranchStatus(x.GetName()) + ] + + # Find list of activated branches in output tree + _brlist_out = wrappedOutputTree._tree.GetListOfBranches() + branches_out = set( + [_brlist_out.At(i) for i in range(_brlist_out.GetEntries())]) + branches_out = [ + x for x in branches_out + if wrappedOutputTree._tree.GetBranchStatus(x.GetName()) + ] + + # Use both + branches = branches_in + branches_out + + # Only keep branches with right collection name + self.brlist_sep = [ + self.filterBranchNames(branches, x) for x in self.input + ] + self.brlist_all = set(itertools.chain(*(self.brlist_sep))) + + self.is_there = np.zeros(shape=(len(self.brlist_all), self.nInputs), + dtype=bool) + for bridx, br in enumerate(self.brlist_all): + for j in range(self.nInputs): + if br in self.brlist_sep[j]: + self.is_there[bridx][j] = True + + # Create output branches + self.out = wrappedOutputTree + for br in self.brlist_all: + self.out.branch("%s_%s" % (self.output, br), + _rootLeafType2rootBranchType[self.branchType[br]], + lenVar="n%s" % self.output) + + def endFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + pass + + def filterBranchNames(self, branches, collection): + out = [] + for br in branches: + name = br.GetName() + if not name.startswith(collection + '_'): + continue + out.append(name.replace(collection + '_', '')) + self.branchType[out[-1]] = br.FindLeaf(br.GetName()).GetTypeName() + return out + + def analyze(self, event): + """process event, return True (go to next module) or False (fail, go to next event)""" + coll = [Collection(event, x) for x in self.input] + objects = [(coll[j][i], j, i) for j in range(self.nInputs) + for i in range(len(coll[j]))] + if self.selector: + objects = [ + obj_j_i for obj_j_i in objects + if self.selector[obj_j_i[1]](obj_j_i[0]) + ] + objects.sort(key=self.sortkey, reverse=self.reverse) + if self.maxObjects: + objects = objects[:self.maxObjects] + for bridx, br in enumerate(self.brlist_all): + out = [] + for obj, j, i in objects: + out.append(getattr(obj, br) if self.is_there[bridx][j] else 0) + self.out.fillBranch("%s_%s" % (self.output, br), out) + return True + + +# define modules using the syntax 'name = lambda : constructor' to avoid having them loaded when not needed + +lepMerger = lambda: collectionMerger(input=["Electron", "Muon"], + output="Lepton") +lepMerger_exampleSelection = lambda: collectionMerger( + input=["Electron", "Muon"], + output= + "Lepton", # this will keep only the two leading leptons among electrons with pt > 20 and muons with pt > 40 + maxObjects=2, + selector=dict([("Electron", lambda x: x.pt > 20), + ("Muon", lambda x: x.pt > 40)]), +) diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/datamodel.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/datamodel.py new file mode 100644 index 0000000000000..5afff90009e3c --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/datamodel.py @@ -0,0 +1,150 @@ +from PhysicsTools.NanoAODTools.postprocessing.framework.treeReaderArrayTools import InputTree +import ROOT +import math +ROOT.PyConfig.IgnoreCommandLineOptions = True + +statusflags = { # GenPart_statusFlags, stored bitwise (powers of 2): + # https://github.com/cms-sw/cmssw/edit/master/PhysicsTools/NanoAOD/python/genparticles_cff.py + # https://cms-nanoaod-integration.web.cern.ch/integration/master-106X/mc106Xul18_doc.html#GenPart + 'isPrompt': (1 << 0), 'fromHardProcess': (1 << 8), + 'isDecayedLeptonHadron': (1 << 1), 'isHardProcessTauDecayProduct': (1 << 9), + 'isTauDecayProduct': (1 << 2), 'isDirectHardProcessTauDecayProduct': (1 << 10), + 'isPromptTauDecayProduct': (1 << 3), 'fromHardProcessBeforeFSR': (1 << 11), + 'isDirectTauDecayProduct': (1 << 4), 'isFirstCopy': (1 << 12), + 'isDirectPromptTauDecayProduct': (1 << 5), 'isLastCopy': (1 << 13), + 'isDirectHadronDecayProduct': (1 << 6), 'isLastCopyBeforeFSR': (1 << 14), + 'isHardProcess': (1 << 7), +} + + +class Event: + """Class that allows seeing an entry of a PyROOT TTree as an Event""" + + def __init__(self, tree, entry): + self._tree = tree + self._entry = entry + self._tree.gotoEntry(entry) + + def __getattr__(self, name): + if name in self.__dict__: + return self.__dict__[name] + return self._tree.readBranch(name) + + def __getitem__(self, attr): + return self.__getattr__(attr) + + def eval(self, expr): + """Evaluate an expression, as TTree::Draw would do. + + This is added for convenience, but it may perform poorly and the implementation is not bulletproof, + so it's better to rely on reading values, collections or objects directly + """ + if not hasattr(self._tree, '_exprs'): + self._tree._exprs = {} + # remove useless warning about EvalInstance() + import warnings + warnings.filterwarnings(action='ignore', category=RuntimeWarning, + message='creating converter for unknown type "const char\*\*"$') + warnings.filterwarnings(action='ignore', category=RuntimeWarning, + message='creating converter for unknown type "const char\*\[\]"$') + if expr not in self._tree._exprs: + formula = ROOT.TTreeFormula(expr, expr, self._tree) + if formula.IsInteger(): + formula.go = formula.EvalInstance64 + else: + formula.go = formula.EvalInstance + self._tree._exprs[expr] = formula + # force sync, to be safe + self._tree.GetEntry(self._entry) + self._tree.entry = self._entry + # self._tree._exprs[expr].SetQuickLoad(False) + else: + self._tree.gotoEntry(self._entry) + formula = self._tree._exprs[expr] + if "[" in expr: # unclear why this is needed, but otherwise for some arrays x[i] == 0 for all i > 0 + formula.GetNdata() + return formula.go() + + +class Object: + """Class that allows seeing a set branches plus possibly an index as an Object""" + + def __init__(self, event, prefix, index=None): + self._event = event + self._prefix = prefix + "_" + self._index = index + + def __getattr__(self, name): + if name in self.__dict__: + return self.__dict__[name] + if name[:2] == "__" and name[-2:] == "__": + raise AttributeError + val = getattr(self._event, self._prefix + name) + if self._index != None: + val = val[self._index] + # convert char to integer number + val = ord(val) if type(val) == str else val + self.__dict__[name] = val # cache + return val + + def __getitem__(self, attr): + return self.__getattr__(attr) + + def p4(self, corr_pt=None): + """Create TLorentzVector for this particle.""" + ret = ROOT.TLorentzVector() + if corr_pt == None: + ret.SetPtEtaPhiM(self.pt, self.eta, self.phi, self.mass) + else: + ret.SetPtEtaPhiM(corr_pt, self.eta, self.phi, self.mass) + return ret + + def DeltaR(self, other): + if isinstance(other, ROOT.TLorentzVector): + deta = abs(other.Eta() - self.eta) + dphi = abs(other.Phi() - self.phi) + else: + deta = abs(other.eta - self.eta) + dphi = abs(other.phi - self.phi) + while dphi > math.pi: + dphi = abs(dphi - 2 * math.pi) + return math.sqrt(dphi**2 + deta**2) + + def statusflag(self, flag): + """Find if bit for statusflag is set (for GenPart only).""" + return (self.statusFlags & statusflags[flag])==statusflags[flag] + + def subObj(self, prefix): + return Object(self._event, self._prefix + prefix) + + def __repr__(self): + return ("<%s[%s]>" % (self._prefix[:-1], self._index)) if self._index != None else ("<%s>" % self._prefix[:-1]) + + def __str__(self): + return self.__repr__() + + +class Collection: + def __init__(self, event, prefix, lenVar=None): + self._event = event + self._prefix = prefix + if lenVar != None: + self._len = getattr(event, lenVar) + else: + self._len = getattr(event, "n" + prefix) + self._cache = {} + + def __getitem__(self, index): + if type(index) == int and index in self._cache: + return self._cache[index] + if index >= self._len: + raise IndexError("Invalid index %r (len is %r) at %s" % (index, self._len, self._prefix)) + elif index < 0: + raise IndexError("Invalid index %r (negative) at %s" % (index, self._prefix)) + ret = Object(self._event, self._prefix, index=index) + if type(index) == int: + self._cache[index] = ret + return ret + + def __len__(self): + return self._len diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/eventloop.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/eventloop.py new file mode 100644 index 0000000000000..98689bf5ceab5 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/eventloop.py @@ -0,0 +1,101 @@ +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import Event +from PhysicsTools.NanoAODTools.postprocessing.framework.treeReaderArrayTools import clearExtraBranches +import sys +import time +import ROOT + + +class Module(object): + def __init__(self): + self.writeHistFile = False + + def beginJob(self, histFile=None, histDirName=None): + if histFile != None and histDirName != None: + self.writeHistFile = True + prevdir = ROOT.gDirectory + self.histFile = histFile + self.histFile.cd() + self.dir = self.histFile.mkdir(histDirName) + prevdir.cd() + self.objs = [] + + def endJob(self): + if hasattr(self, 'objs') and self.objs != None: + prevdir = ROOT.gDirectory + self.dir.cd() + for obj in self.objs: + obj.Write() + prevdir.cd() + if hasattr(self, 'histFile') and self.histFile != None: + self.histFile.Close() + + def beginFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + pass + + def endFile(self, inputFile, outputFile, inputTree, wrappedOutputTree): + pass + + def analyze(self, event): + """process event, return True (go to next module) or False (fail, go to next event)""" + pass + + def addObject(self, obj): + setattr(self, obj.GetName(), obj) + self.objs.append(getattr(self, obj.GetName())) + + def addObjectList(self, names, obj): + objlist = [] + for iname, name in enumerate(names): + setattr(self, obj.GetName() + '_' + name, + obj.Clone(obj.GetName() + '_' + name)) + objlist.append(getattr(self, obj.GetName() + '_' + name)) + self.objs.append(getattr(self, obj.GetName() + '_' + name)) + setattr(self, obj.GetName(), objlist) + + +def eventLoop( + modules, inputFile, outputFile, inputTree, wrappedOutputTree, + maxEvents=-1, eventRange=None, progress=(10000, sys.stdout), + filterOutput=True +): + for m in modules: + m.beginFile(inputFile, outputFile, inputTree, wrappedOutputTree) + + t0 = time.time() + tlast = t0 + doneEvents = 0 + acceptedEvents = 0 + entries = inputTree.entries + if eventRange: + entries = len(eventRange) + if maxEvents > 0: + entries = min(entries, maxEvents) + + for ie, i in enumerate(range(entries) if eventRange == None else eventRange): + if maxEvents > 0 and ie >= maxEvents: + break + e = Event(inputTree, i) + clearExtraBranches(inputTree) + doneEvents += 1 + ret = True + for m in modules: + ret = m.analyze(e) + if not ret: + break + if ret: + acceptedEvents += 1 + if (ret or not filterOutput) and wrappedOutputTree != None: + wrappedOutputTree.fill() + if progress: + if ie > 0 and ie % progress[0] == 0: + t1 = time.time() + progress[1].write("Processed %8d/%8d entries, %5.2f%% (elapsed time %7.1fs, curr speed %8.3f kHz, avg speed %8.3f kHz), accepted %8d/%8d events (%5.2f%%)\n" % ( + ie, entries, ie / float(0.01 * entries), + t1 - t0, (progress[0] / 1000.) / (max(t1 - tlast, 1e-9)), + ie / 1000. / (max(t1 - t0, 1e-9)), + acceptedEvents, doneEvents, + acceptedEvents / (0.01 * doneEvents))) + tlast = t1 + for m in modules: + m.endFile(inputFile, outputFile, inputTree, wrappedOutputTree) + return (doneEvents, acceptedEvents, time.time() - t0) diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/jobreport.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/jobreport.py new file mode 100644 index 0000000000000..f23fdc316dbec --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/jobreport.py @@ -0,0 +1,74 @@ +import xml.etree.cElementTree as ET +import re +#import lxml.etree.ElementTree as ET + + +class JobReport: + def __init__(self): + self.fjr = ET.Element("FrameworkJobReport") + self.readbranches = ET.SubElement(self.fjr, "ReadBranches") + self.performancereport = ET.SubElement(self.fjr, "PerformanceReport") + self.performancesummary = ET.SubElement( + self.performancereport, "PerformanceSummary", Metric="StorageStatistics") + ET.SubElement(self.performancesummary, "Metric", + Name="Parameter-untracked-bool-enabled", Value="true") + ET.SubElement(self.performancesummary, "Metric", + Name="Parameter-untracked-bool-stats", Value="true") + ET.SubElement(self.performancesummary, "Metric", + Name="Parameter-untracked-string-cacheHint", Value="application-only") + ET.SubElement(self.performancesummary, "Metric", + Name="Parameter-untracked-string-readHint", Value="auto-detect") + ET.SubElement(self.performancesummary, "Metric", + Name="ROOT-tfile-read-totalMegabytes", Value="0") + ET.SubElement(self.performancesummary, "Metric", + Name="ROOT-tfile-write-totalMegabytes", Value="0") + # + # + # + # + # + # + +# likely not needed +# +# + + def addInputFile(self, filename, eventsRead=1, runsAndLumis={"1": [1]}): + infile = ET.SubElement(self.fjr, "InputFile") + ET.SubElement(infile, "LFN").text = re.sub( + r".*?(/store/.*\.root)(\?.*)?", r"\1", filename) + ET.SubElement(infile, "PFN").text = "" + ET.SubElement(infile, "Catalog").text = "" + ET.SubElement(infile, "InputType").text = "primaryFiles" + ET.SubElement(infile, "ModuleLabel").text = "source" + ET.SubElement(infile, "InputSourceClass").text = "PoolSource" + ET.SubElement(infile, "GUID").text = "" + ET.SubElement(infile, "EventsRead").text = "%s" % eventsRead + runs = ET.SubElement(infile, "Runs") + for r, ls in runsAndLumis.items(): + run = ET.SubElement(runs, "Run", ID="%s" % r) + for l in ls: + ET.SubElement(run, "LumiSection", ID="%s" % l) + + def addOutputFile(self, filename, events=1, runsAndLumis={"1": [1]}): + infile = ET.SubElement(self.fjr, "File") + ET.SubElement(infile, "LFN").text = "" + ET.SubElement(infile, "PFN").text = filename + ET.SubElement(infile, "Catalog").text = "" + ET.SubElement(infile, "ModuleLabel").text = "NANO" + ET.SubElement(infile, "OutputModuleClass").text = "PoolOutputModule" + ET.SubElement(infile, "GUID").text = "" + ET.SubElement(infile, "DataType").text = "" + ET.SubElement( + infile, "BranchHash").text = "dc90308e392b2fa1e0eff46acbfa24bc" + ET.SubElement(infile, "TotalEvents").text = "%s" % events + runs = ET.SubElement(infile, "Runs") + for r, ls in runsAndLumis.items(): + run = ET.SubElement(runs, "Run", ID="%s" % r) + for l in ls: + ET.SubElement(run, "LumiSection", ID="%s" % l) + + def save(self, filename="FrameworkJobReport.xml"): + tree = ET.ElementTree(self.fjr) + tree.write(filename) # , pretty_print=True) + pass diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/output.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/output.py new file mode 100644 index 0000000000000..d81a724016dd0 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/output.py @@ -0,0 +1,190 @@ +from PhysicsTools.NanoAODTools.postprocessing.framework.treeReaderArrayTools import setExtraBranch +from array import array +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +_rootBranchType2PythonArray = { + 'b': 'B', + 'B': 'b', + 'i': 'I', + 'I': 'i', + 'F': 'f', + 'D': 'd', + 'l': 'L', + 'L': 'l', + 'O': 'B' +} + + +class OutputBranch: + def __init__( + self, tree, name, rootBranchType, n=1, + lenVar=None, title=None, limitedPrecision=False + ): + n = int(n) + self.buff = array( + _rootBranchType2PythonArray[rootBranchType], n * [0. if rootBranchType in 'FD' else 0]) + self.lenVar = lenVar + self.n = n + self.precision = ROOT.MiniFloatConverter.ReduceMantissaToNbitsRounding( + limitedPrecision) if limitedPrecision and rootBranchType == 'F' else lambda x: x + # check if a branch was already there + existingBranch = tree.GetBranch(name) + if (existingBranch): + self.branch = existingBranch + self.branch.SetAddress(self.buff) + else: + if lenVar != None: + self.branch = tree.Branch( + name, self.buff, "%s[%s]/%s" % (name, lenVar, rootBranchType)) + elif n == 1: + self.branch = tree.Branch( + name, self.buff, name + "/" + rootBranchType) + else: + self.branch = tree.Branch( + name, self.buff, "%s[%d]/%s" % (name, n, rootBranchType)) + if title: + self.branch.SetTitle(title) + + def fill(self, val): + if self.lenVar: + if len(self.buff) < len(val): # realloc + self.buff = array(self.buff.typecode, max( + len(val), 2 * len(self.buff)) * [0. if self.buff.typecode in 'fd' else 0]) + self.branch.SetAddress(self.buff) + for i, v in enumerate(val): + self.buff[i] = self.precision(v) + elif self.n == 1: + self.buff[0] = self.precision(val) + else: + if len(val) != self.n: + raise RuntimeError("Mismatch in filling branch %s of fixed length %d with %d values (%s)" % ( + self.branch.GetName(), self.n, len(val), val)) + for i, v in enumerate(val): + self.buff[i] = v + + +class OutputTree: + def __init__(self, tfile, ttree, intree): + self._file = tfile + self._tree = ttree + self._intree = intree + self._branches = {} + + def branch( + self, name, rootBranchType, n=1, lenVar=None, + title=None, limitedPrecision=False + ): + # and (not self._tree.GetBranch(lenVar)): + if (lenVar != None) and (lenVar not in self._branches): + self._branches[lenVar] = OutputBranch(self._tree, lenVar, "i") + self._branches[name] = OutputBranch( + self._tree, name, rootBranchType, n=n, + lenVar=lenVar, title=title, limitedPrecision=limitedPrecision + ) + return self._branches[name] + + def fillBranch(self, name, val): + br = self._branches[name] + if br.lenVar and (br.lenVar in self._branches): + self._branches[br.lenVar].buff[0] = len(val) + setExtraBranch(self._intree, br.lenVar, len(val)) + br.fill(val) + setExtraBranch(self._intree, name, val) + + def tree(self): + return self._tree + + def fill(self): + self._tree.Fill() + + def write(self): + self._file.cd() + self._tree.Write() + + +class FullOutput(OutputTree): + def __init__( + self, + inputFile, + inputTree, + outputFile, + branchSelection=None, + outputbranchSelection=None, + fullClone=False, + maxEntries=None, + firstEntry=0, + provenance=False, + jsonFilter=None + ): + outputFile.cd() + + self.outputbranchSelection = outputbranchSelection + self.maxEntries = maxEntries + self.firstEntry = firstEntry + if fullClone: + outputTree = inputTree.CopyTree( + '1', "", maxEntries if maxEntries else ROOT.TVirtualTreePlayer.kMaxEntries, firstEntry) + else: + outputTree = inputTree.CloneTree(0) + + # enable back all branches in inputTree, then disable for computation + # the branches as requested in branchSelection + inputTree.SetBranchStatus("*", 1) + if branchSelection: + branchSelection.selectBranches(inputTree) + + OutputTree.__init__(self, outputFile, outputTree, inputTree) + self._inputTree = inputTree + self._otherTrees = {} + self._otherObjects = {} + for k in inputFile.GetListOfKeys(): + kn = k.GetName() + if kn == "Events": + continue # this we are doing + elif kn in ("MetaData", "ParameterSets"): + if provenance: + # treat content of other trees as if associated to event 0 + self._otherTrees[kn] = inputFile.Get( + kn).CopyTree('1' if firstEntry == 0 else '0') + elif kn in ("LuminosityBlocks", "Runs"): + if not jsonFilter: + self._otherTrees[kn] = inputFile.Get( + kn).CopyTree('1' if firstEntry == 0 else '0') + elif firstEntry == 0: + _isRun = (kn == "Runs") + _it = inputFile.Get(kn) + _ot = _it.CloneTree(0) + for ev in _it: + if (jsonFilter.filterRunOnly(ev.run) if _isRun else jsonFilter.filterRunLumi(ev.run, ev.luminosityBlock)): + _ot.Fill() + self._otherTrees[kn] = _ot + elif k.GetClassName() == "TTree": + print("Not copying unknown tree %s" % kn) + else: + self._otherObjects[kn] = inputFile.Get(kn) + + def fill(self): + self._inputTree.readAllBranches() + self._tree.Fill() + + def write(self): + if self.outputbranchSelection: + self.outputbranchSelection.selectBranches(self._tree) + self._tree = self.tree().CopyTree('1', "", + self.maxEntries if self.maxEntries else ROOT.TVirtualTreePlayer.kMaxEntries, 0) + + OutputTree.write(self) + for t in self._otherTrees.values(): + t.Write() + for on, ov in self._otherObjects.items(): + self._file.WriteTObject(ov, on) + + +class FriendOutput(OutputTree): + def __init__(self, inputFile, inputTree, outputFile, treeName="Friends"): + outputFile.cd() + outputTree = ROOT.TTree( + treeName, "Friend tree for " + inputTree.GetName()) + OutputTree.__init__(self, outputFile, outputTree, inputTree) diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/postprocessor.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/postprocessor.py new file mode 100755 index 0000000000000..80a0d7940c5b5 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/postprocessor.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +from PhysicsTools.NanoAODTools.postprocessing.framework.jobreport import JobReport +from PhysicsTools.NanoAODTools.postprocessing.framework.preskimming import preSkim +from PhysicsTools.NanoAODTools.postprocessing.framework.output import FriendOutput, FullOutput +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import eventLoop +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import InputTree +from PhysicsTools.NanoAODTools.postprocessing.framework.branchselection import BranchSelection +import os +import time +import hashlib +import subprocess +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +class PostProcessor: + def __init__( + self, outputDir, inputFiles, cut=None, branchsel=None, modules=[], + compression="LZMA:9", friend=False, postfix=None, jsonInput=None, + noOut=False, justcount=False, provenance=False, haddFileName=None, + fwkJobReport=False, histFileName=None, histDirName=None, + outputbranchsel=None, maxEntries=None, firstEntry=0, prefetch=False, + longTermCache=False + ): + self.outputDir = outputDir + self.inputFiles = inputFiles + self.cut = cut + self.modules = modules + self.compression = compression + self.postfix = postfix + self.json = jsonInput + self.noOut = noOut + self.friend = friend + self.justcount = justcount + self.provenance = provenance + self.jobReport = JobReport() if fwkJobReport else None + self.haddFileName = haddFileName + self.histFile = None + self.histDirName = None + if self.jobReport and not self.haddFileName: + print("Because you requested a FJR we assume you want the final " \ + "hadd. No name specified for the output file, will use tree.root") + self.haddFileName = "tree.root" + self.branchsel = BranchSelection(branchsel) if branchsel else None + if outputbranchsel is not None: + self.outputbranchsel = BranchSelection(outputbranchsel) + elif outputbranchsel is None and branchsel is not None: + # Use the same branches in the output as in input + self.outputbranchsel = BranchSelection(branchsel) + else: + self.outputbranchsel = None + + self.histFileName = histFileName + self.histDirName = histDirName + # 2^63 - 1, largest int64 + self.maxEntries = maxEntries if maxEntries else 9223372036854775807 + self.firstEntry = firstEntry + self.prefetch = prefetch # prefetch files to TMPDIR using xrdcp + # keep cached files across runs (it's then up to you to clean up the temp) + self.longTermCache = longTermCache + + def prefetchFile(self, fname, verbose=True): + tmpdir = os.environ['TMPDIR'] if 'TMPDIR' in os.environ else "/tmp" + if not fname.startswith("root://"): + return fname, False + rndchars = "".join([hex(i)[2:] for i in bytearray(os.urandom(8))]) \ + if not self.longTermCache else "long_cache-id%d-%s" \ + % (os.getuid(), hashlib.sha1(fname.encode('utf-8')).hexdigest()) + localfile = "%s/%s-%s.root" \ + % (tmpdir, os.path.basename(fname).replace(".root", ""), rndchars) + if self.longTermCache and os.path.exists(localfile): + if verbose: + print("Filename %s is already available in local path %s " \ + % (fname, localfile)) + return localfile, False + try: + if verbose: + print("Filename %s is remote, will do a copy to local path %s"\ + % (fname, localfile)) + start = time.time() + subprocess.check_output(["xrdcp", "-f", "-N", fname, localfile]) + if verbose: + print("Time used for transferring the file locally: %.2f s"\ + % (time.time() - start)) + return localfile, (not self.longTermCache) + except: + if verbose: + print("Error: could not save file locally, will run from remote") + if os.path.exists(localfile): + if verbose: + print("Deleting partially transferred file %s" % localfile) + try: + os.unlink(localfile) + except: + pass + return fname, False + + def run(self): + outpostfix = self.postfix if self.postfix is not None else ( + "_Friend" if self.friend else "_Skim") + if not self.noOut: + + if self.compression != "none": + ROOT.gInterpreter.ProcessLine("#include ") + (algo, level) = self.compression.split(":") + compressionLevel = int(level) + if algo == "LZMA": + compressionAlgo = ROOT.ROOT.kLZMA + elif algo == "ZLIB": + compressionAlgo = ROOT.ROOT.kZLIB + elif algo == "LZ4": + compressionAlgo = ROOT.ROOT.kLZ4 + else: + raise RuntimeError("Unsupported compression %s" % algo) + else: + compressionLevel = 0 + print("Will write selected trees to " + self.outputDir) + if not self.justcount: + if not os.path.exists(self.outputDir): + os.system("mkdir -p " + self.outputDir) + else: + compressionLevel = 0 + + if self.noOut: + if len(self.modules) == 0: + raise RuntimeError( + "Running with --noout and no modules does nothing!") + + # Open histogram file, if desired + if (self.histFileName is not None and self.histDirName is None) or (self.histFileName is None and self.histDirName is not None): + raise RuntimeError( + "Must specify both histogram file and histogram directory!") + elif self.histFileName is not None and self.histDirName is not None: + self.histFile = ROOT.TFile.Open(self.histFileName, "RECREATE") + else: + self.histFile = None + + for m in self.modules: + if hasattr(m, 'writeHistFile') and m.writeHistFile: + m.beginJob(histFile=self.histFile, + histDirName=self.histDirName) + else: + m.beginJob() + + fullClone = (len(self.modules) == 0) + outFileNames = [] + t0 = time.time() + totEntriesRead = 0 + for fname in self.inputFiles: + ffnames = [] + if "," in fname: + fnames = fname.split(',') + fname, ffnames = fnames[0], fnames[1:] + + fname = fname.strip() + + # Convert LFN to PFN if needed; this requires edmFileUtil to be present in $PATH + if fname.startswith('/store/') : + fname = subprocess.check_output(['edmFileUtil', '-d', '-f '+fname]).decode("utf-8").strip() + + # open input file + if self.prefetch: + ftoread, toBeDeleted = self.prefetchFile(fname) + inFile = ROOT.TFile.Open(ftoread) + else: + inFile = ROOT.TFile.Open(fname) + + # get input tree + inTree = inFile.Get("Events") + if inTree is None: + inTree = inFile.Get("Friends") + nEntries = min(inTree.GetEntries() - + self.firstEntry, self.maxEntries) + totEntriesRead += nEntries + # pre-skimming + elist, jsonFilter = preSkim( + inTree, self.json, self.cut, maxEntries=self.maxEntries, firstEntry=self.firstEntry) + if self.justcount: + print('Would select %d / %d entries from %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, fname, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) + if self.prefetch: + if toBeDeleted: + os.unlink(ftoread) + continue + else: + print('Pre-select %d entries out of %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) + inAddFiles = [] + inAddTrees = [] + for ffname in ffnames: + inAddFiles.append(ROOT.TFile.Open(ffname)) + inAddTree = inAddFiles[-1].Get("Events") + if inAddTree is None: + inAddTree = inAddFiles[-1].Get("Friends") + inAddTrees.append(inAddTree) + inTree.AddFriend(inAddTree) + + if fullClone: + # no need of a reader (no event loop), but set up the elist if available + if elist: + inTree.SetEntryList(elist) + else: + # initialize reader + if elist: + inTree = InputTree(inTree, elist) + else: + inTree = InputTree(inTree) + + # prepare output file + if not self.noOut: + outFileName = os.path.join(self.outputDir, os.path.basename( + fname).replace(".root", outpostfix + ".root")) + outFile = ROOT.TFile.Open( + outFileName, "RECREATE", "", compressionLevel) + outFileNames.append(outFileName) + if compressionLevel: + outFile.SetCompressionAlgorithm(compressionAlgo) + # prepare output tree + if self.friend: + outTree = FriendOutput(inFile, inTree, outFile) + else: + firstEntry = 0 if fullClone and elist else self.firstEntry + outTree = FullOutput( + inFile, + inTree, + outFile, + branchSelection=self.branchsel, + outputbranchSelection=self.outputbranchsel, + fullClone=fullClone, + maxEntries=self.maxEntries, + firstEntry=firstEntry, + jsonFilter=jsonFilter, + provenance=self.provenance) + else: + outFile = None + outTree = None + if self.branchsel: + self.branchsel.selectBranches(inTree) + + # process events, if needed + if not fullClone and not (elist and elist.GetN() == 0): + eventRange = range(self.firstEntry, self.firstEntry + + nEntries) if nEntries > 0 and not elist else None + (nall, npass, timeLoop) = eventLoop( + self.modules, inFile, outFile, inTree, outTree, + eventRange=eventRange, maxEvents=self.maxEntries + ) + print('Processed %d preselected entries from %s (%s entries). Finally selected %d entries' % (nall, fname, nEntries, npass)) + elif outTree is not None: + nall = nEntries + print('Selected %d / %d entries from %s (%.2f%%)' % (outTree.tree().GetEntries(), nall, fname, outTree.tree().GetEntries() / (0.01 * nall) if nall else 0)) + + # now write the output + if not self.noOut: + outTree.write() + outFile.Close() + print("Done %s" % outFileName) + if self.jobReport: + self.jobReport.addInputFile(fname, nall) + if self.prefetch: + if toBeDeleted: + os.unlink(ftoread) + + for m in self.modules: + m.endJob() + + print("Total time %.1f sec. to process %i events. Rate = %.1f Hz." % ((time.time() - t0), totEntriesRead, totEntriesRead / (time.time() - t0))) + + if self.haddFileName: + haddnano = "./haddnano.py" if os.path.isfile( + "./haddnano.py") else "haddnano.py" + os.system("%s %s %s" % + (haddnano, self.haddFileName, " ".join(outFileNames))) + if self.jobReport: + self.jobReport.addOutputFile(self.haddFileName) + self.jobReport.save() diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/preskimming.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/preskimming.py new file mode 100644 index 0000000000000..0ea1e3cd0d503 --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/preskimming.py @@ -0,0 +1,85 @@ +import json +import re +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +class JSONFilter: + def __init__(self, fname="", runsAndLumis={}): + self.keep = {} + if fname != "": + self.runsAndLumis = json.load(open(fname, 'r')) + else: + self.runsAndLumis = runsAndLumis + for _run, lumis in self.runsAndLumis.items(): + run = int(_run) + if run not in self.keep: + self.keep[run] = [] + self.keep[run] += lumis + for run in list(self.keep.keys()): + if len(self.keep[run]) == 0: + del self.keep[run] + + def filterRunLumi(self, run, lumi): + try: + for (l1, l2) in self.keep[run]: + if l1 <= lumi and lumi <= l2: + return True + return False + except KeyError: + return False + + def filterRunOnly(self, run): + return (run in self.keep) + + def runCut(self): + return "%d <= run && run <= %s" % (min(self.keep.keys()), max(self.keep.keys())) + + def filterEList(self, tree, elist): + # FIXME this can be optimized for sure + tree.SetBranchStatus("*", 0) + tree.SetBranchStatus('run', 1) + tree.SetBranchStatus('luminosityBlock', 1) + filteredList = ROOT.TEntryList('filteredList', 'filteredList') + if elist: + for i in range(elist.GetN()): + entry = elist.GetEntry(0) if i == 0 else elist.Next() + tree.GetEntry(entry) + if self.filterRunLumi(tree.run, tree.luminosityBlock): + filteredList.Enter(entry) + else: + for entry in range(tree.GetEntries()): + tree.GetEntry(entry) + if self.filterRunLumi(tree.run, tree.luminosityBlock): + filteredList.Enter(entry) + tree.SetBranchStatus("*", 1) + return filteredList + + +def preSkim(tree, jsonInput=None, cutstring=None, maxEntries=None, firstEntry=0): + if jsonInput == None and cutstring == None: + return None, None + cut = None + jsonFilter = None + if jsonInput != None: + if type(jsonInput) is dict: + jsonFilter = JSONFilter(runsAndLumis=jsonInput) + else: + jsonFilter = JSONFilter(jsonInput) + cut = jsonFilter.runCut() + if cutstring != None: + cut = "(%s) && (%s)" % (cutstring, cut) if cut else cutstring + if maxEntries is None: + maxEntries = ROOT.TVirtualTreePlayer.kMaxEntries + while "AltBranch$" in cut: + m = re.search(r"AltBranch\$\(\s*(\w+)\s*,\s*(\w+)\s*\)", cut) + if not m: + raise RuntimeError( + "Error, found AltBranch$ in cut string, but it doesn't comply with the syntax this code can support. The cut is %r" % cut) + cut = cut.replace(m.group(0), m.group( + 1) if tree.GetBranch(m.group(1)) else m.group(2)) + tree.Draw('>>elist', cut, "entrylist", maxEntries, firstEntry) + elist = ROOT.gDirectory.Get('elist') + if jsonInput: + elist = jsonFilter.filterEList(tree, elist) + return elist, jsonFilter diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/framework/treeReaderArrayTools.py b/PhysicsTools/NanoAODTools/python/postprocessing/framework/treeReaderArrayTools.py new file mode 100644 index 0000000000000..139edfa8c6a4f --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/framework/treeReaderArrayTools.py @@ -0,0 +1,151 @@ +import types +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +def InputTree(tree, entrylist=ROOT.MakeNullPointer(ROOT.TEntryList)): + """add to the PyROOT wrapper of a TTree a TTreeReader and methods readBranch, arrayReader, valueReader""" + if hasattr(tree, '_ttreereader'): + return tree # don't initialize twice + tree.entry = -1 + tree._entrylist = entrylist + tree._ttreereader = ROOT.TTreeReader(tree, tree._entrylist) + tree._ttreereader._isClean = True + tree._ttrvs = {} + tree._ttras = {} + tree._leafTypes = {} + tree._ttreereaderversion = 1 + tree.arrayReader = types.MethodType(getArrayReader, tree) + tree.valueReader = types.MethodType(getValueReader, tree) + tree.readBranch = types.MethodType(readBranch, tree) + tree.gotoEntry = types.MethodType(_gotoEntry, tree) + tree.readAllBranches = types.MethodType(_readAllBranches, tree) + tree.entries = tree._ttreereader.GetEntries(False) + tree._extrabranches = {} + return tree + + +def getArrayReader(tree, branchName): + """Make a reader for branch branchName containing a variable-length value array.""" + if branchName not in tree._ttras: + if not tree.GetBranch(branchName): + raise RuntimeError("Can't find branch '%s'" % branchName) + if not tree.GetBranchStatus(branchName): + raise RuntimeError("Branch %s has status=0" % branchName) + leaf = tree.GetBranch(branchName).GetLeaf(branchName) + if not bool(leaf.GetLeafCount()): + raise RuntimeError("Branch %s is not a variable-length value array" % branchName) + typ = leaf.GetTypeName() + tree._ttras[branchName] = _makeArrayReader(tree, typ, branchName) + return tree._ttras[branchName] + + +def getValueReader(tree, branchName): + """Make a reader for branch branchName containing a single value.""" + if branchName not in tree._ttrvs: + if not tree.GetBranch(branchName): + raise RuntimeError("Can't find branch '%s'" % branchName) + if not tree.GetBranchStatus(branchName): + raise RuntimeError("Branch %s has status=0" % branchName) + leaf = tree.GetBranch(branchName).GetLeaf(branchName) + if bool(leaf.GetLeafCount()) or leaf.GetLen() != 1: + raise RuntimeError("Branch %s is not a value" % branchName) + typ = leaf.GetTypeName() + tree._ttrvs[branchName] = _makeValueReader(tree, typ, branchName) + return tree._ttrvs[branchName] + + +def clearExtraBranches(tree): + tree._extrabranches = {} + + +def setExtraBranch(tree, name, val): + tree._extrabranches[name] = val + + +def readBranch(tree, branchName): + """Return the branch value if the branch is a value, and a TreeReaderArray if the branch is an array""" + if tree._ttreereader._isClean: + raise RuntimeError("readBranch must not be called before calling gotoEntry") + if branchName in tree._extrabranches: + return tree._extrabranches[branchName] + elif branchName in tree._ttras: + return tree._ttras[branchName] + elif branchName in tree._ttrvs: + ret = tree._ttrvs[branchName].Get()[0] + return ord(ret) if type(ret) == str else ret + else: + branch = tree.GetBranch(branchName) + if not branch: + raise RuntimeError("Unknown branch %s" % branchName) + if not tree.GetBranchStatus(branchName): + raise RuntimeError("Branch %s has status=0" % branchName) + leaf = branch.GetLeaf(branchName) + typ = leaf.GetTypeName() + if leaf.GetLen() == 1 and not bool(leaf.GetLeafCount()): + _vr = _makeValueReader(tree, typ, branchName) + # force calling SetEntry as a new ValueReader was created + tree.gotoEntry(tree.entry, forceCall=True) + ret = _vr.Get()[0] + return ord(ret) if type(ret) == str else ret + else: + _ar = _makeArrayReader(tree, typ, branchName) + # force calling SetEntry as a new ArrayReader was created + tree.gotoEntry(tree.entry, forceCall=True) + return _ar + + +####### PRIVATE IMPLEMENTATION PART ####### + +def _makeArrayReader(tree, typ, nam): + if not tree._ttreereader._isClean: + _remakeAllReaders(tree) + ttra = ROOT.TTreeReaderArray(typ)(tree._ttreereader, nam) + tree._leafTypes[nam] = typ + tree._ttras[nam] = ttra + return tree._ttras[nam] + + +def _makeValueReader(tree, typ, nam): + if not tree._ttreereader._isClean: + _remakeAllReaders(tree) + ttrv = ROOT.TTreeReaderValue(typ)(tree._ttreereader, nam) + tree._leafTypes[nam] = typ + tree._ttrvs[nam] = ttrv + return tree._ttrvs[nam] + + +def _remakeAllReaders(tree): + _ttreereader = ROOT.TTreeReader(tree, getattr(tree, '_entrylist', ROOT.MakeNullPointer(ROOT.TEntryList))) + _ttreereader._isClean = True + _ttrvs = {} + for k in tree._ttrvs.keys(): + _ttrvs[k] = ROOT.TTreeReaderValue(tree._leafTypes[k])(_ttreereader, k) + _ttras = {} + for k in tree._ttras.keys(): + _ttras[k] = ROOT.TTreeReaderArray(tree._leafTypes[k])(_ttreereader, k) + tree._ttrvs = _ttrvs + tree._ttras = _ttras + tree._ttreereader = _ttreereader + tree._ttreereaderversion += 1 + + +def _readAllBranches(tree): + tree.GetEntry(_currentTreeEntry(tree)) + + +def _currentTreeEntry(tree): + if tree._entrylist: + return tree._entrylist.GetEntry(tree.entry) + else: + return tree.entry + + +def _gotoEntry(tree, entry, forceCall=False): + tree._ttreereader._isClean = False + if tree.entry != entry or forceCall: + if (tree.entry == entry - 1 and entry != 0): + tree._ttreereader.Next() + else: + tree._ttreereader.SetEntry(entry) + tree.entry = entry diff --git a/PhysicsTools/NanoAODTools/python/postprocessing/utils/crabhelper.py b/PhysicsTools/NanoAODTools/python/postprocessing/utils/crabhelper.py new file mode 100644 index 0000000000000..553e8fb92c76d --- /dev/null +++ b/PhysicsTools/NanoAODTools/python/postprocessing/utils/crabhelper.py @@ -0,0 +1,66 @@ +import os +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import * +import sys +import re +import PSet + + +def inputFiles(): + print("ARGV: " + str(sys.argv)) + JobNumber = sys.argv[1] + crabFiles = PSet.process.source.fileNames + print(crabFiles) + firstInput = crabFiles[0] + tested = False + forceaaa = False + print("--------- using edmFileUtil to convert PFN to LFN --------------") + for i in range(0, len(crabFiles)): + if os.getenv("GLIDECLIENT_Group", "") != "overflow" and os.getenv("GLIDECLIENT_Group", "") != "overflow_conservative" and not forceaaa: + print("Data is local") + pfn = os.popen("edmFileUtil -d %s" % (crabFiles[i])).read() + pfn = re.sub("\n", "", pfn) + print(str(crabFiles[i]) + "->" + str(pfn)) + if not tested: + print("Testing file open") + import ROOT + testfile = ROOT.TFile.Open(pfn) + if testfile and testfile.IsOpen(): + print("Test OK") + crabFiles[i] = pfn + testfile.Close() + # tested=True + else: + print("Test open failed, forcing AAA") + crabFiles[i] = "root://cms-xrd-global.cern.ch/" + \ + crabFiles[i] + forceaaa = True + else: + crabFiles[i] = pfn + + else: + print("Data is not local, using AAA/xrootd") + crabFiles[i] = "root://cms-xrd-global.cern.ch/" + crabFiles[i] + return crabFiles + + +def runsAndLumis(): + if hasattr(PSet.process.source, "lumisToProcess"): + lumis = PSet.process.source.lumisToProcess + runsAndLumis = {} + for l in lumis: + if "-" in l: + start, stop = l.split("-") + rstart, lstart = start.split(":") + rstop, lstop = stop.split(":") + else: + rstart, lstart = l.split(":") + rstop, lstop = l.split(":") + if rstart != rstop: + raise Exception( + "Cannot convert '%s' to runs and lumis json format" % l) + if rstart not in runsAndLumis: + runsAndLumis[rstart] = [] + runsAndLumis[rstart].append([int(lstart), int(lstop)]) + print("Runs and Lumis: " + str(runsAndLumis)) + return runsAndLumis + return None diff --git a/PhysicsTools/NanoAODTools/scripts/nano_postproc.py b/PhysicsTools/NanoAODTools/scripts/nano_postproc.py new file mode 100755 index 0000000000000..e788c10ab1930 --- /dev/null +++ b/PhysicsTools/NanoAODTools/scripts/nano_postproc.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import PostProcessor +from importlib import import_module +import os +import sys +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + +if __name__ == "__main__": + from optparse import OptionParser + parser = OptionParser(usage="%prog [options] outputDir inputFiles") + parser.add_option("-s", "--postfix", dest="postfix", type="string", default=None, + help="Postfix which will be appended to the file name (default: _Friend for friends, _Skim for skims)") + parser.add_option("-J", "--json", dest="json", type="string", + default=None, help="Select events using this JSON file") + parser.add_option("-c", "--cut", dest="cut", type="string", + default=None, help="Cut string") + parser.add_option("-b", "--branch-selection", dest="branchsel", + type="string", default=None, help="Branch selection") + parser.add_option("--bi", "--branch-selection-input", dest="branchsel_in", + type="string", default=None, help="Branch selection input") + parser.add_option("--bo", "--branch-selection-output", dest="branchsel_out", + type="string", default=None, help="Branch selection output") + parser.add_option("--friend", dest="friend", action="store_true", default=False, + help="Produce friend trees in output (current default is to produce full trees)") + parser.add_option("--full", dest="friend", action="store_false", default=False, + help="Produce full trees in output (this is the current default)") + parser.add_option("--noout", dest="noOut", action="store_true", + default=False, help="Do not produce output, just run modules") + parser.add_option("-P", "--prefetch", dest="prefetch", action="store_true", default=False, + help="Prefetch input files locally instead of accessing them via xrootd") + parser.add_option("--long-term-cache", dest="longTermCache", action="store_true", default=False, + help="Keep prefetched files across runs instead of deleting them at the end") + parser.add_option("-N", "--max-entries", dest="maxEntries", type="long", default=None, + help="Maximum number of entries to process from any single given input tree") + parser.add_option("--first-entry", dest="firstEntry", type="long", default=0, + help="First entry to process in the three (to be used together with --max-entries)") + parser.add_option("--justcount", dest="justcount", default=False, + action="store_true", help="Just report the number of selected events") + parser.add_option("-I", "--import", dest="imports", type="string", default=[], action="append", + nargs=2, help="Import modules (python package, comma-separated list of ") + parser.add_option("-z", "--compression", dest="compression", type="string", + default=("LZMA:9"), help="Compression: none, or (algo):(level) ") + + (options, args) = parser.parse_args() + + if options.friend: + if options.cut or options.json: + raise RuntimeError( + "Can't apply JSON or cut selection when producing friends") + + if len(args) < 2: + parser.print_help() + sys.exit(1) + outdir = args[0] + args = args[1:] + + modules = [] + for mod, names in options.imports: + import_module(mod) + obj = sys.modules[mod] + selnames = names.split(",") + mods = dir(obj) + for name in selnames: + if name in mods: + print("Loading %s from %s " % (name, mod)) + if type(getattr(obj, name)) == list: + for mod in getattr(obj, name): + modules.append(mod()) + else: + modules.append(getattr(obj, name)()) + if options.noOut: + if len(modules) == 0: + raise RuntimeError( + "Running with --noout and no modules does nothing!") + if options.branchsel != None: + options.branchsel_in = options.branchsel + options.branchsel_out = options.branchsel + p = PostProcessor(outdir, args, + cut=options.cut, + branchsel=options.branchsel_in, + modules=modules, + compression=options.compression, + friend=options.friend, + postfix=options.postfix, + jsonInput=options.json, + noOut=options.noOut, + justcount=options.justcount, + prefetch=options.prefetch, + longTermCache=options.longTermCache, + maxEntries=options.maxEntries, + firstEntry=options.firstEntry, + outputbranchsel=options.branchsel_out) + p.run() diff --git a/PhysicsTools/NanoAODTools/standalone/checkoutStandalone.sh b/PhysicsTools/NanoAODTools/standalone/checkoutStandalone.sh new file mode 100644 index 0000000000000..126150d71bcbf --- /dev/null +++ b/PhysicsTools/NanoAODTools/standalone/checkoutStandalone.sh @@ -0,0 +1,61 @@ +#! /bin/bash -e +# +# Check out PhysicsTools/NanoAODToos as a standalone package. +# + +DEST="MyProject" +REPO=https://github.com/cms-sw/cmssw.git +BRANCH=master + +while getopts ":d:r:b:" opt; do + case $opt in + d) DEST=$OPTARG ;; + r) REPO=$OPTARG ;; + b) BRANCH=$OPTARG ;; + :) echo "Error: -${OPTARG} requires an argument." + exit 1;; + *) echo "Options:" + echo "-d destination folder (default: MyProject)" + echo "-r repository (default: https://github.com/cms-sw/cmssw.git)" + echo "-b branch (default: master)" + exit 1 + esac +done + +echo "Checking out NanoAODTools in standalone mode in folder $DEST" + +# check if a shared reference repository is available, otherwise set up a personal one +if [ "$CMSSW_GIT_REFERENCE" = "" ]; then + if [ -e /cvmfs/cms-ib.cern.ch/git/cms-sw/cmssw.git ] ; then + CMSSW_GIT_REFERENCE=/cvmfs/cms-ib.cern.ch/git/cms-sw/cmssw.git + elif [ -e /cvmfs/cms.cern.ch/cmssw.git.daily ] ; then + CMSSW_GIT_REFERENCE=/cvmfs/cms.cern.ch/cmssw.git.daily + else + CMSSW_GIT_REFERENCE=None + fi +fi + + +if [ "$CMSSW_GIT_REFERENCE" != "None" ]; then + git clone --branch $BRANCH --no-checkout --reference $CMSSW_GIT_REFERENCE $REPO $DEST +else + # No reference repository available: make a local shallow clone + git clone --branch $BRANCH --depth 1 --no-checkout $REPO $DEST +fi + +# Setup sparse checkout (manually, to skip top-level files) +cd $DEST +git config core.sparsecheckout true +{ + echo "/.gitignore" + echo "/.clang-tidy" + echo "/.clang-format" + echo "!/*/" + echo "/PhysicsTools/" + echo "!/PhysicsTools/*/" + echo "/PhysicsTools/NanoAODTools/" +} > .git/info/sparse-checkout +git read-tree -mu HEAD + +# Initialize python module paths +source PhysicsTools/NanoAODTools/standalone/env_standalone.sh build diff --git a/PhysicsTools/NanoAODTools/standalone/env_standalone.sh b/PhysicsTools/NanoAODTools/standalone/env_standalone.sh new file mode 100644 index 0000000000000..87ad845f3e915 --- /dev/null +++ b/PhysicsTools/NanoAODTools/standalone/env_standalone.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Please setup python 3 and ROOT into your environment first + +CWD=$PWD +if [ ${BASH_SOURCE[0]:0:1} == "/" ]; then + FULLPATH=${BASH_SOURCE[0]} +else + FULLPATH=$PWD/${BASH_SOURCE[0]} +fi +cd ${FULLPATH/%env_standalone.sh/}/.. + +if [ -f standalone/env_standalone.sh ]; then + if [ ! -d build ]; then + if [ x${1} = 'xbuild' ]; then + mkdir -p build/lib/python/PhysicsTools + ln -s ../../../../python build/lib/python/PhysicsTools/NanoAODTools + echo "Build directory created, please source again standalone/env_standalone.sh without the build argument." + else + echo "Build directory is not yet present, please source again standalone/env_standalone.sh with the build argument." + fi + else + if [ x${1} = 'xbuild' ]; then + echo "Build directory is already present, please source again standalone/env_standalone.sh without the build argument." + else + find build/lib/python python -type d -execdir touch '{}/__init__.py' \; + export NANOAODTOOLS_BASE=${PWD} + export PYTHONPATH=${NANOAODTOOLS_BASE}/build/lib/python:${PYTHONPATH} + echo "NanoAODTools: Standalone environment set." + fi + fi + cd $CWD +else + echo "Error in moving to the NanoAODTools directory to setup the standalone environment" + cd $CWD +fi + diff --git a/PhysicsTools/NanoAODTools/test/BuildFile.xml b/PhysicsTools/NanoAODTools/test/BuildFile.xml new file mode 100644 index 0000000000000..97cf3ea7c4185 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/PhysicsTools/NanoAODTools/test/exampleAnalysis.py b/PhysicsTools/NanoAODTools/test/exampleAnalysis.py new file mode 100755 index 0000000000000..e9159437cce84 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/exampleAnalysis.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# +# Example of filling an histogram with nanoAODTools in a plain python script. +# +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import Module +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import Collection +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import PostProcessor +from importlib import import_module +import os +import sys +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + + +class ExampleAnalysis(Module): + def __init__(self): + self.writeHistFile = True + + def beginJob(self, histFile=None, histDirName=None): + Module.beginJob(self, histFile, histDirName) + + self.h_vpt = ROOT.TH1F('sumpt', 'sumpt', 100, 0, 1000) + self.addObject(self.h_vpt) + + def analyze(self, event): + electrons = Collection(event, "Electron") + muons = Collection(event, "Muon") + jets = Collection(event, "Jet") + eventSum = ROOT.TLorentzVector() + + # select events with at least 2 muons + if len(muons) >= 2: + for lep in muons: # loop on muons + eventSum += lep.p4() + for lep in electrons: # loop on electrons + eventSum += lep.p4() + for j in jets: # loop on jets + eventSum += j.p4() + self.h_vpt.Fill(eventSum.Pt()) # fill histogram + + return True + + +preselection = "Jet_pt[0] > 250" +files = ["root://eoscms.cern.ch//eos/cms/store/user/cmsbuild/store/group/cat/datasets/NANOAODSIM/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/7B930101-EB91-4F4E-9B90-0861460DBD94.root"] + +p = PostProcessor(".", + files, + cut=preselection, + branchsel=None, + modules=[ExampleAnalysis()], + noOut=True, + histFileName="histOut.root", + histDirName="plots", + ) +p.run() diff --git a/PhysicsTools/NanoAODTools/test/exampleGenDump.py b/PhysicsTools/NanoAODTools/test/exampleGenDump.py new file mode 100755 index 0000000000000..1d582c841576b --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/exampleGenDump.py @@ -0,0 +1,133 @@ +#! /usr/bin/env python3 +# Sources: +# https://cms-nanoaod-integration.web.cern.ch/integration/master-106X/mc106Xul18_doc.html#GenPart +# https://github.com/cms-sw/cmssw/blob/master/PhysicsTools/NanoAOD/python/genparticles_cff.py +# https://github.com/cms-sw/cmssw/blob/master/PhysicsTools/NanoAOD/plugins/LHETablesProducer.cc +from __future__ import print_function # for python3 compatibility +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import PostProcessor +from PhysicsTools.NanoAODTools.postprocessing.framework.eventloop import Module +from PhysicsTools.NanoAODTools.postprocessing.framework.datamodel import Collection +import PhysicsTools.NanoAODTools.postprocessing.framework.datamodel as datamodel +datamodel.statusflags['isHardPrompt'] = datamodel.statusflags['isPrompt'] + datamodel.statusflags['fromHardProcess'] + +def hasbit(value,bit): + """Check if i'th bit is set to 1, i.e. binary of 2^i, + from the right to the left, starting from position i=0. + Example: hasbit(GenPart_statusFlags,0) -> isPrompt""" + return (value & (1 << bit))>0 + + +def getprodchain(part,genparts=None,event=None,decay=-1): + """Print production chain recursively.""" + chain = "%3s"%(part.pdgId) + imoth = part.genPartIdxMother + while imoth>=0: + if genparts is not None: + moth = genparts[imoth] + chain = "%3s -> "%(moth.pdgId)+chain + imoth = moth.genPartIdxMother + elif event is not None: + chain = "%3s -> "%(event.GenPart_pdgId[imoth])+chain + imoth = event.GenPart_genPartIdxMother[imoth] + if genparts is not None and decay>0: + chain = chain[:-3] # remove last particle + chain += getdecaychain(part,genparts,indent=len(chain),depth=decay-1) + return chain + + +def getdecaychain(part,genparts,indent=0,depth=999): + """Print decay chain recursively.""" + chain = "%3s"%(part.pdgId) + imoth = part._index + ndaus = 0 + indent_ = len(chain)+indent + for idau in range(imoth+1,genparts._len): + dau = genparts[idau] + if dau.genPartIdxMother==imoth: # found daughter + if ndaus>=1: + chain += '\n'+' '*indent_ + if depth>=2: + chain += " -> "+getdecaychain(dau,genparts,indent=indent_+4,depth=depth-1) + else: # stop recursion + chain += " -> %3s"%(dau.pdgId) + ndaus += 1 + return chain + + +# DUMPER MODULE +class LHEDumper(Module): + + def __init__(self): + self.nleptonic = 0 + self.ntauonic = 0 + self.nevents = 0 + + def analyze(self,event): + """Dump gen information for each gen particle in given event.""" + print("\n%s Event %s %s"%('-'*10,event.event,'-'*70)) + self.nevents += 1 + leptonic = False + tauonic = False + bosons = [ ] + taus = [ ] + particles = Collection(event,'GenPart') + #particles = Collection(event,'LHEPart') + print(" \033[4m%7s %7s %7s %7s %7s %7s %7s %7s %8s %9s %10s \033[0m"%( + "index","pdgId","moth","mothId","dR","pt","eta","status","prompt","taudecay","last copy")) + for i, particle in enumerate(particles): + mothidx = particle.genPartIdxMother + eta = max(-999,min(999,particle.eta)) + prompt = particle.statusflag('isPrompt') #hasbit(particle.statusFlags,0) + taudecay = particle.statusflag('isTauDecayProduct') #hasbit(particle.statusFlags,2) + lastcopy = particle.statusflag('isLastCopy') #hasbit(particle.statusFlags,13) + #ishardprompt = particle.statusflag('isHardPrompt') + if 0<=mothidx0: + print(" %-10s %4d / %-4d (%4.1f%%)"%('Tauonic: ',self.ntauonic, self.nevents,100.0*self.ntauonic/self.nevents)) + print(" %-10s %4d / %-4d (%4.1f%%)"%('Leptonic:',self.nleptonic,self.nevents,100.0*self.nleptonic/self.nevents)) + print("%s Done %s\n"%('-'*10,'-'*80)) + + +# PROCESS NANOAOD +url = "root://cms-xrd-global.cern.ch/" +from argparse import ArgumentParser +parser = ArgumentParser() +parser.add_argument('-i', '--infiles', nargs='+') +parser.add_argument('-o', '--outdir', default='.') +parser.add_argument('-n', '--maxevts', type=int, default=20) +args = parser.parse_args() +infiles = args.infiles or [ + url+'/store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/280000/525CD279-3344-6043-98B9-2EA8A96623E4.root', + #url+'/store/mc/RunIISummer20UL18NanoAODv9/TTTo2L2Nu_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/130000/44187D37-0301-3942-A6F7-C723E9F4813D.root', +] +processor = PostProcessor(args.outdir,infiles,noOut=True,modules=[LHEDumper()],maxEntries=args.maxevts) +processor.run() diff --git a/PhysicsTools/NanoAODTools/test/example_postproc.py b/PhysicsTools/NanoAODTools/test/example_postproc.py new file mode 100755 index 0000000000000..fa0d2f7581f37 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/example_postproc.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# +# Example of running the postprocessor to skim events with a cut, and +# adding a new variable using a Module. +# +from PhysicsTools.NanoAODTools.postprocessing.examples.exampleModule import * + +from PhysicsTools.NanoAODTools.postprocessing.framework.postprocessor import PostProcessor +from importlib import import_module +import os +import sys +import ROOT +ROOT.PyConfig.IgnoreCommandLineOptions = True + +fnames = ["root://eoscms.cern.ch//eos/cms/store/user/cmsbuild/store/group/cat/datasets/NANOAODSIM/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/7B930101-EB91-4F4E-9B90-0861460DBD94.root"] + +p = PostProcessor(outputDir=".", + inputFiles=fnames, + cut="Jet_pt>150", + modules=[exampleModuleConstr()], + provenance=True, + maxEntries=50000, #just read the first maxEntries events + ) +p.run() diff --git a/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.cc b/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.cc new file mode 100644 index 0000000000000..8e051e3fa0213 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.cc @@ -0,0 +1,14 @@ +#include "MhtjuProducerCppWorker.h" + +std::pair MhtjuProducerCppWorker::getHT() { + TLorentzVector ht(0, 0, 0, 0); + + unsigned n = (*nJet).Get()[0]; + for (unsigned i = 0; i < n; i++) { + TLorentzVector j_add(0, 0, 0, 0); + j_add.SetPtEtaPhiM((*Jet_pt)[i], 0, (*Jet_phi)[i], 0); + ht += j_add; + } + + return std::pair(ht.Pt(), ht.Phi()); +}; diff --git a/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.h b/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.h new file mode 100644 index 0000000000000..6b4d5cfef4707 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/examples/MhtjuProducerCppWorker.h @@ -0,0 +1,29 @@ +#ifndef PhysicsTools_NanoAODTools_mhtjuProducerCppWorker_h +#define PhysicsTools_NanoAODTools_mhtjuProducerCppWorker_h +// Example of a C++ helper function that can be called from a python Module. +// see python/postprocessing/examples/mhtjuProducerCpp.py for more details. + +#include +#include +#include +#include + +class MhtjuProducerCppWorker { +public: + MhtjuProducerCppWorker() {} + + void setJets(TTreeReaderValue *nJet_, TTreeReaderArray *Jet_pt_, TTreeReaderArray *Jet_phi_) { + nJet = nJet_; + Jet_pt = Jet_pt_; + Jet_phi = Jet_phi_; + } + + std::pair getHT(); + +private: + TTreeReaderValue *nJet = nullptr; + TTreeReaderArray *Jet_pt = nullptr; + TTreeReaderArray *Jet_phi = nullptr; +}; + +#endif diff --git a/PhysicsTools/NanoAODTools/test/runTest.sh b/PhysicsTools/NanoAODTools/test/runTest.sh new file mode 100755 index 0000000000000..d3b391e27e9e0 --- /dev/null +++ b/PhysicsTools/NanoAODTools/test/runTest.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +function die { echo $1: status $2 ; exit $2; } + +echo "===== Test NanoAODTools functionality ====" + +(${SCRAM_TEST_PATH}/exampleAnalysis.py) || die "Falure in exampleAalysis" $? +(${SCRAM_TEST_PATH}/example_postproc.py) || die "Falure in example_postproc" $?