-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTMVAClassification.C
311 lines (249 loc) · 15.1 KB
/
TMVAClassification.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// @(#)root/tmva $Id: TMVAClassification.C 44112 2012-05-04 10:00:41Z evt $
/**********************************************************************************
* Project : TMVA - a ROOT-integrated toolkit for multivariate data analysis *
* Package : TMVA *
* Root Macro: TMVAClassification *
* *
* This macro provides examples for the training and testing of the *
* TMVA classifiers. *
* *
* As input data is used a toy-MC sample consisting of four Gaussian-distributed *
* and linearly correlated input variables. *
* *
* The methods to be used can be switched on and off by means of booleans, or *
* via the prompt command, for example: *
* *
* root -l ./TMVAClassification.C\(\"Fisher,Likelihood\"\) *
* *
* (note that the backslashes are mandatory) *
* If no method given, a default set of classifiers is used. *
* *
* The output file "TMVA.root" can be analysed with the use of dedicated *
* macros (simply say: root -l <macro.C>), which can be conveniently *
* invoked through a GUI that will appear at the end of the run of this macro. *
* Launch the GUI via the command: *
* *
* root -l ./TMVAGui.C *
* *
**********************************************************************************/
#include <cstdlib>
#include <iostream>
#include <map>
#include <string>
#include "TChain.h"
#include "TFile.h"
#include "TTree.h"
#include "TString.h"
#include "TObjString.h"
#include "TSystem.h"
#include "TROOT.h"
#if not defined(__CINT__) || defined(__MAKECINT__)
// needs to be included when makecint runs (ACLIC)
#include "TMVA/Factory.h"
#include "TMVA/Tools.h"
#endif
void TMVAClassification( TString myMethodList = "" )
{
// The explicit loading of the shared libTMVA is done in TMVAlogon.C, defined in .rootrc
// if you use your private .rootrc, or run from a different directory, please copy the
// corresponding lines from .rootrc
// methods to be processed can be given as an argument; use format:
//
// mylinux~> root -l TMVAClassification.C\(\"myMethod1,myMethod2,myMethod3\"\)
//
// if you like to use a method via the plugin mechanism, we recommend using
//
// mylinux~> root -l TMVAClassification.C\(\"P_myMethod\"\)
// (an example is given for using the BDT as plugin (see below),
// but of course the real application is when you write your own
// method based)
//---------------------------------------------------------------
// This loads the library
TMVA::Tools::Instance();
// to get access to the GUI and all tmva macros
TString tmva_dir(TString(gRootDir) + "/tmva");
if(gSystem->Getenv("TMVASYS"))
tmva_dir = TString(gSystem->Getenv("TMVASYS"));
gROOT->SetMacroPath(tmva_dir + "/test/:" + gROOT->GetMacroPath() );
gROOT->ProcessLine(".L TMVAGui.C");
// Default MVA methods to be trained + tested
std::map<std::string,int> Use;
//
// --- Boosted Decision Trees
Use["BDT"] = 0; // uses Adaptive Boost
Use["BDTG"] = 1; // uses Gradient Boost
Use["BDTB"] = 0; // uses Bagging
Use["BDTD"] = 0; // decorrelation + Adaptive Boost
Use["BDTF"] = 0; // allow usage of fisher discriminant for node splitting
//
// ---------------------------------------------------------------
std::cout << std::endl;
std::cout << "==> Start TMVAClassification" << std::endl;
// Select methods (don't look at this code - not of interest)
if (myMethodList != "") {
for (std::map<std::string,int>::iterator it = Use.begin(); it != Use.end(); it++) it->second = 0;
std::vector<TString> mlist = TMVA::gTools().SplitString( myMethodList, ',' );
for (UInt_t i=0; i<mlist.size(); i++) {
std::string regMethod(mlist[i]);
if (Use.find(regMethod) == Use.end()) {
std::cout << "Method \"" << regMethod << "\" not known in TMVA under this name. Choose among the following:" << std::endl;
for (std::map<std::string,int>::iterator it = Use.begin(); it != Use.end(); it++) std::cout << it->first << " ";
std::cout << std::endl;
return;
}
Use[regMethod] = 1;
}
}
// --------------------------------------------------------------------------------------------------
// --- Here the preparation phase begins
// Create a ROOT output file where TMVA will store ntuples, histograms, etc.
TString outfileName( "TMVA.root" );
TFile* outputFile = TFile::Open( outfileName, "RECREATE" );
// Create the factory object. Later you can choose the methods
// whose performance you'd like to investigate. The factory is
// the only TMVA object you have to interact with
//
// The first argument is the base of the name of all the
// weightfiles in the directory weight/
//
// The second argument is the output file for the training results
// All TMVA output can be suppressed by removing the "!" (not) in
// front of the "Silent" argument in the option string
TMVA::Factory *factory = new TMVA::Factory( "TMVAClassification", outputFile,
"!V:!Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification" );
// If you wish to modify default settings
// (please check "src/Config.h" to see all available global options)
// (TMVA::gConfig().GetVariablePlotting()).fTimesRMS = 8.0;
// (TMVA::gConfig().GetIONames()).fWeightFileDir = "myWeightDirectory";
// Define the input variables that shall be used for the MVA training
// note that you may also use variable expressions, such as: "3*var1/var2*abs(var3)"
// [all types of expressions that can also be parsed by TTree::Draw( "expression" )]
factory->AddVariable( "Pt_J", 'F' );
factory->AddVariable( "PTD_J", 'F' );
// factory->AddVariable( "rPt_tr", 'F' );
factory->AddVariable( "N_tr", 'F' );
factory->AddVariable( "D1_tr", 'F' );
factory->AddVariable( "D2_tr", 'F' );
// You can add so-called "Spectator variables", which are not used in the MVA training,
// but will appear in the final "TestTree" produced by TMVA. This TestTree will contain the
// input variables, the response values of all trained MVAs, and the spectator variables
// factory->AddSpectator( "spec1 := var1*2", "Spectator 1", "units", 'F' );
// factory->AddSpectator( "spec2 := var1*3", "Spectator 2", "units", 'F' );
// Read training and test data
// (it is also possible to use ASCII format as input -> see TMVA Users Guide)
TString fname1 = "./DY_incl_MVA_InclJet_lquark03.root";
// if (gSystem->AccessPathName( fname )) // file does not exist in local directory
// gSystem->Exec("wget http://root.cern.ch/files/tmva_class_example.root");
TFile *input1 = TFile::Open( fname1 );
std::cout << "--- TMVAClassification : Using Signal input file: " << input1->GetName() << std::endl;
// --- Register the training and test trees
TTree *signal = (TTree*)input1->Get("DY_MVA");
TString fname2 = "./DY_incl_MVA_InclJet_gluon03.root";
TFile *input2 = TFile::Open( fname2 );
std::cout << "--- TMVAClassification : Using Bg input file: " << input2->GetName() << std::endl;
TTree *background = (TTree*)input2->Get("DY_MVA");
// global event weights per tree (see below for setting event-wise weights)
Double_t signalWeight = 1.0;
Double_t backgroundWeight = 1.0;
// You can add an arbitrary number of signal or background trees
factory->AddSignalTree ( signal, signalWeight );
factory->AddBackgroundTree( background, backgroundWeight );
// To give different trees for training and testing, do as follows:
// factory->AddSignalTree( signalTrainingTree, signalTrainWeight, "Training" );
// factory->AddSignalTree( signalTestTree, signalTestWeight, "Test" );
// Use the following code instead of the above two or four lines to add signal and background
// training and test events "by hand"
// NOTE that in this case one should not give expressions (such as "var1+var2") in the input
// variable definition, but simply compute the expression before adding the event
//
// // --- begin ----------------------------------------------------------
// std::vector<Double_t> vars( 4 ); // vector has size of number of input variables
// Float_t treevars[4], weight;
//
// // Signal
// for (UInt_t ivar=0; ivar<4; ivar++) signal->SetBranchAddress( Form( "var%i", ivar+1 ), &(treevars[ivar]) );
// for (UInt_t i=0; i<signal->GetEntries(); i++) {
// signal->GetEntry(i);
// for (UInt_t ivar=0; ivar<4; ivar++) vars[ivar] = treevars[ivar];
// // add training and test events; here: first half is training, second is testing
// // note that the weight can also be event-wise
// if (i < signal->GetEntries()/2.0) factory->AddSignalTrainingEvent( vars, signalWeight );
// else factory->AddSignalTestEvent ( vars, signalWeight );
// }
//
// // Background (has event weights)
// background->SetBranchAddress( "weight", &weight );
// for (UInt_t ivar=0; ivar<4; ivar++) background->SetBranchAddress( Form( "var%i", ivar+1 ), &(treevars[ivar]) );
// for (UInt_t i=0; i<background->GetEntries(); i++) {
// background->GetEntry(i);
// for (UInt_t ivar=0; ivar<4; ivar++) vars[ivar] = treevars[ivar];
// // add training and test events; here: first half is training, second is testing
// // note that the weight can also be event-wise
// if (i < background->GetEntries()/2) factory->AddBackgroundTrainingEvent( vars, backgroundWeight*weight );
// else factory->AddBackgroundTestEvent ( vars, backgroundWeight*weight );
// }
// --- end ------------------------------------------------------------
//
// --- end of tree registration
// Set individual event weights (the variables must exist in the original TTree)
//+++++ add weights
for signal : factory->SetSignalWeightExpression ("Wei");
for background: factory->SetBackgroundWeightExpression("Wei");
//factory->SetBackgroundWeightExpression( "weight" );
// Apply additional cuts on the signal and background samples (can be different)
TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
TCut mycutb = ""; // for example: TCut mycutb = "abs(var1)<0.5";
// Tell the factory how to use the training and testing events
//
// If no numbers of events are given, half of the events in the tree are used
// for training, and the other half for testing:
// factory->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
// To also specify the number of testing events, use:
// factory->PrepareTrainingAndTestTree( mycut,
// "NSigTrain=3000:NBkgTrain=3000:NSigTest=3000:NBkgTest=3000:SplitMode=Random:!V" );
factory->PrepareTrainingAndTestTree( mycuts, mycutb,
"nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" );
// ---- Book MVA methods
//
// Please lookup the various method configuration options in the corresponding cxx files, eg:
// src/MethoCuts.cxx, etc, or here: http://tmva.sourceforge.net/optionRef.html
// it is possible to preset ranges in the option string in which the cut optimisation should be done:
// "...:CutRangeMin[2]=-1:CutRangeMax[2]=1"...", where [2] is the third input variable
// Boosted Decision Trees
if (Use["BDTG"]) // Gradient Boost
factory->BookMethod( TMVA::Types::kBDT, "BDTG",
"!H:!V:NTrees=1000:BoostType=Grad:Shrinkage=0.10:UseBaggedGrad:GradBaggingFraction=0.5:nCuts=20:NNodesMax=5" );
if (Use["BDT"]) // Adaptive Boost
factory->BookMethod( TMVA::Types::kBDT, "BDT",
"!H:!V:NTrees=850:nEventsMin=150:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:nCuts=20:PruneMethod=NoPruning" );
if (Use["BDTB"]) // Bagging
factory->BookMethod( TMVA::Types::kBDT, "BDTB",
"!H:!V:NTrees=400:BoostType=Bagging:SeparationType=GiniIndex:nCuts=20:PruneMethod=NoPruning" );
if (Use["BDTD"]) // Decorrelation + Adaptive Boost
factory->BookMethod( TMVA::Types::kBDT, "BDTD",
"!H:!V:NTrees=400:nEventsMin=400:MaxDepth=3:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=20:PruneMethod=NoPruning:VarTransform=Decorrelate" );
if (Use["BDTF"]) // Allow Using Fisher discriminant in node splitting for (strong) linearly correlated variables
factory->BookMethod( TMVA::Types::kBDT, "BDTMitFisher",
"!H:!V:NTrees=50:nEventsMin=150:UseFisherCuts:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:nCuts=20:PruneMethod=NoPruning" );
// For an example of the category classifier usage, see: TMVAClassificationCategory
// --------------------------------------------------------------------------------------------------
// ---- Now you can optimize the setting (configuration) of the MVAs using the set of training events
// factory->OptimizeAllMethods("SigEffAt001","Scan");
// factory->OptimizeAllMethods("ROCIntegral","GA");
// --------------------------------------------------------------------------------------------------
// ---- Now you can tell the factory to train, test, and evaluate the MVAs
// Train MVAs using the set of training events
factory->TrainAllMethods();
// ---- Evaluate all MVAs using the set of test events
factory->TestAllMethods();
// ----- Evaluate and compare performance of all configured MVAs
factory->EvaluateAllMethods();
// --------------------------------------------------------------
// Save the output
outputFile->Close();
std::cout << "==> Wrote root file: " << outputFile->GetName() << std::endl;
std::cout << "==> TMVAClassification is done!" << std::endl;
delete factory;
// Launch the GUI for the root macros
if (!gROOT->IsBatch()) TMVAGui( outfileName );
}