From 15cd83c9513acb2ff8dca94b40b2c675b8dfb75c Mon Sep 17 00:00:00 2001 From: Alexander Held <45009355+alexander-held@users.noreply.github.com> Date: Tue, 25 Jul 2023 14:50:03 -0400 Subject: [PATCH] fix: b-tagging threshold comparison (#182) * fix b-tagging threshold * improve event number formatting --- .../ttbar_analysis_pipeline.ipynb | 76 ++++++++++++------- .../ttbar_analysis_pipeline.py | 54 ++++++------- 2 files changed, 75 insertions(+), 55 deletions(-) diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb index 9812bee3..3f8927dd 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "0dc37683", "metadata": {}, @@ -24,6 +25,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "404927fe", "metadata": {}, @@ -35,6 +37,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bd72d323", "metadata": {}, @@ -75,6 +78,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cd573bb1", "metadata": {}, @@ -89,15 +93,15 @@ "\n", "| setting | number of files | total size | number of events |\n", "| --- | --- | --- | --- |\n", - "| `1` | 9 | 22.9 GB | 10455719 |\n", - "| `2` | 18 | 42.8 GB | 19497435 |\n", - "| `5` | 43 | 105 GB | 47996231 |\n", - "| `10` | 79 | 200 GB | 90546458 |\n", - "| `20` | 140 | 359 GB | 163123242 |\n", - "| `50` | 255 | 631 GB | 297247463 |\n", - "| `100` | 395 | 960 GB | 470397795 |\n", - "| `200` | 595 | 1.40 TB | 705273291 |\n", - "| `-1` | 787 | 1.78 TB | 940160174 |\n", + "| `1` | 9 | 22.9 GB | 10,455,719 |\n", + "| `2` | 18 | 42.8 GB | 19,497,435 |\n", + "| `5` | 43 | 105 GB | 47,996,231 |\n", + "| `10` | 79 | 200 GB | 90,546,458 |\n", + "| `20` | 140 | 359 GB | 163,123,242 |\n", + "| `50` | 255 | 631 GB | 297,247,463 |\n", + "| `100` | 395 | 960 GB | 470,397,795 |\n", + "| `200` | 595 | 1.40 TB | 705,273,291 |\n", + "| `-1` | 787 | 1.78 TB | 940,160,174 |\n", "\n", "The input files are all in the 1–3 GB range." ] @@ -131,6 +135,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "141d6520", "metadata": {}, @@ -288,7 +293,7 @@ " # Basic selection criteria\n", " selections.add(\"exactly_1l\", (ak.num(elecs) + ak.num(muons)) == 1)\n", " selections.add(\"atleast_4j\", ak.num(jets) >= 4)\n", - " selections.add(\"exactly_1b\", ak.sum(jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) == 1)\n", + " selections.add(\"exactly_1b\", ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) == 1)\n", " selections.add(\"atleast_2b\", ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2)\n", " # Complex selection criteria\n", " selections.add(\"4j1b\", selections.all(\"exactly_1l\", \"atleast_4j\", \"exactly_1b\"))\n", @@ -402,6 +407,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "90dd4c9e", "metadata": {}, @@ -446,6 +452,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b910d3d5", "metadata": {}, @@ -465,40 +472,40 @@ "outputs": [], "source": [ "def get_query(source: ObjectStream) -> ObjectStream:\n", - " \"\"\"Query for event / column selection: >=4j >=1b, ==1 lep with pT>30 GeV + additional cuts, \n", + " \"\"\"Query for event / column selection: >=4j >=1b, ==1 lep with pT>30 GeV + additional cuts,\n", " return relevant columns\n", " *NOTE* jet pT cut is set lower to account for systematic variations to jet pT\n", " \"\"\"\n", - " cuts = source.Where(lambda e: {\"pt\": e.Electron_pt, \n", - " \"eta\": e.Electron_eta, \n", - " \"cutBased\": e.Electron_cutBased, \n", + " cuts = source.Where(lambda e: {\"pt\": e.Electron_pt,\n", + " \"eta\": e.Electron_eta,\n", + " \"cutBased\": e.Electron_cutBased,\n", " \"sip3d\": e.Electron_sip3d,}.Zip()\\\n", " .Where(lambda electron: (electron.pt > 30\n", - " and abs(electron.eta) < 2.1 \n", + " and abs(electron.eta) < 2.1\n", " and electron.cutBased == 4\n", - " and electron.sip3d < 4)).Count() \n", - " + {\"pt\": e.Muon_pt, \n", + " and electron.sip3d < 4)).Count()\n", + " + {\"pt\": e.Muon_pt,\n", " \"eta\": e.Muon_eta,\n", " \"tightId\": e.Muon_tightId,\n", " \"sip3d\": e.Muon_sip3d,\n", " \"pfRelIso04_all\": e.Muon_pfRelIso04_all}.Zip()\\\n", - " .Where(lambda muon: (muon.pt > 30 \n", - " and abs(muon.eta) < 2.1 \n", - " and muon.tightId \n", + " .Where(lambda muon: (muon.pt > 30\n", + " and abs(muon.eta) < 2.1\n", + " and muon.tightId\n", " and muon.pfRelIso04_all < 0.15)).Count()== 1)\\\n", - " .Where(lambda f: {\"pt\": f.Jet_pt, \n", + " .Where(lambda f: {\"pt\": f.Jet_pt,\n", " \"eta\": f.Jet_eta,\n", " \"jetId\": f.Jet_jetId}.Zip()\\\n", - " .Where(lambda jet: (jet.pt > 25 \n", - " and abs(jet.eta) < 2.4 \n", + " .Where(lambda jet: (jet.pt > 25\n", + " and abs(jet.eta) < 2.4\n", " and jet.jetId == 6)).Count() >= 4)\\\n", - " .Where(lambda g: {\"pt\": g.Jet_pt, \n", + " .Where(lambda g: {\"pt\": g.Jet_pt,\n", " \"eta\": g.Jet_eta,\n", " \"btagCSVV2\": g.Jet_btagCSVV2,\n", " \"jetId\": g.Jet_jetId}.Zip()\\\n", - " .Where(lambda jet: (jet.btagCSVV2 >= 0.5 \n", + " .Where(lambda jet: (jet.btagCSVV2 > 0.5\n", " and jet.pt > 25\n", - " and abs(jet.eta) < 2.4) \n", + " and abs(jet.eta) < 2.4)\n", " and jet.jetId == 6).Count() >= 1)\n", " selection = cuts.Select(lambda h: {\"Electron_pt\": h.Electron_pt,\n", " \"Electron_eta\": h.Electron_eta,\n", @@ -524,7 +531,7 @@ " })\n", " if USE_INFERENCE:\n", " return selection\n", - " \n", + "\n", " # some branches are only needed if USE_INFERENCE is turned on\n", " return selection.Select(lambda h: {\"Electron_pt\": h.Electron_pt,\n", " \"Electron_eta\": h.Electron_eta,\n", @@ -545,6 +552,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d8f08fc1", "metadata": {}, @@ -587,6 +595,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c28a9e49", "metadata": {}, @@ -694,6 +703,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d7bb4428", "metadata": {}, @@ -759,6 +769,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bed3df8b", "metadata": {}, @@ -875,6 +886,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7c334dd3", "metadata": {}, @@ -907,6 +919,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e904cd3c", "metadata": {}, @@ -934,6 +947,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f36dc601", "metadata": {}, @@ -981,6 +995,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c74e4361", "metadata": {}, @@ -1018,6 +1033,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bd480eec", "metadata": {}, @@ -1048,6 +1064,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3a293479", "metadata": {}, @@ -1103,6 +1120,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "14dc4b23", "metadata": {}, @@ -1150,6 +1168,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7f60c316", "metadata": {}, @@ -1293,6 +1312,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a2ce2d14", "metadata": {}, @@ -1329,7 +1349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py index 16738b46..8bd90ade 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py @@ -74,15 +74,15 @@ # # | setting | number of files | total size | number of events | # | --- | --- | --- | --- | -# | `1` | 9 | 22.9 GB | 10455719 | -# | `2` | 18 | 42.8 GB | 19497435 | -# | `5` | 43 | 105 GB | 47996231 | -# | `10` | 79 | 200 GB | 90546458 | -# | `20` | 140 | 359 GB | 163123242 | -# | `50` | 255 | 631 GB | 297247463 | -# | `100` | 395 | 960 GB | 470397795 | -# | `200` | 595 | 1.40 TB | 705273291 | -# | `-1` | 787 | 1.78 TB | 940160174 | +# | `1` | 9 | 22.9 GB | 10,455,719 | +# | `2` | 18 | 42.8 GB | 19,497,435 | +# | `5` | 43 | 105 GB | 47,996,231 | +# | `10` | 79 | 200 GB | 90,546,458 | +# | `20` | 140 | 359 GB | 163,123,242 | +# | `50` | 255 | 631 GB | 297,247,463 | +# | `100` | 395 | 960 GB | 470,397,795 | +# | `200` | 595 | 1.40 TB | 705,273,291 | +# | `-1` | 787 | 1.78 TB | 940,160,174 | # # The input files are all in the 1–3 GB range. @@ -251,7 +251,7 @@ def process(self, events): # Basic selection criteria selections.add("exactly_1l", (ak.num(elecs) + ak.num(muons)) == 1) selections.add("atleast_4j", ak.num(jets) >= 4) - selections.add("exactly_1b", ak.sum(jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) == 1) + selections.add("exactly_1b", ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) == 1) selections.add("atleast_2b", ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2) # Complex selection criteria selections.add("4j1b", selections.all("exactly_1l", "atleast_4j", "exactly_1b")) @@ -387,40 +387,40 @@ def postprocess(self, accumulator): # %% tags=[] def get_query(source: ObjectStream) -> ObjectStream: - """Query for event / column selection: >=4j >=1b, ==1 lep with pT>30 GeV + additional cuts, + """Query for event / column selection: >=4j >=1b, ==1 lep with pT>30 GeV + additional cuts, return relevant columns *NOTE* jet pT cut is set lower to account for systematic variations to jet pT """ - cuts = source.Where(lambda e: {"pt": e.Electron_pt, - "eta": e.Electron_eta, - "cutBased": e.Electron_cutBased, + cuts = source.Where(lambda e: {"pt": e.Electron_pt, + "eta": e.Electron_eta, + "cutBased": e.Electron_cutBased, "sip3d": e.Electron_sip3d,}.Zip()\ .Where(lambda electron: (electron.pt > 30 - and abs(electron.eta) < 2.1 + and abs(electron.eta) < 2.1 and electron.cutBased == 4 - and electron.sip3d < 4)).Count() - + {"pt": e.Muon_pt, + and electron.sip3d < 4)).Count() + + {"pt": e.Muon_pt, "eta": e.Muon_eta, "tightId": e.Muon_tightId, "sip3d": e.Muon_sip3d, "pfRelIso04_all": e.Muon_pfRelIso04_all}.Zip()\ - .Where(lambda muon: (muon.pt > 30 - and abs(muon.eta) < 2.1 - and muon.tightId + .Where(lambda muon: (muon.pt > 30 + and abs(muon.eta) < 2.1 + and muon.tightId and muon.pfRelIso04_all < 0.15)).Count()== 1)\ - .Where(lambda f: {"pt": f.Jet_pt, + .Where(lambda f: {"pt": f.Jet_pt, "eta": f.Jet_eta, "jetId": f.Jet_jetId}.Zip()\ - .Where(lambda jet: (jet.pt > 25 - and abs(jet.eta) < 2.4 + .Where(lambda jet: (jet.pt > 25 + and abs(jet.eta) < 2.4 and jet.jetId == 6)).Count() >= 4)\ - .Where(lambda g: {"pt": g.Jet_pt, + .Where(lambda g: {"pt": g.Jet_pt, "eta": g.Jet_eta, "btagCSVV2": g.Jet_btagCSVV2, "jetId": g.Jet_jetId}.Zip()\ - .Where(lambda jet: (jet.btagCSVV2 >= 0.5 + .Where(lambda jet: (jet.btagCSVV2 > 0.5 and jet.pt > 25 - and abs(jet.eta) < 2.4) + and abs(jet.eta) < 2.4) and jet.jetId == 6).Count() >= 1) selection = cuts.Select(lambda h: {"Electron_pt": h.Electron_pt, "Electron_eta": h.Electron_eta, @@ -446,7 +446,7 @@ def get_query(source: ObjectStream) -> ObjectStream: }) if USE_INFERENCE: return selection - + # some branches are only needed if USE_INFERENCE is turned on return selection.Select(lambda h: {"Electron_pt": h.Electron_pt, "Electron_eta": h.Electron_eta,