-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcoredata.py
278 lines (224 loc) · 9.26 KB
/
coredata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Data containers backed by Python managed memory (Numpy arrays and Pandas
dataframes). This module is intended to substitute large parts of the C++
wrappers.
"""
import dataclasses
import numpy as np
import pandas as pd
@dataclasses.dataclass(eq=False)
class FKTableData:
"""
Data contained in an FKTable
Parameters
----------
hadronic : bool
Whether a hadronic (two PDFs) or a DIS (one PDF) convolution is needed.
Q0 : float
The scale at which the PDFs should be evaluated (in GeV).
ndata : int
The number of data points in the grid.
xgrid : array, shape (nx)
The points in x at which the PDFs should be evaluated.
sigma : pd.DataFrame
For hadronic data, the columns are the indexes in the ``NfxNf`` list of
possible flavour combinations of two PDFs. The MultiIndex contains
three keys, the data index, an index into ``xgrid`` for the first PDF
and an idex into ``xgrid`` for the second PDF, indicating if the points in
``x`` where the PDF should be evaluated.
For DIS data, the columns are indexes in the ``Nf`` list of flavours.
The MultiIndex contains two keys, the data index and an index into
``xgrid`` indicating the points in ``x`` where the PDF should be
evaluated.
metadata : dict
Other information contained in the FKTable.
"""
hadronic: bool
Q0: float
ndata: int
xgrid: np.array
sigma: pd.DataFrame
metadata: dict = dataclasses.field(default_factory=dict, repr=False)
# TODO: When we move to something other than the current fktable format,
# we should apply the cuts directly before loading the file.
def with_cuts(self, cuts):
"""Return a copy of the FKTabe with the cuts applied. The data index
of the sigma operator (the outermost level), contains the data point
that have been kept. The ndata property is updated to reflect the new
number of datapoints. If cuts is None, return the object unmodified.
Parameters
----------
cuts : array_like or validphys.core.Cuts or None.
The cuts to be applied.
Returns
-------
res : FKTableData
A copy of the FKtable with the cuts applies.
Notes
-----
The original number of points can be accessed with
``table.metadata['GridInfo'].ndata``.
Examples
--------
>>> from validphys.fkparser import load_fktable
... from validphys.loader import Loader
... l = Loader()
... ds = l.check_dataset('ATLASTTBARTOT', theoryid=53, cfac=('QCD',))
... table = load_fktable(ds.fkspecs[0])
... newtable = table.with_cuts([0,1])
>>> assert set(newtable.sigma.index.get_level_values(0)) == {0,1}
>>> assert newtable.ndata == 2
>>> assert newtable.metadata['GridInfo'].ndata == 3
"""
if hasattr(cuts, 'load'):
cuts = cuts.load()
if cuts is None:
return self
newndata = len(cuts)
newsigma = self.sigma.loc[cuts]
return dataclasses.replace(self, ndata=newndata, sigma=newsigma)
@dataclasses.dataclass(eq=False)
class CFactorData:
"""
Data contained in a CFactor
Parameters
----------
description : str
Information on how the data was obtained.
central_value : array, shape(ndata)
The value of the cfactor for each data point.
uncertainty : array, shape(ndata)
The absolute uncertainty on the cfactor if available.
"""
description: str
central_value: np.array
uncertainty: np.array
@dataclasses.dataclass(eq=False)
class CommonData:
"""
Data contained in Commondata files, relevant cuts applied.
Parameters
----------
setname : str
Name of the dataset
ndata : int
Number of data points
commondataproc : str
Process type, one of 21 options
nkin : int
Number of kinematics specified
kinematics : list of str with length nkin
Kinematic variables kin1, kin2, kin3 ...
nsys : int
Number of systematics
sysid : list of str with length nsys
ID for systematic
commondata_table : pd.DataFrame
Pandas dataframe containing the commondata
systype_table : pd.DataFrame
Pandas dataframe containing the systype index
for each systematic alongside the uncertainty
type (ADD/MULT/RAND) and name
(CORR/UNCORR/THEORYCORR/SKIP)
"""
setname: str
ndata: int
commondataproc: str
nkin: int
nsys: int
commondata_table: pd.DataFrame = dataclasses.field(repr=False)
systype_table: pd.DataFrame = dataclasses.field(repr=False)
systematics_table: pd.DataFrame = dataclasses.field(init=None, repr=False)
def __post_init__(self):
self.systematics_table = self.commondata_table.drop(
columns=["process", "kin1", "kin2", "kin3", "data", "stat"]
)
def with_cuts(self, cuts):
"""A method to return a CommonData object where
an integer mask has been applied, keeping only data
points which pass cuts.
Note if the first data point passes cuts, the first entry
of ``cuts`` should be ``0``.
Paramters
---------
cuts: list or validphys.core.Cuts or None
"""
# Ensure that the cuts we're applying applies to this dataset
# only check, however, if the cuts is of type :py:class:`validphys.core.Cuts`
if hasattr(cuts, 'name') and self.setname != cuts.name:
raise ValueError(f"The cuts provided are for {cuts.name} which does not apply "
f"to this commondata file: {self.setname}")
if hasattr(cuts, 'load'):
cuts = cuts.load()
if cuts is None:
return self
# We must shift the cuts up by 1 since a cut of 0 implies the first data point
# while commondata indexing starts at 1.
cuts = list(map(lambda x: x + 1, cuts))
newndata = len(cuts)
new_commondata_table = self.commondata_table.loc[cuts]
return dataclasses.replace(
self, ndata=newndata, commondata_table=new_commondata_table
)
@property
def central_values(self):
return self.commondata_table["data"]
@property
def stat_errors(self):
return self.commondata_table["stat"]
@property
def multiplicative_errors(self):
"""Returns the systematics which are multiplicative (systype is MULT)
in a percentage format, with SKIP uncertainties removed.
"""
mult_systype = self.systype_table[self.systype_table["type"] == "MULT"]
# NOTE: Index with list here so that return is always a DataFrame, even
# if N_sys = 1 (else a Series could be returned)
mult_table = self.systematics_table.loc[:, ["MULT"]]
# Minus 1 because iloc starts from 0, while the systype counting starts
# from 1.
mult_table = mult_table.iloc[:, mult_systype.index - 1]
mult_table.columns = mult_systype["name"].to_numpy()
return mult_table.loc[:, mult_table.columns != "SKIP"]
@property
def additive_errors(self):
"""Returns the systematics which are additive (systype is ADD) as
absolute uncertainties (same units as data), with SKIP uncertainties
removed.
"""
add_systype = self.systype_table[self.systype_table["type"] == "ADD"]
# NOTE: Index with list here so that return is always a DataFrame, even
# if N_sys = 1 (else a Series could be returned)
add_table = self.systematics_table.loc[:, ["ADD"]]
# Minus 1 because iloc starts from 0, while the systype counting starts
# from 1.
add_table = add_table.iloc[:, add_systype.index - 1]
add_table.columns = add_systype["name"].to_numpy()
return add_table.loc[:, add_table.columns != "SKIP"]
def systematic_errors(self, central_values=None):
"""Returns all systematic errors as absolute uncertainties, with a
single column for each uncertainty. Converts
:py:attr:`multiplicative_errors` to units of data and then appends
onto :py:attr:`additive_errors`. By default uses the experimental
central values to perform conversion, but the user can supply a
1-D array of central values, with length :py:attr:`self.ndata`, to use
instead of the experimental central values to calculate the absolute
contribution of the multiplicative systematics.
Parameters
----------
central_values: None, np.array
1-D array containing alternative central values to combine with
multiplicative uncertainties. This array must have length equal
to :py:attr:`self.ndata`. By default ``central_values`` is None, and
the central values of the commondata are used.
Returns
-------
systematic_errors: pd.DataFrame
Dataframe containing systematic errors.
"""
if central_values is None:
central_values = self.central_values.to_numpy()
converted_mult_errors = (
self.multiplicative_errors * central_values[:, np.newaxis] / 100
)
return pd.concat((self.additive_errors, converted_mult_errors), axis=1)