-
Notifications
You must be signed in to change notification settings - Fork 14
/
Objective.py
303 lines (230 loc) · 7.08 KB
/
Objective.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
###
# Edge prediction algorithm - objective functions
# Developer: Dan Bean, King's College London
# Contact: [email protected]
###
import numpy as np
class Objective:
"""Calculate several standard metrics that can be used as an objective function.
For a given set of scores and known source nodes from the graph, this method calculates several
standard metrics: F0.5, F1, F2, J, accuracy, precision, recall, false discovery rate, false positive rate,
contingency table. These metrics and the hits from the scores according to the threshold are returned,
with the hits also broken down in to new hits (not known in the graph) and known hits (edges already
present in the graph).
The intended use of this class is via the Objective.evaluate method, which will calculate and return the
above properties for a given threshold. This function is used to evaluate possible thresholds for a sef of
scores and find the optimium threshold to maximise the objective function in the EdgePrediction library.
Parameters
----------
score : numpy float array
The score for every source node in the graph. Must be in the same order as the 'known' parameter.
known : numpy bool array
For every source node, value is True if there is an edge of the type being predicted to the target node, False
otherwise. Must be in the same order as the 'score' parameter.
Attributes
----------
n_known : float
The number of known source nodes in the graph, calculated from the known parameter
n_total : float
The total number of source nodes in the graph, calculated from the known parameter
"""
def __init__(self, score, known):
self.score = score
self.known = known
self.n_known = float(np.sum(known))
self.n_total = float(len(known))
def evaluate(self, threshold):
"""Calculate metrics and hits for a given threshold
Parameters
----------
threshold : float
The threshold applied to the source node scores to determine which nodes are hits. Any nodes with
score >= threshold are considered hits.
Returns
-------
result : dict
Contains all the calculated metrics, contingency table and lists of hits.
"""
is_hit = self.score >= threshold
n_hits = float(np.sum(is_hit))
tp, fp, tn, fn = self.contingency(threshold)
n_new_predictions = n_hits - tp
prec = self.precision(tp,fp,tn,fn)
rec = self.recall(tp,fp,tn,fn)
#print "tp: %s ,fp: %s ,tn: %s ,fn: %s" % (tp,fp,tn,fn)
f1 = self.f_beta(prec, rec)
f05 = self.f_beta(prec, rec, 0.5)
f2 = self.f_beta(prec, rec, 2)
acc = self.accuracy(tp,fp,tn,fn)
j = self.youden_j(tp,fp,tn,fn)
fdr = self.falseDiscoveryRate(tp,fp,tn,fn)
falsePosRate = self.falsePositiveRate(tp,fp,tn,fn)
contingency = {'tp':tp,'fp':fp,'tn':tn,'fn':fn}
result = {'F1': f1,'F05':f05,'F2':f2, 'ACC': acc, 'J': j, 'PREC':prec, 'REC':rec, 'FDR':fdr, 'FPR': falsePosRate, 'hits_total':n_hits,'hits_new':n_new_predictions, 'hits_known':tp, 'is_hit': is_hit}
result['contingency'] = contingency
return result
def contingency(self, threshold):
"""Generate a contingency table
Parameters
----------
threshold : float
The threshold applied to the source node scores to determine which nodes are hits. Any nodes with
score >= threshold are considered hits.
Returns
-------
tp, fp, tn, fn : int
tuple of true positives, false positives, true negatives, false negatives
"""
is_hit = self.score >= threshold
n_hits = float(np.sum(is_hit))
tp = np.sum(self.known[is_hit])
fp = n_hits - tp
fn = np.sum(self.known[self.score < threshold])
tn = self.n_total - tp - fp - fn
return tp, fp, tn, fn
def f_beta(self,prec, rec, beta = 1):
"""F-beta statistic for any beta.
'The effectiveness of retrieval with respect to a user who places beta times as much importance to recall as
precision' - Van Rijsbergen, C. J. (1979). Information Retrieval (2nd ed.). Used to calculate F0.5, F1, F2.
Parameters
----------
prec : float
precision of the model
rec : float
recall of the model
beta : int ; float
beta parameter of F statistic, relative importance of recall over precision
Returns
-------
f : float
The F-beta statistic
"""
if prec == 0 or rec == 0:
f = 0
else:
pxr = prec * rec
paddr = (beta**2 * prec) + rec
f = (1 + beta**2) * (pxr/paddr)
return f
def accuracy(self,tp,fp,tn,fn):
"""Calculate model accuracy
The proportion of all predictions (for the positive or negative class) from the model that are correct
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
acc : float
Accuacy of model
"""
acc = (float(tp) + tn)/(tp+fp+tn+fn)
return acc
def youden_j(self,tp,fp,tn,fn):
"""Youden's J statistic
J = sensitivity + specificity - 1 = TP/(TP+FN) + TN/(TN+FP) - 1
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
j : float
Youden's J statistic
"""
a = float(tp)/(tp + fn)
b = float(tn)/(tn + fp)
j = a + b - 1
return j
def precision(self,tp,fp,tn,fn):
"""Calculate precision of model
The proportion of all positives from the model that are true positives.
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
prec : float
Precision of the model
"""
prec = float(tp)/(tp+fp)
return prec
def recall(self,tp,fp,tn,fn):
"""Calculate recall of model
The proportion of all positives in the population that are predicted positive by the model.
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
rec : float
Recall of model
"""
rec = float(tp)/(tp+fn)
return rec
def falseDiscoveryRate(self,tp,fp,tn,fn):
"""Calculate False Discovery Rate (FDR)
The proportion of all positive predictions that are false positives.
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
rate : float
The false discovery rate
"""
rate = float(fp)/(fp+tp)
return rate
def falsePositiveRate(self,tp,fp,tn,fn):
"""Calculate the False Positive Rate (FPR)
Proportion of of actual negatives that are predicted positive
Parameters
----------
tp : int ; float
True positives of model
fp : int ; float
False positives of model
tn : int ; float
True negatives of model
fn : int ; float
False negatives of model
Returns
-------
rate : float
The false positive rate
"""
rate = float(fp)/(tn+fp)
return rate