-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_metrics.py
194 lines (151 loc) · 9.75 KB
/
eval_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import sys
def obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold):
# False alarm and miss rates for ASV
Pfa_asv = sum(non_asv >= asv_threshold) / non_asv.size
Pmiss_asv = sum(tar_asv < asv_threshold) / tar_asv.size
# Rate of rejecting spoofs in ASV
if spoof_asv.size == 0:
Pmiss_spoof_asv = None
else:
Pmiss_spoof_asv = np.sum(spoof_asv < asv_threshold) / spoof_asv.size
return Pfa_asv, Pmiss_asv, Pmiss_spoof_asv
def compute_det_curve(target_scores, nontarget_scores):
n_scores = target_scores.size + nontarget_scores.size
all_scores = np.concatenate((target_scores, nontarget_scores))
labels = np.concatenate((np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
# Sort labels based on scores
indices = np.argsort(all_scores, kind='mergesort')
labels = labels[indices]
# Compute false rejection and false acceptance rates
tar_trial_sums = np.cumsum(labels)
nontarget_trial_sums = nontarget_scores.size - (np.arange(1, n_scores + 1) - tar_trial_sums)
frr = np.concatenate((np.atleast_1d(0), tar_trial_sums / target_scores.size)) # false rejection rates
far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums / nontarget_scores.size)) # false acceptance rates
thresholds = np.concatenate((np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices])) # Thresholds are the sorted scores
return frr, far, thresholds
def compute_eer(target_scores, nontarget_scores):
""" Returns equal error rate (EER) and the corresponding threshold. """
frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
abs_diffs = np.abs(frr - far)
min_index = np.argmin(abs_diffs)
eer = np.mean((frr[min_index], far[min_index]))
return eer, thresholds[min_index]
def compute_tDCF(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, print_cost):
"""
Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system.
In brief, t-DCF returns a detection cost of a cascaded system of this form,
Speech waveform -> [CM] -> [ASV] -> decision
where CM stands for countermeasure and ASV for automatic speaker
verification. The CM is therefore used as a 'gate' to decided whether or
not the input speech sample should be passed onwards to the ASV system.
Generally, both CM and ASV can do detection errors. Not all those errors
are necessarily equally cost, and not all types of users are necessarily
equally likely. The tandem t-DCF gives a principled with to compare
different spoofing countermeasures under a detection cost function
framework that takes that information into account.
INPUTS:
bonafide_score_cm A vector of POSITIVE CLASS (bona fide or human)
detection scores obtained by executing a spoofing
countermeasure (CM) on some positive evaluation trials.
trial represents a bona fide case.
spoof_score_cm A vector of NEGATIVE CLASS (spoofing attack)
detection scores obtained by executing a spoofing
CM on some negative evaluation trials.
Pfa_asv False alarm (false acceptance) rate of the ASV
system that is evaluated in tandem with the CM.
Assumed to be in fractions, not percentages.
Pmiss_asv Miss (false rejection) rate of the ASV system that
is evaluated in tandem with the spoofing CM.
Assumed to be in fractions, not percentages.
Pmiss_spoof_asv Miss rate of spoof samples of the ASV system that
is evaluated in tandem with the spoofing CM. That
is, the fraction of spoof samples that were
rejected by the ASV system.
cost_model A struct that contains the parameters of t-DCF,
with the following fields.
Ptar Prior probability of target speaker.
Pnon Prior probability of nontarget speaker (zero-effort impostor)
Psoof Prior probability of spoofing attack.
Cmiss_asv Cost of ASV falsely rejecting target.
Cfa_asv Cost of ASV falsely accepting nontarget.
Cmiss_cm Cost of CM falsely rejecting target.
Cfa_cm Cost of CM falsely accepting spoof.
print_cost Print a summary of the cost parameters and the
implied t-DCF cost function?
OUTPUTS:
tDCF_norm Normalized t-DCF curve across the different CM
system operating points; see [2] for more details.
Normalized t-DCF > 1 indicates a useless
countermeasure (as the tandem system would do
better without it). min(tDCF_norm) will be the
minimum t-DCF used in ASVspoof 2019 [2].
CM_thresholds Vector of same size as tDCF_norm corresponding to
the CM threshold (operating point).
NOTE:
o In relative terms, higher detection scores values are assumed to
indicate stronger support for the bona fide hypothesis.
o You should provide real-valued soft scores, NOT hard decisions. The
recommendation is that the scores are log-likelihood ratios (LLRs)
from a bonafide-vs-spoof hypothesis based on some statistical model.
This, however, is NOT required. The scores can have arbitrary range
and scaling.
o Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages.
References:
[1] T. Kinnunen, K.-A. Lee, H. Delgado, N. Evans, M. Todisco,
M. Sahidullah, J. Yamagishi, D.A. Reynolds: "t-DCF: a Detection
Cost Function for the Tandem Assessment of Spoofing Countermeasures
and Automatic Speaker Verification", Proc. Odyssey 2018: the
Speaker and Language Recognition Workshop, pp. 312--319, Les Sables d'Olonne,
France, June 2018 (https://www.isca-speech.org/archive/Odyssey_2018/pdfs/68.pdf)
[2] ASVspoof 2019 challenge evaluation plan
TODO: <add link>
"""
# Sanity check of cost parameters
if cost_model['Cfa_asv'] < 0 or cost_model['Cmiss_asv'] < 0 or \
cost_model['Cfa_cm'] < 0 or cost_model['Cmiss_cm'] < 0:
print('WARNING: Usually the cost values should be positive!')
if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \
np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10:
sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.')
# Unless we evaluate worst-case model, we need to have some spoof tests against asv
if Pmiss_spoof_asv is None:
sys.exit('ERROR: you should provide miss rate of spoof tests against your ASV system.')
# Sanity check of scores
combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
sys.exit('ERROR: Your scores contain nan or inf.')
# Sanity check that inputs are scores and not decisions
n_uniq = np.unique(combined_scores).size
if n_uniq < 3:
sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
# Obtain miss and false alarm rates of CM
Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm)
# Constants - see ASVspoof 2019 evaluation plan
C1 = cost_model['Ptar'] * (cost_model['Cmiss_cm'] - cost_model['Cmiss_asv'] * Pmiss_asv) - \
cost_model['Pnon'] * cost_model['Cfa_asv'] * Pfa_asv
C2 = cost_model['Cfa_cm'] * cost_model['Pspoof'] * (1 - Pmiss_spoof_asv)
# Sanity check of the weights
if C1 < 0 or C2 < 0:
sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?')
# Obtain t-DCF curve for all thresholds
tDCF = C1 * Pmiss_cm + C2 * Pfa_cm
# Normalized t-DCF
tDCF_norm = tDCF / np.minimum(C1, C2)
# Everything should be fine if reaching here.
if print_cost:
print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size))
# print('t-DCF MODEL')
# print(' Ptar = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar']))
# print(' Pnon = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon']))
# print(' Pspoof = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof']))
# print(' Cfa_asv = {:8.5f} (Cost of ASV falsely accepting a nontarget)'.format(cost_model['Cfa_asv']))
# print(' Cmiss_asv = {:8.5f} (Cost of ASV falsely rejecting target speaker)'.format(cost_model['Cmiss_asv']))
# print(' Cfa_cm = {:8.5f} (Cost of CM falsely passing a spoof to ASV system)'.format(cost_model['Cfa_cm']))
# print(' Cmiss_cm = {:8.5f} (Cost of CM falsely blocking target utterance which never reaches ASV)'.format(cost_model['Cmiss_cm']))
# print('\n Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), s=CM threshold)')
if C2 == np.minimum(C1, C2):
print(' tDCF_norm(s) = {:8.5f} x Pmiss_cm(s) + Pfa_cm(s)\n'.format(C1 / C2))
else:
print(' tDCF_norm(s) = Pmiss_cm(s) + {:8.5f} x Pfa_cm(s)\n'.format(C2 / C1))
return tDCF_norm, CM_thresholds