-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluation.py
206 lines (174 loc) · 8.04 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import numpy as np
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
import pandas as pd
import pickle
import os
class MetricEvaluator:
def __init__(self, left_df=None, right_df=None):
if type(left_df) == type(None) and type(right_df) == type(None):
print("Empty Evaluator")
return
# tool usage
self.name_to_tool = {"no tool in hand" : "T0", "needle_driver": "T1", "forceps": "T2", "scissors":"T3"}
self.tool_to_name = {"T0": "no tool in hand", "T1": "needle_driver","T2": "forceps","T3": "scissors"}
self.yolo_to_label = {'empty': 0, 'needle_driver': 1, 'forceps': 2, 'scissors': 3}
# left_df = pd.read_csv(left_ground_truth_path, sep=" ", header=None)
# right_df = pd.read_csv(right_ground_truth_path, sep=" ", header=None)
self.left_tool_labels, left_relevant_classes = self.convert_file_to_list(left_df)
self.right_tool_labels, right_relevant_classes = self.convert_file_to_list(right_df)
self.left_predictions = list()
self.right_predictions = list()
self.metric_history = {'f1': list(), 'accuracy': list(), 'recall': list(), 'precision': list(), 'f1_macro': list()}
# used to construct the metric dictionary and for calculating the Macro-f1 score which averages over the total
# number of relevant classes (relevant := has a corresponding ground truth)
self.num_relevant_classes = len(left_relevant_classes.union(right_relevant_classes))
def convert_tool_to_label(self, tool):
"""
Converts TX string to label that discrete numeric label
:param tool:
:return:
"""
tool_to_label = {"T0": 0, "T1": 1, "T2": 2, "T3": 3}
return tool_to_label[tool]
def convert_file_to_list(self, df):
"""
Explodes concise tool usage file to a list of ground truths for each file (arm) passed to it
:param df:
:return:
"""
ground_truth = np.zeros(df.iloc[-1, 1]) # last row end time, maximum time
for index, row in df.iterrows():
ground_truth[row[0]:row[1]] = self.convert_tool_to_label(row[2])
return ground_truth, set(ground_truth)
def update_left_prediction(self, prediction):
"""
Convert prediction to tool name as seen in tool usage dataset
:param prediction:
:return: none, changes self object
"""
lower_case_pred = prediction.lower()
self.left_predictions.append(self.yolo_to_label[lower_case_pred])
def update_right_prediction(self, prediction):
"""
Convert prediction to tool name as seen in tool usage dataset
:param prediction:
:return: none, changes self object
"""
lower_case_pred = prediction.lower()
self.right_predictions.append(self.yolo_to_label[lower_case_pred])
def convert_yolo_output_to_tool(self, yolo_output):
"""
updates history of predictions given a yolo output
:param yolo_output:
:return:
"""
hand, tool = yolo_output.split('_', maxsplit=1)
if hand == "Left":
self.update_left_prediction(tool)
else:
self.update_right_prediction(tool)
def calculate_recall(self):
"""
Calculates and returns recall for both arms predictions and for each tool
:return:
"""
# shear off the ground truths not yet predicted
right_ground_truth = self.right_tool_labels[:len(self.right_predictions)]
left_ground_truth = self.left_tool_labels[:len(self.left_predictions)]
ground_truth = np.hstack([right_ground_truth, left_ground_truth])
predictions = self.right_predictions + self.left_predictions
return recall_score(ground_truth, predictions, labels=list(range(4)), average=None)
def calculate_accuracy(self):
"""
Calculates and returns accuracy for both arms predictions
:return:
"""
# shear off the ground truths not yet predicted
right_ground_truth = self.right_tool_labels[:len(self.right_predictions)]
left_ground_truth = self.left_tool_labels[:len(self.left_predictions)]
ground_truth = np.hstack([right_ground_truth, left_ground_truth])
predictions = self.right_predictions + self.left_predictions
return accuracy_score(ground_truth, predictions)
def calculate_f1(self):
"""
Calculates and returns f1 for both arms predictions and for each tool
:return:
"""
# shear off the ground truths not yet predicted
right_ground_truth = self.right_tool_labels[:len(self.right_predictions)]
left_ground_truth = self.left_tool_labels[:len(self.left_predictions)]
ground_truth = np.hstack([right_ground_truth, left_ground_truth])
predictions = self.right_predictions + self.left_predictions
return f1_score(ground_truth, predictions, labels=list(range(4)), average=None)
def calculate_precision(self):
"""
Calculates and returns precision for both arms predictions and for each tool
:return:
"""
# shear off the ground truths not yet predicted
right_ground_truth = self.right_tool_labels[:len(self.right_predictions)]
left_ground_truth = self.left_tool_labels[:len(self.left_predictions)]
ground_truth = np.hstack([right_ground_truth, left_ground_truth])
predictions = self.right_predictions + self.left_predictions
return precision_score(ground_truth, predictions, labels=list(range(4)), average=None)
def calculate_macro_f1_score(self):
"""
Calculates and returns f1-macro (average of f1 scores) for both arms predictions
:return:
"""
f1_scores = self.calculate_f1()
return np.sum(f1_scores) / self.num_relevant_classes
def calculate_all_metrics(self):
"""
Updates metric history stored within evaluation object with all of the metrics
:return:
"""
try:
self.metric_history['recall'].append(self.calculate_recall())
self.metric_history['accuracy'].append(self.calculate_accuracy())
self.metric_history['precision'].append(self.calculate_precision())
self.metric_history['f1'].append(self.calculate_f1())
self.metric_history['f1_macro'].append(self.calculate_macro_f1_score())
return False
except:
return True
def history_to_pickle(self, destination):
"""
Takes metric history and stores it in pickle given a destination file (pass without .pkl)
:param destination:
:return:
"""
# save metric history
with open(destination + '.pkl', 'wb') as handle:
pickle.dump(self.metric_history, handle, protocol=pickle.HIGHEST_PROTOCOL)
@staticmethod
def print_metric_statistics(source=None, destination=None):
"""
If passed string, assume this is the pickle and then load history before printing last entries
If passed dictionary, then print last entries
:param source:
:return:
"""
if type(source) == str:
with open(source, 'rb') as f:
history_dictionary = pickle.load(f)
elif type(source) == dict:
history_dictionary = source
else:
print("Pass me a path to a pickle or a dictionary! I like salty foods and books. Sue me.")
return
# load history
history_df = pd.DataFrame.from_dict(history_dictionary)
if type(destination) == str:
history_df.iloc[-1, :].to_csv(destination + '.csv')
else:
print(history_df.iloc[-1, :].head())
def main():
metri = MetricEvaluator()
for f in os.listdir('experiments'):
print(f)
metri.print_metric_statistics('experiments/' + f, 'experiments/report_' + f[:-4])
# metri.print_metric_statistics('experiments/P026_tissue1window_size25_smoothingsuper-linear.pkl')
# use window size 25 and log smoothing
if __name__ == "__main__":
main()