-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter3.py
279 lines (246 loc) · 12.3 KB
/
Chapter3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
from sklearn.datasets import fetch_mldata
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.base import clone
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
mnist = fetch_mldata("MNIST original")
X, y = mnist["data"], mnist["target"]
some_digit = X[36000]
# some_digit_image = some_digit.reshape(28, 28)
#
# plt.imshow(some_digit_image, cmap = matplotlib.cm.binary, interpolation="nearest")
# plt.axis("off")
# plt.show()
#
# label = y[36000]
# print(label)
# Separate the training data and the test data
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# Shuffle the training data (since some training algorithms are sensetive to the ordering of training data)
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Test out a binary classifier
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# for i in y_train:
# if y_train[i] == 5:
# y_train_5 = y_train[i]
# else:
# pass
# return y_train_5
#
# for i in y_test:
# if y_test[i] == 5:
# y_test_5 = y_test[i]
# else:
# pass
sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(X_train, y_train_5)
test_prediction = sgd_classifier.predict([some_digit])
print("SGD binary classifier test prediction: ", test_prediction)
if test_prediction == True:
print("Test image correctly classified as 5")
else:
print("Test image incorrectly classifed!")
# Evaluate the performace of the binary classifier using Cross-validation
skfolds = StratifiedKFold(n_splits = 3, random_state = 42)
# StratifiedKFold performs stratefied sampling of to produce folds that contain proportions of each class that are representative of the fill training set
for train_i, test_i in skfolds.split(X_train, y_train_5):
clone_classifier = clone(sgd_classifier)
X_train_folds = X_train[train_i]
y_train_folds = (y_train_5[train_i])
X_test_fold = X_train[test_i]
y_test_fold = (y_train_5[test_i])
clone_classifier.fit(X_train_folds, y_train_folds)
y_pred = clone_classifier.predict(X_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct, "out of", len(y_pred))
CV_score = cross_val_score(sgd_classifier, X_train, y_train_5, cv=3, scoring="accuracy")
print("Cross-validation score: ", CV_score)
# A dumb classifer to classify not-5s
class Not5Classifier(BaseEstimator):
def fit(self, X, y=None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
not_5_classifier = Not5Classifier()
not_5_CV_score = cross_val_score(not_5_classifier, X_train, y_train_5, cv=3, scoring="accuracy")
print("Not-5 Cross-validation score: ", not_5_CV_score)
# NB a better way to evaluate the performace of a classifer is to use a CONFUSION MATRIX
y_train_pred = cross_val_predict(sgd_classifier, X_train, y_train_5, cv=3)
conf_mat = confusion_matrix(y_train_5, y_train_pred)
print(conf_mat)
# The confusion matrix gives rise to Precision and Recall scores
# precision = true_pos/(true_pos+false_pos)
# recall = true_pos/(true_pos+false_neg)
precision = precision_score(y_train_5, y_train_pred)
recall = recall_score(y_train_5, y_train_pred)
print("Precision: ", precision)
print("Recall: ", recall)
F_1 = (precision * recall)/(precision + recall)
print("F1 score:", F_1, f1_score(y_train_5, y_train_pred))
# observe the classifers decision function...
y_scores = sgd_classifier.decision_function([some_digit])
print(y_scores)
# But which threshold do we use? First we get the scores of all instances in the training set using cross_val_predict
y_scores = cross_val_predict(sgd_classifier, X_train, y_train_5, cv=3, method="decision_function")
# Then with these scores we compute the precision and recall for all possible thresholds using the precision_recall_curve function
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
# Finally we plot precision and recall as a function of threshold using matplotlib
def plot_precision_recall_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0,1])
# plot_precision_recall_threshold(precisions, recalls, thresholds)
# plt.show()
# We can use this plot to choose the threshold most suitable for our needs, based on what precision and recall values we'd need
# We can also plot precision against recall...
def plot_precision_recall(precisions, recalls):
plt.plot(recalls, precisions, "g-")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0,1])
plt.ylim([0,1])
plt.show()
# Say we want to aim for 90% precision. This would be from a threshold of > 70000
y_train_pred_90 = (y_scores > 70000)
pres_90 = precision_score(y_train_5, y_train_pred_90)
recall_90 = recall_score(y_train_5, y_train_pred_90)
print("Precision:", pres_90)
print("Recall: ", recall_90)
# The receiver operating characteristic (ROC) curve is onother tool used with binary classifiers. But instead of precision v recall,
# it plots the true positives v false positives
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
"""This function produces the ROC curve - a plot of the true positive rate again the false positive rate -
along side the ROC of a purely random classifier (the dotted line). Note that a perfect classifier will give a value of
exactly 1 when the ROC curve is integrated."""
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0,1],[0,1],"k--")
plt.axis([0,1,0,1])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC curve")
# plot_roc_curve(fpr, tpr)
# plt.show()
# NB a perfect classifier will give a value of 1 when the ROC curve is integrated; a purely random classifier will give a value of 0.5
# Lets evalute using the method - the area under the curve (AUC)
auc_val = roc_auc_score(y_train_5, y_scores)
print("Area under curve for SGD classifier: ", auc_val)
# Alright, lets try training a Random Forest classifier and compare the ROC curve and the ROC AUC to those of the SGD classifier
rf_classifier = RandomForestClassifier(random_state=42)
y_probas_rf = cross_val_predict(rf_classifier, X_train, y_train_5, cv=3, method="predict_proba")
# But wait, to plot an ROC curve we need scores, not probablities. We can use the positive class probability as the score!
y_scores_rf = y_probas_rf[:,1] # score = prob of positive class
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_train_5, y_scores_rf)
# now we plot!
# plt.plot(fpr, tpr, "b:", label="SGD")
# plot_roc_curve(fpr_rf, tpr_rf, "Random Forest")
# plt.legend(loc="lower right")
# plt.show()
rf_auc_score = roc_auc_score(y_train_5, y_scores_rf)
# rf_precision = precision_score(y_train_5, y_probas_rf)
# rf_recall = recall_score(y_train_5, y_probas_rf)
print("Area under curve for Random Forest classifier: ", rf_auc_score)
# print("Random Forest precision: ", rf_precision)
# print("Random Forest recall: ", rf_recall)
# Now lets try a multi-class classifier!!
# Scale the features for later...
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
# First train on the full X and y training sets:
sgd_classifier.fit(X_train, y_train)
multiclassSGD_test = sgd_classifier.predict([some_digit])
print("SGD multiclass test prediction: ", multiclassSGD_test)
some_digit_scores = sgd_classifier.decision_function([some_digit])
print("Decision function: ", some_digit_scores)
classes = sgd_classifier.classes_
print("Classes: ", classes)
SGD_multi_CV_score = cross_val_score(sgd_classifier,X_train, y_train, cv=3, scoring="accuracy")
SGD_multi_scaled_CV_score = cross_val_score(sgd_classifier, X_train_scaled, y_train, cv=3, scoring="accuracy")
print("Multiclass SGD CV score: ", SGD_multi_CV_score)
print("Multiclass SGD with scaling CV score: ", SGD_multi_scaled_CV_score)
# Lets force Scikit-Learn to use One vs One classification
ovo_classifier = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_classifier.fit(X_train, y_train)
ovo_test_prediction = ovo_classifier.predict([some_digit])
print("OvO test prediction: ", ovo_test_prediction)
n_ovo_estimators = len(ovo_classifier.estimators_)
print("Number of OvO estimators: ", n_ovo_estimators)
# Lets also try out a Random Forest classifier for multi-class predictions
rf_multi_classifier = RandomForestClassifier(random_state=42)
rf_multi_classifier.fit(X_train, y_train)
rf_multi_classifier_test = rf_multi_classifier.predict([some_digit])
print("Random Forest multi-class prediction: ", rf_multi_classifier_test)
rf_probs = rf_multi_classifier.predict_proba([some_digit])
print("Random Forest classifier probablities: ", rf_probs)
# Lets say we have decided on a promising model and are now looking for ways to tweak and imporve it. OK, time for Error Analysis!
# First we make some predictions:
y_train_pred = cross_val_predict(sgd_classifier, X_train_scaled, y_train, cv=3)
# generate the confusion matrix
conf_mat = confusion_matrix(y_train, y_train_pred)
print("The confusion matrix: ")
print(conf_mat)
# # we can make a plot of the confusion matrix too
# plt.matshow(conf_mat)
# plt.show()
# # Divide each value in the confusion matrix by the total number of images in the corresponding class to compare error rates
# row_sums = conf_mat.sum(axis=1, keepdims=True)
# norm_conf_mat = conf_mat/row_sums
# # Then we fill the diagonal of the matrix with zeros to show only the errors
# np.fill_diagonal(norm_conf_mat, 0)
# plt.matshow(norm_conf_mat, cmap=plt.cm.gray)
# plt.show()
# Now lets look at multilabel classification. To illustrate this, well make a classifier that identifies numbers that are both odd and more then or equal to 7
# This part of the code creates an array containing two target labels for each digit image: the first whether its large, the second whether its odd
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
# This part creates a K-Neighbors classifier, which supports multilabel classification (NB not all classifiers do!)
KNN_classifier = KNeighborsClassifier()
KNN_classifier.fit(X_train, y_multilabel)
knn_test = KNN_classifier.predict([some_digit])
print(knn_test)
# To evaluate this multilabel classifier, we compute an F1 score for each label and then average the scores (NB we can use other metrics for binary classifiers this way)
# y_train_knn_predict = cross_val_predict(KNN_classifier, X_train, y_train, cv=3)
# knn_f1 = f1_score(y_train, y_train_knn_predict, average="macro")
# print("Average F1 score for the K-Neighbors classifier: ", knn_f1)
# to wrap this up, lets look at Multi Output Classification. A generalization of multilabel classification, where each label can have two or more possible values.
# We'll test this out by building a system that removes noise from images.
# Start by making noisy images....
train_noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_noisy = X_train + train_noise
test_noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_noisy = X_test + test_noise
y_train_mod = X_train
y_test_mod = X_test
# Lets look at the noisy test images
some_noisy_digit = X_test_noisy[3600]
some_noisy_digit_image = some_noisy_digit.reshape(28, 28)
plt.imshow(some_noisy_digit_image, cmap = matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()
# And now we'll train this multi output classifier with K-Neighbors
KNN_classifier.fit(X_train_noisy, y_train_mod)
some_clean_digit = KNN_classifier.predict([some_noisy_digit])
some_clean_digit_image = some_clean_digit.reshape(28, 28)
plt.imshow(some_clean_digit_image, cmap = matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()