-
Notifications
You must be signed in to change notification settings - Fork 0
/
diabetes_classImbalance_KNN.py
201 lines (124 loc) · 5.79 KB
/
diabetes_classImbalance_KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 14 11:43:16 2018
@author: David
parts of code adapted from code here:
http://www.blopig.com/blog/2017/07/using-random-forests-in-python-with-scikit-learn/
https://www.kaggle.com/dbsnail/diabetes-prediction-over-0-86-accuracy
https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
class imbalance adapted partly from:
https://elitedatascience.com/imbalanced-classes
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from scipy.stats import spearmanr, pearsonr
plt.style.use('ggplot')
'''
import warnings
warnings.filterwarnings('ignore')
'''
### read CSV file containing BR census data
di_df = pd.read_csv('C:\\Users\\David\\Documents\\Data Science Related\\Datasets\\pima-indians-diabetes\\diabetes.csv')
### checking class balance (levels of Outcome variable)
di_df.Outcome.value_counts()
''' MAYBE COUNT ZERO VALUES HERE (ARE THESE LEGIT DATA VALUES?)
### Check how many columns contain missing data
print(di_df.isnull().any().sum(), ' / ', len(di_df.columns))
### Check how many entries in total are missing
print(di_df.isnull().any(axis=1).sum(), ' / ', len(di_df))
'''
### function to replace particular value in a feature column with mean of subset
### of column grouped by target variable level (i.e. mean imputation by group)
def impute_mean_byGroup(df, field, val, target):
df_copy = df.copy() # make copy of df (not strictly necessary?)
df_copy[field].replace(val, np.nan, inplace=True) # replace values of field that match val with nan
field_grouped = df_copy[field].groupby(df_copy[target]) # field grouped by target (creates group object)
field_nanRepByMean = field_grouped.apply(lambda x: x.fillna(x.mean())) # replace nan in each group by mean of field for that group
df_copy[field] = field_nanRepByMean.values
return(df_copy)
### impute mean for 0 values by grouped outcome for selected columns
di_df_imputed = di_df.copy()
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
di_df_imputed = impute_mean_byGroup(di_df_imputed, col, 0, 'Outcome')
### assess balance of classes (counts of 0 and 1)
di_df_imputed['Outcome'].value_counts()
### copy di_df_imputed to new df simply called 'df' to use for modifying entries for balancing classes
df = di_df_imputed.copy()
### using upsampling (resampling) of minority class (1) to address class imbalance
from sklearn.utils import resample
### Separate majority and minority classes into new dfs
df_maj = df[df.Outcome==0]
df_min = df[df.Outcome==1]
### Upsample minority class
df_min_upsamp = resample(df_min,
replace=True, # sample with replacement
n_samples=500, # to match majority class
random_state=123) # reproducible results
### Combine majority class with upsampled minority class
df_combined_upsamp = pd.concat([df_maj, df_min_upsamp])
### Display new class counts
df_combined_upsamp.Outcome.value_counts()
### import sklearn functions for K-Nearest Neighbors Clasifier and train/test split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
### split data into features and target
X = df_combined_upsamp.iloc[:,:-1]
y = df_combined_upsamp.iloc[:, -1]
### split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)
### check dimensions of train/test data
print(X_train.shape)
print(X_test.shape)
print(y_train.size)
print(y_test.size)
### instantiate and fit knn model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
### predict on test set and assess accuracy
y_pred = knn.predict(X_test)
print('Accuracy of knn on test set: {:.2f}'.format(knn.score(X_test, y_test)))
### import libraries for cross-validation
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
### Perform 10-fold cross-validation and compute mean accuracy over folds
kfold = model_selection.KFold(n_splits=10)
modelCV = KNeighborsClassifier(n_neighbors=5)
scoring = 'accuracy'
results = cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross-validation mean accuracy: %.3f" % (results.mean()))
### confusion matrix metrics and visualization
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
acc_title = 'Accuracy Score: {0}'.format(round(results.mean(),2))
plt.title(acc_title, size = 15);
### Compute ROC Curve and Plot
### calculate the fpr and tpr for all thresholds of the classification
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.title('ROC')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.025, 1.025]) # just giving a little padding to plot range
plt.ylim([-0.025, 1.025])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
### show validation curves for different number of neighbors used to fit KNN
from sklearn.learning_curve import validation_curve
neighbors = np.arange(1, 20)
train_score, val_score = validation_curve(KNeighborsClassifier(), X, y, 'n_neighbors', neighbors, cv=10)
plt.plot(neighbors, np.median(train_score, 1), color='blue', label='training score')
plt.plot(neighbors, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('neighbors')
plt.ylabel('score');