forked from sergeyplis/polyssifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolyssifier.py
448 lines (385 loc) · 14.5 KB
/
polyssifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""This module computes the baseline results by applying various classifiers.
The classifiers used here are nearest neighbors, linear SVM, RBF SVM, decision
tree, random forest, logistic regression, naive bayes, and LDA.
"""
__author__ = "Sergey Plis"
__copyright__ = "Copyright 2015, Mind Research Network"
__credits__ = ["Sergey Plis, Devon Hjelm, Alvaro Ulloa"]
__licence__ = "3-clause BSD"
__email__ = "[email protected]"
__maintainer__ = "Sergey Plis"
# We may want to load memmaps eventually, so here's a flag to control this.
USEJOBLIB=False
import argparse
import functools
from glob import glob
import logging
if USEJOBLIB:
from joblib.pool import MemmapingPool as Pool
from joblib.pool import ArrayMemmapReducer as Array
else:
from multiprocessing import Pool
from multiprocessing import Array
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as pl
import multiprocessing
import numpy as np
import os
from os import path
import pandas as pd
import pickle
import random as rndc
from scipy.io import savemat
from scipy.spatial.distance import pdist
import seaborn as sb
from sklearn.metrics import auc
from sklearn.metrics import roc_curve as rc
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.qda import QDA
import sys
logging.basicConfig(format="[%(module)s:%(levelname)s]:%(message)s")
logger = logging.getLogger(__name__)
# please set this number to no more than the number of cores on the machine you're
# going to be running it on but high enough to help the computation
PROCESSORS = 20
seed = rndc.SystemRandom().seed()
NAMES = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "Logistic Regression", "Naive Bayes", "LDA"]
def make_classifiers(data_shape, ksplit) :
"""Function that makes classifiers each with a number of folds.
Returns two dictionaries for the classifiers and their parameters, using
`data_shape` and `ksplit` in construction of classifiers.
Parameters
----------
data_shape : tuple of int
Shape of the data. Must be a pair of integers.
ksplit : int
Number of folds.
Returns
-------
classifiers: dict
The dictionary of classifiers to be used.
params: dict
A dictionary of list of dictionaries of the corresponding
params for each classifier.
"""
if len(data_shape) != 2:
raise ValueError("Only 2-d data allowed (samples by dimension).")
classifiers = {
"Nearest Neighbors": KNeighborsClassifier(3),
"Linear SVM": SVC(kernel="linear", C=1, probability=True),
"RBF SVM": SVC(gamma=2, C=1, probability=True),
"Decision Tree": DecisionTreeClassifier(max_depth=None,
max_features="auto"),
"Random Forest": RandomForestClassifier(max_depth=None,
n_estimators=10,
max_features="auto",
n_jobs=PROCESSORS),
"Logistic Regression": LogisticRegression(),
"Naive Bayes": GaussianNB(),
"LDA": LDA()
}
params = {
"Nearest Neighbors": [{"n_neighbors": [1, 5, 10, 20]}],
"Linear SVM": [{"kernel": ["linear"],"C": [1]}],
"RBF SVM": [{"kernel": ["rbf"],
"gamma": np.arange(0.1, 1, 0.1).tolist() + range(1, 10),
"C": np.logspace(-2, 2, 5).tolist()}],
"Decision Tree": [],
"Random Forest": [{"n_estimators": range(5,20)}],
"Logistic Regression": [{"C": np.logspace(0.1, 3, 7).tolist()}],
"Naive Bayes": [],
"LDA": [{"n_components": [np.int(0.1 * data_shape[0]),
np.int(0.2 * data_shape[0]),
np.int(0.3 * data_shape[0]),
np.int(0.5 * data_shape[0]),
np.int(0.7 * data_shape[0])]}],
}
logger.info("Using classifiers %r with params %r" % (classifiers, params))
return classifiers, params
def get_score(data, labels, fold_pairs, name, model, param):
"""
Function to get score for a classifier.
Parameters
----------
data: array_like
Data from which to derive score.
labels: array_like or list
Corresponding labels for each sample.
fold_pairs: list of pairs of array_like
A list of train/test indicies for each fold
dhjelm(Why can't we just use the KFold object?)
name: str
Name of classifier.
model: WRITEME
param: WRITEME
Parameters for the classifier.
Returns
-------
classifier: WRITEME
fScore: WRITEME
"""
assert isinstance(name, str)
logger.info("Classifying %s" % name)
ksplit = len(fold_pairs)
if name not in NAMES:
raise ValueError("Classifier %s not supported. "
"Did you enter it properly?" % name)
# Redefine the parameters to be used for RBF SVM (dependent on
# training data)
if True: #better identifier here
logger.info("Attempting to use grid search...")
fScore = []
for i, fold_pair in enumerate(fold_pairs):
print ("Classifying a %s the %d-th out of %d folds..."
% (name, i+1, len(fold_pairs)))
classifier = get_classifier(
name, model, param, data[fold_pair[0], :])
area = classify(data, labels, fold_pair, classifier)
fScore.append(area)
else:
logger.warn("Multiprocessing splits not tested yet.")
pool = Pool(processes=min(ksplit, PROCESSORS))
classify_func = lambda f : classify(
data,
labels,
fold_pairs[f],
classifier=get_classifier(
name,
model,
param,
data=data[fold_pairs[f][0], :]))
fScore = pool.map(functools.partial(classify_func, xrange(ksplit)))
pool.close()
pool.join()
return classifier, fScore
def get_classifier(name, model, param, data=None):
"""
Returns the classifier for the model.
Parameters
----------
name: str
Classifier name.
model: WRITEME
param: WRITEME
data: array_like, optional
Returns
-------
WRITEME
"""
assert isinstance(name, str)
if name == "RBF SVM":
logger.info("RBF SVM requires some preprocessing."
"This may take a while")
assert data is not None
#Euclidean distances between samples
dist = pdist(data, "euclidean").ravel()
#Estimates for sigma (10th, 50th and 90th percentile)
sigest = np.asarray(np.percentile(dist,[10,50,90]))
#Estimates for gamma (= -1/(2*sigma^2))
gamma = 1./(2*sigest**2)
#Set SVM parameters with these values
param = [{"kernel": ["rbf"],
"gamma": gamma.tolist(),
"C": np.logspace(-2,2,5).tolist()}]
if name not in ["Decision Tree", "Naive Bayes"]:
# why 5?
logger.info("Using grid search for %s" % name)
model = GridSearchCV(model, param, cv=5, scoring="f1",
n_jobs=PROCESSORS)
else:
logger.info("Not using grid search for %s" % name)
return model
def classify(data, labels, (train_idx, test_idx), classifier=None):
"""
Classifies given a fold and a model.
Parameters
----------
data: array_like
2d matrix of observations vs variables
labels: list or array_like
1d vector of labels for each data observation
(train_idx, test_idx) : list
set of indices for splitting data into train and test
classifier: sklearn classifier object
initialized classifier with "fit" and "predict_proba" methods.
Returns
-------
WRITEME
"""
assert classifier is not None, "Why would you pass not classifier?"
# Data scaling based on training set
scaler = StandardScaler()
scaler.fit(data[train_idx])
data_train = scaler.transform(data[train_idx])
data_test = scaler.transform(data[test_idx])
classifier.fit(data_train, labels[train_idx])
fpr, tpr, thresholds = rc(labels[test_idx],
classifier.predict_proba(data_test)[:, 1])
return auc(fpr, tpr)
def load_data(source_dir, data_pattern):
"""
Loads the data from multiple sources if provided.
Parameters
----------
source_dir: str
data_pattern: str
Returns
-------
data: array_like
"""
logger.info("Loading data from %s with pattern %s"
% (source_dir, data_pattern))
data_files = glob(path.join(source_dir, data_pattern))
if len(data_files) == 0:
raise ValueError("No data files found with pattern %s in %s"
% (data_pattern, source_dir))
data = None
for data_file in data_files:
d = np.load(data_file)
if data is not None:
assert d.shape[0] == data.shape[0]
data = np.concatenate((data, d), axis=1)
else:
data = d
logger.info("Data loading complete. Shape is %r" % (data.shape,))
return data
def load_labels(source_dir, label_pattern):
"""
Function to load labels file.
Parameters
----------
source_dir: str
Source directory of labels
label_pattern: str
unix regex for label files.
Returns
-------
labels: array_like
A numpy vector of the labels.
"""
logger.info("Loading labels from %s with pattern %s"
% (source_dir, label_pattern))
label_files = glob(path.join(source_dir, label_pattern))
if len(label_files) == 0:
raise ValueError("No label files found with pattern %s"
% label_pattern)
if len(label_files) > 1:
raise ValueError("Only one label file supported ATM.")
labels = np.load(label_files[0]).flatten()
logger.info("Label loading complete. Shape is %r" % (labels.shape,))
return labels
def main(source_dir, ksplit, out_dir, data_pattern, label_pattern, test_mode):
"""
Main function for polyssifier.
Parameters
----------
source_dir: str
ksplit: int
out_dir: str
data_pattern: str
POSIX-type regex string for list of paths.
label_pattern: str
POSIX-type regex string for list of paths.
test_mode: bool
"""
# Load input and labels.
data = load_data(source_dir, data_pattern)
labels = load_labels(source_dir, label_pattern)
# Get classifiers and params.
classifiers, params = make_classifiers(data.shape, ksplit)
global NAMES
if test_mode:
NAMES = ["Nearest Neighbors", "Linear SVM", "Decision Tree",
"Logistic Regression", "Naive Bayes", "LDA"]
classifiers = dict((k, classifiers[k]) for k in NAMES)
params = dict((k, params[k]) for k in NAMES)
kplit = 3
# Make the folds.
logger.info("Making %d folds" % ksplit)
kf = StratifiedKFold(labels, n_folds=ksplit)
# Extract the training and testing indices from the k-fold object,
# which stores fold pairs of indices.
fold_pairs = [(tr, ts) for (tr, ts) in kf]
assert len(fold_pairs) == ksplit
#dhjelm: were we planning on using this dict?
score={}
dscore=[]
for name in NAMES:
mdl = classifiers[name]
param = params[name]
# Get the scores.
clf, fScores = get_score(data, labels,
fold_pairs, name,
mdl, param)
if out_dir is not None:
save_path = path.join(out_dir,
name + "%.2f.pkl" % (np.mean(fScores)))
logger.info("Saving classifier to %s" % save_path)
with open(save_path, "wb") as f:
pickle.dump(clf,f)
dscore.append(fScores)
score[name] = (np.mean(fScores), np.std(fScores))
# save results from all folds for estimating AUC std
with open(path.join(out_dir, 'auc_score.pkl'),'wb') as f:
pickle.dump(score, f)
dscore = np.asarray(dscore)
pl.figure(figsize=[10,6])
ax=pl.gca()
ds = pd.DataFrame(dscore.T, columns=NAMES)
ds_long =pd.melt(ds)
sb.barplot(x='variable', y='value', data=ds_long, palette="Paired")
ax.set_xticks(np.arange(len(NAMES)))
ax.set_xticklabels(NAMES, rotation=30)
ax.set_ylabel("classification AUC")
#ax.set_title("Using features: "+str(action_features))
pl.subplots_adjust(bottom=0.18)
if out_dir is not None:
# change the file you're saving it to
pl.savefig(path.join(out_dir, "classifiers.png"))
else:
pl.show(True)
def make_argument_parser():
"""
Creates an ArgumentParser to read the options for this script from
sys.argv
"""
parser = argparse.ArgumentParser()
parser.add_argument("data_directory",
help="Directory where the data files live.")
parser.add_argument("out", help="Output directory of files.")
parser.add_argument("-t", "--test", action="store_true",
help=("Test mode, avoids slow classifiers and uses"
" 3 folds"))
parser.add_argument("--folds", default=10,
help="Number of folds for n-fold cross validation")
parser.add_argument("--data_pattern", default="data.npy",
help="Pattern for data files")
parser.add_argument("--label_pattern", default="labels.npy",
help="Pattern for label files")
return parser
if __name__ == "__main__":
CPUS = multiprocessing.cpu_count()
if CPUS < PROCESSORS:
raise ValueError("Number of PROCESSORS exceed available CPUs, "
"please edit this in the script and come again!")
parser = make_argument_parser()
args = parser.parse_args()
logger.setLevel(logging.DEBUG)
main(args.data_directory, out_dir=args.out, ksplit=int(args.folds),
data_pattern=args.data_pattern, label_pattern=args.label_pattern,
test_mode=args.test)