-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm_ip.py
376 lines (292 loc) · 12.5 KB
/
svm_ip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#!/usr/bin/python3
import sys
import random
import argparse
import itertools
import scipy.io
import numpy as np
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
"""
Constants
"""
MAX_ITERS = -1
TOL = 1e-3
gamma_range = {'start': -15, 'stop': 3, 'num': 7, 'base': 2.0}
C_range = {'start': -3, 'stop': 15, 'num': 7, 'base': 2.0}
kernels = ['rbf', 'sigmoid']
def write_readme(path):
with open(f'{path}/README', 'w') as f:
f.write('Command line arguments:\n')
f.write(f'{" ".join(sys.argv)}\n\n')
f.write('Hyperparameter set:\n')
f.write(f'Gamma: {gamma_range}\n')
f.write(f'C: {C_range}\n')
f.write(f'Kernels: {kernels}\n')
def get_subjects(path):
mat = scipy.io.loadmat(path)['ipOnly_roi_scanData']
subjects = list(set([s[0][:2] for s in mat[1]]))
subjects.sort()
return subjects
def scramble_labels(y_data):
classes = sorted(list(set(y_data)))
y_data_copy = y_data.copy()
to_change = random.sample([i for i, x in enumerate(
y_data) if x == classes[0]], k=len(y_data)//4)
to_change.extend(random.sample(
[i for i, x in enumerate(y_data) if x == classes[1]], k=len(y_data)//4))
for index in to_change:
if y_data[index] == classes[0]:
y_data[index] = classes[1]
else:
y_data[index] = classes[0]
# Makes sure labels are scrambled properly
num_diff = sum(i != j for i, j in zip(y_data, y_data_copy))
if abs(len(y_data)//2 - num_diff) > 1:
raise ValueError
def extract_subject_data(mat, subject, roi, use_pre):
subject_runs = []
for i in range(len(mat[1])):
if mat[1][i][0][:2] == subject:
subject_runs.append(i)
assert (len(subject_runs) == 2)
block_count = 0
x_data = [[] for _ in range(2)]
for scan in mat[0][subject_runs[use_pre]][0][roi][0]:
for c, cond in enumerate(scan[0]):
for block in cond[0]:
block_count += 1
block_data = []
if len(block[0]) == 8:
trs = block[0]
elif len(block[0]) == 9:
trs = block[0][0:8]
else:
trs = block[0][1:9]
for tr in trs:
# Extract all voxel data from individual TRs
block_data.append(tr[0][0][0].tolist())
if use_pre:
x_data[c].append(block_data)
else:
x_data[c // 2].append(block_data)
# y_data.append('untrained' if 'untrained' in scan[1][cond][0] else 'trained')
x_data_f, y_data_f = [], []
# Flatten and transform data into units by voxel
for c in range(2):
trs = []
for a in x_data[c]:
trs.extend(a)
trs = np.array(trs).T
x_data_f.extend(trs)
y_data_f.extend([c for _ in trs])
data = {'x': x_data_f, 'y': y_data_f}
return data
def generate_dataset(mat, subjects, roi, use_pre):
x_data = []
x_data_indices = []
y_data_by_subject = dict()
for subject in subjects:
subject_data = extract_subject_data(mat, subject, roi, use_pre)
x_data_indices.append(len(x_data))
y_data_by_subject[subject] = subject_data['y']
x_data.extend(subject_data['x'])
# MinMaxScaler scales each feature to values between 0 and 1 among all x data
scaler = MinMaxScaler(feature_range=(0, 1))
x_standardized = scaler.fit_transform(x_data)
# Sorts block data into respective subject
x_data_by_subject = dict()
for i in range(len(subjects)):
subject = subjects[i]
start_index = x_data_indices[i]
end_index = x_data_indices[i+1] if i + \
1 < len(x_data_indices) else len(x_data)
x_data_by_subject[subject] = x_standardized[start_index:end_index]
x_data_by_subject = {k: v for k, v in sorted(
x_data_by_subject.items(), key=lambda item: len(item[1]))}
y_data_by_subject = {k: v for k, v in sorted(
y_data_by_subject.items(), key=lambda item: len(item[1]))}
y_data_by_subject_shuffled = dict()
for k, v in y_data_by_subject.items():
scrambled = v.copy()
scramble_labels(scrambled)
y_data_by_subject_shuffled[k] = scrambled
return x_data_by_subject, y_data_by_subject, y_data_by_subject_shuffled
def split_dataset(x_data, y_data, y_data_shuffled, inner_subjects, outer_subject, scramble):
x_train = []
y_train = []
x_test_inner = []
y_test_inner = []
x_test_outer = []
y_test_outer = []
for subject in x_data.keys():
if subject == outer_subject:
x_test_outer.extend(x_data[subject])
y_test_outer.extend(y_data[subject])
elif subject in inner_subjects:
x_test_inner.extend(x_data[subject])
y_test_inner.extend(y_data[subject])
else:
x_train.extend(x_data[subject])
if scramble:
y_train.extend(y_data_shuffled[subject])
else:
y_train.extend(y_data[subject])
return x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer
def get_optimal_run(x_train, y_train, x_test, y_test, kernels, gamma_range, C_range, n_cores):
gamma_vals = np.logspace(
gamma_range['start'], gamma_range['stop'], gamma_range['num'], base=gamma_range['base'])
C_vals = np.logspace(
C_range['start'], C_range['stop'], C_range['num'], base=C_range['base'])
param_grid = ParameterGrid(
{'kernel': kernels, 'gamma': gamma_vals, 'C': C_vals})
def evaluate_acc(params):
svclassifier = SVC(
kernel=params['kernel'], gamma=params['gamma'], C=params['C'], max_iter=MAX_ITERS, tol=TOL)
svclassifier.fit(x_train, y_train)
return svclassifier.score(x_test, y_test), params
search = Parallel(n_jobs=n_cores)(delayed(evaluate_acc)(params)
for params in list(param_grid))
search.sort(key=lambda x: tuple(x[1].values()))
best_acc = 0
best_params = None
for result in search:
if result[0] > best_acc:
best_acc = result[0]
best_params = result[1]
return best_params, best_acc
def train(data_params, grid_params, num_inner=1, scramble=False, n_cores=1):
subjects = get_subjects(data_params['path'])
if data_params['cond'] == 0:
subjects.remove('CG')
mat = scipy.io.loadmat(data_params['path'])['ipOnly_roi_scanData']
x_data, y_data, y_data_shuffled = generate_dataset(
mat, subjects, data_params['roi'], data_params['cond'])
inner_acc_report = []
outer_acc_report = []
for outer_subject in tqdm(subjects, leave=(not scramble)):
opt_inner_params = None
opt_inner_acc = -1
inner_subjects = [s for s in subjects if s != outer_subject]
# Iterate through inner folds
for inner_test_subjects in itertools.combinations((inner_subjects), num_inner):
inner_test_subjects = list(inner_test_subjects)
x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer = split_dataset(
x_data, y_data, y_data_shuffled, inner_test_subjects, outer_subject, scramble)
# Gets optimal params for training dataset from grid search
opt_params, inner_acc = get_optimal_run(
x_train, y_train, x_test_inner, y_test_inner, grid_params['kernels'], grid_params['gamma'], grid_params['C'], n_cores)
assert (len(set(y_train)) == 2)
assert (len(set(y_test_inner)) == 2)
assert (len(set(y_test_outer)) == 2)
if opt_params is not None:
# Trains model using optimal params for this set
svclassifier = SVC(
kernel=opt_params['kernel'], gamma=opt_params['gamma'], C=opt_params['C'], max_iter=MAX_ITERS, tol=TOL)
svclassifier.fit(x_train, y_train)
# Track optimal params among all inner folds
if inner_acc > opt_inner_acc:
opt_inner_acc = inner_acc
opt_inner_params = opt_params
inner_acc_report.append(inner_acc)
# Test outer subject using optimal params across all inner folds
x_train, y_train, _, _, x_test_outer, y_test_outer = split_dataset(
x_data, y_data, y_data_shuffled, [], outer_subject, scramble)
svclassifier = SVC(kernel=opt_inner_params['kernel'], gamma=opt_inner_params['gamma'],
C=opt_inner_params['C'], max_iter=MAX_ITERS, tol=TOL)
svclassifier.fit(x_train, y_train)
outer_acc = svclassifier.score(x_test_outer, y_test_outer)
outer_acc_report.append(outer_acc)
np.save(f'{output_path}/outer_accs.npy', outer_acc_report)
np.save(f'{output_path}/inner_accs.npy', inner_acc_report)
return inner_acc_report, outer_acc_report
def permutation(data_params, grid_params, inner_dist, outer_dist, runs=30, n_cores=1, output_path=''):
"""
Performs a specified number of runs where data labels are scrambled.
Parameters
----------
data_params: dict
contains specifications for data processing (see train method for documentation)
grid_params: dict
contains values for grid search (see train method for documentation)
inner_dist: list
holds accuracy values for individual inner subject tests
outer_dist: list
holds accuracy values for individual outer subject tests
runs: int
number of runs to perform, default is 30
n_cores: int
number of CPU cores for parallelization, default is 1
num_rank_blocks: int
the number of blocks to rank order by, from #1 to #(num_rank_blocks)
default is 1
output_path: str
path to which files should be saved,
default is current directory
"""
for _ in tqdm(range(runs)):
inner_result, outer_result = train(
data_params, grid_params, scramble=True, n_cores=n_cores)
inner_dist.append(inner_result)
outer_dist.append(outer_result)
np.save(f'{output_path}/outer_perms.npy', outer_dist)
np.save(f'{output_path}/inner_perms.npy', inner_dist)
"""
Parse arguments and run appropriate analysis
"""
parser = argparse.ArgumentParser()
parser.add_argument("indir", metavar="INDIR", help="directory to input data")
parser.add_argument("outdir", metavar="OUTDIR",
help="directory to output data")
parser.add_argument("-p", "--permute", action="store_true",
help="permute training data")
parser.add_argument("-n", "--run-count", action="store",
default=1, type=int, help="run RUN_COUNT permutations")
parser.add_argument("-r", "--roi", action="store",
type=int, help="ROI: V1 is 0, MT is 1")
parser.add_argument("-c", "--cond", action="store", type=int,
help="conditions: 0 is post-training, 1 is pre-training")
parser.add_argument("--cores", action="store", default=1, type=int,
help="use CORES number of CPU cores for parallelization")
args, extra = parser.parse_known_args()
if args.roi is None:
print('Need to specify ROI.')
exit()
if args.cond is None:
print('Need to specify cond.')
exit()
roi = args.roi
cond = args.cond
run_count = args.run_count
num_cores = args.cores
output_path = args.outdir
if output_path[-1] != '/':
output_path += '/'
path = args.indir
data_params = {'path': path, 'roi': roi, 'cond': cond}
grid_params = {'gamma': gamma_range, 'C': C_range, 'kernels': kernels}
write_readme(output_path)
if args.permute:
try:
inner_dist = np.load(f'{output_path}/inner_perms.npy').tolist()
outer_dist = np.load(f'{output_path}/outer_perms.npy').tolist()
except FileNotFoundError:
print('Creating new permutation distribution...')
inner_dist = []
outer_dist = []
permutation(data_params, grid_params, inner_dist, outer_dist,
runs=run_count, output_path=output_path, n_cores=num_cores)
np.save(f'{output_path}/outer_perms.npy', outer_dist)
np.save(f'{output_path}/inner_perms.npy', inner_dist)
else:
inner_accs = []
outer_accs = []
inner_result, outer_result = train(
data_params, grid_params, n_cores=num_cores)
inner_accs.append(inner_result)
outer_accs.append(outer_result)
np.save(f'{output_path}/outer_accs.npy', outer_accs)
np.save(f'{output_path}/inner_accs.npy', inner_accs)