-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathmeteore_reg.py
executable file
·507 lines (474 loc) · 22.5 KB
/
meteore_reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
from sklearn.linear_model import RidgeCV
#from skleanr.linear_model import Ridge, LinearRegression, SGDRegressor,\
# ElasticNet, Lars, Lasso, ARDRegression, BayesianRidge, HuberRegressor,\
# PoissonRegressor, PassiveAggressiveRegressor
#from sklearn.svm import LinearSVR
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sys
from math import floor
from argparse import ArgumentParser as AP
from collections import Counter
def parse_training_loci(fn):
"""
Get all reads and positions to check for overlap before reading values.
Expects to get a Chr column after ReadID and before Pos, but will cope if it's missing
"""
with open(fn, 'rt') as f:
group_read_chrom_pos = set()
no_chrom = False
for i, row in enumerate(f):
if i == 0:
if 'chr' not in row.lower():
no_chrom = True
continue
cols = row.strip().split('\t')
group = int(cols[0].strip().replace('m',''))
read = cols[1].strip()
if no_chrom:
chrom = '.'
pos = int(cols[2].strip())
else:
chrom = cols[2].strip()
pos = int(cols[3].strip())
entry = (group, read, chrom, pos)
if entry in group_read_chrom_pos:
print (fn, 'duplicate group, read, position:', entry)
group_read_chrom_pos.add(entry)
return group_read_chrom_pos
def parse_training_values(fn, shared_ids, trunc_score_min, trunc_score_max, debug=False):
""" parse file contents into dictionary of [(group,read,chrom,pos)] = (strand, label, prediction, score) """
truncated = 0
contents = {}
no_chrom = False
with open(fn, 'rt') as f:
for i, row in enumerate(f):
if i == 0:
if 'chr' not in row.lower():
no_chrom = True
continue
cols = row.strip().split('\t')
group = int(cols[0].strip().replace('m',''))
read = cols[1].strip()
if no_chrom:
chrom = '.'
pos = int(cols[2].strip())
else:
chrom = cols[2].strip()
pos = int(cols[3].strip())
if (group, read, chrom, pos) in shared_ids:
starting_index = 3
if not no_chrom:
starting_index += 1
strand = 1 if cols[starting_index].strip() == '+' else 0
label = 1 if cols[starting_index+1].strip() == 'yes' else 0 # truth
predicted = 1 if cols[starting_index+2].strip() == 'yes' else 0
score = float(cols[starting_index+3].strip())
if trunc_score_max:
if score > trunc_score_max:
score = trunc_score_max
truncated += 1
if trunc_score_min:
if score < trunc_score_min:
score = trunc_score_min
truncated += 1
contents[(group, read, chrom, pos)] = (strand, label, predicted, score)
if debug:
print(truncated, 'scores truncated', file=sys.stderr)
return contents
def parse_test_loci(fn):
""" Parse test data (doesn't contain known outcomes or groups) """
with open(fn, 'rt') as f:
read_chrom_pos = set()
no_chrom = False
for i, row in enumerate(f):
if i == 0:
if 'chr' not in row.lower():
no_chrom = True
continue
cols = row.strip().split('\t')
read = cols[0].strip()
if no_chrom:
chrom = '.'
pos = int(cols[1].strip())
else:
chrom = cols[1].strip()
pos = int(cols[2].strip())
entry = (read, chrom, pos)
if entry in read_chrom_pos:
print (fn, 'duplicate read, position:', entry)
read_chrom_pos.add(entry)
return read_chrom_pos
def parse_test_values(fn, shared_ids, trunc_score_min, trunc_score_max, debug=False):
"""
Parse test file for shared_ids and write values to value_array
If first, place into cols 0, 2, else cols 1, 3
If trunc_score_min/max then set more extreme scores to this. Important for balancing
contributions to regression.
modify in place
"""
truncated = 0
contents = {}
no_chrom = False
with open(fn, 'rt') as f:
for i, row in enumerate(f):
if i == 0:
if 'chr' not in row.lower():
no_chrom = True
continue
cols = row.strip().split('\t')
read = cols[0].strip()
if no_chrom:
chrom = '.'
pos = int(cols[1].strip())
else:
chrom = cols[1].strip()
pos = int(cols[2].strip())
if (read, chrom, pos) in shared_ids:
if no_chrom:
strand = 1 if cols[2].strip() == '+' else 0
score = float(cols[3].strip())
else:
strand = 1 if cols[3].strip() == '+' else 0
score = float(cols[4].strip())
if trunc_score_max:
if score > trunc_score_max:
score = trunc_score_max
truncated += 1
if trunc_score_min:
if score < trunc_score_min:
score = trunc_score_min
truncated += 1
contents[(read, chrom, pos)] = (strand, score)
if debug:
print(truncated, 'scores were truncated', file=sys.stderr)
return contents
def get_filter_vals_fixed_proportion(value_array, critical_value, filter=-1):
"""
Removes a fixed proportion of scores around the critical value as these are most uncertain in
terms of calling methylation status.
Find the rank of the critical value in array, then find the value corresponding to a
fixed number of ranks either side the mean. Return these min and max filter values.
"""
if filter == -1:
return critical_value, critical_value
else:
scores_sorted = np.sort(value_array[:,-1])
half_num_ignore = floor(len(scores_sorted)*(filter/2))
mean_index = np.argmax(scores_sorted>critical_value)-1
filter_min = scores_sorted[mean_index-half_num_ignore]
filter_max = scores_sorted[mean_index+half_num_ignore]
#print('mean score:',critical_value,'filter min:', filter_min, 'filter max:',
# filter_max, file=sys.stderr)
return filter_min, filter_max
def count_success(critical_value, value_array, method_name, training_desc, test_desc,
tool_names, id_list, filter=-1, include_pred=False):
"""
If prediction is higher than the critical_value, consider as methylated
filter is the proportion of scores close to the critical value to ignore +/- the critical_value
"""
groups = len(tool_names)
score_index = 2
success = Counter()
total_ignored = 0
filter_min, filter_max = get_filter_vals_fixed_proportion(value_array, critical_value, filter)
fn = 'meteore_training_'+'_'.join(tool_names)+'_'+method_name+'_'+\
str(training_desc)+'_'+str(filter)+'_'+str(test_desc)+'.tsv'
if include_pred:
fn = fn.replace('.tsv','_inc_pred.tsv')
with open(fn, 'wt') as out:
print('\t'.join(map(str,['Group','Read','Chrom','Pos','Strand','Label','Prediction','Score'])), file=out)
for i, s in enumerate(value_array[:,-1]):
if s > filter_min and s < filter_max:
total_ignored += 1
continue
if int(value_array[i][0]) == 1: # truth about methylation status
for k in range(groups):
# check the tool's predicted methylation status
if value_array[i][score_index+groups+k] == 1:
success[tool_names[k]] += 1
if s > critical_value:
success['model'] += 1
else: # unmethylated
for k in range(groups):
if value_array[i][score_index+groups+k] == 0:
success[tool_names[k]] += 1
if s < critical_value:
success['model'] += 1
if s > critical_value:
# methylated
c = 1
else:
# unmethylated
c = 0
group, read, chrom, pos = id_list[i]
# 0 truth/label
# 1 strand
# 2 score - t1
# 3 score - t2
# 4 prediction - t1
# 5 prediction - t2
# 6 np.arange(0,len(shared_ids),dtype=np.single)
# 7 predicted_values
strand = value_array[i,1]
label = value_array[i,0]
c_str = 'no' if c == 0 else 'yes'
strand_str = '-' if strand == 0 else '+'
label_str = 'no' if label == 0 else 'yes'
print('\t'.join(map(str,[group,read,chrom,pos,strand_str,label_str,c_str,s])), file=out)
return success, total_ignored
def load_training_data(training_fns, trunc_min_scores,
trunc_max_scores, debug=False):
""" First parse group, read and position to find shared data points
Then read in training scores, truncating as appropriate """
# Parse file twice. First time get all the loci, second time all the value data
training_ids = [parse_training_loci(t_ids) for t_ids in training_fns]
shared_ids = set.intersection(*training_ids)
id_list = sorted(shared_ids)
if debug:
print('Number of shared ids in training sets:', len(shared_ids), file=sys.stderr)
# value_array contents
# 0 truth
# 1 strand
# 2:len(train)+2, tool score x train
# 2+len(train):2+len(train)*2, tool prediction x train
# 2+2*len(train) or -2, numeric order
# 3+2*len(train) or -1, model predicted score
groups = len(training_fns)
value_array = np.zeros((len(shared_ids),4+(2*groups)), dtype=np.single)
value_array[:,-2] = np.arange(0,len(shared_ids), dtype=np.single)
for index, (training_fn, t_min, t_max) in \
enumerate(zip(training_fns, trunc_min_scores, trunc_max_scores)):
# Read in values
contents = parse_training_values(training_fn, shared_ids, t_min, t_max, debug=debug)
for i, id in enumerate(id_list):
strand, label, predicted, score = contents[id]
if index == 0:
value_array[i,0] = label
value_array[i,1] = strand
value_array[i,2+index] = score
value_array[i,2+groups+index] = predicted
return value_array, id_list
def load_test_data(test_fns, trunc_min_scores,
trunc_max_scores, debug=False):
""" First parse group, read and position to find shared data points
Then read in test scores, truncating as appropriate """
# Parse file twice. First time get all the loci, second time all the value data
test_ids = [parse_test_loci(t_ids) for t_ids in test_fns]
shared_ids = set.intersection(*test_ids)
id_list = sorted(shared_ids)
if debug:
print('Number of shared ids in test sets:', len(shared_ids), file=sys.stderr)
# value_array contents
# 0 strand
# 1:len(train)+1, tool score x train
# 1+len(train) or -2, numeric order
# 2+len(train) or -1, model predicted score
groups = len(test_fns)
value_array = np.zeros((len(shared_ids),3+(groups)), dtype=np.single)
value_array[:,-2] = np.arange(0,len(shared_ids), dtype=np.single)
for index, (test_fn, t_min, t_max) in \
enumerate(zip(test_fns, trunc_min_scores, trunc_max_scores)):
# Read in values
contents = parse_test_values(test_fn, shared_ids,
t_min, t_max, debug=debug)
for i, id in enumerate(id_list):
strand, score = contents[id]
if index == 0:
value_array[i,0] = strand
value_array[i,1+index] = score
return value_array, id_list
def scale_data(value_array, groups, score_index, trunc_min, trunc_max):
"""
Adds min and max scaling values into temporary score array, rescales,
then writes back scores to value_array.
Parameters:
value_array : data matrix
groups : (int) number of data sets
score_index: index of first column belonging to scores. 2 for training, 1 for test
trunc_min/max: list of min/max score values
Returns:
In place modification of value_array reference
"""
start = score_index
stop = score_index + groups
x, y = value_array[:,start:stop].shape
scores = np.zeros((x+2,y), dtype=np.single) # need 2 extra rows, for min and max
scores[:x,:] = value_array[:,start:stop]
for i in range(groups):
scores[x,i] = trunc_min[i] if trunc_min[i] is not None else -5
scores[x+1,i] = trunc_max[i] if trunc_max[i] is not None else 5
scores = MinMaxScaler().fit_transform(scores)
value_array[:,start:stop] = scores[:x,:]
return value_array
def load_env():
""" define interface and load any environment variables required """
parser = AP()
parser.add_argument('--train', nargs='+', required=True, help='Path to training data')
parser.add_argument('--train_desc', required=True , help='Describe the training data set')
parser.add_argument('--test', nargs='*', help='Path to test data. Must be omitted or '+\
'same number of tool files as as --train (and in the same order)')
parser.add_argument('--test_desc', help='Describe the test data set')
parser.add_argument('--tool_names', nargs='+', required=True, help='Names of each tool in the same order '+\
'as given for --train (and --test if provided)')
parser.add_argument('--trunc_max', nargs='*', type=float,
help='Truncate maximum values. Must match order of training data')
parser.add_argument('--trunc_min', nargs='*', type=float,
help='Truncate minimum values. Must match order of training data')
parser.add_argument('--debug', action='store_true', help='Turn on debugging output (STDERR)')
parser.add_argument('--inc_pred', action='store_true', help='Include tool prediction categories '+\
'in regression. NOTE: must be present in all input files')
parser.add_argument('--filter', type=float, default=-1, help='Percentage of total reads to remove '+\
'around mean score, default -1 (disabled)')
parser.add_argument('--testIsTraining', action='store_true',
help='Use functions for handling training data for the test parameters')
parser.add_argument('--no_scaling', action='store_true', help='Disable score scaling')
args = parser.parse_args()
# check that the number of command line arguments make sense
if len(args.train) != len(args.tool_names):
print('Tool names must match training files', file=sys.stderr)
sys.exit(-1)
if args.test and len(args.test) != len(args.train):
print('Test files must match training files', file=sys.stderr)
sys.exit(-1)
if args.test and not args.test_desc:
print('Description of test data set required', file=sys.stderr)
sys.exit(-1)
if args.trunc_max and len(args.trunc_max) != len(args.train):
print('Truncation maximum values must match against each training input', file=sys.stderr)
sys.exit(-1)
if args.trunc_min and len(args.trunc_min) != len(args.train):
print('Truncation minimum values must match against each training input', file=sys.stderr)
sys.exit(-1)
return args
def main():
args = load_env()
groups = len(args.train)
if args.trunc_min is None:
trunc_min_scores = (None for i in range(groups))
else:
trunc_min_scores = args.trunc_min
if args.trunc_max is None:
trunc_max_scores = (None for i in range(groups))
else:
trunc_max_scores = args.trunc_max
value_array, id_list = load_training_data(args.train,trunc_min_scores,
trunc_max_scores,debug=args.debug)
score_index = 2
start = score_index
stop = score_index + groups
if not args.no_scaling:
value_array = scale_data(value_array, groups, score_index, trunc_min_scores, trunc_max_scores)
# prepare a regression model to use later
#extra_regressors = [Ridge, LinearRegression, SGDRegressor, ElasticNet, Lars, Lasso, ARDRegression,
# BayesianRidge, HuberRegressor, PoissonRegressor, PassiveAggressiveRegressor]
regressors = [RidgeCV]
#extra_names = ['Ridge', 'LinearRegression', 'SGDRegressor', 'ElasticNet', 'Lars', 'Lasso',
# 'ARDRegression', 'BayesianRidge', 'HuberRegressor', 'PoissonRegressor',
# 'PassiveAggressiveRegressor']
reg_names = ['RidgeCV']
for r, n in zip(regressors, reg_names):
# sort array by scores prior to fitting regression
for k in range(groups):
if k==0:
value_array = value_array[value_array[:,score_index+k].argsort()]
else:
value_array = value_array[value_array[:,score_index+k].argsort(kind='mergesort')]
model = r()
if args.inc_pred:
model.fit(value_array[:,score_index:], value_array[:,0]) # fit against truth [:,0]
value_array[:,-1] = model.predict(value_array[:,start:])
else:
model.fit(value_array[:,score_index:score_index+groups], value_array[:,0])
value_array[:,-1] = model.predict(value_array[:,start:stop])
# sort by numeric id to bring back in line with ids
value_array = value_array[value_array[:,-2].argsort()]
critical_value = 0.5
if args.no_scaling:
critical_value = 0 # np.mean(value_array[:,7])
# We pass args.train_desc twice because we test on our training data first
success, total_ignored = count_success(critical_value, value_array, n,
args.train_desc, args.train_desc, args.tool_names, id_list,
filter=args.filter, include_pred=args.inc_pred)
#print ('Number of data points:', truth.shape[0], 'Number filtered out:', total_ignored)
divisor = value_array.shape[0] - total_ignored
fields = ['Filter', 'Ignored'] +\
['\t'.join([t+'_pass', t+'_prop']) for t in args.tool_names] +\
['Model_succ', 'Model_prop']
print('\t'.join(fields), file=sys.stderr)
fields = [args.filter, total_ignored] +\
['\t'.join(map(str, [success[t], success[t]/divisor])) for t in args.tool_names] +\
['\t'.join(map(str, [success['model'], success['model']/divisor]))]
print('\t'.join(map(str, fields)), file=sys.stderr)
### Load real data to test with our model
if args.test:
if args.testIsTraining:
value_array, id_list = load_training_data(args.test,trunc_min_scores,
trunc_max_scores,debug=args.debug)
score_index = 2
else:
value_array, id_list = load_test_data(args.test,trunc_min_scores,
trunc_max_scores,debug=args.debug)
score_index = 1
if not args.no_scaling:
value_array = scale_data(value_array, groups, score_index, trunc_min_scores, trunc_max_scores)
# sort array by scores prior to fitting regression
for k in range(groups):
if k==0:
value_array = value_array[value_array[:,score_index+k].argsort()]
else:
value_array = value_array[value_array[:,score_index+k].argsort(kind='mergesort')]
# Predict based on training model
if args.inc_pred:
value_array[:,-1] = model.predict(value_array[:,score_index:])
else:
value_array[:,-1] = model.predict(value_array[:,score_index:score_index+groups])
# sort by numeric id to bring back in line with ids
value_array = value_array[value_array[:,-2].argsort()]
# set tipping point between unmethylated/methylated call
critical_value = 0.5
if args.no_scaling:
critical_value = 0 # np.mean(value_array[:,7])
if args.testIsTraining:
success, total_ignored = count_success(critical_value, value_array, 'RidgeCV',
args.train_desc, args.test_desc, args.tool_names, id_list, filter=args.filter, include_pred=args.inc_pred)
#print ('Number of data points:', truth.shape[0], 'Number filtered out:', total_ignored)
divisor = value_array.shape[0] - total_ignored
fields = ['Filter', 'Ignored'] +\
['\t'.join([t+'_pass', t+'_prop']) for t in args.tool_names] +\
['Model_succ', 'Model_prop']
print('\t'.join(fields), file=sys.stderr)
fields = [args.filter, total_ignored] +\
['\t'.join(map(str,[success[t], success[t]/divisor])) for t in args.tool_names] +\
['\t'.join(map(str, [success['model'], success['model']/divisor]))]
print('\t'.join(map(str, fields)), file=sys.stderr)
else:
filter_min, filter_max = get_filter_vals_fixed_proportion(value_array,
critical_value, args.filter)
total_ignored = 0
fn = 'meteore_test_'+'_'.join(args.tool_names)+'_'+'RidgeCV'+'_'+\
str(args.train_desc)+'_'+str(args.filter)+'_'+str(args.test_desc)+'.tsv'
if args.inc_pred:
fn = fn.replace('.tsv','_inc_pred.tsv')
with open(fn, 'wt') as out:
print('\t'.join(map(str,['Read','Chrom','Pos','Strand','Score','Prediction'])), file=out)
for i, s in enumerate(value_array[:,-1]):
if s > filter_min and s < filter_max:
total_ignored += 1
continue
if s > critical_value:
c = 1 # methylated
else:
c = 0 # unmethylated
read, chrom, pos = id_list[i]
# 0 strand
# 1: scores - t1
# -1 predicted_values
strand = value_array[i,0]
c_str = 'no' if c == 0 else 'yes'
strand_str = '-' if strand == 0 else '+'
print('\t'.join(map(str,[read,chrom,pos,strand_str,s,c_str])), file=out)
if __name__ == '__main__':
main()