-
Notifications
You must be signed in to change notification settings - Fork 2
/
summarize_methylation_rates.py
executable file
·186 lines (151 loc) · 5.54 KB
/
summarize_methylation_rates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
'''
Created on Oct 6, 2014
@author: Bong-Hyun Kim
'''
import os
import sys
from sys import stderr
from pysam import Samfile
from seq.genome import Genome2
from call.meplot import GenomeMethylationCounter
from os.path import basename
usage = '''
%s [-c <chromosome name>|-g <chromosome fasta file>|-o <output_fn>|-s <snp_correction>|-r <trim>|-t|-h] <sam file>
This program will count methylation in the given SAM file.
The SAM file can be output of any generic aligner but specifically tested
on GSNAP, BSMAP and Bismark.
Options:
-g <chromosome/genome FASTA file>
FASTA file containing the chromosome reference sequences.
-s <snp_correction>
Select type of SNP correction. Default value is No correction.
-s ctsnp for using reverse strand G/(A+G) ratio like in BSMAP
-s allnt using reverse strand A ratio
-l <trim> (default : 0)
# of nucleotides to trim from 5' end of reads.
-r <trim> (default : 0)
# of nucleotide to trim while counting the methylations.
This option is removing 3' biased region of reads.
-b <quality_base> (default: 33)
#important for -q and -c option to interprete the quality scores.
-q <quality_trim_cutoff> (default : off)
This option determines the trim position after appearance of this score.
-x <quality_score_exclustion_cutoff> (default : 0)
Lower than this threshold is not included in methylation counting.
-m <major allele cutoff> (default: 0.5)
-d <major allele detection depth cutoff> (default: 10)
-p
Flag for counting methylation in the reads for M-bias plot
-t
Flag for terse output
-v
Flag for debugging outputs via STDERR.
-u
Flag for including duplicate reads for methylation rates
-h
Print help message
''' % os.path.basename(sys.argv[0])
if __name__ == '__main__':
from getopt import getopt
options, args = getopt(sys.argv[1:], "m:d:r:g:s:o:q:x:b:l:upSvth")
if not args:
print usage
sys.exit()
#default setting
#Methylation Caller
mbiasplot_flag = False
mcount_flag = True
mcount_summary_flag = False
if basename(sys.argv[0]) == 'summarize_methylation_rates.py':
#same as -S option
mbiasplot_flag = False
mcount_flag = False
mcount_summary_flag = True
elif basename(sys.argv[0]) == 'MEPLOT':
#same as -p option
mbiasplot_flag = True
mcount_flag = False
mcount_summary_flag = False
samfn = args[0]
snp_type = 0
comprehensive_output = True
output_fn = ''
output_fp = sys.stdout
genome_fn = '/scratch/kimb/chr21.fa'
major_allele_cutoff = 0.5
major_allele_detection_min_depth = 10
rtrim = 0
ltrim = 0
quality_trim = None
quality_base = 33
quality_score_cutoff = None
verbose = 0
duplicate_including = False
for opt, val in options:
if opt == '-s':
if val == 'ctsnp':
snp_type = 1
elif val == 'allnt':
snp_type = 2
elif opt == '-m':
major_allele_cutoff = float(val)
elif opt == '-d':
major_allele_detection_min_depth = int(val)
elif opt == '-r':
rtrim = int(val)
elif opt == '-l':
ltrim = int(val)
elif opt == '-g':
genome_fn = val
elif opt == '-t':
comprehensive_output = False
elif opt == '-q':
quality_trim = int(val)
elif opt == '-b':
quality_base = int(val)
elif opt == '-x':
quality_score_cutoff = int(val)
elif opt == '-p':
mbiasplot_flag = True
mcount_flag = False
elif opt == '-S':
mcount_summary_flag = True
mbiasplot_flag = False
mcount_flag = False
elif opt == '-o':
output_fn = val
if os.path.exists(output_fn):
raise Exception("%s already exists" % output_fn)
elif opt == '-h':
print usage
sys.exit()
elif opt == '-v':
verbose += 1
elif opt == '-u':
duplicate_including = True
if verbose:
print >> stderr, "Reading Genome file", genome_fn
genome = Genome2(genome_fn, verbose=verbose)
if verbose:
print >> stderr, "Opening Sam/Bam file", samfn
samfile = Samfile(samfn)
methylcounter = GenomeMethylationCounter(genome, samfile,
snp_major_allele_cutoff=major_allele_cutoff,
snp_detection_min_depth=major_allele_detection_min_depth,
rtrim=rtrim,
quality_trim=quality_trim,
quality_base=quality_base,
quality_score_cutoff=quality_score_cutoff,
mcount_flag=mcount_flag or mcount_summary_flag,
mbiasplot_flag=mbiasplot_flag,
ltrim=ltrim, verbose=verbose,
duplicate_including=duplicate_including)
if output_fn:
output_fp = open(output_fn, 'w')
if mcount_flag:
methylcounter.build_methylation_bedgraph(sfp=output_fp, snp=snp_type, comprehensive_output=comprehensive_output)
if mbiasplot_flag:
methylcounter.build_mbiasplot_output(sfp=output_fp)
if mcount_summary_flag:
methylcounter.build_methylation_summary(sfp=output_fp)