-
Notifications
You must be signed in to change notification settings - Fork 0
/
maptools.py
268 lines (232 loc) · 10.1 KB
/
maptools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import os
import re
import sys
import subprocess
import gzip
import errno
from Bio import SeqIO
def create_directories(dirpath):
'''make directory path'''
try:
os.makedirs(dirpath)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def return_filehandle(open_me):
'''get me a filehandle, common compression or text'''
magic_dict = {
b'\x1f\x8b\x08': 'gz'
# '\x42\x5a\x68': 'bz2',
# '\x50\x4b\x03\x04': 'zip'
}
max_bytes = max(len(t) for t in magic_dict)
with open(open_me, 'rb') as f:
s = f.read(max_bytes)
for m in magic_dict:
if s.startswith(m):
t = magic_dict[m]
if t == 'gz':
return gzip.open(open_me, 'rt')
# elif t == 'bz2':
# return bz2.open(open_me)
# elif t == 'zip':
# return zipfile.open(open_me)
return open(open_me)
def get_seqio_record(seq_handle):
'''Parses a filehandle seq_handle and yields the formatted records
Generator for SeqIO record objects
'''
with seq_handle as sopen:
for record in SeqIO.parse(sopen, 'fasta'): # iterate with SeqIO
yield record # yield each record as it is iterated
def get_genetic_markers(markers):
'''Accepts a tab file of genetic markers, their LGs and their genetic
distance.
Returns a dictionary with marker id as key and a dict as value.
'''
genetic_markers = {}
fh = return_filehandle(markers) # get filehandle for markers file
with fh as mopen:
for line in mopen:
line = line.rstrip()
if not line or line.startswith('#') or line.startswith('VARIANT'):
continue # skip comments and headers
fields = line.split('\t')
if len(fields) < 3: # skip empty or unassigned
continue
genetic_markers[fields[0]] = {'lg': fields[1],
'distance': fields[2]}
return genetic_markers
def markers_blastn(markers, reference, blast_path):
'''Runs a BLASTn against reference using maxtargets 10 culling_limit 10
using the blastn at blast_path
outputs results in outfmt6 to <markers>.cull10_targets10_fmt6.tab
'''
cmd = ('{}/blastn -db {} -query {}'.format(blast_path, reference, markers)+
' -out {}.cull10_targets10_fmt6.tab'.format(markers) +
' -outfmt 6 -culling_limit 10 -max_target_seqs 10 -num_threads 2')
exit_val = subprocess.check_call(cmd, shell=True)
if exit_val:
print('Blastn process exited with non-zero value: {}'.format(exit_val))
sys.exit(1)
def format_for_allmaps(markers_gm, markers_phys):
'''Accepts a tab file of genetic marker labels, their LG and their
genetic distance.
Loads this into genetic_markers dict.
Also accepts a physical position file parsed from filter_marker_blastn
these will be unioned and an AllMaps CSV file will be produced.
'''
markers_gm = os.path.abspath(markers_gm)
markers_phys = os.path.abspath(markers_phys)
genetic_markers = get_genetic_markers(markers_gm)
fh = return_filehandle(markers_phys)
with fh as popen:
for line in popen:
line = line.rstrip()
if not line or line.startswith('#') or line.startswith('VARIANT'):
continue # skip comments and headers
fields = line.split('\t')
if len(fields) < 3: # skip empty or unassigned
continue
if not genetic_markers.get(fields[0]): # skip if no distance etc
continue
marker_id = fields[0]
ref = fields[1]
position = fields[2]
lg = genetic_markers[marker_id]['lg']
genetic_distance = genetic_markers[marker_id]['distance']
output = '{},{},{},{}'.format(ref, position, lg, genetic_distance)
print(output) # print output line formatted above for allmaps
def get_sequence_lengths(fasta):
'''Gets sequence lengths and returns dictionary
with ID key and Length value
'''
sequence_lengths = {} # dictionary to return
fh = return_filehandle(fasta) # get a filehandle
for record in get_seqio_record(fh): # generator for SeqIO
length = len(record.seq)
seq_id = record.id
sequence_lengths[seq_id] = length
return sequence_lengths
def populate_marker_alignments(alignments, ref):
'''Populates a dictionary with marker alignment targets,
their bitscores, physical positions and reference sequence lengths
'''
ref_lengths = get_sequence_lengths(ref) # dict to hold ref lengths
fh = return_filehandle(alignments)
marker_alignments = {}
with fh as bopen: # iterate through alignments file
for line in bopen:
line = line.rstrip()
if line.startswith('#') or not line: # skip if comment or None
continue
fields = line.split('\t')
query = fields[0]
ref = fields[1]
ref_length = int(ref_lengths[ref])
ref_start = int(fields[8])
bitscore = float(fields[-1])
last = 0
if not marker_alignments.get(query):
marker_alignments[query] = {'alignments': {}, 'num_hits': 0}
if not marker_alignments[query]['alignments'].get(ref):
marker_alignments[query]['alignments'][ref] = []
target = {'length': ref_length, 'start': ref_start,
'bitscore': bitscore, 'ref': ref} # target object
marker_alignments[query]['alignments'][ref].append(target)
last = bitscore
marker_alignments[query]['num_hits'] += 1
return marker_alignments
def filter_markers_blastn(alignments, ref, outfile, min_markers, max_targets):
'''Filters BLASTn format 6 output based on provided min_markers and
max_targets. min_markers filters scaffold with >= min_markers.
max_targets filters out markers with >= max_target alignments.
'''
alignments = os.path.abspath(alignments)
ref = os.path.abspath(ref)
outfile = os.path.abspath(outfile)
outfile_handle = open(outfile, 'w')
marker_alignments = populate_marker_alignments(alignments, ref) # markers
passing_markers = {}
passing_refs = {}
for query in marker_alignments:
if marker_alignments[query]['num_hits'] > max_targets: # >targets
continue
passing_markers[query] = {'length': 0, 'start': 0,
'bitscore': 0, 'ref': ''}
for ref in marker_alignments[query]['alignments']:
target = {}
skip_switch = 0
bitscore = 0
if len(marker_alignments[query]['alignments'][ref]) > 1:
count = 0
hold = {}
for align in marker_alignments[query]['alignments'][ref]:
if align['bitscore'] > bitscore:
hold = align
bitscore = align['bitscore']
elif align['bitscore'] == bitscore:
skip_switch = 1 # cannot use multi target same ref
break # break loop and continue reject this ref
if not skip_switch:
target = hold # assign marker this target
if skip_switch:
continue # don't use markers ambiguous on same scaffold
else:
target = marker_alignments[query]['alignments'][ref][0]
assert target, 'Target must be defined'
bitscore = target['bitscore']
if bitscore > passing_markers[query]['bitscore']:
passing_markers[query] = target # set new target
elif bitscore == passing_markers[query]['bitscore']:
if target['length'] > passing_markers[query]['length']:
passing_markers[query] = target # set to longest ref
if not passing_refs.get(passing_markers[query]['ref']):
passing_refs[passing_markers[query]['ref']] = 0
passing_refs[passing_markers[query]['ref']] += 1
for query in passing_markers: # now iterate through passing and get refs
ref = passing_markers[query]['ref']
if not ref: # if marker passed but ref was too ambiguous
continue
if passing_refs[ref] >= min_markers: # check min markers
position = passing_markers[query]['start']
output_me = '{}\t{}\t{}'.format(query, ref, position)
outfile_handle.write(output_me + '\n')
print(output_me)
outfile_handle.close()
def markers_tab_to_fasta(markers):
'''Formats markers.tab into markers.fasta.
Assumes Column 1 is the Label and Column 2 is the Sequence
'''
markers = os.path.abspath(markers)
with open(markers) as fopen:
for line in fopen:
line = line.rstrip()
if line.startswith('#') or line.startswith('VARIANT') or not line:
continue # continue if comment common header or blank
fields = line.split('\t') # get fields from line
if len(fields) != 2:
print('ERROR: line {}, does nto have 2 fields!'.format(line))
sys.exit(1)
label = fields[0]
sequence = fields[1]
parts = sequence.split('/') # get variant parts
if len(parts) < 2: # Add log message here skip if not 2 or more
continue
new_sequence = []
count = 0
part1 = re.compile('[A-z]\[([A-z]+)')
part2 = re.compile('[A-z]+\]')
for p in parts:
if part2.search(p):
p = part2.sub('', p)
if part1.search(p):
allele = part1.search(p).groups()[0]
length = len(allele) + 1
p = '{}{}'.format(p[:-length], allele)
new_sequence.append(p)
record = '>{}\n{}'.format(label, ''.join(new_sequence))
print(record)
if __name__ == '__main__':
print('Please import!')
sys.exit(0)