-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdbg.py
205 lines (166 loc) · 5.98 KB
/
dbg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# code taken from https://raw.githubusercontent.com/pmelsted/dbg/master/dbg.py
# import multiprocessing as mp
import argparse
import collections
from Bio import Seq, SeqIO, SeqRecord
import gfapy
# Create reverse complement
def twin(km):
return Seq.reverse_complement(km)
# Chunk reads into k sized chunks
def kmers(seq,k):
for i in range(len(seq)-k+1):
yield seq[i:i+k]
# Move sequence analysis one nucleotide forward
def fw(km):
for x in 'ACGT':
yield km[1:]+x
# Move sequence analysis one nucleotide backward
def bw(km):
for x in 'ACGT':
yield x + km[:-1]
# count kmers and build dictionary with counts
# use limit as cut off of very down regulated seq
def build(fn,k=31,limit=1):
d = collections.defaultdict(int)
# For each filename in input parse fast.q file
for f in fn:
reads = SeqIO.parse(f,'fastq')
# Extract reads
for read in reads:
seq_s = str(read.seq)
seq_l = seq_s.split('N')
#
for seq in seq_l:
for km in kmers(seq,k):
d[km] +=1
seq = twin(seq)
for km in kmers(seq,k):
d[km] += 1
# Remove k-mer elements of dict that
d1 = [x for x in d if d[x] <= limit]
for x in d1:
del d[x]
return d
# merges the organisms data
def merge_dicts(d1,d2):
merge = {}
for i in d1.keys():
merge[i] = (d1[i], d2[i])
for i in d2.keys():
merge[i] = (d1[i], d2[i])
return merge
# Taken from source code
def contig_to_string(c):
return c[0] + ''.join(x[-1] for x in c[1:])
# Taken from source code
def get_contig(d,km):
c_fw = get_contig_forward(d,km)
c_bw = get_contig_forward(d,twin(km))
if km in fw(c_fw[-1]):
c = c_fw
else:
c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
return contig_to_string(c),c
# Taken from source code
def get_contig_forward(d,km):
c_fw = [km]
while True:
if sum(x in d for x in fw(c_fw[-1])) != 1:
break
cand = [x for x in fw(c_fw[-1]) if x in d][0]
if cand == km or cand == twin(km):
break # break out of cycles or mobius contigs
if cand == twin(c_fw[-1]):
break # break out of hairpins
if sum(x in d for x in bw(cand)) != 1:
break
c_fw.append(cand)
return c_fw
# Taken from source code
def all_contigs(d,k):
done = set()
r = []
for x in d:
if x not in done:
s,c = get_contig(d,x)
for y in c:
done.add(y)
done.add(twin(y))
r.append(s)
G = {}
heads = {}
tails = {}
for i,x in enumerate(r):
G[i] = ([],[])
heads[x[:k]] = (i,'+')
tails[twin(x[-k:])] = (i,'-')
for i in G:
x = r[i]
for y in fw(x[-k:]):
if y in heads:
G[i][0].append(heads[y])
if y in tails:
G[i][0].append(tails[y])
for z in fw(twin(x[:k])):
if z in heads:
G[i][1].append(heads[z])
if z in tails:
G[i][1].append(tails[z])
return G,r
# Outputs the links of each node.
def get_links(cs,d,k,s,fd):
for x in range(0,len(cs)-k):
keyA = cs[x:x+k]
keyB = cs[(x+1):(x+1)+k]
kmerA = (('%s:%s:(A:%s,B:%s)')%(s,x,d[keyA][0],d[keyA][1]))
kmerB = (('%s:%s:(A:%s,B:%s)')%(s,x+1,d[keyB][0],d[keyB][1]))
fd.write("L\t%s\t+\t%s\t+\t%sM\n"%(kmerA,kmerB,(k-1)))
# Outputs the Segment chunks of each node.
def get_kmers(cs,d, k, s,fd):
global g, listofkmers,listoflinks,lastkmerid
for x in range(0,len(cs)+1-k): # to get all subsegmet, holds the all the kmers
key = cs[x:x+k]
fd.write("S\t%s:%s:(A:%s,B:%s)\t%s\n"%(s,x,d[key][0],d[key][1],key))
lastkmerid[s] = (('%s:%s:(A:%s,B:%s)')%(s,x,d[key][0],d[key][1]))
def write_GFA2(G,cs,k,d,fd):
global args, g,listofkmers, listoflinks,lastkmerid
fd.write("H\tVN:Z:1.\n")
for i,x in enumerate(cs): # Get the one contig and a number id for the contig
get_kmers(x,d,k,i,fd) # Function to get the fragments of organism A and B if included
for i,x in enumerate(cs): # Get the one contig and a number id for the contig
get_links(x,d,k,i,fd)
del cs
for i in G: # Get the links to the gfa
for j,o in G[i][0]:
fd.write("L\t%s\t+\t%s\t%s\t%dM\n"%(lastkmerid[i],lastkmerid[j],o,k-1))
for j,o in G[i][1]:
fd.write("L\t%s\t-\t%s\t%s\t%dM\n"%(lastkmerid[i],lastkmerid[j],o,k-1))
del lastkmerid
def main():
global args
if args.output: # If the output file name is given use it
filename = args.output
else: # else use standard one
filename = "output.gfa"
dA = build(args.A, args.k, 0)
dB = build(args.B, args.k, 0)
d = merge_dicts(dA, dB)
del dA
del dB
with open(filename,"w") as file:
G,cs = all_contigs(d,args.k)
write_GFA2(G,cs,args.k,d,file)
parser = argparse.ArgumentParser(description="Creates a GFA file with one or two organisms given a kmer size")
parser.add_argument("-k", type=int, required=True, help="the kmer size")
parser.add_argument("-A", nargs='+', required=True, help="Organism_A_files")
parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
parser.add_argument("-output",required=False,help="Output GFA file name")
args = parser.parse_args()
listofkmers = []
listoflinks = []
lastkmerid = {}
# To add more organisms add this parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
# change the name and do another call to build and do multiple merge_dicts calls
if __name__ == "__main__":
main()