-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcatenate_reference_fasta.py
executable file
·61 lines (49 loc) · 1.4 KB
/
concatenate_reference_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python2.7
"""
Concatenate the records in a FASTA file with 100 Ns in
between.
concatenate_reference_fasta.py <infile> <contig order>
UPDATE 26 Jan 2018: Added more flexibility so that it can work
with genomes that have lots of contigs. Also, converts sequence to
uppercase.
"""
#==============================================================================#
import argparse
from sys import stdout
import re
from Bio import SeqIO
#==============================================================================#
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument('--gap', default=100, type=int)
parser.add_argument('--file', default=False, action='store_true')
parser.add_argument('infile')
parser.add_argument('order', nargs='+')
args = parser.parse_args()
gap = args.gap
infile = args.infile
idx = SeqIO.index(infile, 'fasta')
sq_order = []
if args.file:
with open(args.order[0], 'rb') as ih:
for line in ih:
sq_order.append(line.strip())
else:
sq_order = args.order
sq_lens = {}
padding = {}
for id, rec in idx.iteritems():
sq_lens[id] = len(rec)
p = 0
for sq in sq_order:
padding[sq] = p
p += sq_lens[sq] + gap
stdout.write('>genome\n')
first = True
for id in sq_order:
rec = idx[id]
if first:
first = False
else:
stdout.write('N' * gap)
stdout.write(str(rec.seq).upper())
stdout.write('\n')