-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq_highlight.py
48 lines (35 loc) · 2.33 KB
/
seq_highlight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/user/bin/env python3
import argparse
parser = argparse.ArgumentParser(prog = 'script.py', formatter_class = argparse.RawDescriptionHelpFormatter, description = '''\
Takes a file of one-line sequences as input, sequences_parameters.txt from rbd, the original fasta file, returns a file with ids and last cluster belonging (assigned by rbd) from a reference fasta file as output for each match of each sequences of interest. The order follows the one of the file sequence_of_interest.txt
''')
parser.add_argument('--seq', '-s', dest = 'seq', action = 'store', type = str, required = True, help = 'Input file with one sequence per line')
parser.add_argument('--fasta', '-f', dest = 'fasta', action = 'store', type = str, required = True, help = 'Input reference fasta file')
parser.add_argument('--output', '-o', dest = 'output', action = 'store', type = str, required = True, help = 'Name of the output file')
parser.add_argument('--rbd', '-r', dest = 'rbd', action = 'store', type = str, required = True, help = 'Input file which is the sequence_parameters.txt file from rbd output')
args = parser.parse_args()
with open(args.seq, 'r') as f:
lines = f.readlines() #reads the sequences from input file.
seq_bank = [] #list with all the seq in one line
with open(args.fasta, 'r') as f:
temp = f.readlines()
for i in range(len(temp)) :
if temp[i].startswith('>') : #creates a list with ids followed by the corresponding sequences:
seq_bank.append(temp[i]) #adds the id of the seq
seq_bank.append(temp[i+1].rstrip()+temp[i+2].rstrip()+temp[i+3]) #adds the sequences in one string
seq = []
for i in range(len(lines)):
for j in range(len(seq_bank)):
if lines[i] == seq_bank[j] :
seq.append(seq_bank[j-1]) #seq contains the sequence references of the sequences
print('Number of ids corresponding to the sequences given : {}'.format(len(seq)))
with open(args.rbd, 'r') as f :
cluster_ref = f.readlines() #reads the sequences cluster assignement from rbd
out = []
for i in range(len(seq)):
for j in range(len(cluster_ref)):
if seq[i].split(' ')[0] == cluster_ref[j].split('\t')[0]:
out.append(seq[i].rstrip().split(' ')[0]+'\t'+cluster_ref[j].split('\t')[1])
#out[] contains the ids of the sequences that corresponds to the ones passed in the input file, and their last cluster belonging
with open(args.output, 'w') as f :
f.writelines(out)