forked from facebookresearch/XLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast.py
69 lines (53 loc) · 2.27 KB
/
blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
from Bio import Entrez
import time
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import ray
Entrez.email = "[email protected]" # Replace with your email address
# @ray.remote(num_cpus=2)
def fetch_blast_data(protein_seq, i):
# Protein sequence to search with BLAST
sequence = '>sequence1\n{0}'.format(protein_seq)
# Perform the BLAST search with BLOSUM62 matrix and E-value threshold of 1e-50
print(i)
result_handle = NCBIWWW.qblast(program="blastp", database="nr_clustered", sequence=sequence, expect=0.7, matrix_name="BLOSUM62", alignments=0, descriptions=100)
# Parse the BLAST results
blast_records = NCBIXML.parse(result_handle)
sequences = []
# Iterate through the results and fetch the protein sequences
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < 1e-50:
# Extract the accession number from the alignment title
accession = alignment.accession
# Fetch the protein sequence using Entrez
handle = Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text")
protein_sequence = handle.read().strip()
handle.close()
# Print the protein sequence
sequences.append(protein_sequence)
# Close the result handle after parsing the results
result_handle.close()
return (i, sequences)
# ray.init(num_cpus=4)
futures = []
antigens = []
with open('antigens.txt', 'r') as f:
for i, line in enumerate(f):
antigens.append(line)
# fetch_blast_data(line.replace('\n',''), i)
antigens = list(set(antigens))
for i, ag in enumerate(antigens):
# futures.append(fetch_blast_data.remote(ag.replace('\n',''), i))
fetch_blast_data(ag.replace('\n',''), i)
while len(futures) > 0:
done_ids, futures = ray.wait(futures, num_returns=1)
for done_id in done_ids:
i, done_task = ray.get(done_id)
with open(str(i)+'.fasta', 'w') as f:
for seq in done_task:
f.write(seq+'\n')
print(f'Remaining {len(futures)}. Finished {str(done_task)}')
time.sleep(1.0)