-
Notifications
You must be signed in to change notification settings - Fork 1
/
VirStrain_build.py
115 lines (107 loc) · 4.19 KB
/
VirStrain_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import re
import os,sys
import argparse
import math
__author__="Liao Herui"
usage="VirStrain - An RNA virus strain-level identification tool for short reads."
def main():
parser=argparse.ArgumentParser(prog='VirStrain_build.py',description=usage)
parser.add_argument('-v', '--version', action='version', version='%(prog)s v1.17')
parser.add_argument('-i','--input_msa',dest='input_msa',type=str,required=True,help="Input MSA file generated by mafft --- Required")
parser.add_argument('-d','--database_dir',dest='db_dir',type=str,help='Database Output dir (default: current workdir/VirStrain_DB)')
parser.add_argument('-c','--dash_cutoff',dest='dash_cutoff',type=str,help='The cutoff of dash in each column of MSA (default: 0)')
parser.add_argument('-s','--sites_ratio_cutoff',dest='sites_cutoff',type=str,help='The cutoff of sites number for manual-covering function (eg. 1 means all useful sites will be used and 0.8 means 80%% useful sites will be used.)')
parser.add_argument('-r','--sites_range_cutoff',dest='sites_rcutoff',type=str,help='The cutoff of sites range for covering algorithm (eg. 3-500 means the covering algorithm will only consider the SNV sites from 3-500 of MSA.)')
args=parser.parse_args()
in_msa=args.input_msa
db_dir=args.db_dir
dashc=args.dash_cutoff
sitec=args.sites_cutoff
site_r=args.sites_rcutoff
if not db_dir:
db_dir='VirStrain_DB'
if not dashc:
dashc=0
if not sitec:
sitec=0
else:
sitec=float(sitec)
if not site_r:
site_r=[0,0]
else:
siter_t=re.split('-',site_r)
site_r=[int(siter_t[0]),int(siter_t[1])]
if not os.path.exists(db_dir):
os.makedirs(db_dir)
# Start to build the database
snum,cnum=scan_msa(in_msa)
# Run all program
# Step1 - Choose sites
if site_r[1]==0:
os.system('perl bin/aln2cluster-overlap-kmer-withd.pl '+in_msa+' '+str(snum)+' '+str(dashc)+' 0 '+str(cnum)+' 0 > VirStrain_build.column')
else:
os.system('perl bin/aln2cluster-overlap-kmer-withd.pl '+in_msa+' '+str(snum)+' '+str(dashc)+' '+str(site_r[0])+' '+str(site_r[1])+' 0 > VirStrain_build.column')
if sitec>0:
manual_covering(sitec,snum,in_msa,site_r)
# Step2 - Extract kmer and generate snp matrix
os.system('python bin/S1_extract_kmer.py -i '+in_msa+' -c VirStrain_build.column')
# Step3 - Divide strains into clusters
os.system('python bin/S2_remove_redundant_For_Me.py -i Strain_pos_snp_matrix_consider_all.txt')
# Step4 - Hierarchical Clustering
os.system('python bin/S2.5_Split_cls_hierarchical_with_perl.py -i '+in_msa+' -r Remove_redundant_matrix_MM_Call.clstr')
os.system('mv After_split.png Before_split.png ID2Name.txt '+db_dir)
os.system('mv Pos-snp-kmer-all.fa Pos-snp-kmer-all.txt Rebuild_cls.clstr Remove_redundant_matrix_MM_Call.clstr Strain_pos_snp_matrix_not_redundant_MM_Call.txt Strain_pos_snp_matrix_consider_all.txt SubCls_kmer.txt Strain_cls_info.txt VirStrain_build.column '+db_dir)
def manual_covering(sitec,snum,in_msa,site_r):
os.system('perl bin/aln2entropy.pl '+in_msa+' '+str(snum)+' > manual.column')
#print('perl bin/aln2entropy.pl '+in_msa+' '+str(snum)+'> manual.column')
#exit()
f=open('manual.column','r')
o=open('mc.column','w+')
a=[]
while True:
line=f.readline().strip()
if not line:break
if not re.search('column',line):continue
ele=line.split()
cid=int(ele[1])
for e in ele:
if re.search('-',e):
dash_num=re.sub('-:','',e)
break
dash_num=int(dash_num)
if dash_num==0:
if not site_r[1]==0:
if cid>=site_r[0] and cid<=site_r[1]:
a.append(line)
else:
a.append(line)
#o.write(line+'\n')
out_c=math.ceil(sitec*float(len(a)))
#out_c=int(out_c)
outa=a[:out_c]
print('Newly added sites number: ',len(outa),'/',len(a))
for line in outa:
o.write(line+'\n')
o.close()
os.system('rm manual.column')
os.system('cat VirStrain_build.column mc.column > VirStrain_build_tem.column')
os.system('rm VirStrain_build.column mc.column')
os.system('mv VirStrain_build_tem.column VirStrain_build.column')
def scan_msa(in_msa):
fs1=open(in_msa,'r')
dseq={}
while True:
line=fs1.readline().strip()
if not line:break
if re.search('>',line):
name=line
dseq[name]=''
else:
dseq[name]+=line
cnum=0
for d in dseq:
cnum=len(dseq[d])
break
return len(dseq),cnum
if __name__=='__main__':
sys.exit(main())