-
Notifications
You must be signed in to change notification settings - Fork 3
/
getmlstgenes.py
168 lines (155 loc) · 8.2 KB
/
getmlstgenes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
# Copyright (C) 2015,2016 Mohammad Alanjary
# University of Tuebingen
# Interfaculty Institute of Microbiology and Infection Medicine
# Lab of Nadine Ziemert, Div. of Microbiology/Biotechnology
# Funding by the German Centre for Infection Research (DZIF)
#
# This file is part of ARTS
# ARTS is free software. you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version
#
# License: You should have received a copy of the GNU General Public License v3 with ARTS
# A copy of the GPLv3 can also be found at: <http://www.gnu.org/licenses/>.
import os, argparse, setlog, json, sqlite3 as sql, numpy as np
def get16S(db,cov=800):
conn=sql.connect(db)
cur=conn.cursor()
#Combine tables into view and write full hits, fragment hits, genematrix
cur.execute("CREATE TEMP VIEW rnagenes AS SELECT A.*,B.* FROM RNAhits A INNER JOIN Seqs B ON A.seqid=B.seqid ORDER BY orgname, LENGTH(naseq) DESC")
#Get longest 16S rRNA hit from each org
cur.execute("SELECT orgname,seqid,naseq FROM rnagenes WHERE hmmhit='16S_rRNA' GROUP BY orgname")
seqdict = {}
for x in cur:
if x[0] not in seqdict and len(x[2]) >= cov:
seqdict[x[0]] = {}
log.debug("RNA gene: %s - %s"%(x[0],len(x[2])))
#Add sequences (seqid,naseq,aaseq)
seqdict[x[0]]["RNA_16S_rRNA"] = [[x[1],x[2],False]]
conn.close()
return seqdict
def getcoregenes(db,pct=0.4,pct2=0.4,seqdict=None):
conn = sql.connect(db)
cur = conn.cursor()
hkcore=[str(x[0]) for x in cur.execute("SELECT DISTINCT hmmhit FROM HMMhits")]
orgs=[str(x[0]) for x in cur.execute("SELECT DISTINCT orgname FROM HMMhits")]
orgs = list(orgs)
oarry = np.zeros((len(hkcore), len(orgs)))
if not seqdict:
seqdict = {}
if any("RNA_16S_rRNA" in x.keys() for k,x in seqdict.items()):
log.info("Adding 16S to matrix results")
#add 16S genes to matrix if seqdict exists
newrow = np.zeros(len(orgs))
for org in seqdict.keys():
j = orgs.index(org)
newrow[j] = 1
hkcore.append("RNA_16S_rRNA")
oarry = np.vstack((oarry,newrow))
# Combine tables into view and write full hits, fragment hits, genematrix
cur.execute("CREATE TEMP VIEW allgenes AS SELECT A.*,B.* FROM HMMhits A INNER JOIN Seqs B ON A.seqid=B.seqid WHERE flags>0")
for i, hkgn in enumerate(hkcore):
if "RNA_16S_rRNA" in hkgn:
#Skip 16S already extracted
continue
results = cur.execute(
"SELECT orgname,seqid,genecov,hmmcov,MAX(iscore),genelen,source,loc_start,loc_end,loc_strand,description,score,naseq,aaseq FROM allgenes WHERE hmmhit=? GROUP BY seqid",
(hkgn,)).fetchall()
for x in results:
# add to genematrix
j = orgs.index(x[0])
# separate fragments and full length
if x[2] >= pct or x[3] >= pct2:
oarry[i, j] += 1
if x[0] not in seqdict:
seqdict[x[0]] = {}
if hkgn not in seqdict[x[0]]:
seqdict[x[0]][hkgn] = []
seqdict[x[0]][hkgn].append([x[1],x[-2],x[-1]])
return orgs,hkcore,oarry,seqdict
def findsingles(db,minnum=7,minorg=0.8,maxgenes=30,dnds="",outdir="",keepgenes="",lf=""):
global log
if lf:
log = setlog.init(logfile=lf,level="info")
else:
log = setlog.init(toconsole=True,level="info")
log.info("Getting 16S genes...")
seqdict = get16S(db)
log.info("Done. Getting Core genes...")
orgs,hks,gmat,seqdict = getcoregenes(db,seqdict=seqdict)
if len(keepgenes):
keepgenes = keepgenes.split(",")
else:
keepgenes = []
njvals = {}
if dnds and os.path.isfile(dnds):
with open(dnds,"r") as fil:
njvals = json.load(fil)
numorgs = len(orgs)
filtinds = [i for i,x in enumerate(gmat) if float(list(x).count(1))/numorgs >= 0.95]
filtinds.extend([i for i,x in enumerate(hks) if x in keepgenes])
filtinds = list(set(filtinds))
hkinds = []
#Start with organisms that have all keepgenes
orginds = [i for i in range(numorgs) if all(gmat[hks.index(x),i]==1 for x in keepgenes)]
hksfound = []
#Remove organisms with missing/duplicate genes until minimum single copy genes are found
while float(len(orginds))/numorgs >= minorg:
hkinds = [i for i in filtinds if float(list(gmat[i][orginds]).count(1)) == len(orginds)]
hksfound = [hks[i] for i in hkinds]
if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes):
log.info("Found minimum genes")
break
tophks = sorted([[x,i,list(x).count(1)] for i,x in enumerate(gmat) if i not in hkinds],key=lambda row: row[2],reverse=True)
maxcount = tophks[0][2]
tempmat = np.vstack([x[0] for x in tophks if x[2]==maxcount])
orgcounts = sorted([[list(tempmat[:,j]).count(1),j] for j in orginds], key=lambda row: row[0])
orginds = [x[1] for x in orgcounts[numorgs-maxcount:]]
remorg = [x for i,x in enumerate(orgs) if i not in orginds]
singhks = [x for i,x in enumerate(hks) if i in hkinds]
#Prioritize by lowest dNdS values
singhks = sorted(singhks, key=lambda k: njvals.get(k,9))
#Move keep genes at the top of list
for x in keepgenes:
if x in singhks:
singhks.insert(0, singhks.pop(singhks.index(x)))
if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes):
log.info("# of single copy genes found: %s\t# removed organisms: %s"%(len(singhks),len(remorg)))
log.info("Removed orgs:%s"%remorg)
for i,x in enumerate(singhks):
if i<maxgenes:
log.info("Top Singles: %s\tdNdS=%s"%(x,njvals.get(x,"NA")))
else:
log.info("Other Singles: %s\tdNdS=%s"%(x,njvals.get(x,"NA")))
#WRITE GENES TO FASTA
if outdir:
if not os.path.exists(outdir):
os.makedirs(outdir)
for hkgn in singhks[:maxgenes]:
allnucrecs = [">%s\n%s"%(org,x[hkgn][0][1]) for org,x in seqdict.items() if hkgn in x.keys() and x[hkgn][0][1] and org not in remorg]
allaarecs = [">%s\n%s"%(org,x[hkgn][0][2]) for org,x in seqdict.items() if hkgn in x.keys() and x[hkgn][0][2] and org not in remorg]
with open(os.path.join(outdir,hkgn+".fna"),"w") as fnafil, open(os.path.join(outdir,hkgn+".faa"),"w") as faafil:
if len(allnucrecs):
fnafil.write("\n".join(allnucrecs)+"\n")
if len(allnucrecs):
faafil.write("\n".join(allaarecs)+"\n")
return remorg,singhks[:maxgenes]
else:
log.error("Failed: Only %s single copy genes found after removing orgs: %s\nKeepgenes found: %s"%(len(singhks),remorg,set(singhks)&set(keepgenes)))
log.info("Try different set or allow more deletions")
return False,False
# Commandline Execution
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="""Find and extract MLST genes""")
parser.add_argument("input", help="Input sequence database")
parser.add_argument("-dnds", help="Json file with dN/dS values for further filtering",default="")
parser.add_argument("-log", help="Log file to save status",default="")
parser.add_argument("-od", "--outdir", help="Directory to rewrite singles with removed organisms (default: currentdir/singles)",default="singles")
parser.add_argument("-kg", "--keepgenes", help="list of genes that must be kept. (comma seperated, ex: RNA_16S_rRNA,TIGR02013)",default="")
parser.add_argument("-mx","--maxgenes", help="maximum number of single copy housekeeping genes, uses lowest dNdS valued genes (default:30)",type=int,default=30)
parser.add_argument("-mn","--minnum", help="minimum number of single copy housekeeping genes (default:7)",type=int,default=7)
parser.add_argument("-mo","--minorg", help="minimum percent of organisms to remain (default:0.8)",type=float,default=0.8)
args = parser.parse_args()
findsingles(args.input,args.minnum,args.minorg,args.maxgenes,args.dnds,args.outdir,args.keepgenes,args.log)