-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunique_genes.py
46 lines (30 loc) · 1014 Bytes
/
unique_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pysam
'''
quick check for unique genes at each step
'''
datadir='/pym/Data/Nanopore/projects/prolificans/rna'
intertsvfile=datadir+'/interproscan/Trinity.fasta.transdecoder_clean.pep.tsv'
tdecodefile=datadir+'/transdecoder/Trinity.fasta.transdecoder.pep'
trinfile=datadir+'/trinity/Trinity.fasta'
##interproscan
intergenes=[]
with open(intertsvfile, 'r') as f:
content=f.read().splitlines()
for i in content:
intergenes.append(i.split('_i')[0])
interunique=list(set(intergenes))
##tdecode
tdecodefa=pysam.FastaFile(tdecodefile)
tdecodegenes=[]
for i in tdecodefa.references:
tdecodegenes.append(i.split('_i')[0])
tdecodeunique=list(set(tdecodegenes))
##trinity
trinfa=pysam.FastaFile(trinfile)
tringenes=[]
for i in trinfa.references:
tringenes.append(i.split('_i')[0])
trinunique=list(set(tringenes))
print('Trinity:'+str(len(trinunique)))
print('transdecoder:'+str(len(tdecodeunique)))
print('interproscan:'+str(len(interunique)))