-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBenchmark
executable file
·174 lines (139 loc) · 7.97 KB
/
Benchmark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
import os
import sys
import re
from collections import defaultdict
import argparse
import shutil
import ArgParser
import CreateDataset
import FilterDataset
import CreateBenchmark
import PaperTermFrequency
import subprocess
import ConfigParser
import Config
import Stats
import FormatChecker
from os.path import basename
# The first step with using the argparse module is to create a parser object that can then parse all the user inputs and convert them into python objects based on the user input types provided
print "*************************************************"
print "Welcome to the Benchmark Creation tool !!!!!"
print "*************************************************"
parser = argparse.ArgumentParser(prog='bm.py',description='Creates a set of benchmark proteins')
parser.add_argument('-G','--organism',nargs='*', default=['all'],help='Provides user a choice to specify a set of organisms (example:Saccharomyces cerevisiae or 7227) separated by space.Default is all.')
parser.add_argument('-N','--ontology',nargs='*', default=['all'],help='Provides user a choice to specify a set of ontologies (F, P, C) separated by space. Default is all.')
parser.add_argument('-V','--evidence',nargs='*', default=['all'],help='Provides user a choice to specify a set of GO experimental evidence codes (example: IPI, IDA, EXP) separated by space.Default is all.')
parser.add_argument('-C','--cafa', default='F' ,help='Takes in either T or F. If specified as T, user needs to provide a CAFA targets file as input1. If F, program will take in a uniprot-goa file as input1. Default is F')
parser.add_argument('-I1', '--input1', nargs='*', help='This opton is mandatory. Specifies path to the first input file.')
parser.add_argument('-I2', '--input2', nargs='*', help='This option is mandatory. Specifies path to the second input file.')
parser.add_argument('-O', '--output', nargs='*',default=[], help='Optional argument.Provides user an option to specify an output filename.')
parser.add_argument('-S', '--source',action='store' ,nargs='*',default=['all'],help='Provides user a choice to specify sources (example: UniProt, InterPro) separated by spaces. Default is all.')
parser.add_argument('-P', '--pubmed',default='F',help='Allows user to turn on the pubmed filter. If turned on, GO terms w/o any Pubmed references will not be considered part of the benchmark set.By default, it is turned off.')
parser.add_argument('-F', '--confidence',default='F',help='Allows user to turn on the annotation confidence filter. If turned on, GO terms assignments to proteins that are documented in few papers (4 or less by default) will not be considered part of the benchmark set.By default, it is turned off.')
parser.add_argument('-T', '--threshold',type=int, default=4,help='Allows users to specify a threshold for the minimum number of papers to be used for having a confident annotation. If not specified, defaults to a value of 4.')
parser.add_argument('-B', '--blacklist', nargs='*',default=[], help='This parameter can take in a list of pubmed ids and all GO terms and proteins annotated in them will be eliminated from the benchmark set.Default is an empty list.')
# Search for config file in the current directory
fname_ind = 0
for root,dirs,files in os.walk('.'):
for fname in files:
if fname == '.cafarc':
fname_ind = 1
if fname_ind == 0:
print 'Config file not found'
print 'Creating new configuration file...'
print '***************************************************************'
Config.create()
break
Config_handle = ConfigParser.ConfigParser()
Config_handle.read('.cafarc')
ConfigParam = defaultdict()
ConfigParam = {'workdir' : Config_handle.get('WORKDIR', 'DEFAULT_PATH'),
'ftp_host' : Config_handle.get('FTP', 'HOSTNAME'),
'ftp_curr_path' : Config_handle.get('FTP', 'CURRENT_FILE_PATH'),
'ftp_old_path' : Config_handle.get('FTP', 'OLD_FILE_PATH'),
'exp_eec' : Config_handle.get('DEFAULTS', 'EXP_EVIDENCE_CODES'),
'iea_eec' : Config_handle.get('DEFAULTS', 'IEA_EVIDENCE_CODES'),
'ont_def' : Config_handle.get('DEFAULTS', 'ONTOLOGIES'),
'tax_file' : Config_handle.get('DEFAULTS', 'TAXONOMY_FILENAME'),
'uniprot_path' : Config_handle.get('SEQUENCE', 'BASE_URL'),
'ftp_date' : Config_handle.get('REGEX', 'FTP_DATE'),
'ftp_file_start' : Config_handle.get('REGEX', 'FTP_FILE_START')
}
work_dir = ConfigParam['workdir']
work_dir = work_dir.rstrip('/')
uniprot_path = ConfigParam['uniprot_path'] .rstrip('/')
if not os.path.exists(work_dir):
os.makedirs(work_dir)
parsed_dict = ArgParser.parse(parser, ConfigParam)
t1 = parsed_dict['t1']
t2 = parsed_dict['t2']
outfile_basename = basename(parsed_dict['outfile'])
# At this point once all input parameters are validated, validation of file format should be done to make sure users input file in appropriate format
if parsed_dict['user_conf'] == 'T':
paper_threshold = parsed_dict['user_thresh']
ann_conf_filter = True
else:
ann_conf_filter = False
paper_threshold = 0
if parsed_dict['user_mode'] == 'T':
t1_input_file = work_dir + '/' + CreateDataset.parse_cafa(t1, ConfigParam)
else:
t1_input_file = work_dir + '/' + CreateDataset.parse(t1, ConfigParam)
t2_input_file = work_dir + '/' + CreateDataset.parse(t2, ConfigParam)
# Add format checker module here
FormatChecker.check(t1_input_file, t2_input_file,parsed_dict['user_mode'])
if not os.path.exists(t2_input_file + '_with_annotations_per_paper.txt'):
paper_annotation_freq = t2_input_file + '_with_annotations_per_paper.txt'
paper_conf_filter = True
paper_ann_freq_handle = open(paper_annotation_freq, 'w')
else:
paper_conf_filter = False
if ann_conf_filter or paper_conf_filter :
[ann_conf, paper_conf] = PaperTermFrequency.count(t2_input_file, parsed_dict['user_EVI'], ann_conf_filter, paper_conf_filter)
if len(paper_conf) > 0:
for i in paper_conf:
print >> paper_ann_freq_handle, i + '\t' + str(len(paper_conf[i]))
paper_conf.clear()
else:
ann_conf = defaultdict(lambda:defaultdict(set))
# The next step is to parse t1 file into separate ontologies
FilterDataset.t2_filter(t2_input_file, parsed_dict['user_ONT'], parsed_dict['user_EVI'], parsed_dict['user_ORG'], parsed_dict['user_source'], parsed_dict['user_pubmed'], parsed_dict['black_set'], ann_conf, paper_threshold, ConfigParam['tax_file'])
t2_exp = t2_input_file + '.exponly'
if os.stat(t2_exp).st_size == 0:
print "Your benchmark set will be empty with the parameters provided."
sys.exit(1)
if parsed_dict['user_mode'] == 'T':
t1_iea = t1_input_file + '.iea1'
CreateBenchmark.parse_cafa(t2_exp, t1_iea)
filename = t2_exp + '_bench.txt'
else:
FilterDataset.t1_filter_pass1(t1_input_file, t2_exp, ConfigParam['iea_eec'], ConfigParam['exp_eec'])
t1_iea_handle = open(t1_input_file + '.iea1' , 'r')
t1_exp_handle = open(t1_input_file + '.exp1' , 'r')
CreateBenchmark.parse(t1_iea_handle, t1_exp_handle, t2_exp, ConfigParam['iea_eec'], ConfigParam['exp_eec'])
filename = t2_exp + '_bench.txt'
if not outfile_basename == '':
ob = parsed_dict['outfile']
else:
ob = t2_input_file + '.benchmark'
index = 1
while os.path.exists(ob + '.' + str(index)):
index = index + 1
output_filename = ob + '.' + str(index)
if os.stat(filename).st_size == 0:
print "Your benchmark set is empty."
else:
subprocess.call(['sort ' + filename + '| uniq > ' + output_filename], shell=True)
Stats.plot_stats(output_filename, uniprot_path)
print 'Cleaning working directory....'
os.remove(filename)
os.remove(t1_input_file + '.iea1')
if os.path.exists(t1_input_file + '.exp1'):
os.remove(t1_input_file + '.exp1')
os.remove(t2_input_file + '.exponly')
for root, dirs, files in os.walk(work_dir):
for fname in files:
if os.path.getsize(root + '/' + fname) == 0:
os.remove(root + '/' + fname)
print 'Thank you for using the Benchmark Creator Software'