Benchmark

#!/usr/bin/env python

import os
import sys
import re
from collections import defaultdict
import argparse
import shutil
import ArgParser
import CreateDataset
import FilterDataset
import CreateBenchmark
import PaperTermFrequency
import subprocess
import ConfigParser
import Config
import Stats
import FormatChecker
from os.path import basename

# The first step with using the argparse module is to create a parser object that can then parse all the user inputs and convert them into python objects based on the user input types provided

print "*************************************************"
print "Welcome to the Benchmark Creation tool !!!!!"
print "*************************************************"

parser = argparse.ArgumentParser(prog='bm.py',description='Creates a set of benchmark proteins')
parser.add_argument('-G','--organism',nargs='*', default=['all'],help='Provides user a choice to specify a set of organisms (example:Saccharomyces cerevisiae or 7227) separated by space.Default is all.')
parser.add_argument('-N','--ontology',nargs='*', default=['all'],help='Provides user a choice to specify a set of ontologies (F, P, C) separated by space. Default is all.')
parser.add_argument('-V','--evidence',nargs='*', default=['all'],help='Provides user a choice to specify a set of GO experimental evidence codes (example: IPI, IDA, EXP) separated by space.Default is all.')
parser.add_argument('-C','--cafa', default='F' ,help='Takes in either T or F. If specified as T, user needs to provide a CAFA targets file as input1. If F, program will take in a uniprot-goa file as input1. Default is F')
parser.add_argument('-I1', '--input1', nargs='*', help='This opton is mandatory. Specifies path to the first input file.')
parser.add_argument('-I2', '--input2', nargs='*', help='This option is mandatory. Specifies path to the second input file.')
parser.add_argument('-O', '--output', nargs='*',default=[], help='Optional argument.Provides user an option to specify an output filename.')
parser.add_argument('-S', '--source',action='store' ,nargs='*',default=['all'],help='Provides user a choice to specify sources (example: UniProt, InterPro) separated by spaces. Default is all.')
parser.add_argument('-P', '--pubmed',default='F',help='Allows user to turn on the pubmed filter. If turned on, GO terms w/o any Pubmed references will not be considered part of the benchmark set.By default, it is turned off.')
parser.add_argument('-F', '--confidence',default='F',help='Allows user to turn on the annotation confidence filter. If turned on, GO terms assignments to proteins that are documented in few papers (4 or less by default) will not be considered part of the benchmark set.By default, it is turned off.')
parser.add_argument('-T', '--threshold',type=int, default=4,help='Allows users to specify a threshold for the minimum number of papers to be used for having a confident annotation. If not specified, defaults to a value of 4.')
parser.add_argument('-B', '--blacklist', nargs='*',default=[], help='This parameter can take in a list of pubmed ids and all GO terms and proteins annotated in them will be eliminated from the benchmark set.Default is an empty list.')

# Search for config file in the current directory
fname_ind = 0

for root,dirs,files in os.walk('.'):
    for fname in files:
        if fname == '.cafarc':
            fname_ind = 1
    if fname_ind == 0:
        print 'Config file not found'
        print 'Creating new configuration file...'
        print '***************************************************************'
        Config.create()
    break
    
Config_handle = ConfigParser.ConfigParser()
Config_handle.read('.cafarc')
ConfigParam = defaultdict()

ConfigParam = {'workdir' : Config_handle.get('WORKDIR', 'DEFAULT_PATH'),
               'ftp_host' : Config_handle.get('FTP', 'HOSTNAME'),
               'ftp_curr_path' : Config_handle.get('FTP', 'CURRENT_FILE_PATH'),
               'ftp_old_path' : Config_handle.get('FTP', 'OLD_FILE_PATH'),
               'exp_eec' : Config_handle.get('DEFAULTS', 'EXP_EVIDENCE_CODES'),
               'iea_eec' : Config_handle.get('DEFAULTS', 'IEA_EVIDENCE_CODES'),
               'ont_def' : Config_handle.get('DEFAULTS', 'ONTOLOGIES'),
               'tax_file' : Config_handle.get('DEFAULTS', 'TAXONOMY_FILENAME'),
               'uniprot_path' : Config_handle.get('SEQUENCE', 'BASE_URL'),
               'ftp_date' : Config_handle.get('REGEX', 'FTP_DATE'),
               'ftp_file_start' : Config_handle.get('REGEX', 'FTP_FILE_START')
               }

work_dir = ConfigParam['workdir']
work_dir = work_dir.rstrip('/')
uniprot_path = ConfigParam['uniprot_path'] .rstrip('/')

if not os.path.exists(work_dir):
    os.makedirs(work_dir)

parsed_dict = ArgParser.parse(parser, ConfigParam)

t1 = parsed_dict['t1']
t2 = parsed_dict['t2']
outfile_basename = basename(parsed_dict['outfile'])

# At this point once all input parameters are validated, validation of file format should be done to make sure users input file in appropriate format

if parsed_dict['user_conf'] == 'T':
    paper_threshold = parsed_dict['user_thresh']
    ann_conf_filter = True
else:
    ann_conf_filter = False
    paper_threshold = 0

if parsed_dict['user_mode'] == 'T':
    t1_input_file = work_dir + '/' + CreateDataset.parse_cafa(t1, ConfigParam)
else:
    t1_input_file = work_dir + '/' + CreateDataset.parse(t1, ConfigParam)

t2_input_file = work_dir + '/' + CreateDataset.parse(t2, ConfigParam)

# Add format checker module here

FormatChecker.check(t1_input_file, t2_input_file,parsed_dict['user_mode'])

if not os.path.exists(t2_input_file + '_with_annotations_per_paper.txt'):
    paper_annotation_freq  = t2_input_file + '_with_annotations_per_paper.txt'
    paper_conf_filter = True
    paper_ann_freq_handle = open(paper_annotation_freq, 'w')
else:
    paper_conf_filter = False

if ann_conf_filter or paper_conf_filter : 
    [ann_conf, paper_conf] = PaperTermFrequency.count(t2_input_file, parsed_dict['user_EVI'], ann_conf_filter, paper_conf_filter)

    if len(paper_conf) > 0:
        for i in paper_conf:
            print >> paper_ann_freq_handle, i + '\t' + str(len(paper_conf[i]))

        paper_conf.clear()
else:
    ann_conf = defaultdict(lambda:defaultdict(set))

# The next step is to parse t1 file into separate ontologies

FilterDataset.t2_filter(t2_input_file, parsed_dict['user_ONT'], parsed_dict['user_EVI'], parsed_dict['user_ORG'], parsed_dict['user_source'], parsed_dict['user_pubmed'], parsed_dict['black_set'], ann_conf, paper_threshold, ConfigParam['tax_file'])
t2_exp = t2_input_file + '.exponly'

if os.stat(t2_exp).st_size == 0:
    print "Your benchmark set will be empty with the parameters provided."
    sys.exit(1)

if parsed_dict['user_mode'] == 'T':
    t1_iea = t1_input_file + '.iea1'
    CreateBenchmark.parse_cafa(t2_exp, t1_iea)
    filename = t2_exp + '_bench.txt'
    
else:
    FilterDataset.t1_filter_pass1(t1_input_file, t2_exp, ConfigParam['iea_eec'], ConfigParam['exp_eec'])
    t1_iea_handle = open(t1_input_file + '.iea1' , 'r')
    t1_exp_handle = open(t1_input_file + '.exp1' , 'r')
    CreateBenchmark.parse(t1_iea_handle, t1_exp_handle, t2_exp, ConfigParam['iea_eec'], ConfigParam['exp_eec'])
    filename = t2_exp + '_bench.txt'

if not outfile_basename == '':
    ob = parsed_dict['outfile'] 
else:
    ob = t2_input_file + '.benchmark'

index = 1
while os.path.exists(ob + '.' + str(index)):
    index = index + 1
output_filename = ob + '.' + str(index)
    
if os.stat(filename).st_size == 0:
    print "Your benchmark set is empty."
else:
    subprocess.call(['sort ' + filename + '| uniq > ' + output_filename], shell=True)
    Stats.plot_stats(output_filename, uniprot_path)

print 'Cleaning working directory....'

os.remove(filename)
os.remove(t1_input_file + '.iea1')
if os.path.exists(t1_input_file + '.exp1'):
    os.remove(t1_input_file + '.exp1')

os.remove(t2_input_file + '.exponly')

for root, dirs, files in os.walk(work_dir):
    for fname in files:
        if os.path.getsize(root + '/' + fname) == 0:
            os.remove(root + '/' + fname)

print 'Thank you for using the Benchmark Creator Software'