diff --git a/data_managers/ncbi_blastdb/README b/data_managers/ncbi_blastdb/README new file mode 100644 index 00000000..d8474035 --- /dev/null +++ b/data_managers/ncbi_blastdb/README @@ -0,0 +1,3 @@ +Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB. + +Uses ncbi's update_blast.pl script. \ No newline at end of file diff --git a/data_managers/ncbi_blastdb/blastdb.xml b/data_managers/ncbi_blastdb/blastdb.xml new file mode 100644 index 00000000..6d07ce23 --- /dev/null +++ b/data_managers/ncbi_blastdb/blastdb.xml @@ -0,0 +1,48 @@ + + Downloader + fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb" + + blast+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Downloads Blast DBs and updates blastdb tool data tables. + +------ + + +.. class:: infomark + +**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs. + + + diff --git a/data_managers/ncbi_blastdb/data_manager_conf.xml b/data_managers/ncbi_blastdb/data_manager_conf.xml new file mode 100644 index 00000000..f070c8d1 --- /dev/null +++ b/data_managers/ncbi_blastdb/data_manager_conf.xml @@ -0,0 +1,18 @@ + + + + + + + + + + blastdb/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name} + abspath + + + + + diff --git a/data_managers/ncbi_blastdb/fetch_blast_db.py b/data_managers/ncbi_blastdb/fetch_blast_db.py new file mode 100644 index 00000000..3749bc63 --- /dev/null +++ b/data_managers/ncbi_blastdb/fetch_blast_db.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +#Dan Blankenberg +#Script that calls update_blastdb.pl to download preformatted databases + +import optparse +import os +import sys +import subprocess +import hashlib + +from galaxy.util.json import from_json_string, to_json_string +DEFAULT_ALGORITHM = hashlib.sha512 +CHUNK_SIZE = 2**20 #1mb + +def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ): + chunk_size = chunk_size or CHUNK_SIZE + algorithm = algorithm or DEFAULT_ALGORITHM + if isinstance( algorithm, basestring ): + hash = hashlib.new( algorithm ) + else: + hash = algorithm() + #we hash a directory by taking names of directories, files and their contents + for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ): + dirnames.sort() + filenames.sort() + for name in dirnames: + hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) ) + for name in filenames: + filename = os.path.join( dirpath, name ) + hash.update( os.path.relpath( filename, directory ) ) + fh = open( filename, 'rb' ) + while True: + data = fh.read( chunk_size ) + if not data: + break + hash.update( data ) + fh.close() + + return hash.hexdigest() + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-f', '--filename', dest='filename', action='store', type='string', default=None, help='filename' ) + parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' ) + (options, args) = parser.parse_args() + + params = from_json_string( open( options.filename ).read() ) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir( target_directory ) + + blastdb_name = params['param_dict']['blastdb_name'] #value + data_description = params['param_dict']['advanced'].get( 'data_description', None ) + data_id = params['param_dict']['advanced'].get( 'data_id', None ) + + cmd_options = [ '--decompress' ] + + args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ] + proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) + return_code = proc.wait() + if return_code != 1: + print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code + sys.exit( 1 ) + + if not data_id: + data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) ) + + if not data_description: + alias_date = None + try: + for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ): + if line.startswith( '# Alias file created ' ): + alias_date = line.split( '# Alias file created ', 1 )[1].strip() + if line.startswith( 'TITLE' ): + data_description = line.split( None, 1 )[1].strip() + break + except Exception, e: + print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e ) + if alias_date and data_description: + data_description = "%s (%s)" % ( data_description, alias_date ) + + if not data_description: + data_description = data_id + + data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name } + data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ] } } + + #save info to json file + with open( options.filename, 'wb' ) as fh: + fh.write( to_json_string( data_manager_dict ) ) + +if __name__ == "__main__": + main() diff --git a/data_managers/ncbi_blastdb/tool_dependencies.xml b/data_managers/ncbi_blastdb/tool_dependencies.xml new file mode 100644 index 00000000..8752b4aa --- /dev/null +++ b/data_managers/ncbi_blastdb/tool_dependencies.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test-data/est_out.json b/test-data/est_out.json new file mode 100644 index 00000000..94c46575 --- /dev/null +++ b/test-data/est_out.json @@ -0,0 +1 @@ +{"data_tables": {"blastdb": [{"path": "est/est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67", "nucleotide_alias_name": "est", "name": "Database of GenBank+EMBL+DDBJ sequences from EST Divisions (12/05/2013 07:12:35)", "value": "est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67"}]}} \ No newline at end of file diff --git a/tool-data/tool_data_table_conf.xml.sample b/tool-data/tool_data_table_conf.xml.sample new file mode 100644 index 00000000..f69062ff --- /dev/null +++ b/tool-data/tool_data_table_conf.xml.sample @@ -0,0 +1,6 @@ + + + value, name, path + +
+