Example Data Manager: update_blastdb.pl

Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB. Available on the Galaxy Test Tool Shed: http://testtoolshed.g2.bx.psu.edu/view/blankenberg/data_manager_example_blastdb_ncbi_update_blastdb See also: Blankenberg et al (2014) Wrangling Galaxy's reference data http://dx.doi.org/10.1093/bioinformatics/btu119 Associated wiki pages: https://wiki.galaxyproject.org/Admin/Tools/DataManagers
peterjc · Apr 8, 2014 · 21d7cff · 21d7cff · peterjc · Apr 11, 2014
1 parent d24a5e6
commit 21d7cff
Show file tree

Hide file tree

Showing 7 changed files with 175 additions and 0 deletions.
diff --git a/data_managers/ncbi_blastdb/README b/data_managers/ncbi_blastdb/README
@@ -0,0 +1,3 @@
+Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB.
+
+Uses ncbi's update_blast.pl script.
diff --git a/data_managers/ncbi_blastdb/blastdb.xml b/data_managers/ncbi_blastdb/blastdb.xml
@@ -0,0 +1,48 @@
+<tool id="data_manager_blast_db" name="Blast DB" version="0.0.1" tool_type="manage_data">
+    <description>Downloader</description>
+    <command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb"</command>
+    <requirements>
+        <requirement type="package" version="2.2.28">blast+</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <inputs>
+        <param name="blastdb_name" type="text" label="Blast DB Name" help="try &quot;nt&quot; as an example" optional="False"/>
+        <conditional name="advanced">
+            <param name="advanced_selector" type="select" label="Advanced Options">
+                <option value="basic" selected="True">Basic</option>
+                <option value="advanced">Advanced</option>
+            </param>
+            <when value="basic">
+            </when>
+            <when value="advanced">
+                <param type="text" name="data_description" value="" label="Display name" help="Optional"/>
+                <param type="text" name="data_id" value="" label="ID for sequence" help="Optional"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="blastdb_name" value="est"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="out_file" file="est_out.json"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Downloads Blast DBs and updates blastdb tool data tables.
+
+------
+
+
+.. class:: infomark
+
+**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs.
+
+    </help>
+</tool>
diff --git a/data_managers/ncbi_blastdb/data_manager_conf.xml b/data_managers/ncbi_blastdb/data_manager_conf.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/blastdb.xml" id="ncbi_blast_plus_update_blastdb">
+        <data_table name="blastdb">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
diff --git a/data_managers/ncbi_blastdb/fetch_blast_db.py b/data_managers/ncbi_blastdb/fetch_blast_db.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+#Script that calls update_blastdb.pl to download preformatted databases
+
+import optparse
+import os
+import sys
+import subprocess
+import hashlib
+
+from galaxy.util.json import from_json_string, to_json_string
+DEFAULT_ALGORITHM = hashlib.sha512
+CHUNK_SIZE = 2**20 #1mb
+
+def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ):
+    chunk_size = chunk_size or CHUNK_SIZE
+    algorithm = algorithm or DEFAULT_ALGORITHM
+    if isinstance( algorithm, basestring ):
+        hash = hashlib.new( algorithm )
+    else:
+        hash = algorithm()
+    #we hash a directory by taking names of directories, files and their contents
+    for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ):
+        dirnames.sort()
+        filenames.sort()
+        for name in dirnames:
+            hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) )
+        for name in filenames:
+            filename = os.path.join( dirpath, name )
+            hash.update( os.path.relpath( filename, directory ) )
+            fh = open( filename, 'rb' )
+            while True:
+                data = fh.read( chunk_size )
+                if not data:
+                    break
+                hash.update( data )
+            fh.close()
+
+    return hash.hexdigest()
+
+def main():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-f', '--filename', dest='filename', action='store', type='string', default=None, help='filename' )
+    parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' )
+    (options, args) = parser.parse_args()
+
+    params = from_json_string( open( options.filename ).read() )
+    target_directory = params[ 'output_data' ][0]['extra_files_path']
+    os.mkdir( target_directory )
+
+    blastdb_name = params['param_dict']['blastdb_name'] #value
+    data_description = params['param_dict']['advanced'].get( 'data_description', None )
+    data_id = params['param_dict']['advanced'].get( 'data_id', None )
+
+    cmd_options = [ '--decompress' ]
+
+    args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
+    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
+    return_code = proc.wait()
+    if return_code != 1:
+        print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
+        sys.exit( 1 )
+
+    if not data_id:
+        data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) )
+
+    if not data_description:
+        alias_date = None
+        try:
+            for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ):
+                if line.startswith( '# Alias file created ' ):
+                    alias_date = line.split( '# Alias file created ', 1 )[1].strip()
+                if line.startswith( 'TITLE' ):
+                    data_description = line.split( None, 1 )[1].strip()
+                    break
+        except Exception, e:
+            print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e )
+        if alias_date and data_description:
+            data_description = "%s (%s)" % ( data_description, alias_date )
+
+    if not data_description:
+        data_description = data_id
+
+    data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name }
+    data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ]  } }
+
+    #save info to json file
+    with open( options.filename, 'wb' ) as fh:
+        fh.write( to_json_string( data_manager_dict ) )
+
+if __name__ == "__main__":
+    main()
diff --git a/data_managers/ncbi_blastdb/tool_dependencies.xml b/data_managers/ncbi_blastdb/tool_dependencies.xml
@@ -0,0 +1,6 @@
+<tool_dependency>
+    <package name="blast+" version="2.2.28">
+        <repository toolshed="http://testtoolshed.g2.bx.psu.edu" name="package_blast_plus_2_2_28" owner="iuc" changeset_revision="5a449da71d08" />
+    </package>
+</tool_dependency>
+
diff --git a/test-data/est_out.json b/test-data/est_out.json
@@ -0,0 +1 @@
+{"data_tables": {"blastdb": [{"path": "est/est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67", "nucleotide_alias_name": "est", "name": "Database of GenBank+EMBL+DDBJ sequences from EST Divisions (12/05/2013 07:12:35)", "value": "est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67"}]}}
diff --git a/tool-data/tool_data_table_conf.xml.sample b/tool-data/tool_data_table_conf.xml.sample
@@ -0,0 +1,6 @@
+<tables>
+    <table name="blastdb" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb.loc" />
+    </table>
+</tables>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB.

		Uses ncbi's update_blast.pl script.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"data_tables": {"blastdb": [{"path": "est/est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67", "nucleotide_alias_name": "est", "name": "Database of GenBank+EMBL+DDBJ sequences from EST Divisions (12/05/2013 07:12:35)", "value": "est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67"}]}}