kofamscan: add data table (#5956)

* kofamscan: add data table with reference data consisting of multiple files (parameters -p and -k) summing up to 1.3G kfoamscan qualifies for a data table. currently the idea is to have two data tables 1. the first contains the main reference data - path to the profiles dir - path to the ko list file 2. hal files (hal files are contained in the profiles dir and basically just list hmm files, i.e. specify a subset of the hmm files. in the current reference data there are hal files for pro and eukaryotes, but this may change?) also one could use this to list the single HMMs, such that users can run against a single HMM also - use tabuar output by default - change format of the outoput from txt to tabular if needed - fixes the version command * try different syntax for input ref
galaxyproject · Apr 22, 2024 · db30370 · db30370
1 parent edb9fd7
commit db30370
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 26 deletions.
diff --git a/tools/kofamscan/kofamscan.xml b/tools/kofamscan/kofamscan.xml
@@ -2,7 +2,7 @@
     <description>gene function annotation based on KEGG orthology and HMM</description>
     <macros>
         <token name="@TOOL_VERSION@">1.3.0</token>
-        <token name="@VERSION_SUFFIX@">1</token>
+        <token name="@VERSION_SUFFIX@">2</token>
         <xml name="reportannotation" token_selected="">
             <param name="reportannotation" type="boolean" truevalue="--report-unannotated" falsevalue="--no-report-unannotated" checked="@SELECTED@" label="Include sequence name to outputs even if no KOs are assigned?"/>
         </xml>
@@ -14,27 +14,35 @@
         <requirement type="package" version="@TOOL_VERSION@">kofamscan</requirement>
         <requirement type="package" version="3.0">zip</requirement>
     </requirements>
-    <version_command><![CDATA[sansa -v | grep "Sansa " | cut -d "v" -f 3]]></version_command>
+    <version_command><![CDATA[exec_annotation --version | cut -d " " -f 2]]></version_command>
     <command detect_errors="exit_code"><![CDATA[
-## preprocessing
-mkdir ./profile &&
-mkdir ./temp_extract &&
-#if $p_cond.p_sel == 'compressed'
-    tar -xf '${p_cond.p}' -C temp_extract &&
-    find ./temp_extract/ -name '*.hmm' -exec mv {} ./profile \; &&
-    find ./temp_extract/ -name '*.hal' -exec mv {} ./profile \; &&
-#elif $p_cond.p_sel == 'hmm'
-    ## input files require prefix 'K' and file extension '.hmm'
-    #for $i, $current in enumerate($p_cond.p)
-        ln -s '$current' 'profile/K${i}.hmm' &&
-    #end for
+#if $p_cond.p_sel != "cached"
+    ## preprocessing
+    mkdir ./profile &&
+    mkdir ./temp_extract &&
+    #if $p_cond.p_sel == 'compressed'
+        tar -xf '${p_cond.p}' -C temp_extract &&
+        find ./temp_extract/ -name '*.hmm' -exec mv {} ./profile \; &&
+        find ./temp_extract/ -name '*.hal' -exec mv {} ./profile \; &&
+    #elif $p_cond.p_sel == 'hmm'
+        ## input files require prefix 'K' and file extension '.hmm'
+        #for $i, $current in enumerate($p_cond.p)
+            ln -s '$current' 'profile/K${i}.hmm' &&
+        #end for
+    #end if
 #end if
 
 ## run
 exec_annotation
--p 'profile'
+
+#if $p_cond.p_sel != "cached"
+    -p 'profile'
+    -k '$p_cond.k'
+#else
+    -p '$p_cond.kofam.fields.profile_dir/#if $p_cond.kofam_subset then $p_cond.kofam_subset else ""#'
+    -k '$p_cond.kofam.fields.ko_list'
+#end if
 -o 'result.txt'
--k '$k'
 --cpu \${GALAXY_SLOTS:-4}
 -E $E
 -T $ap.T
@@ -54,17 +62,34 @@ $ap.f_cond.reportannotation
         <param name="query" type="data" format="fasta" label="Select query sequence file" help="Nucleotide sequences are not accepted."/>
         <conditional name="p_cond">
             <param name="p_sel" type="select" label="Select profile database format">
-                <option value="compressed" selected="true">Compressed set of HMM and HAL file(s)</option>
+                <option value="cached" selected="true">Cached</option>
+                <option value="compressed">Compressed set of HMM and HAL file(s)</option>
                 <option value="hmm">HMM file(s)</option>
             </param>
+            <when value="cached">
+                <param name="kofam" type="select">
+                    <options from_data_table="kofam"/>
+                </param>
+                <param name="kofam_subset" type="select" optional="true">
+                    <options from_data_table="kofam_subset">
+                        <filter type="param_value" column="2" ref="kofam"/>
+                        <filter type="sort_by" column="1"/>
+                        <filter type="unique_value" column="1"/>
+                        <column name="name" index="1"/>
+                        <column name="value" index="0"/>
+                    </options>
+                    <validator type="no_options" message="No option available for this input"/>
+                </param>
+            </when>
             <when value="compressed">
                 <param argument="-p" type="data" format="tar" label="Select a compressed file with HMM and HAL file(s)" help="Compressed archives are available from KofamKOALA web service (https://www.genome.jp/tools/kofamkoala/)."/>
+                <param argument="-k" type="data" format="tabular" label="Select KO list file"/>
             </when>
             <when value="hmm">
                 <param argument="-p" type="data" format="hmm3" multiple="true" label="Select profile HMM file(s)"/>
+                <param argument="-k" type="data" format="tabular" label="Select KO list file"/>
             </when>
         </conditional>
-        <param argument="-k" type="data" format="tabular" label="Select KO list file"/>
         <param argument="-E" type="float" min="0.0" max="1.0" value="0.01" label="Set E-value threshold"/>
         <section name="ap" title="Advanced parameters" expanded="true">
             <param argument="-T" type="integer" value="1" label="Set threshold scale" help="The score thresholds will be multiplied by this value."/>
@@ -97,15 +122,18 @@ $ap.f_cond.reportannotation
         </section>
     </inputs>
     <outputs>
+        <data name="out_result" format="txt" from_work_dir="result.txt" label="${tool.name} on ${on_string}: Results">
+            <filter>'result' in ap['out']</filter>
+            <change_format>
+                <when input='ap.f_cond.f_sel' value="detail-tsv" format="tabular" />
+            </change_format>
+        </data>
         <data name="out_alignments" format="zip" from_work_dir="tmp/alignments.zip" label="${tool.name} on ${on_string}: HMMER alignments">
             <filter>'alignments' in ap['out']</filter>
         </data>
         <data name="out_output" format="txt" from_work_dir="tmp/output/output.txt" label="${tool.name} on ${on_string}: HMMER output">
             <filter>'output' in ap['out']</filter>
         </data>
-        <data name="out_result" format="txt" from_work_dir="result.txt" label="${tool.name} on ${on_string}: Results">
-            <filter>'result' in ap['out']</filter>
-        </data>
         <data name="out_tabular" format="txt" from_work_dir="tmp/tabular/tabular.txt" label="${tool.name} on ${on_string}: HMMER tabular">
             <filter>'tabular' in ap['out']</filter>
         </data>
@@ -131,7 +159,7 @@ $ap.f_cond.reportannotation
             <param name="query" value="query.fasta"/>
             <conditional name="p_cond">
                 <param name="p_sel" value="hmm"/>
-                <param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
+                <param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
             </conditional>
             <param name="k" value="ko"/>
             <param name="E" value="0.02"/>
@@ -154,7 +182,7 @@ $ap.f_cond.reportannotation
                     <has_line line="Internal pipeline statistics summary:"/>
                 </assert_contents>
             </output>
-            <output name="out_result">
+            <output name="out_result" ftype="tabular">
                 <assert_contents>
                     <has_n_lines n="9"/>
                     <has_text_matching expression=".+sp\|P19858\|LDHA_BOVIN"/>
@@ -172,15 +200,15 @@ $ap.f_cond.reportannotation
             <param name="query" value="query.fasta"/>
             <conditional name="p_cond">
                 <param name="p_sel" value="hmm"/>
-                <param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
+                <param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
             </conditional>
             <param name="k" value="ko"/>
             <section name="ap">
                 <conditional name="f_cond">
                     <param name="f_sel" value="mapper"/>
                 </conditional>
             </section>
-            <output name="out_result">
+            <output name="out_result" ftype="txt">
                 <assert_contents>
                     <has_n_lines n="7"/>
                     <has_line line="sp|P19858|LDHA_BOVIN"/>
@@ -192,7 +220,7 @@ $ap.f_cond.reportannotation
             <param name="query" value="query.fasta"/>
             <conditional name="p_cond">
                 <param name="p_sel" value="hmm"/>
-                <param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
+                <param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
             </conditional>
             <param name="k" value="ko"/>
             <section name="ap">
@@ -222,6 +250,47 @@ $ap.f_cond.reportannotation
                 </assert_contents>
             </output>
         </test>
+        <!-- #6 test with cached data -->
+        <test expect_num_outputs="1">
+            <param name="query" value="query.fasta"/>
+            <conditional name="p_cond">
+                <param name="p_sel" value="cached"/>
+                <param name="kofam" value="test_value"/>
+            </conditional>
+            <param name="k" value="ko"/>
+            <section name="ap">
+                <conditional name="f_cond">
+                    <param name="f_sel" value="mapper-one-line"/>
+                </conditional>
+            </section>
+            <output name="out_result" ftype="txt">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                    <has_line line="sp|P19858|LDHA_BOVIN"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- #6 test with cached data + subset -->
+        <test expect_num_outputs="1">
+            <param name="query" value="query.fasta"/>
+            <conditional name="p_cond">
+                <param name="p_sel" value="cached"/>
+                <param name="kofam" value="test_value"/>
+                <param name="kofam_subset" value="SUBSET"/>
+            </conditional>
+            <param name="k" value="ko"/>
+            <section name="ap">
+                <conditional name="f_cond">
+                    <param name="f_sel" value="mapper-one-line"/>
+                </conditional>
+            </section>
+            <output name="out_result" ftype="txt">
+                <assert_contents>
+                    <has_n_lines n="7"/>
+                    <has_line line="sp|P19858|LDHA_BOVIN"/>
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
 .. class:: infomark

diff --git a/tools/kofamscan/test-data/kofam_subsets_test.loc b/tools/kofamscan/test-data/kofam_subsets_test.loc
@@ -0,0 +1,2 @@
+#value<tab>name<tab>kofam_value
+subset.hal	SUBSET	test_value
diff --git a/tools/kofamscan/test-data/kofam_test.loc b/tools/kofamscan/test-data/kofam_test.loc
@@ -0,0 +1,2 @@
+#value<tab>name<tab>profile_dir<tab>ko_list
+test_value	test_name	${__HERE__}/profiles	${__HERE__}/ko
diff --git a/tools/kofamscan/test-data/K00001.hmm → ...s/kofamscan/test-data/profiles/K00001.hmm b/tools/kofamscan/test-data/K00001.hmm → ...s/kofamscan/test-data/profiles/K00001.hmm
diff --git a/tools/kofamscan/test-data/K00002.hmm → ...s/kofamscan/test-data/profiles/K00002.hmm b/tools/kofamscan/test-data/K00002.hmm → ...s/kofamscan/test-data/profiles/K00002.hmm
diff --git a/tools/kofamscan/test-data/K00003.hmm → ...s/kofamscan/test-data/profiles/K00003.hmm b/tools/kofamscan/test-data/K00003.hmm → ...s/kofamscan/test-data/profiles/K00003.hmm
diff --git a/tools/kofamscan/test-data/profiles/subset.hal b/tools/kofamscan/test-data/profiles/subset.hal
@@ -0,0 +1,2 @@
+K00002.hmm
+K00003.hmm
diff --git a/tools/kofamscan/tool-data/kofam.loc.sample b/tools/kofamscan/tool-data/kofam.loc.sample
@@ -0,0 +1 @@
+# value<tab>name<tab>profile_dir<tab>ko_list<tab>hals
diff --git a/tools/kofamscan/tool-data/kofam_subset.loc.sample b/tools/kofamscan/tool-data/kofam_subset.loc.sample
@@ -0,0 +1,2 @@
+#value<tab>name<tab>kofam_value
+subset.hal	SUBSET	test_value
diff --git a/tools/kofamscan/tool_data_table_conf.xml.sample b/tools/kofamscan/tool_data_table_conf.xml.sample
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- versioned data can be downloaded from
+        ftp://ftp.genome.jp/pub/db/kofam/ or https://www.genome.jp/ftp/db/kofam/
+        see also https://www.genome.jp/tools/kofamkoala/.
+        
+        the downloads contain ko_list.gz and profiles.tar.gz for each version
+        the ko_list entry should point to the file extracted from ko_list.gz
+        the profile_dir entry should point to the directory extracted from profiles.tar.gz
+        
+         -->
+    <table name="kofam" comment_char="#">
+        <columns>value, name, profile_dir, ko_list</columns>
+        <file path="tool-data/kofam.loc" />
+    </table>
+    <!-- 
+        the profile dir contains many K....hmm files and usually two .hal files
+        the .hal files are simple text files listing a set of K files (one per line)
+        which allow to specify subsets of K files easily
+
+        this data table allows to specify these subset files for each entry in
+        the kofam datatabe
+        - value should be a .hal file in the profiles dir (could also be a K file)
+        - name the name shown to the user in the select 
+        - kofam_value should be equal to the value of the corresoponding entry in kofam
+        -->
+    <table name="kofam_subset" comment_char="#">
+        <columns>value, name, kofam_value</columns>
+        <file path="tool-data/kofam_subset.loc" />
+    </table>
+</tables>
diff --git a/tools/kofamscan/tool_data_table_conf.xml.test b/tools/kofamscan/tool_data_table_conf.xml.test
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<tables>
+    <table name="kofam" comment_char="#">
+        <columns>value, name, profile_dir, ko_list</columns>
+        <file path="${__HERE__}/test-data/kofam_test.loc" />
+    </table>
+    <table name="kofam_subset" comment_char="#">
+        <columns>value, name, kofam_value</columns>
+        <file path="${__HERE__}/test-data/kofam_subsets_test.loc" />
+    </table>
+</tables>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#value<tab>name<tab>kofam_value
		subset.hal SUBSET test_value
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#value<tab>name<tab>profile_dir<tab>ko_list
		test_value test_name ${__HERE__}/profiles ${__HERE__}/ko
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# value<tab>name<tab>profile_dir<tab>ko_list<tab>hals