Skip to content

Commit

Permalink
kofamscan: add data table (#5956)
Browse files Browse the repository at this point in the history
* kofamscan: add data table

with reference data consisting of multiple files (parameters -p and -k)
summing up to 1.3G kfoamscan qualifies for a data table.

currently the idea is to have two data tables

1. the first contains the main reference data
   - path to the profiles dir
   - path to the ko list file
2. hal files (hal files are contained in the profiles dir and basically
   just list hmm files, i.e. specify a subset of the hmm files. in the
   current reference data there are hal files for pro and eukaryotes, but
   this may change?) also one could use this to list the single HMMs,
   such that users can run against a single HMM

also

- use tabuar output by default
- change format of the outoput from txt to tabular if needed
- fixes the version command

* try different syntax for input ref
  • Loading branch information
bernt-matthias authored Apr 22, 2024
1 parent edb9fd7 commit db30370
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 26 deletions.
121 changes: 95 additions & 26 deletions tools/kofamscan/kofamscan.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<description>gene function annotation based on KEGG orthology and HMM</description>
<macros>
<token name="@TOOL_VERSION@">1.3.0</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@VERSION_SUFFIX@">2</token>
<xml name="reportannotation" token_selected="">
<param name="reportannotation" type="boolean" truevalue="--report-unannotated" falsevalue="--no-report-unannotated" checked="@SELECTED@" label="Include sequence name to outputs even if no KOs are assigned?"/>
</xml>
Expand All @@ -14,27 +14,35 @@
<requirement type="package" version="@TOOL_VERSION@">kofamscan</requirement>
<requirement type="package" version="3.0">zip</requirement>
</requirements>
<version_command><![CDATA[sansa -v | grep "Sansa " | cut -d "v" -f 3]]></version_command>
<version_command><![CDATA[exec_annotation --version | cut -d " " -f 2]]></version_command>
<command detect_errors="exit_code"><![CDATA[
## preprocessing
mkdir ./profile &&
mkdir ./temp_extract &&
#if $p_cond.p_sel == 'compressed'
tar -xf '${p_cond.p}' -C temp_extract &&
find ./temp_extract/ -name '*.hmm' -exec mv {} ./profile \; &&
find ./temp_extract/ -name '*.hal' -exec mv {} ./profile \; &&
#elif $p_cond.p_sel == 'hmm'
## input files require prefix 'K' and file extension '.hmm'
#for $i, $current in enumerate($p_cond.p)
ln -s '$current' 'profile/K${i}.hmm' &&
#end for
#if $p_cond.p_sel != "cached"
## preprocessing
mkdir ./profile &&
mkdir ./temp_extract &&
#if $p_cond.p_sel == 'compressed'
tar -xf '${p_cond.p}' -C temp_extract &&
find ./temp_extract/ -name '*.hmm' -exec mv {} ./profile \; &&
find ./temp_extract/ -name '*.hal' -exec mv {} ./profile \; &&
#elif $p_cond.p_sel == 'hmm'
## input files require prefix 'K' and file extension '.hmm'
#for $i, $current in enumerate($p_cond.p)
ln -s '$current' 'profile/K${i}.hmm' &&
#end for
#end if
#end if
## run
exec_annotation
-p 'profile'
#if $p_cond.p_sel != "cached"
-p 'profile'
-k '$p_cond.k'
#else
-p '$p_cond.kofam.fields.profile_dir/#if $p_cond.kofam_subset then $p_cond.kofam_subset else ""#'
-k '$p_cond.kofam.fields.ko_list'
#end if
-o 'result.txt'
-k '$k'
--cpu \${GALAXY_SLOTS:-4}
-E $E
-T $ap.T
Expand All @@ -54,17 +62,34 @@ $ap.f_cond.reportannotation
<param name="query" type="data" format="fasta" label="Select query sequence file" help="Nucleotide sequences are not accepted."/>
<conditional name="p_cond">
<param name="p_sel" type="select" label="Select profile database format">
<option value="compressed" selected="true">Compressed set of HMM and HAL file(s)</option>
<option value="cached" selected="true">Cached</option>
<option value="compressed">Compressed set of HMM and HAL file(s)</option>
<option value="hmm">HMM file(s)</option>
</param>
<when value="cached">
<param name="kofam" type="select">
<options from_data_table="kofam"/>
</param>
<param name="kofam_subset" type="select" optional="true">
<options from_data_table="kofam_subset">
<filter type="param_value" column="2" ref="kofam"/>
<filter type="sort_by" column="1"/>
<filter type="unique_value" column="1"/>
<column name="name" index="1"/>
<column name="value" index="0"/>
</options>
<validator type="no_options" message="No option available for this input"/>
</param>
</when>
<when value="compressed">
<param argument="-p" type="data" format="tar" label="Select a compressed file with HMM and HAL file(s)" help="Compressed archives are available from KofamKOALA web service (https://www.genome.jp/tools/kofamkoala/)."/>
<param argument="-k" type="data" format="tabular" label="Select KO list file"/>
</when>
<when value="hmm">
<param argument="-p" type="data" format="hmm3" multiple="true" label="Select profile HMM file(s)"/>
<param argument="-k" type="data" format="tabular" label="Select KO list file"/>
</when>
</conditional>
<param argument="-k" type="data" format="tabular" label="Select KO list file"/>
<param argument="-E" type="float" min="0.0" max="1.0" value="0.01" label="Set E-value threshold"/>
<section name="ap" title="Advanced parameters" expanded="true">
<param argument="-T" type="integer" value="1" label="Set threshold scale" help="The score thresholds will be multiplied by this value."/>
Expand Down Expand Up @@ -97,15 +122,18 @@ $ap.f_cond.reportannotation
</section>
</inputs>
<outputs>
<data name="out_result" format="txt" from_work_dir="result.txt" label="${tool.name} on ${on_string}: Results">
<filter>'result' in ap['out']</filter>
<change_format>
<when input='ap.f_cond.f_sel' value="detail-tsv" format="tabular" />
</change_format>
</data>
<data name="out_alignments" format="zip" from_work_dir="tmp/alignments.zip" label="${tool.name} on ${on_string}: HMMER alignments">
<filter>'alignments' in ap['out']</filter>
</data>
<data name="out_output" format="txt" from_work_dir="tmp/output/output.txt" label="${tool.name} on ${on_string}: HMMER output">
<filter>'output' in ap['out']</filter>
</data>
<data name="out_result" format="txt" from_work_dir="result.txt" label="${tool.name} on ${on_string}: Results">
<filter>'result' in ap['out']</filter>
</data>
<data name="out_tabular" format="txt" from_work_dir="tmp/tabular/tabular.txt" label="${tool.name} on ${on_string}: HMMER tabular">
<filter>'tabular' in ap['out']</filter>
</data>
Expand All @@ -131,7 +159,7 @@ $ap.f_cond.reportannotation
<param name="query" value="query.fasta"/>
<conditional name="p_cond">
<param name="p_sel" value="hmm"/>
<param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
<param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
</conditional>
<param name="k" value="ko"/>
<param name="E" value="0.02"/>
Expand All @@ -154,7 +182,7 @@ $ap.f_cond.reportannotation
<has_line line="Internal pipeline statistics summary:"/>
</assert_contents>
</output>
<output name="out_result">
<output name="out_result" ftype="tabular">
<assert_contents>
<has_n_lines n="9"/>
<has_text_matching expression=".+sp\|P19858\|LDHA_BOVIN"/>
Expand All @@ -172,15 +200,15 @@ $ap.f_cond.reportannotation
<param name="query" value="query.fasta"/>
<conditional name="p_cond">
<param name="p_sel" value="hmm"/>
<param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
<param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
</conditional>
<param name="k" value="ko"/>
<section name="ap">
<conditional name="f_cond">
<param name="f_sel" value="mapper"/>
</conditional>
</section>
<output name="out_result">
<output name="out_result" ftype="txt">
<assert_contents>
<has_n_lines n="7"/>
<has_line line="sp|P19858|LDHA_BOVIN"/>
Expand All @@ -192,7 +220,7 @@ $ap.f_cond.reportannotation
<param name="query" value="query.fasta"/>
<conditional name="p_cond">
<param name="p_sel" value="hmm"/>
<param name="p" value="K00001.hmm,K00002.hmm,K00003.hmm"/>
<param name="p" value="profiles/K00001.hmm,profiles/K00002.hmm,profiles/K00003.hmm"/>
</conditional>
<param name="k" value="ko"/>
<section name="ap">
Expand Down Expand Up @@ -222,6 +250,47 @@ $ap.f_cond.reportannotation
</assert_contents>
</output>
</test>
<!-- #6 test with cached data -->
<test expect_num_outputs="1">
<param name="query" value="query.fasta"/>
<conditional name="p_cond">
<param name="p_sel" value="cached"/>
<param name="kofam" value="test_value"/>
</conditional>
<param name="k" value="ko"/>
<section name="ap">
<conditional name="f_cond">
<param name="f_sel" value="mapper-one-line"/>
</conditional>
</section>
<output name="out_result" ftype="txt">
<assert_contents>
<has_n_lines n="7"/>
<has_line line="sp|P19858|LDHA_BOVIN"/>
</assert_contents>
</output>
</test>
<!-- #6 test with cached data + subset -->
<test expect_num_outputs="1">
<param name="query" value="query.fasta"/>
<conditional name="p_cond">
<param name="p_sel" value="cached"/>
<param name="kofam" value="test_value"/>
<param name="kofam_subset" value="SUBSET"/>
</conditional>
<param name="k" value="ko"/>
<section name="ap">
<conditional name="f_cond">
<param name="f_sel" value="mapper-one-line"/>
</conditional>
</section>
<output name="out_result" ftype="txt">
<assert_contents>
<has_n_lines n="7"/>
<has_line line="sp|P19858|LDHA_BOVIN"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
Expand Down
2 changes: 2 additions & 0 deletions tools/kofamscan/test-data/kofam_subsets_test.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#value<tab>name<tab>kofam_value
subset.hal SUBSET test_value
2 changes: 2 additions & 0 deletions tools/kofamscan/test-data/kofam_test.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#value<tab>name<tab>profile_dir<tab>ko_list
test_value test_name ${__HERE__}/profiles ${__HERE__}/ko
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions tools/kofamscan/test-data/profiles/subset.hal
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
K00002.hmm
K00003.hmm
1 change: 1 addition & 0 deletions tools/kofamscan/tool-data/kofam.loc.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# value<tab>name<tab>profile_dir<tab>ko_list<tab>hals
2 changes: 2 additions & 0 deletions tools/kofamscan/tool-data/kofam_subset.loc.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#value<tab>name<tab>kofam_value
subset.hal SUBSET test_value
31 changes: 31 additions & 0 deletions tools/kofamscan/tool_data_table_conf.xml.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?xml version="1.0"?>
<tables>
<!-- versioned data can be downloaded from
ftp://ftp.genome.jp/pub/db/kofam/ or https://www.genome.jp/ftp/db/kofam/
see also https://www.genome.jp/tools/kofamkoala/.
the downloads contain ko_list.gz and profiles.tar.gz for each version
the ko_list entry should point to the file extracted from ko_list.gz
the profile_dir entry should point to the directory extracted from profiles.tar.gz
-->
<table name="kofam" comment_char="#">
<columns>value, name, profile_dir, ko_list</columns>
<file path="tool-data/kofam.loc" />
</table>
<!--
the profile dir contains many K....hmm files and usually two .hal files
the .hal files are simple text files listing a set of K files (one per line)
which allow to specify subsets of K files easily
this data table allows to specify these subset files for each entry in
the kofam datatabe
- value should be a .hal file in the profiles dir (could also be a K file)
- name the name shown to the user in the select
- kofam_value should be equal to the value of the corresoponding entry in kofam
-->
<table name="kofam_subset" comment_char="#">
<columns>value, name, kofam_value</columns>
<file path="tool-data/kofam_subset.loc" />
</table>
</tables>
11 changes: 11 additions & 0 deletions tools/kofamscan/tool_data_table_conf.xml.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0"?>
<tables>
<table name="kofam" comment_char="#">
<columns>value, name, profile_dir, ko_list</columns>
<file path="${__HERE__}/test-data/kofam_test.loc" />
</table>
<table name="kofam_subset" comment_char="#">
<columns>value, name, kofam_value</columns>
<file path="${__HERE__}/test-data/kofam_subsets_test.loc" />
</table>
</tables>

0 comments on commit db30370

Please sign in to comment.