Skip to content

Commit

Permalink
Make guppyplex input handling more flexible (#4353)
Browse files Browse the repository at this point in the history
* Make guppyplex input handling more flexible

This allows the user to specify which sequence input files guppyplex
should collapse into a single output, and which ones it should keep
separate. Also makes the tool work with fastq and fastq.gz data alike
and exposes the --quality threshold parameter.

The changeset also contains a fix for the artic minion pipeline, which
should make it handle input data with spaces in its name.

* Fix test, add more tool help

* Fix consensus FASTA header line

* Move input file prepping to configfile

Since the wrapper has to symlink each input file separately it could
generate a very long command line when there are lots of files to
concatenate.

* Add additional test

* Remove space from test file list
  • Loading branch information
wm75 authored Jan 31, 2022
1 parent 37384c4 commit e6a1f82
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 11 deletions.
85 changes: 75 additions & 10 deletions tools/artic/artic_guppyplex.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="artic_guppyplex" name="ARTIC guppyplex" version="@PACKAGE_VERSION@+galaxy1" profile="20.09">
<tool id="artic_guppyplex" name="ARTIC guppyplex" version="@PACKAGE_VERSION@+galaxy2" profile="20.09">
<description>Filter Nanopore reads by read length and (optionally) quality</description>
<macros>
<import>macros.xml</import>
Expand All @@ -9,27 +9,89 @@
<command detect_errors="exit_code">
<![CDATA[
mkdir inputs &&
#for $i, $elem in enumerate($input)
ln -fs '$elem' inputs/fastq${i}.fastq &&
#end for
## Note about compression handling in the following:
## guppyplex use mimetypes.guess_type to guess compression so
## it's important to get the suffix of the inputs right.
## Even if it detects compressed input, it will write uncompressed
## output so we need to handle output compression separately.
## symlink input files to appropriate names in the inputs/ directory
bash prepare_inputs.sh &&
#if str($input.structure) == 'one_to_one':
#set $compressed = $input.reads.is_of_type("fastq.gz", "fastqsanger.gz")
#else:
#set $compressed = next(iter($input.reads)).is_of_type("fastq.gz", "fastqsanger.gz")
#end if
artic guppyplex --min-length $min_length --max-length $max_length
#if $min_quality == 0:
--skip-quality-check
#else:
--quality $min_quality
#end if
--directory inputs/
--prefix artic_guppyplex --output '$output1'
--output guppyplex_out.fastq
#if $compressed:
&& gzip guppyplex_out.fastq
#end if
]]>
</command>
<configfiles>
<configfile filename="prepare_inputs.sh"><![CDATA[
#if str($input.structure) == 'one_to_one':
ln -s '$input.reads' inputs/1.${input.reads.ext}
#else:
#for $i, $elem in enumerate($input.reads):
ln -s '$elem' inputs/${i}.${elem.ext} &&
#end for
:
#end if
]]>
</configfile>
</configfiles>
<inputs>
<param name="input" multiple="true" type="data" format="fastq" label="Nanopore reads (FASTQ format" />
<conditional name="input">
<param name="structure" type="select"
label="Structure of your input data"
help="">
<option value="one_to_one">One input dataset per sample</option>
<option value="one_to_many">Multiple input datasets per sample</option>
</param>
<when value="one_to_one">
<param name="reads" type="data" format="@FASTQ_FORMATS@"
label="Sequencing dataset(s) - one per sample" />
</when>
<when value="one_to_many">
<param name="reads" multiple="true" type="data" format="@FASTQ_FORMATS@"
label="Partial sequencing datasets for your sample"
help="Multiple datasets selected here will get combined into a single output for a single assumed sample. Select a nested list to have its inner lists interpreted as data from one sample each and to obtain one output per inner list." />
</when>
</conditional>
<param name="max_length" type="integer" label="Remove reads longer than" value="700" help="remove reads greater than this number of base pairs" />
<param name="min_length" type="integer" label="Remove reads shorter than" value="400" help="remove reads less than this number of base pairs" />
<param name="skip_quality_check" argument="--skip-quality-check" type="boolean" truevalue="--skip-quality-check" falsevalue="" checked="False" label="Do not filter on quality score (speeds up processing)" />
<param name="min_quality" type="integer" min="0" value="7"
label="Eliminate reads with a mean base quality score of less than"
help="Set to 0 to skip the quality check." />
</inputs>
<outputs>
<data name="output1" format="fastq" from_work_dir="run_name_.fastq" />
<data name="output" format_source="reads" from_work_dir="guppyplex_out.fastq*" />
</outputs>
<tests>
<test>
<param name="input" value="test.fastq" />
<output name="output1" file="gupplyplex_output.fastq"/>
<conditional name="input">
<param name="structure" value="one_to_one" />
<param name="reads" value="test.fastq" />
</conditional>
<output name="output" file="gupplyplex_output.fastq"/>
</test>
<test>
<conditional name="input">
<param name="structure" value="one_to_many" />
<param name="reads" value="test.fastq,test.fastq" />
</conditional>
<!-- guppyplex drops duplicate reads so we don't need a new
test file for checking this branch -->
<output name="output" file="gupplyplex_output.fastq"/>
</test>
</tests>
<help><![CDATA[
Expand All @@ -44,6 +106,9 @@
the minimum length and the maximum length of an amplicon plus 200 as the
maximum length.
The tool can also be used simultaneously to gather partial fastq
datasets into single datasets per sample.
.. _ARTIC: https://artic.readthedocs.io/en/latest/
]]></help>
<expand macro="citations" />
Expand Down
7 changes: 6 additions & 1 deletion tools/artic/artic_minion.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@
--medaka-model '$medaka_model'
$bwa
'name/V1'
'${read_file.element_identifier}'
## enclose the sample name in extra single quotes because
## the minion pipeline script doesn't care about passing
## its arguments safely.
"'"'${read_file.element_identifier}'"'"
&& bgzip -f '${read_file.element_identifier}.fail.vcf'
## remove enclosing single-quotes from header of output consensus fasta
&& sed -i "1s/'${read_file.element_identifier}'/${read_file.element_identifier}/" ${read_file.element_identifier}.consensus.fasta
]]></command>
<inputs>
<param argument="--read-file" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Input Read File"/>
Expand Down
1 change: 1 addition & 0 deletions tools/artic/macros.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<macros>
<token name="@PACKAGE_VERSION@">1.2.1</token>
<token name="@FASTQ_FORMATS@">fastq,fastq.gz,fastqsanger,fastqsanger.gz</token>
<xml name="citations">
<citations>
<citation type="bibtex">
Expand Down

0 comments on commit e6a1f82

Please sign in to comment.