Make guppyplex input handling more flexible (#4353)

* Make guppyplex input handling more flexible This allows the user to specify which sequence input files guppyplex should collapse into a single output, and which ones it should keep separate. Also makes the tool work with fastq and fastq.gz data alike and exposes the --quality threshold parameter. The changeset also contains a fix for the artic minion pipeline, which should make it handle input data with spaces in its name. * Fix test, add more tool help * Fix consensus FASTA header line * Move input file prepping to configfile Since the wrapper has to symlink each input file separately it could generate a very long command line when there are lots of files to concatenate. * Add additional test * Remove space from test file list
galaxyproject · Jan 31, 2022 · e6a1f82 · e6a1f82
1 parent 37384c4
commit e6a1f82
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 11 deletions.
diff --git a/tools/artic/artic_guppyplex.xml b/tools/artic/artic_guppyplex.xml
@@ -1,4 +1,4 @@
-<tool id="artic_guppyplex" name="ARTIC guppyplex" version="@PACKAGE_VERSION@+galaxy1" profile="20.09">
+<tool id="artic_guppyplex" name="ARTIC guppyplex" version="@PACKAGE_VERSION@+galaxy2" profile="20.09">
     <description>Filter Nanopore reads by read length and (optionally) quality</description>
     <macros>
         <import>macros.xml</import>
@@ -9,27 +9,89 @@
     <command detect_errors="exit_code">
     <![CDATA[
         mkdir inputs &&
-        #for $i, $elem in enumerate($input)
-            ln -fs '$elem' inputs/fastq${i}.fastq &&
-        #end for
+
+        ## Note about compression handling in the following:
+        ## guppyplex use mimetypes.guess_type to guess compression so
+        ## it's important to get the suffix of the inputs right.
+        ## Even if it detects compressed input, it will write uncompressed
+        ## output so we need to handle output compression separately.
+
+        ## symlink input files to appropriate names in the inputs/ directory
+        bash prepare_inputs.sh &&
+        #if str($input.structure) == 'one_to_one':
+            #set $compressed = $input.reads.is_of_type("fastq.gz", "fastqsanger.gz")
+        #else:
+            #set $compressed = next(iter($input.reads)).is_of_type("fastq.gz", "fastqsanger.gz")
+        #end if
         artic guppyplex --min-length $min_length --max-length $max_length
+        #if $min_quality == 0:
+            --skip-quality-check
+        #else:
+            --quality $min_quality
+        #end if
             --directory inputs/
-            --prefix artic_guppyplex --output '$output1'
+            --output guppyplex_out.fastq
+        #if $compressed:
+            && gzip guppyplex_out.fastq
+        #end if
     ]]>
     </command>
+    <configfiles>
+        <configfile filename="prepare_inputs.sh"><![CDATA[
+            #if str($input.structure) == 'one_to_one':
+ln -s '$input.reads' inputs/1.${input.reads.ext}
+            #else:
+                #for $i, $elem in enumerate($input.reads):
+ln -s '$elem' inputs/${i}.${elem.ext} &&
+                #end for
+:
+            #end if
+        ]]>
+        </configfile>
+    </configfiles>
     <inputs>
-        <param name="input" multiple="true" type="data" format="fastq" label="Nanopore reads (FASTQ format" />
+        <conditional name="input">
+            <param name="structure" type="select"
+            label="Structure of your input data"
+            help="">
+                <option value="one_to_one">One input dataset per sample</option>
+                <option value="one_to_many">Multiple input datasets per sample</option>
+            </param>
+            <when value="one_to_one">
+                <param name="reads" type="data" format="@FASTQ_FORMATS@"
+                label="Sequencing dataset(s) - one per sample" />
+            </when>
+            <when value="one_to_many">
+                <param name="reads" multiple="true" type="data" format="@FASTQ_FORMATS@"
+                label="Partial sequencing datasets for your sample"
+                help="Multiple datasets selected here will get combined into a single output for a single assumed sample. Select a nested list to have its inner lists interpreted as data from one sample each and to obtain one output per inner list." />
+            </when>
+        </conditional>
         <param name="max_length" type="integer" label="Remove reads longer than" value="700" help="remove reads greater than this number of base pairs" />
         <param name="min_length" type="integer" label="Remove reads shorter than" value="400" help="remove reads less than this number of base pairs" />
-        <param name="skip_quality_check" argument="--skip-quality-check" type="boolean" truevalue="--skip-quality-check" falsevalue="" checked="False" label="Do not filter on quality score (speeds up processing)" />
+        <param name="min_quality" type="integer" min="0" value="7"
+        label="Eliminate reads with a mean base quality score of less than"
+        help="Set to 0 to skip the quality check." />
     </inputs>
     <outputs>
-        <data name="output1" format="fastq" from_work_dir="run_name_.fastq" />
+        <data name="output" format_source="reads" from_work_dir="guppyplex_out.fastq*" />
     </outputs>
     <tests>
         <test>
-            <param name="input" value="test.fastq" />
-            <output name="output1" file="gupplyplex_output.fastq"/>
+            <conditional name="input">
+                <param name="structure" value="one_to_one" />
+                <param name="reads" value="test.fastq" />
+            </conditional>
+            <output name="output" file="gupplyplex_output.fastq"/>
+        </test>
+        <test>
+            <conditional name="input">
+                <param name="structure" value="one_to_many" />
+                <param name="reads" value="test.fastq,test.fastq" />
+            </conditional>
+            <!-- guppyplex drops duplicate reads so we don't need a new
+            test file for checking this branch -->
+            <output name="output" file="gupplyplex_output.fastq"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -44,6 +106,9 @@
         the minimum length and the maximum length of an amplicon plus 200 as the
         maximum length.
 
+        The tool can also be used simultaneously to gather partial fastq
+        datasets into single datasets per sample.
+
         .. _ARTIC: https://artic.readthedocs.io/en/latest/
     ]]></help>
     <expand macro="citations" />

diff --git a/tools/artic/artic_minion.xml b/tools/artic/artic_minion.xml
@@ -34,8 +34,13 @@
             --medaka-model '$medaka_model'
             $bwa
             'name/V1'
-            '${read_file.element_identifier}'
+            ## enclose the sample name in extra single quotes because
+            ## the minion pipeline script doesn't care about passing
+            ## its arguments safely.
+            "'"'${read_file.element_identifier}'"'"
         && bgzip -f '${read_file.element_identifier}.fail.vcf'
+        ## remove enclosing single-quotes from header of output consensus fasta
+        && sed -i "1s/'${read_file.element_identifier}'/${read_file.element_identifier}/" ${read_file.element_identifier}.consensus.fasta
     ]]></command>
     <inputs>
         <param argument="--read-file" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Input Read File"/>

diff --git a/tools/artic/macros.xml b/tools/artic/macros.xml
@@ -1,5 +1,6 @@
 <macros>
     <token name="@PACKAGE_VERSION@">1.2.1</token>
+    <token name="@FASTQ_FORMATS@">fastq,fastq.gz,fastqsanger,fastqsanger.gz</token>
     <xml name="citations">
         <citations>
             <citation type="bibtex">