fmalmeida · fmalmeida · Feb 28, 2022 · Feb 23, 2022 · Feb 24, 2022 · Feb 24, 2022
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,5 @@
 * linguist-vendored
 *.nf linguist-vendored=false
 *.config linguist-vendored=false
+*.py linguist-vendored=false
+*.R linguist-vendored=false
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .nextflow*
-docs/_build
+docs/_build
+testing
diff --git a/.zenodo.json b/.zenodo.json
@@ -2,7 +2,7 @@
     "description": "<p>MpGAP is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. It is an easy to use pipeline that adopts well known software for _de novo_ genome assembly of Illumina, Pacbio and Oxford Nanopore sequencing data through illumina only, long reads only or hybrid modes.</p>",
     "license": "other-open", 
     "title": "fmalmeida/MpGAP: A generic multi-platform genome assembly pipeline", 
-    "version": "v3.1", 
+    "version": "v3.1.2", 
     "upload_type": "software",
     "creators": [
         {

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<img src="images/lOGO_3.png" width="300px">
+<img src="assets/lOGO_3.png" width="300px">
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3997375.svg)](https://doi.org/10.5281/zenodo.3445485)
 [![Releases](https://img.shields.io/github/v/release/fmalmeida/mpgap)](https://github.com/fmalmeida/mpgap/releases)
@@ -78,6 +78,9 @@ Therefore, feedbacks are very well welcomed. If you believe that your use case i
         ```bash
         # for docker
         docker pull fmalmeida/mpgap:v3.1
+
+        # run
+        nextflow run fmalmeida/mpgap -profile docker [options]
         ```
 
     * for singularity
@@ -89,6 +92,9 @@ Therefore, feedbacks are very well welcomed. If you believe that your use case i
         export NXF_SINGULARITY_LIBRARYDIR=MY_SINGULARITY_IMAGES    # your singularity storage dir
         export NXF_SINGULARITY_CACHEDIR=MY_SINGULARITY_CACHE       # your singularity cache dir
         singularity pull --dir $NXF_SINGULARITY_LIBRARYDIR fmalmeida-mpgap-v3.1.img docker://fmalmeida/mpgap:v3.1
+
+        # run
+        nextflow run fmalmeida/mpgap -profile singularity [options]
         ```
 
     * for conda
@@ -98,6 +104,10 @@ Therefore, feedbacks are very well welcomed. If you believe that your use case i
         # it is better to create envs with mamba for faster solving
         wget https://github.com/fmalmeida/mpgap/raw/master/environment.yml
         conda env create -f environment.yml   # advice: use mamba
+
+        # must be executed from the base environment
+        # This tells nextflow to load the available mpgap environment when required
+        nextflow run fmalmeida/mpgap -profile conda [options]
         ```
 
         :dart: Please make sure to also download its busco databases. [See the explanation](#note-on-conda)
@@ -121,11 +131,13 @@ Nextflow profiles are a set of "sensible defaults" for the resource requirements
 
 The pipeline have "standard profiles" set to run the workflows with either conda, docker or singularity using the [local executor](https://www.nextflow.io/docs/latest/executor.html), which is nextflow's default and basically runs the pipeline processes in the computer where Nextflow is launched. If you need to run the pipeline using another executor such as sge, lsf, slurm, etc. you can take a look at [nextflow's manual page](https://www.nextflow.io/docs/latest/executor.html) to proper configure one in a new custom profile set in your personal copy of [MpGAP config file](https://github.com/fmalmeida/mpgap/blob/master/nextflow.config) and take advantage that nextflow allows multiple profiles to be used at once, e.g. `-profile conda,sge`.
 
-By default, if no profile is chosen, the pipeline will "load the docker profile". Available pre-set profiles for this pipeline are: docker, conda, singularity, you can choose between them as follows:
+By default, if no profile is chosen, the pipeline will try to load tools from the local machine $PATH. Available pre-set profiles for this pipeline are: `docker/conda/singularity`, you can choose between them as follows:
 
 * conda
 
     ```bash
+    # must be executed from the base environment
+    # This tells nextflow to load the available mpgap environment when required
     nextflow run fmalmeida/mpgap -profile conda [options]
     ```
 
@@ -199,7 +211,7 @@ It produces a long reads only assembly and polishes (correct errors) it with sho
 # run the pipeline setting the desired hybrid strategy globally (for all samples)
 nextflow run fmalmeida/mpgap \
   --output output \
-  --threads 5 \
+  --max_cpus 5 \
   --input "samplesheet.yml" \
   --hybrid_strategy "both"
 ```
@@ -245,11 +257,11 @@ nf-core launch fmalmeida/mpgap
 It will result in the following:
 
 <p align="center">
-<img src="./images/nf-core-asking.png" width="500px"/>
+<img src="./assets/nf-core-asking.png" width="500px"/>
 </p>
 
 <p align="center">
-<img src="./images/nf-core-gui.png" width="400px"/>
+<img src="./assets/nf-core-gui.png" width="400px"/>
 </p>
 
 ## Known issues
@@ -263,6 +275,14 @@ It will result in the following:
 
 ## Citation
 
-To cite this pipeline users can use our Zenodo tag or directly via the github url. Users are encouraged to cite the programs used in this pipeline whenever they are used.
+To cite this tool please refer to our [Zenodo tag](https://doi.org/10.5281/zenodo.3445485).
+
+This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [GPLv3](https://github.com/fmalmeida/ngs-preprocess/blob/master/LICENSE).
+
+> The nf-core framework for community-curated bioinformatics pipelines.
+>
+> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
+>
+> Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x.
 
-Please, do not forget to cite the software that were used whenever you use its outputs. See [the list of tools](markdown/list_of_tools.md).
+In addition, users are encouraged to cite the programs used in this pipeline whenever they are used. Links to resources of tools and data used in this pipeline are in [the list of tools](markdown/list_of_tools.md).
diff --git a/assets/hybrid_test.yml b/assets/hybrid_test.yml
@@ -0,0 +1,17 @@
+samplesheet:
+
+  - id: ont_hybrid
+    nanopore:  https://github.com/fmalmeida/test_datasets/raw/main/ecoli_ont_15X.fastq.gz
+    genome_size: 0.5m
+    illumina: 
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_1.fastq.gz
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_2.fastq.gz
+    hybrid_strategy: both
+
+  - id: pacbio_hybrid
+    pacbio:  https://github.com/fmalmeida/test_datasets/raw/main/ecoli_pacbio_15X.fastq.gz
+    genome_size: 0.5m
+    illumina: 
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_1.fastq.gz
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_2.fastq.gz
+    hybrid_strategy: both
diff --git a/assets/illumina_test.yml b/assets/illumina_test.yml
@@ -0,0 +1,5 @@
+samplesheet:
+  - id: illumina_only
+    illumina: 
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_1.fastq.gz
+      - https://github.com/fmalmeida/test_datasets/raw/main/ecoli_illumina_15X_2.fastq.gz
diff --git a/images/lOGO_3.png → assets/lOGO_3.png b/images/lOGO_3.png → assets/lOGO_3.png
diff --git a/images/lOGO_3_transparente.png → assets/lOGO_3_transparente.png b/images/lOGO_3_transparente.png → assets/lOGO_3_transparente.png
diff --git a/assets/lreads_test.yml b/assets/lreads_test.yml
@@ -0,0 +1,7 @@
+samplesheet:
+  - id: ont_only
+    nanopore:  https://github.com/fmalmeida/test_datasets/raw/main/ecoli_ont_15X.fastq.gz
+    genome_size: 0.5m
+  - id: pacbio_only
+    pacbio:  https://github.com/fmalmeida/test_datasets/raw/main/ecoli_pacbio_15X.fastq.gz
+    genome_size: 0.5m
diff --git a/images/nf-core-asking.png → assets/nf-core-asking.png b/images/nf-core-asking.png → assets/nf-core-asking.png
diff --git a/images/nf-core-gui.png → assets/nf-core-gui.png b/images/nf-core-gui.png → assets/nf-core-gui.png
diff --git a/conf/base.config b/conf/base.config
@@ -0,0 +1,72 @@
+process {
+
+    // The defaults for all processes
+    cpus   = { params.max_cpus   }
+    memory = { params.max_memory }
+    time   = { params.max_time   }
+
+    errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    // labels
+    withLabel:process_ultralow {
+        cpus   = { check_max( 1 * task.attempt, 'cpus' )      }
+        memory = { check_max( 2.GB * task.attempt, 'memory' ) }
+        time   = { check_max( 1.h * task.attempt, 'time' )    }
+    }
+    withLabel:process_low {
+        cpus   = { check_max( 2 * task.attempt, 'cpus' )      }
+        memory = { check_max( 4.GB * task.attempt, 'memory' ) }
+        time   = { check_max( 1.h * task.attempt, 'time' )    }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+
+    // Assemblies will first try to adjust themselves to a parallel execution
+    // If it is not possible, then it waits to use all the resources allowed
+    withLabel:process_assembly {
+      cpus   = {  if (task.attempt == 1) { check_max( 6 * task.attempt, 'cpus' ) } else { params.max_cpus } }
+      memory = {  if (task.attempt == 1) { check_max( 14.GB * task.attempt, 'memory' ) } else { params.max_memory } }
+      time   = {  if (task.attempt == 1) { check_max( 16.h * task.attempt, 'time' ) } else { params.max_time } }
+    }
+
+}
+
+// Function to ensure that resource requirements don't go beyond
+// a maximum limit
+def check_max(obj, type) {
+  if(type == 'memory'){
+    try {
+      if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+        return params.max_memory as nextflow.util.MemoryUnit
+      else
+        return obj
+    } catch (all) {
+      println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+      return obj
+    }
+  } else if(type == 'time'){
+    try {
+      if(obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+        return params.max_time as nextflow.util.Duration
+      else
+        return obj
+    } catch (all) {
+      println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+      return obj
+    }
+  } else if(type == 'cpus'){
+    try {
+      return Math.min( obj, params.max_cpus as int )
+    } catch (all) {
+      println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+      return obj
+    }
+  }
+}
diff --git a/conf/conda.config b/conf/conda.config
@@ -0,0 +1,5 @@
+// conda profile
+params.selected_profile = "conda"
+singularity.enabled     = false
+docker.enabled          = false
+process.conda           = "$CONDA_PREFIX/envs/mpgap-3.1"
diff --git a/conf/defaults.config b/conf/defaults.config
@@ -0,0 +1,130 @@
+/*
+ * Configuration File to run fmalmeida/mpgap pipeline.
+ */
+
+params {
+
+              /* 
+               * Input parameter
+               */
+
+
+// Path to YAML samplesheet file.
+// Please read the documentation https://mpgap.readthedocs.io/en/latest/samplesheet.html to know how to create a samplesheet file.
+  input = null
+
+              /*
+               * Output parameters
+               */
+
+
+// Output folder name
+  output = "output"
+
+
+              /*
+               * Resources parameters
+               */
+
+// Memory allocation for pilon polish.
+// Values in Gb. Default 50G. 50G has been proved to be enough in most cases.
+// This step is crucial because with not enough memory will crash and not correct your assembly.
+  pilon_memory_limit = 50
+
+              /*
+               * General parameters
+               *
+               * These parameters will set the default for all samples.
+               * However, they can also be set inside the YAML, if this happens 
+               * the pipeline will use the value inside the YAML to overwrite 
+               * the parameter for that specific sample.
+               * 
+               * Please read the documentation https://mpgap.readthedocs.io/en/latest/samplesheet.html to know more about the samplesheet file.
+               */
+
+
+// This parameter only needs to be set if the software chosen is Canu, wtdbg2 or Haslr. Is optional for Flye.
+// It is an estimate of the size of the genome. Common suffices are allowed, for example, 3.7m or 2.8g
+  genome_size = null
+
+// Select the appropriate value to pass to wtdbg2 to assemble input.
+// Options are: "ont" for Nanopore reads, "rs" for PacBio RSII, "sq" for PacBio Sequel, "ccs" for PacBio CCS reads.
+// By default, if not given, the pipeline will use the value "ont" if nanopore reads are used and "sq" if pacbio reads are used
+  wtdbg2_technology = null
+
+// Select the appropriate shasta config to use for assembly
+// Since shasta v0.8 (Oct/2021) this parameter is now mandatory.
+  shasta_config = "Nanopore-Oct2021"
+
+// Tells the pipeline to interpret the long reads as "corrected" long reads.
+// This will activate (if available) the options for corrected reads in the
+// assemblers: -corrected (in canu), --pacbio-corr|--nano-corr (in flye), etc.
+// Be cautious when using this parameter. If your reads are not corrected, and
+// you use this parameter, you will probably do not generate any contig.
+  corrected_long_reads = false
+
+// This parameter below (hybrid_strategy) is to select the hybrid strategies adopted by the pipeline.
+// Read the documentation https://mpgap.readthedocs.io/en/latest/manual.html to know more about the hybrid strategies.
+//
+// Whenever using this parameter, it is also possible to polish the longreads-only assemblies with Nanopolish,
+// Medaka or VarianCaller (Arrow) before the polishing with shortreads (using Pilon). For that it is necessary to set
+// the right parameters: pacbio_bam and nanopolish_fast5 (files given only inside YAML) or medaka_model.
+  hybrid_strategy = 1
+
+// Default medaka model used for polishing nanopore long reads assemblies.
+// Please read their manual https://github.com/nanoporetech/medaka to know more about the available models.
+  medaka_model = "r941_min_high_g360"
+
+// This parameter sets to nanopolish the max number of haplotypes to be considered.
+// Sometimes the pipeline may crash because to much variation was found exceeding the limit
+  nanopolish_max_haplotypes = 1000
+
+
+            /*
+             * Advanced parameters
+             * 
+             * Controlling the execution of assemblers
+             * It must be set as true to skip the software and false to use it.
+             * Also adding the possibility to pass additional parameters to them
+             * Additional parameters must be in quotes and separated by spaces.
+             */
+
+
+  quast_additional_parameters = null            // Give additional parameters to Quast while assessing assembly metrics.
+                                                // Must be given as shown in Quast manual. E.g. " --large --eukaryote ".
+
+  skip_spades    = false                      // Hybrid and shortreads only assemblies
+  spades_additional_parameters = null         // Must be given as shown in Spades manual. E.g. " --meta --plasmids "
+
+  skip_shovill   = false                      // Paired shortreads only assemblies
+  shovill_additional_parameters = null        // Must be given as shown in Shovill manual. E.g. " --depth 15 "
+                                              // The pipeline already executes shovill with spades, skesa and megahit, so please, do not use it with shovill's ``--assembler`` parameter.
+
+  skip_unicycler = false                      // Hybrid and shortreads only assemblies
+  unicycler_additional_parameters = null      // Must be given as shown in Unicycler manual. E.g. " --mode conservative --no_correct "
+
+  skip_haslr     = false                      // Hybrid assemblies
+  haslr_additional_parameters = null          // Must be given as shown in Haslr manual. E.g. " --cov-lr 30 "
+
+  skip_canu      = false                      // Longreads only assemblies
+  canu_additional_parameters = null           // Must be given as shown in Canu manual. E.g. " correctedErrorRate=0.075 corOutCoverage=200 "
+
+  skip_flye      = false                      // Longreads only assemblies
+  flye_additional_parameters = null           // Must be given as shown in Flye manual. E.g. " --meta --iterations 4 "
+
+  skip_raven     = false                      // Longreads only assemblies
+  raven_additional_parameters = null          // Must be given as shown in Raven manual. E.g. " --polishing-rounds 4 "
+
+  skip_wtdbg2    = false                      // Longreads only assemblies
+  wtdbg2_additional_parameters = null         // Must be given as shown in wtdbg2 manual. E.g. " --tidy-reads 5000 "
+
+  skip_shasta    = false                      // Nanopore longreads only assemblies
+  shasta_additional_parameters = null         // Must be given as shown in shasta manual. E.g. " --Reads.minReadLength 5000 "
+
+// Max resource options
+// Defaults only, expecting to be overwritten
+  max_memory                 = '14.GB'
+  max_cpus                   = 6
+  max_time                   = '40.h'
+
+}
diff --git a/conf/docker.config b/conf/docker.config
@@ -0,0 +1,7 @@
+// docker profile
+params.selected_profile = "docker"
+singularity.enabled     = false
+docker.enabled          = true
+docker.runOptions       = '-u \$(id -u):\$(id -g)'
+fixOwnership            = true
+process.container       = "fmalmeida/mpgap:v3.1"
diff --git a/conf/singularity.config b/conf/singularity.config
@@ -0,0 +1,7 @@
+// singularity profile
+params.selected_profile = "singularity"
+docker.enabled          = false
+singularity.enabled     = true
+singularity.autoMounts  = true
+process.container       = "docker://fmalmeida/mpgap:v3.1"
+singularity.autoMounts  = true
diff --git a/conf/standard.config b/conf/standard.config
@@ -0,0 +1,6 @@
+// standard local profile -- default
+// does not use any pre-configuration from profiles
+// using docker as default
+params.selected_profile = "none"
+singularity.enabled     = false
+docker.enabled          = false