Merge pull request #8 from CarsonJM/genomad

changed run_genomad to run_virus_identification, and updated output docs
d4straub · Jul 20, 2023 · d57ae9a · d57ae9a
2 parents 4231a0c + e4b7583
commit d57ae9a
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 26 deletions.
diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config
@@ -21,7 +21,7 @@ params {
 
     // Input data
     input                       = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
-    run_genomad                 = true
+    run_virus_identification    = true
     genomad_splits              = 7
 
     // For computational efficiency

diff --git a/docs/output.md b/docs/output.md
@@ -279,23 +279,23 @@ Protein-coding genes are predicted for each assembly.
 <details markdown="1">
 <summary>Output files</summary>
 
-- `VirusIdentification/geNomad/[sample/group]/`
-  - `[sample/group]_annotate`
-    - `[sample/group]_taxonomy.tsv`: Taxonomic assignment data
-  - `[sample/group]_aggregated_classification`
-    - `[sample/group]_aggregated_classification.tsv`: Sequence classification in tabular format
-  - `[sample/group]_find_proviruses`
-    - `[sample/group]_provirus.tsv`: Characteristics of proviruses identified by geNomad
-  - `[sample/group]_summary`
-    - `[sample/group]_virus_summary.tsv`: Virus classification summary file in tabular format
-    - `[sample/group]_plasmid_summary.tsv`: Plasmid classification summary file in tabular format
-    - `[sample/group]_viruses_genes.tsv`: Virus gene annotation data in tabular format
-    - `[sample/group]_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format
-    - `[sample/group]_viruses.fna`: Virus nucleotide sequences in FASTA format
-    - `[sample/group]_plasmids.fna`: Plasmid nucleotide sequences in FASTA format
-    - `[sample/group]_viruses_proteins.faa`: Virus protein sequences in FASTA format
-    - `[sample/group]_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format
-  - `[sample/group].log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary)
+- `VirusIdentification/geNomad/[assembler]-[sample/group]*/`
+  - `[assembler]-[sample/group]*_annotate`
+    - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data
+  - `[assembler]-[sample/group]*_aggregated_classification`
+    - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format
+  - `[assembler]-[sample/group]*_find_proviruses`
+    - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad
+  - `[assembler]-[sample/group]*_summary`
+    - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format
+    - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format
+    - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format
+    - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format
+    - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format
+    - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format
+    - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format
+    - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format
+  - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary)
 
 </details>
 

diff --git a/nextflow.config b/nextflow.config
@@ -60,7 +60,7 @@ params {
     skip_prodigal                        = false
 
     // virus identification options
-    run_genomad                          = false
+    run_virus_identification             = false
     genomad_db                           = null
     genomad_min_score                    = 0.7
     genomad_splits                       = 1

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -527,23 +527,23 @@
                 },
                 "gtdbtk_min_completeness": {
                     "type": "number",
-                    "default": 50,
+                    "default": 50.0,
                     "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
                     "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
                     "minimum": 0.01,
                     "maximum": 100
                 },
                 "gtdbtk_max_contamination": {
                     "type": "number",
-                    "default": 10,
+                    "default": 10.0,
                     "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
                     "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
                     "minimum": 0,
                     "maximum": 100
                 },
                 "gtdbtk_min_perc_aa": {
                     "type": "number",
-                    "default": 10,
+                    "default": 10.0,
                     "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
                     "minimum": 0,
                     "maximum": 100
@@ -557,7 +557,7 @@
                 },
                 "gtdbtk_pplacer_cpus": {
                     "type": "number",
-                    "default": 1,
+                    "default": 1.0,
                     "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.",
                     "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)."
                 },
@@ -629,9 +629,10 @@
             "type": "object",
             "default": "",
             "properties": {
-                "run_genomad": {
+                "run_virus_identification": {
                     "type": "boolean",
-                    "description": "Identify viral sequences in assemblies using geNomad"
+                    "default": false,
+                    "description": "Run virus identification."
                 },
                 "genomad_min_score": {
                     "type": "number",

diff --git a/workflows/mag.nf b/workflows/mag.nf
@@ -638,7 +638,7 @@ workflow MAG {
     ================================================================================
     */
 
-    if (params.run_genomad){
+    if (params.run_virus_identification){
         VIRUS_IDENTIFICATION(ch_assemblies, ch_genomad_db)
         ch_versions = ch_versions.mix(VIRUS_IDENTIFICATION.out.versions.first())
     }