diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml b/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml new file mode 100644 index 00000000000..a21a0a390e6 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "fgbio_collectduplexseqmetrics" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::fgbio=2.0.2" + - "conda-forge::r-ggplot2=3.4.4" diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf b/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf new file mode 100644 index 00000000000..d1d6bfc6a14 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/main.nf @@ -0,0 +1,80 @@ +process FGBIO_COLLECTDUPLEXSEQMETRICS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-51891ad0b60843e4aade9cde2eb5d40c5ae92b80:72c944cdea5caff7f03b96034968ce2a4f1737bc-0': + 'biocontainers/mulled-v2-51891ad0b60843e4aade9cde2eb5d40c5ae92b80:72c944cdea5caff7f03b96034968ce2a4f1737bc-0' }" + + input: + tuple val(meta), path(grouped_bam) + path interval_list + + output: + tuple val(meta), path("**.family_sizes.txt") , emit: family_sizes + tuple val(meta), path("**.duplex_family_sizes.txt") , emit: duplex_family_sizes + tuple val(meta), path("**.duplex_yield_metrics.txt"), emit: duplex_yield_metrics + tuple val(meta), path("**.umi_counts.txt") , emit: umi_counts + tuple val(meta), path("**.duplex_qc.pdf") , emit: duplex_qc + tuple val(meta), path("**.duplex_umi_counts.txt") , emit: duplex_umi_counts, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def intervals = interval_list ? "--intervals ${bed}" : "" + def mem_gb = 8 + + if (!task.memory) { + log.info '[fgbio CollectDuplexSeqMetrics] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.' + } else if (mem_gb > task.memory.giga) { + if (task.memory.giga < 2) { + mem_gb = 1 + } else { + mem_gb = task.memory.giga - 1 + } + } + + """ + fgbio \\ + -Xmx${mem_gb}g \\ + --tmp-dir=. \\ + --async-io=true \\ + --compression=1 \\ + CollectDuplexSeqMetrics \\ + --input $grouped_bam \\ + --output ${prefix} \\ + $intervals \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))") + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def touch_duplex_umi = args.contains("--duplex-umi-counts") || args.contains("-u") ? "touch ${prefix}.duplex_umi_counts.txt" : "" + + """ + touch ${prefix}.family_sizes.txt + touch ${prefix}.duplex_family_sizes.txt + touch ${prefix}.duplex_yield_metrics.txt + touch ${prefix}.umi_counts.txt + touch ${prefix}.duplex_qc.pdf + $touch_duplex_umi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml b/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml new file mode 100644 index 00000000000..f7afa4b4101 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml @@ -0,0 +1,78 @@ +--- +name: "fgbio_collectduplexseqmetrics" +description: Collects a suite of metrics to QC duplex sequencing data. +keywords: + - UMIs + - QC + - bam + - duplex +tools: + - "fgbio": + description: "A set of tools for working with genomic and high throughput sequencing data, including UMIs" + homepage: "http://fulcrumgenomics.github.io/fgbio/" + documentation: "http://fulcrumgenomics.github.io/fgbio/" + tool_dev_url: "https://github.com/fulcrumgenomics/fgbio" + licence: ["MIT"] + - "r-ggplot2": + description: "ggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. " + homepage: "https://ggplot2.tidyverse.org/" + documentation: "https://ggplot2.tidyverse.org/" + tool_dev_url: "https://github.com/tidyverse/ggplot2" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - grouped_bam: + type: file + description: It has to be either 1)The exact BAM output by the GroupReadsByUmi tool (in the sort-order it was produced in) 2)A BAM file that has MI tags present on all reads (usually set by GroupReadsByUmi and has been sorted with SortBam into TemplateCoordinate order. + pattern: "*.bam" + + - interval_list: + type: file + description: Calculation of metrics may be restricted to a set of regions using the --intervals parameter. The file format is descripted here https://samtools.github.io/htsjdk/javadoc/htsjdk/index.html?htsjdk/samtools/util/Interval.html + pattern: "*.{tsv|txt|interval_list}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - family_sizes: + type: file + description: Metrics on the frequency of different types of families of different sizes + pattern: "*.txt" + - duplex_family_sizes: + type: file + description: Metrics on the frequency of duplex tag families by the number of observations from each strand + pattern: "*.txt" + - duplex_yield_metrics: + type: file + description: Summary QC metrics produced using 5%, 10%, 15%...100% of the data + pattern: "*.txt" + - umi_counts: + type: file + description: Metrics on the frequency of observations of UMIs within reads and tag families + pattern: "*.txt" + - duplex_qc: + type: file + description: A series of plots generated from the preceding metrics files for visualization + pattern: "*.pdf" + - duplex_umi_counts: + type: file + description: Metrics on the frequency of observations of duplex UMIs within reads and tag families. + pattern: "*.txt" + +authors: + - "@georgiakes" +maintainers: + - "@georgiakes" diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test new file mode 100644 index 00000000000..0021229b1ba --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test @@ -0,0 +1,79 @@ +nextflow_process { + + name "Test Process FGBIO_COLLECTDUPLEXSEQMETRICS" + script "../main.nf" + process "FGBIO_COLLECTDUPLEXSEQMETRICS" + + tag "modules" + tag "modules_nfcore" + tag "fgbio" + tag "fgbio/collectduplexseqmetrics" + + + test("homo_sapiens - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1]=[] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.family_sizes, + process.out.duplex_family_sizes, + process.out.duplex_yield_metrics, + process.out.umi_counts, + process.out.duplex_umi_counts, + process.out.versions, + file(process.out.duplex_qc[0][1]).name) + .match() } + + ) + } + + } + + test("homo_sapiens - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true) + ] + input[1] = [] + + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.family_sizes, + process.out.duplex_family_sizes, + process.out.duplex_yield_metrics, + process.out.umi_counts, + process.out.duplex_umi_counts, + process.out.versions, + file(process.out.duplex_qc[0][1]).name) + .match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap new file mode 100644 index 00000000000..7dfa35f5801 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap @@ -0,0 +1,106 @@ +{ + "homo_sapiens - stub": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_yield_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.umi_counts.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + + ], + [ + "versions.yml:md5,637a7384cd910f0e0541a631c52b95e1" + ], + "test.duplex_qc.pdf" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-17T19:26:23.325859809" + }, + "homo_sapiens - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.family_sizes.txt:md5,a49de49bd587440c316fec830f502620" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_family_sizes.txt:md5,129e41170b9f5f2f8edce62a686c8548" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.duplex_yield_metrics.txt:md5,237e4e4ee713fdf672b0ee796827fb9d" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.umi_counts.txt:md5,9fe38b2a49ca80492b3a1c6a55679155" + ] + ], + [ + + ], + [ + "versions.yml:md5,637a7384cd910f0e0541a631c52b95e1" + ], + "test.duplex_qc.pdf" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-17T19:26:03.1373243" + } +} \ No newline at end of file diff --git a/modules/nf-core/fgbio/collectduplexseqmetrics/tests/tags.yml b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/tags.yml new file mode 100644 index 00000000000..603caca5db5 --- /dev/null +++ b/modules/nf-core/fgbio/collectduplexseqmetrics/tests/tags.yml @@ -0,0 +1,2 @@ +fgbio/collectduplexseqmetrics: + - "modules/nf-core/fgbio/collectduplexseqmetrics/**"