diff --git a/CHANGELOG.md b/CHANGELOG.md index 01d9401..a4b5e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#56](https://github.com/nf-core/seqinspector/pull/56) Added SeqFu stats module. - [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. - [#63](https://github.com/nf-core/seqinspector/pull/63) Contribution guidelines added about displaying results for new tools diff --git a/CITATIONS.md b/CITATIONS.md index 1668c10..208cfa1 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,7 +14,11 @@ > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. -- [Fastqscreen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) +- [SeqFu](https://telatin.github.io/seqfu2/) + +> Telatin A, Fariselli P, Birolo G. SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files. Bioengineering 2021, 8, 59. doi.org/10.3390/bioengineering8050059 + +- [FastQ Screen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) > Wingett SW and Andrews S. FastQ Screen: A tool for multi-genome mapping and quality control [version 2; referees: 4 approved]. F1000Research 2018, 7:1338 (https://doi.org/10.12688/f1000research.15931.2) diff --git a/conf/modules.config b/conf/modules.config index d3c597b..4a653ed 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,6 +26,15 @@ process { ext.args = '--quiet' } + withName: 'SEQFU_STATS' { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/seqfu_stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'MULTIQC_GLOBAL' { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ diff --git a/docs/output.md b/docs/output.md index e379113..a2706a8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,6 +12,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC +- [SeqFu Stats](#seqfu_stats) - Statistics for FASTA or FASTQ files - [Fastqscreen](#fastqscreen) - mapping against a set of references for basic contamination QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -46,7 +47,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
Output files -- `fastqc/` +- `fastqscreen/` - `*_screen.html`: Interactive graphical fastqscreen report which summaries the mapping of your sequences against each of your libraries. - `*_screen.pdf`: Static graphical fastqscreen report which summaries the mapping of your sequences against each of your libraries. - `*_screen.txt` : text based fastqscreen report which summaries the mapping of your sequences against each of your libraries. @@ -57,6 +58,19 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d It requires the supply of referenced (databases) in a config file. In order to parallelize the mapping of the different samples, in seqinspector, this a fastqscreen config file is generated for every sample/reference combination. +### SeqFu Stats + +
+Output files + +- `seqfu/` + - `*.tsv`: Tab-separated file containing quality metrics. + - `*_mqc.txt`: File containing the same quality metrics as the TSV file, ready to be read by MultiQC. + +
+ +[SeqFu](https://telatin.github.io/seqfu2/) is general-purpose program to manipulate and parse information from FASTA/FASTQ files, supporting gzipped input files. Includes functions to interleave and de-interleave FASTQ files, to rename sequences and to count and print statistics on sequence lengths. In this pipeline, the `seqfu stats` module is used to produce general quality metrics statistics. + ### MultiQC nf-core/seqinspector will generate the following MultiQC reports: diff --git a/modules.json b/modules.json index 97b881d..566d84e 100644 --- a/modules.json +++ b/modules.json @@ -21,6 +21,11 @@ "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] }, + "seqfu/stats": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "seqtk/sample": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/seqfu/stats/environment.yml b/modules/nf-core/seqfu/stats/environment.yml new file mode 100644 index 0000000..8fa07df --- /dev/null +++ b/modules/nf-core/seqfu/stats/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::seqfu=1.20.3" diff --git a/modules/nf-core/seqfu/stats/main.nf b/modules/nf-core/seqfu/stats/main.nf new file mode 100644 index 0000000..0f8bb3e --- /dev/null +++ b/modules/nf-core/seqfu/stats/main.nf @@ -0,0 +1,51 @@ +process SEQFU_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqfu:1.20.3--h1eb128b_0': + 'biocontainers/seqfu:1.20.3--h1eb128b_0' }" + + + input: + // stats can get one or more fasta or fastq files + tuple val(meta), path(files) + + output: + tuple val(meta), path("*.tsv") , emit: stats + tuple val(meta), path("*_mqc.txt"), emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + seqfu \\ + stats \\ + $args \\ + --multiqc ${prefix}_mqc.txt \\ + $files > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqfu: \$(seqfu version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_mqc.txt + seqfu stats ${prefix}_mqc.txt > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqfu: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqfu/stats/meta.yml b/modules/nf-core/seqfu/stats/meta.yml new file mode 100644 index 0000000..f534a3f --- /dev/null +++ b/modules/nf-core/seqfu/stats/meta.yml @@ -0,0 +1,60 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "seqfu_stats" +description: Statistics for FASTA or FASTQ files +keywords: + - seqfu + - stats + - n50 +tools: + - "seqfu": + description: "Cross-platform compiled suite of tools to manipulate and inspect + FASTA and FASTQ files" + homepage: "https://telatin.github.io/seqfu2/" + documentation: "https://telatin.github.io/seqfu2/" + tool_dev_url: "https://github.com/telatin/seqfu2" + doi: "10.3390/bioengineering8050059" + licence: ["GPL v3"] + identifier: biotools:seqfu + +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - files: + type: file + description: One or more FASTA or FASTQ files + pattern: "*.{fasta,fastq,fasta.gz,fastq.gz,fq,fq.gz}" +output: + #Only when we have meta + - stats: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.tsv": + type: file + description: Tab-separated output file with basic sequence statistics. + pattern: "*.{tsv}" + - multiqc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_mqc.txt": + type: file + description: MultiQC ready table + pattern: "*.{_mqc.txt}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@telatin" +maintainers: + - "@telatin" diff --git a/modules/nf-core/seqfu/stats/tests/main.nf.test b/modules/nf-core/seqfu/stats/tests/main.nf.test new file mode 100644 index 0000000..b889b2b --- /dev/null +++ b/modules/nf-core/seqfu/stats/tests/main.nf.test @@ -0,0 +1,75 @@ +nextflow_process { + + name "Test Process SEQFU_STATS" + script "../main.nf" + process "SEQFU_STATS" + + tag "modules" + tag "modules_nfcore" + tag "seqfu" + tag "seqfu/stats" + + + test("seqfu stats - faa") { + // test with 1 FAA file (with multiple sequences of different length) + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.stats }, + { assert process.out.multiqc }, + { assert process.out.stats.size() == 1 }, + { assert snapshot(process.out.versions).match("versions-single") }, + { assert snapshot(process.out.stats).match("stats-single") }, + { assert path(process.out.stats.get(0).get(1)).md5 == "26141ef87ad8a6f59a6f283cc0a06fda" } + ) + } + + } + + test("seqfu stats - multiple files") { + // test feeding a mix of files including compressed + when { + process { + """ + input[0] = [ + [ id:'test' ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.stats }, + { assert process.out.multiqc }, + { assert process.out.stats.size() == 1 }, + { assert path(process.out.versions[0]).readLines()[1].contains('.') }, + { assert snapshot(process.out.stats).match("stats-multi") }, + { assert snapshot(process.out.stats).md5().match("multi-lines") }, + { assert path(process.out.stats[0][1]).readLines()[0] == 'File\t#Seq\tTotal bp\tAvg\tN50\tN75\tN90\tauN\tMin\tMax' }, + { assert path(process.out.multiqc[0][1]).readLines().join('\n').contains('genome.fasta') }, + { assert path(process.out.multiqc[0][1]).readLines().join('\n').contains('proteome.fasta') } + ) + } + + } + +} diff --git a/modules/nf-core/seqfu/stats/tests/main.nf.test.snap b/modules/nf-core/seqfu/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..8049c82 --- /dev/null +++ b/modules/nf-core/seqfu/stats/tests/main.nf.test.snap @@ -0,0 +1,56 @@ +{ + "versions-single": { + "content": [ + [ + "versions.yml:md5,fd11b4665f68f22e7ad4c646ad3c56cd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T20:21:50.896221522" + }, + "multi-lines": { + "content": "d5ab14ce74939f856edc64c6a8d250d3", + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T20:22:03.775940585" + }, + "stats-single": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tsv:md5,26141ef87ad8a6f59a6f283cc0a06fda" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T20:21:50.937732021" + }, + "stats-multi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tsv:md5,98102573ecf7e3b9e53db1c6e0f02b06" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T20:22:03.681146614" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqfu/stats/tests/tags.yml b/modules/nf-core/seqfu/stats/tests/tags.yml new file mode 100644 index 0000000..2685ed4 --- /dev/null +++ b/modules/nf-core/seqfu/stats/tests/tags.yml @@ -0,0 +1,2 @@ +seqfu/stats: + - "modules/nf-core/seqfu/stats/**" diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index fe8e99d..307b163 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -8,6 +8,7 @@ include { samplesheetToList } from 'plugin/nf-schema' include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' include { FASTQC } from '../modules/nf-core/fastqc/main' +include { SEQFU_STATS } from '../modules/nf-core/seqfu/stats' include { FASTQSCREEN_FASTQSCREEN } from '../modules/nf-core/fastqscreen/fastqscreen/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' @@ -63,6 +64,16 @@ workflow SEQINSPECTOR { ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + // + // Module: Run SeqFu stats + // + SEQFU_STATS ( + ch_samplesheet + ) + ch_multiqc_files = ch_multiqc_files.mix(SEQFU_STATS.out.multiqc) + ch_versions = ch_versions.mix(SEQFU_STATS.out.versions.first()) + // // MODULE: Run FastQ Screen //