diff --git a/CHANGELOG.md b/CHANGELOG.md index c790c3824c..304e914701 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### `Added` - +- [#628](https://github.com/SciLifeLab/Sarek/pull/628), [#722](https://github.com/SciLifeLab/Sarek/pull/722) - `ASCAT` now use `.gc` file - [#712](https://github.com/SciLifeLab/Sarek/pull/712), [#718](https://github.com/SciLifeLab/Sarek/pull/718) - Added possibilities to run Sarek with `conda` +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Annotation documentation +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Helper script to download `snpeff` and `VEP` cache files +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - New `--annotation_cache`, `--snpEff_cache`, `--vep_cache` parameters +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Possibility to use cache wen annotating with `snpEff` and `VEP` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Update `Sarek-data` submodule with multiple patients TSV file ### `Changed` @@ -23,27 +29,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#717](https://github.com/SciLifeLab/Sarek/pull/717) - Update documentation - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with conda - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `vepCacheVersion` is now defined in `conf/genomes.config` or `conf/igenomes.config` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` - [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Update `Sarek-data` submodule - [#723](https://github.com/SciLifeLab/Sarek/pull/723), [#725](https://github.com/SciLifeLab/Sarek/pull/725) - Update docs - [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved AwsBatch configuration - -### `Added` -- [#628](https://github.com/SciLifeLab/Sarek/pull/628), [#722](https://github.com/SciLifeLab/Sarek/pull/722) - `ASCAT` now use `.gc` file -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Possibility to use cache wen annotating with `snpEff` and `VEP` -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - New `--annotation_cache`, `--snpEff_cache`, `--vep_cache` parameters -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Helper script to download `snpeff` and `VEP` cache files -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Annotation documentation -- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - VCFs and Annotated VCFs are now ordered by Patient, then tools +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Strelka Best Practices output is now prefixed with `StrelkaBP_` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Improved usage of `targetBED` params ### `Removed` - [#715](https://github.com/SciLifeLab/Sarek/pull/715) - Remove `defReferencesFiles` function from `buildReferences.nf` - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpEff` base container is no longer used - [#721](https://github.com/SciLifeLab/Sarek/pull/721) - Remove COSMIC docs +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Remove `defineDirectoryMap()` ### `Fixed` - [#720](https://github.com/SciLifeLab/Sarek/pull/720) - bamQC is now run on the recalibrated bams, and not after MarkDuplicates - [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix Ascat ref file input (one file can't be a set) - [#727](https://github.com/SciLifeLab/Sarek/pull/727) - bamQC outputs are no longer overwritten (name of dir is now the file instead of sample) +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix multi sample TSV file [#691](https://github.com/SciLifeLab/Sarek/issues/691) +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels ## [2.2.2] - 2018-12-19 diff --git a/Sarek-data b/Sarek-data index 456c73f3e5..9087faa53d 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit 456c73f3e5888b5122c0f49190b04f10789704fe +Subproject commit 9087faa53d25fca90c1a84a48cfaf7cbed496317 diff --git a/annotate.nf b/annotate.nf index 73eef9eb4c..02768b4553 100644 --- a/annotate.nf +++ b/annotate.nf @@ -43,15 +43,13 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project ['haplotypecaller', vcf]}, - Channel.fromPath("${directoryMap.manta}/*SV.vcf.gz") - .flatten().map{vcf -> ['manta', vcf]}, - Channel.fromPath("${directoryMap.mutect2}/*.vcf.gz") - .flatten().map{vcf -> ['mutect2', vcf]}, - Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only - .flatten().map{vcf -> ['strelka', vcf]}, - Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates - .flatten().map{vcf -> ['strelkabp', vcf]} + Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") + .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/Manta/*SV.vcf.gz") + .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/MuTect2/*.vcf.gz") + .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") + .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } } else if (annotateTools == []) { -// alternatively, annotate user-submitted VCFs - list = "" - annotateVCF.each{ list += ",${it}" } - list = list.substring(1) - if (StringUtils.countMatches("${list}", ",") == 0) vcfToAnnotate = Channel.fromPath("${list}") - .map{vcf -> ['userspecified', vcf]} - else vcfToAnnotate = Channel.fromPath("{$list}") - .map{vcf -> ['userspecified', vcf]} +// Annotate user-submitted VCFs +// If user-submitted, Sarek assume that the idPatient should be assumed automatically + vcfToAnnotate = Channel.fromPath(annotateVCF) + .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} } else exit 1, "specify only tools or files to annotate, not both" vcfNotToAnnotate.close() @@ -101,17 +98,17 @@ vcfNotToAnnotate.close() (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4) vcfForVep = vcfForVep.map { - variantCaller, vcf -> - ["vep", variantCaller, vcf, null] + variantCaller, idPatient, vcf -> + ["VEP", variantCaller, idPatient, vcf, null] } process RunBcftoolsStats { - tag {vcf} + tag {"${idPatient} - ${vcf}"} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: - set variantCaller, file(vcf) from vcfForBCFtools + set variantCaller, idPatient, file(vcf) from vcfForBCFtools output: file ("*.bcf.tools.stats.out") into bcfReport @@ -127,12 +124,12 @@ if (params.verbose) bcfReport = bcfReport.view { } process RunVcftools { - tag {vcf} + tag {"${idPatient} - ${variantCaller} - ${vcf}"} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: - set variantCaller, file(vcf) from vcfForVCFtools + set variantCaller, idPatient, file(vcf) from vcfForVCFtools output: file ("${vcf.simpleName}.*") into vcfReport @@ -147,25 +144,22 @@ if (params.verbose) vcfReport = vcfReport.view { "Files : [${it.fileName}]" } -snpEff_cache = params.snpEff_cache ? params.snpEff_cache : "null" - process RunSnpeff { - tag {"${variantCaller} - ${vcf}"} + tag {"${idPatient} - ${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_snpEff.csv") "${directoryMap.snpeffReports.minus(params.outDir+'/')}/${it}" - else if (it == "${vcf.simpleName}_snpEff.ann.vcf") null - else "${directoryMap.snpeff.minus(params.outDir+'/')}/${it}" + if (it == "${vcf.simpleName}_snpEff.ann.vcf") null + else "Annotation/${idPatient}/snpEff/${it}" } input: - set variantCaller, file(vcf) from vcfForSnpeff - file dataDir from Channel.fromPath(snpEff_cache, type: 'dir') + set variantCaller, idPatient, file(vcf) from vcfForSnpeff + file dataDir from Channel.value(params.snpEff_cache ? file(params.snpEff_cache) : "null") val snpeffDb from Channel.value(params.genomes[params.genome].snpeffDb) output: set file("${vcf.simpleName}_snpEff.genes.txt"), file("${vcf.simpleName}_snpEff.csv"), file("${vcf.simpleName}_snpEff.summary.html") into snpeffOutput - set val("snpeff"), variantCaller, file("${vcf.simpleName}_snpEff.ann.vcf") into snpeffVCF + set val("snpEff"), variantCaller, idPatient, file("${vcf.simpleName}_snpEff.ann.vcf") into snpeffVCF when: 'snpeff' in tools || 'merge' in tools @@ -191,7 +185,7 @@ if (params.verbose) snpeffOutput = snpeffOutput.view { "File : ${it.fileName}" } -if('merge' in tools) { +if ('merge' in tools) { // When running in the 'merge' mode // snpEff output is used as VEP input // Used a feedback loop from vcfCompressed @@ -204,29 +198,27 @@ if('merge' in tools) { ) } -vep_cache = params.vep_cache ? params.vep_cache : "null" - process RunVEP { - tag {"${variantCaller} - ${vcf}"} + tag {"${idPatient} - ${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_VEP.summary.html") "${directoryMap.vep.minus(params.outDir+'/')}/${it}" + if (it == "${vcf.simpleName}_VEP.summary.html") "Annotation/${idPatient}/VEP/${it}" else null } input: - set annotator, variantCaller, file(vcf), file(idx) from vcfForVep - file dataDir from Channel.fromPath(vep_cache, type: 'dir') + set annotator, variantCaller, idPatient, file(vcf), file(idx) from vcfForVep + file dataDir from Channel.value(params.vep_cache ? file(params.vep_cache) : "null") val cache_version from Channel.value(params.genomes[params.genome].vepCacheVersion) output: - set finalannotator, variantCaller, file("${vcf.simpleName}_VEP.ann.vcf") into vepVCF + set finalAnnotator, variantCaller, idPatient, file("${vcf.simpleName}_VEP.ann.vcf") into vepVCF file("${vcf.simpleName}_VEP.summary.html") into vepReport when: 'vep' in tools || 'merge' in tools script: - finalannotator = annotator == "snpeff" ? 'merge' : 'vep' + finalAnnotator = annotator == "snpEff" ? 'merge' : 'VEP' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome cache = (params.vep_cache && params.annotation_cache) ? "--dir_cache \${PWD}/${dataDir}" : "--dir_cache /.vep" """ @@ -258,18 +250,18 @@ if (params.verbose) vepReport = vepReport.view { vcfToCompress = snpeffVCF.mix(vepVCF) process CompressVCF { - tag {"${annotator} - ${vcf}"} + tag {"${idPatient} - ${annotator} - ${vcf}"} - publishDir "${directoryMap."$finalannotator"}", mode: params.publishDirMode + publishDir "${params.outDir}/Annotation/${idPatient}/${finalAnnotator}", mode: params.publishDirMode input: - set annotator, variantCaller, file(vcf) from vcfToCompress + set annotator, variantCaller, idPatient, file(vcf) from vcfToCompress output: - set annotator, variantCaller, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (vcfCompressed, vcfCompressedoutput) + set annotator, variantCaller, idPatient, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (vcfCompressed, vcfCompressedoutput) script: - finalannotator = annotator == "merge" ? "vep" : annotator + finalAnnotator = annotator == "merge" ? "VEP" : annotator """ bgzip < ${vcf} > ${vcf}.gz tabix ${vcf}.gz @@ -277,9 +269,9 @@ process CompressVCF { } if (params.verbose) vcfCompressedoutput = vcfCompressedoutput.view { - "${it[0]} VCF:\n" + - "File : ${it[2].fileName}\n" + - "Index : ${it[3].fileName}" + "${it[2]} - ${it[0]} VCF:\n" + + "File : ${it[3].fileName}\n" + + "Index : ${it[4].fileName}" } /* diff --git a/buildContainers.nf b/buildContainers.nf index 9f2e8bd1f8..7255dffa06 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -41,7 +41,7 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project ` to view the used reference, read groups etc. +The BAM file headers contain the details about the actual command-line arguments for mapping, merging, use `samtools view -H ` to view the used reference, read groups etc. ### Recalibrated: -This directory is usually empty, it is the location for the final recalibrated files in the preprocessing pipeline: recalibrated BAMs are usually 2-3 times larger than the duplicatemarked files. To re-generate recalibrated BAMs you have to apply the recalibration table delivered to the `NonRecalibrated` directory either by calling Sarek, or doing this [recalibration step][BQSR-link] yourself. +This directory is usually empty, it is the location for the final recalibrated files in the preprocessing pipeline: recalibrated BAMs are usually 2-3 times larger than the duplicatemarked files. +To re-generate recalibrated BAMs you have to apply the recalibration table delivered to the `NonRecalibrated` directory either by calling Sarek, or doing this [recalibration step][BQSR-link] yourself. --- ## Reports: @@ -65,7 +66,7 @@ This directory is usually empty, it is the location for the final recalibrated f The `Reports` directory is the place for collecting outputs for different quality control (QC) software; going through these files can help us to decide whether the sequencing and the workflow was successful, or further steps are needed to get meaningful results. The main entry point it the [MultiQC][multiqc-link] directory: the HTML index file aggregates and visualizes all the software use for QC. -### MultiQC +### MultiQC To assess the quality of the sequencing and workflow the best start is to view at the `Reports/MultiQC/multiqc_report.html` file of the `MultiQC` directory, where the statistics and graphics of all the software below should be presented. The actual graphs and the tables are configurable, and generally much easier to view than the raw output of the individual software. The subsequent QC compartments are: @@ -73,25 +74,30 @@ The subsequent QC compartments are: * bamQC: [Qualimap][qualimap-link] examines sequencing alignment data in SAM/BAM files according to the features of the mapped reads and provides an overall view of the data provides quality control statistics about aligned BAM files * BCFToolsStats: [bcftools][bcftools] measuring non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. of VCF files. * [FastQC][fastqc]: provides statistics about the raw FASTQ files only. -* MarkDuplicates: a [Picard][picard-md] tool to tag PCR/optical duplicates from aligned BAM data -* SamToolsStats: [samtools][samtools] collection of statistics from BAM files +* MarkDuplicates: a [Picard][picard-md] tool to tag PCR/optical duplicates from aligned BAM data. +* SamToolsStats: [samtools][samtools] collection of statistics from BAM files. --- -## VariantCallings: +## VariantCalling: -All the raw results regarding variant-calling are collected in this directory. Not all the software below are producing VCF files, also both somatic and germline +All the raw results regarding variant-calling are collected in this directory. +Not all the software below are producing VCF files, also both somatic and germline variants are collected in this directory. -* [Ascat][ascat]: is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. This direcory contains the graphical output of the software, CNV, ploidy and sample purity estimations. -* [FreeBayes][freebayes]: is for Bayesian haplotype-based genetic polymorphism discovery and genotyping. The single VCF file generated by FreeBayes -is huge, it is recommended to flatten and filter this VCF, i.e. using the provided [SpeedSeq][speedseq] filter +* [Ascat][ascat]: is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. +This directory contains the graphical output of the software, CNV, ploidy and sample purity estimations. +* [FreeBayes][freebayes]: is for Bayesian haplotype-based genetic polymorphism discovery and genotyping. +The single VCF file generated by FreeBayes is huge, it is recommended to flatten and filter this VCF, i.e. using the provided [SpeedSeq][speedseq] filter. * [HaplotypeCaller][haplotypecaller] is the in-house germline caller of the Broad Institute, the non-recalibrated variant files are there to check the -germline variations and compare the two samples (tumour and normal) for possible mixup -* HaplotypeCallerGVCF: germline calls in [gVCF format][genomicvcf] even for the tumour sample: this format makes possible the joint analysis of a cohort -* [Manta][manta]: is a structural variant caller supported by Illumina. There are several output files, corresponding to germline (diploid) calls, candidate calls and somatic files. +germline variations and compare the two samples (tumour and normal) for possible mixup. +* HaplotypeCallerGVCF: germline calls in [gVCF format][genomicvcf] even for the tumour sample: this format makes possible the joint analysis of a cohort. +* [Manta][manta]: is a structural variant caller supported by Illumina. +There are several output files, corresponding to germline (diploid) calls, candidate calls and somatic files. Manta provides a candidate list for small indels also that can be fed to Strelka. -* [MuTect2][mutect2] is the current somatic caller of GATK for both SNPs and indels. Recommended to keep only lines with the "PASS" filter. -* [Strelka2][strelka2] is somatic SNP and indel caller supported by Illumina. Strelka gives filtered and unfiltered calls for SNPs and indels separately, together with germline calls. +* [MuTect2][mutect2] is the current somatic caller of GATK for both SNPs and indels. +Recommended to keep only lines with the "PASS" filter. +* [Strelka2][strelka2] is somatic SNP and indel caller supported by Illumina. +Strelka gives filtered and unfiltered calls for SNPs and indels separately, together with germline calls. [ascat]:https://www.crick.ac.uk/research/a-z-researchers/researchers-v-y/peter-van-loo/software/ [bcftools]: http://www.htslib.org/doc/bcftools.html diff --git a/germlineVC.nf b/germlineVC.nf index 7c32373d6e..21416a874f 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -46,12 +46,11 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project call_targets.bed.gz - tabix call_targets.bed.gz - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${genomeFile} \ - --exome \ - --callRegions call_targets.bed.gz \ - --runDir Strelka - fi - - # always run this part - python Strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi - """ + beforeScript = params.targetBED ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options = params.targetBED ? "--exome --callRegions call_targets.bed.gz" : "" + """ + ${beforeScript} + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${genomeFile} \ + ${options} \ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz + mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi + mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz + mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi + """ } if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { @@ -409,7 +394,7 @@ if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { process RunSingleManta { tag {idSample + " - Single Diploid"} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta @@ -473,7 +458,7 @@ vcfForQC = Channel.empty().mix( process RunBcftoolsStats { tag {vcf} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForBCFtools @@ -496,7 +481,7 @@ bcfReport.close() process RunVcftools { tag {vcf} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForVCFtools diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 9002f77d8d..93035e2a54 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -127,33 +127,6 @@ class SarekUtils { return true } - // Define map of directories - static def defineDirectoryMap(outDir) { - return [ - 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", - 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", - 'ascat' : "${outDir}/VariantCalling/Ascat", - 'freebayes' : "${outDir}/VariantCalling/FreeBayes", - 'gvcf-hc' : "${outDir}/VariantCalling/HaplotypeCallerGVCF", - 'haplotypecaller' : "${outDir}/VariantCalling/HaplotypeCaller", - 'manta' : "${outDir}/VariantCalling/Manta", - 'mutect2' : "${outDir}/VariantCalling/MuTect2", - 'strelka' : "${outDir}/VariantCalling/Strelka", - 'strelkabp' : "${outDir}/VariantCalling/StrelkaBP", - 'snpeff' : "${outDir}/Annotation/SnpEff", - 'vep' : "${outDir}/Annotation/VEP", - 'bamQC' : "${outDir}/Reports/bamQC", - 'bcftoolsStats' : "${outDir}/Reports/BCFToolsStats", - 'fastQC' : "${outDir}/Reports/FastQC", - 'markDuplicatesQC' : "${outDir}/Reports/MarkDuplicates", - 'multiQC' : "${outDir}/Reports/MultiQC", - 'samtoolsStats' : "${outDir}/Reports/SamToolsStats", - 'snpeffReports' : "${outDir}/Reports/SnpEff", - 'vcftools' : "${outDir}/Reports/VCFTools", - 'version' : "${outDir}/Reports/ToolsVersion" - ] - } - // Channeling the TSV file containing BAM. // Format is: "subject gender status sample bam bai" static def extractBams(tsvFile, mode) { diff --git a/main.nf b/main.nf index 509bf705b0..88c096eebd 100644 --- a/main.nf +++ b/main.nf @@ -43,10 +43,9 @@ kate: syntax groovy; space-indent on; indent-width 2; // Check for awsbatch profile configuration // make sure queue is defined if (workflow.profile == 'awsbatch') { - if(!params.awsqueue) exit 1, "Provide the job queue for aws batch!" + if (!params.awsqueue) exit 1, "Provide the job queue for aws batch!" } - if (params.help) exit 0, helpMessage() if (!SarekUtils.isAllowedParams(params)) exit 1, "params unknown, see --help for more information" if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project " @@ -54,7 +53,6 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/DuplicateMarked/${bam}\t${params.outDir}/Preprocessing/DuplicateMarked/${bai}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/DuplicateMarked" ) duplicateMarkedBams = duplicateMarkedBams.map { @@ -345,7 +342,7 @@ if (params.verbose) duplicateMarkedBams = duplicateMarkedBams.view { process CreateRecalibrationTable { tag {idPatient + "-" + idSample} - publishDir directoryMap.duplicateMarked, mode: params.publishDirMode, overwrite: false + publishDir "${params.outDir}/Preprocessing/DuplicateMarked", mode: params.publishDirMode, overwrite: false input: set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam @@ -385,9 +382,9 @@ process CreateRecalibrationTable { // Create a TSV file to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\t${directoryMap.duplicateMarked}/${recalTable}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/DuplicateMarked/${bam}\t${params.outDir}/Preprocessing/DuplicateMarked/${bai}\t${params.outDir}/Preprocessing/DuplicateMarked/${recalTable}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/DuplicateMarked" ) recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) @@ -403,7 +400,7 @@ if (params.verbose) recalibrationTable = recalibrationTable.view { process RecalibrateBam { tag {idPatient + "-" + idSample} - publishDir directoryMap.recalibrated, mode: params.publishDirMode + publishDir "${params.outDir}/Preprocessing/Recalibrated", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable @@ -435,9 +432,9 @@ process RecalibrateBam { // Creating a TSV file to restart from this step recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/Recalibrated/${bam}\t${params.outDir}/Preprocessing/Recalibrated/${bai}\n" }.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated + name: 'recalibrated.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/Recalibrated" ) if (params.verbose) recalibratedBam = recalibratedBam.view { @@ -453,7 +450,7 @@ if (params.verbose) recalibratedBam = recalibratedBam.view { process RunSamtoolsStats { tag {idPatient + "-" + idSample} - publishDir directoryMap.samtoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/SamToolsStats", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats @@ -474,7 +471,7 @@ if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { process RunBamQCrecalibrated { tag {idPatient + "-" + idSample} - publishDir directoryMap.bamQC, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/bamQC", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC diff --git a/runMultiQC.nf b/runMultiQC.nf index d0f24b5e1d..2aec568485 100644 --- a/runMultiQC.nf +++ b/runMultiQC.nf @@ -40,10 +40,9 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project - [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor] -} +bamsAll = bamsNormal.join(bamsTumor) // Manta and Strelka (bamsForManta, bamsForStrelka, bamsForStrelkaBP, bamsAll) = bamsAll.into(4) @@ -256,18 +218,18 @@ process RunMutect2 { script: """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - Mutect2 \ - -R ${genomeFile}\ - -I ${bamTumor} -tumor ${idSampleTumor} \ - -I ${bamNormal} -normal ${idSampleNormal} \ - -L ${intervalBed} \ - -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + Mutect2 \ + -R ${genomeFile}\ + -I ${bamTumor} -tumor ${idSampleTumor} \ + -I ${bamNormal} -normal ${idSampleNormal} \ + -L ${intervalBed} \ + -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf """ } -// --germline_resource af-only-gnomad.vcf.gz \ -// --normal_panel pon.vcf.gz \ -// --dbsnp ${dbsnp} \ +// --germline_resource af-only-gnomad.vcf.gz \ +// --normal_panel pon.vcf.gz \ +// --dbsnp ${dbsnp} \ mutect2Output = mutect2Output.groupTuple(by:[0,1,2,3]) @@ -317,29 +279,25 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "_" + idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${directoryMap."$variantCaller"}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${"$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge file(genomeIndex) from Channel.value(referenceMap.genomeIndex) + file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir + // we have this funny *_* pattern to avoid copying the raw calls to publishdir set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated - // TODO DRY with ConcatVCF + // TODO DRY with ConcatVCF when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - - if(params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" - else // WGS - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " - - """ - concatenateVCFs.sh ${concatOptions} + options = params.targetBED ? "-t ${targetBED}" : "" + """ + concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${options} """ } @@ -352,10 +310,11 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunStrelka { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.strelka, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Strelka", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForStrelka + file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, @@ -368,35 +327,23 @@ process RunStrelka { when: 'strelka' in tools && !params.onlyQC script: - """ - if [ ! -s "${params.targetBED}" ]; then - # do WGS - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ - --runDir Strelka - else - # WES or targeted - bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz - tabix call_targets.bed.gz - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ - --exome \ - --callRegions call_targets.bed.gz \ - --runDir Strelka - fi - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - # always run this part - - mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ + beforeScript = params.targetBED ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options = params.targetBED ? "--exome --callRegions call_targets.bed.gz" : "" + """ + ${beforeScript} + configureStrelkaSomaticWorkflow.py \ + --tumor ${bamTumor} \ + --normal ${bamNormal} \ + --referenceFasta ${genomeFile} \ + ${options} \ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz + mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi + mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz + mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi + """ } if (params.verbose) strelkaOutput = strelkaOutput.view { @@ -409,7 +356,7 @@ if (params.verbose) strelkaOutput = strelkaOutput.view { process RunManta { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForManta @@ -463,7 +410,7 @@ if (params.verbose) mantaOutput = mantaOutput.view { process RunSingleManta { tag {idSample + " - Tumor-Only"} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta @@ -522,7 +469,7 @@ bamsForStrelkaBP = bamsForStrelkaBP.map { process RunStrelkaBP { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.strelkabp, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Strelka", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from bamsForStrelkaBP @@ -549,13 +496,13 @@ process RunStrelkaBP { python Strelka/runWorkflow.py -m local -j ${task.cpus} mv Strelka/results/variants/somatic.indels.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi mv Strelka/results/variants/somatic.snvs.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi """ } @@ -614,7 +561,7 @@ alleleCountOutput = alleleCountOutput.map { process RunConvertAlleleCounts { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.ascat, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/ASCAT", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCountOutput @@ -636,7 +583,7 @@ process RunConvertAlleleCounts { process RunAscat { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.ascat, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/ASCAT", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOutput @@ -695,7 +642,7 @@ vcfForQC = Channel.empty().mix( process RunBcftoolsStats { tag {vcf} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForBCFtools @@ -718,7 +665,7 @@ bcfReport.close() process RunVcftools { tag {vcf} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForVCFtools @@ -739,7 +686,7 @@ if (params.verbose) vcfReport = vcfReport.view { vcfReport.close() process GetVersionAlleleCount { - publishDir directoryMap.version, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/ToolsVersion", mode: params.publishDirMode output: file("v_*.txt") when: 'ascat' in tools && !params.onlyQC @@ -750,7 +697,7 @@ process GetVersionAlleleCount { } process GetVersionASCAT { - publishDir directoryMap.version, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/ToolsVersion", mode: params.publishDirMode output: file("v_*.txt") when: 'ascat' in tools && !params.onlyQC @@ -816,13 +763,6 @@ def defineToolList() { ] } -def generateIntervalsForVC(bams, intervals) { - def (bamsNew, bamsForVC) = bams.into(2) - def (intervalsNew, vcIntervals) = intervals.into(2) - def bamsForVCNew = bamsForVC.combine(vcIntervals) - return [bamsForVCNew, bamsNew, intervalsNew] -} - def grabRevision() { // Return the same string executed from github or not return workflow.revision ?: workflow.commitId ?: workflow.scriptId.substring(0,10)