From 6ddf877f35679d2f40b5d3386c1cba2189eb48cf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 14 Feb 2019 10:00:00 +0100 Subject: [PATCH 01/22] update submodule --- Sarek-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sarek-data b/Sarek-data index 03b5a18b2b..9087faa53d 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit 03b5a18b2bdba3dac6307e27a5b5c7e5fec3bd54 +Subproject commit 9087faa53d25fca90c1a84a48cfaf7cbed496317 From 62e5aa20dcc197207713d885678afb808601fc14 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 14 Feb 2019 11:03:44 +0100 Subject: [PATCH 02/22] add current changes [skip ci] --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ca27045ac..1f4109bc65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#694](https://github.com/SciLifeLab/Sarek/pull/694) - Add monochrome and grey logos for light or dark background - [#698](https://github.com/SciLifeLab/Sarek/pull/698) - Add btb profile for munin server - [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Add font-ttf-dejavu-sans-mono `2.37` and fontconfig `2.12.6` to container +- [#XXX](https://github.com/SciLifeLab/Sarek/pull/XXX) - Add `MULTIPLE` as a test ### `Changed` @@ -30,6 +31,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Update FastQC to `0.11.8` - [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Change `--TMP_DIR` by `--tmp-dir` for GATK `4.0.9.0` BaseRecalibrator - [#706](https://github.com/SciLifeLab/Sarek/pull/706) - Update TravisCI testing +- [#XXX](https://github.com/SciLifeLab/Sarek/pull/XXX) - Update `Sarek-data` submodule with multiple patients TSV file ### `Fixed` From 1e6ad90a5ffebd796b38857f0c57ef20f8e71c45 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 14 Feb 2019 15:03:50 +0100 Subject: [PATCH 03/22] add test for MULTIPLE --- scripts/test.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/test.sh b/scripts/test.sh index ae591fd2c9..9288df2c6b 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -125,6 +125,12 @@ then clean_repo fi +if [[ MULTIPLE =~ $TEST ]] +then + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-multiple.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-multiple.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP +fi + if [[ BUILDCONTAINERS =~ $TEST ]] && [[ $PROFILE == docker ]] then ./scripts/do_all.sh --genome $GENOME From f3e650cd86bcf4e6765115d4965283f261204172 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 14 Feb 2019 16:41:22 +0100 Subject: [PATCH 04/22] fix multi TSV sample --- somaticVC.nf | 44 +++----------------------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/somaticVC.nf b/somaticVC.nf index 138fe911d3..458052ca29 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -69,10 +69,6 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { referenceMap.intervals = file("$workflow.projectDir/repeats/tiny_${params.genome}.list") } -// TODO -// FreeBayes does not need recalibrated BAMs, but we need to test whether -// the channels are set up correctly when we disable it - tsvPath = '' if (params.sample) tsvPath = params.sample else tsvPath = "${directoryMap.recalibrated}/recalibrated.tsv" @@ -101,33 +97,17 @@ if (params.verbose) bamFiles = bamFiles.view { Files : [${it[3].fileName}, ${it[4].fileName}]" } -// assume input is recalibrated, ignore explicitBqsrNeeded -(recalibratedBam, recalTables) = bamFiles.into(2) - -recalTables = recalTables.map{ it + [null] } // null recalibration table means: do not use --BQSR - -recalTables = recalTables.map { [it[0]] + it[2..-1] } // remove status - -if (params.verbose) recalibratedBam = recalibratedBam.view { +if (params.verbose) bamFiles = bamFiles.view { "Recalibrated BAM for variant Calling:\n\ ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ Files : [${it[3].fileName}, ${it[4].fileName}]" } -// Here we have a recalibrated bam set, but we need to separate the bam files based on patient status. -// The sample tsv config file which is formatted like: "subject status sample lane fastq1 fastq2" -// cf fastqFiles channel, I decided just to add _status to the sample name to have less changes to do. -// And so I'm sorting the channel if the sample match _0, then it's a normal sample, otherwise tumor. -// Then combine normal and tumor to get each possibilities -// ie. normal vs tumor1, normal vs tumor2, normal vs tumor3 -// then copy this channel into channels for each variant calling -// I guess it will still work even if we have multiple normal samples - // separate recalibrateBams by status bamsNormal = Channel.create() bamsTumor = Channel.create() -recalibratedBam +bamFiles .choice(bamsTumor, bamsNormal) {it[1] == 0 ? 1 : 0} bamsNormal = bamsNormal.ifEmpty{exit 1, "No normal sample defined, check TSV file: ${tsvFile}"} @@ -214,18 +194,7 @@ if (params.verbose) bedIntervals = bedIntervals.view { " Interv: ${it.baseName}" } -(bamsNormalTemp, bamsNormal, bedIntervals) = generateIntervalsForVC(bamsNormal, bedIntervals) -(bamsTumorTemp, bamsTumor, bedIntervals) = generateIntervalsForVC(bamsTumor, bedIntervals) - -bamsAll = bamsNormal.combine(bamsTumor) - -// Since idPatientNormal and idPatientTumor are the same -// It's removed from bamsAll Channel (same for genderNormal) -// /!\ It is assumed that every sample are from the same patient -bamsAll = bamsAll.map { - idPatientNormal, idSampleNormal, bamNormal, baiNormal, idPatientTumor, idSampleTumor, bamTumor, baiTumor -> - [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor] -} +bamsAll = bamsNormal.join(bamsTumor) // Manta and Strelka (bamsForManta, bamsForStrelka, bamsForStrelkaBP, bamsAll) = bamsAll.into(4) @@ -816,13 +785,6 @@ def defineToolList() { ] } -def generateIntervalsForVC(bams, intervals) { - def (bamsNew, bamsForVC) = bams.into(2) - def (intervalsNew, vcIntervals) = intervals.into(2) - def bamsForVCNew = bamsForVC.combine(vcIntervals) - return [bamsForVCNew, bamsNew, intervalsNew] -} - def grabRevision() { // Return the same string executed from github or not return workflow.revision ?: workflow.commitId ?: workflow.scriptId.substring(0,10) From f5c265abf8c8a92e0b58ac00e92b8c1818a18408 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 14 Feb 2019 16:48:14 +0100 Subject: [PATCH 05/22] update CHANGELOG --- CHANGELOG.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec915c138f..0015e7f7ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,22 +9,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` - [#712](https://github.com/SciLifeLab/Sarek/pull/712), [#718](https://github.com/SciLifeLab/Sarek/pull/718) - Added possibilities to run Sarek with `conda` +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Annotation documentation +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Helper script to download `snpeff` and `VEP` cache files +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - New `--annotation_cache`, `--snpEff_cache`, `--vep_cache` parameters +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Possibility to use cache wen annotating with `snpEff` and `VEP` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Update `Sarek-data` submodule with multiple patients TSV file ### `Changed` - [#710](https://github.com/SciLifeLab/Sarek/pull/710) - Improve release checklist and script - [#711](https://github.com/SciLifeLab/Sarek/pull/711) - Improve configuration priorities -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `vepCacheVersion` is now defined in `conf/genomes.config` or `conf/igenomes.config` -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with conda - [#716](https://github.com/SciLifeLab/Sarek/pull/716) - Update paths to containers and iGenomes +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with conda +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `vepCacheVersion` is now defined in `conf/genomes.config` or `conf/igenomes.config` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Update `Sarek-data` submodule - [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved AwsBatch configuration -### `Added` -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Possibility to use cache wen annotating with `snpEff` and `VEP` -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - New `--annotation_cache`, `--snpEff_cache`, `--vep_cache` parameters -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Helper script to download `snpeff` and `VEP` cache files -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Annotation documentation - ### `Removed` - [#715](https://github.com/SciLifeLab/Sarek/pull/715) - Remove `defReferencesFiles` function from `buildReferences.nf` - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpEff` base container is no longer used @@ -32,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` - [#720](https://github.com/SciLifeLab/Sarek/pull/720) - bamQC is now run on the recalibrated bams, and not after MarkDuplicates +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix multi sample TSV file [#691](https://github.com/SciLifeLab/Sarek/issues/691) ## [2.2.2] - 2018-12-19 @@ -58,7 +60,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Update FastQC to `0.11.8` - [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Change `--TMP_DIR` by `--tmp-dir` for GATK `4.0.9.0` BaseRecalibrator - [#706](https://github.com/SciLifeLab/Sarek/pull/706) - Update TravisCI testing -- [#XXX](https://github.com/SciLifeLab/Sarek/pull/XXX) - Update `Sarek-data` submodule with multiple patients TSV file ### `Fixed` From 3624ff50bbf58c66d4598d20c04b4251b187ec59 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 15 Feb 2019 15:17:15 +0100 Subject: [PATCH 06/22] change output configuration for VCFs --- annotate.nf | 26 +++++++++++--------------- conf/travis.config | 8 ++++++++ germlineVC.nf | 6 +++--- lib/SarekUtils.groovy | 21 ++++++++++----------- somaticVC.nf | 22 ++++++++-------------- 5 files changed, 40 insertions(+), 43 deletions(-) diff --git a/annotate.nf b/annotate.nf index 73eef9eb4c..ddff03f4a7 100644 --- a/annotate.nf +++ b/annotate.nf @@ -70,15 +70,15 @@ vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { // we annote all available vcfs by default that we can find in the VariantCalling directory Channel.empty().mix( - Channel.fromPath("${directoryMap.haplotypecaller}/*.vcf.gz") + Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.haplotypecaller}/*.vcf.gz") .flatten().map{vcf -> ['haplotypecaller', vcf]}, - Channel.fromPath("${directoryMap.manta}/*SV.vcf.gz") + Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.manta}/*SV.vcf.gz") .flatten().map{vcf -> ['manta', vcf]}, - Channel.fromPath("${directoryMap.mutect2}/*.vcf.gz") + Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.mutect2}/*.vcf.gz") .flatten().map{vcf -> ['mutect2', vcf]}, - Channel.fromPath("${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only + Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only .flatten().map{vcf -> ['strelka', vcf]}, - Channel.fromPath("${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates + Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates .flatten().map{vcf -> ['strelkabp', vcf]} ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 @@ -147,20 +147,18 @@ if (params.verbose) vcfReport = vcfReport.view { "Files : [${it.fileName}]" } -snpEff_cache = params.snpEff_cache ? params.snpEff_cache : "null" - process RunSnpeff { tag {"${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_snpEff.csv") "${directoryMap.snpeffReports.minus(params.outDir+'/')}/${it}" + if (it == "${vcf.simpleName}_snpEff.csv") "${params.outDir}/Reports/${directoryMap.snpeff}/${it}" else if (it == "${vcf.simpleName}_snpEff.ann.vcf") null - else "${directoryMap.snpeff.minus(params.outDir+'/')}/${it}" + else "${params.outDir}/Annotation/${directoryMap.snpeff}/${it}" } input: set variantCaller, file(vcf) from vcfForSnpeff - file dataDir from Channel.fromPath(snpEff_cache, type: 'dir') + file dataDir from Channel.value(params.snpEff_cache ? params.snpEff_cache : "null") val snpeffDb from Channel.value(params.genomes[params.genome].snpeffDb) output: @@ -204,19 +202,17 @@ if('merge' in tools) { ) } -vep_cache = params.vep_cache ? params.vep_cache : "null" - process RunVEP { tag {"${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_VEP.summary.html") "${directoryMap.vep.minus(params.outDir+'/')}/${it}" + if (it == "${vcf.simpleName}_VEP.summary.html") "${params.outDir}/Annotation/${directoryMap.vep}/${it}" else null } input: set annotator, variantCaller, file(vcf), file(idx) from vcfForVep - file dataDir from Channel.fromPath(vep_cache, type: 'dir') + file dataDir from Channel.value(params.vep_cache ? params.vep_cache : "null") val cache_version from Channel.value(params.genomes[params.genome].vepCacheVersion) output: @@ -260,7 +256,7 @@ vcfToCompress = snpeffVCF.mix(vepVCF) process CompressVCF { tag {"${annotator} - ${vcf}"} - publishDir "${directoryMap."$finalannotator"}", mode: params.publishDirMode + publishDir "${params.outDir}/Annotation/${directoryMap."$finalannotator"}", mode: params.publishDirMode input: set annotator, variantCaller, file(vcf) from vcfToCompress diff --git a/conf/travis.config b/conf/travis.config index 3bf97ca817..8865582d65 100644 --- a/conf/travis.config +++ b/conf/travis.config @@ -19,3 +19,11 @@ process { cpus = params.max_cpus memory = params.max_memory } + +withName:RunVEP { + maxForks = 1 +} + +withName:RunSnpeff { + maxForks = 1 +} diff --git a/germlineVC.nf b/germlineVC.nf index 7c32373d6e..ac40dbb398 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -318,7 +318,7 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "-" + idSampleNormal} - publishDir "${directoryMap."$variantCaller"}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap."$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge @@ -356,7 +356,7 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunSingleStrelka { tag {idSample} - publishDir directoryMap.strelka, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelka}", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleStrelka @@ -409,7 +409,7 @@ if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { process RunSingleManta { tag {idSample + " - Single Diploid"} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 9b69a3aa17..87fc5587a4 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -137,23 +137,22 @@ class SarekUtils { return [ 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", - 'ascat' : "${outDir}/VariantCalling/Ascat", - 'freebayes' : "${outDir}/VariantCalling/FreeBayes", - 'gvcf-hc' : "${outDir}/VariantCalling/HaplotypeCallerGVCF", - 'haplotypecaller' : "${outDir}/VariantCalling/HaplotypeCaller", - 'manta' : "${outDir}/VariantCalling/Manta", - 'mutect2' : "${outDir}/VariantCalling/MuTect2", - 'strelka' : "${outDir}/VariantCalling/Strelka", - 'strelkabp' : "${outDir}/VariantCalling/StrelkaBP", - 'snpeff' : "${outDir}/Annotation/SnpEff", - 'vep' : "${outDir}/Annotation/VEP", + 'ascat' : "Ascat", + 'freebayes' : "FreeBayes", + 'gvcf-hc' : "HaplotypeCallerGVCF", + 'haplotypecaller' : "HaplotypeCaller", + 'manta' : "Manta", + 'mutect2' : "MuTect2", + 'strelka' : "Strelka", + 'strelkabp' : "StrelkaBP", + 'snpeff' : "SnpEff", + 'vep' : "VEP", 'bamQC' : "${outDir}/Reports/bamQC", 'bcftoolsStats' : "${outDir}/Reports/BCFToolsStats", 'fastQC' : "${outDir}/Reports/FastQC", 'markDuplicatesQC' : "${outDir}/Reports/MarkDuplicates", 'multiQC' : "${outDir}/Reports/MultiQC", 'samtoolsStats' : "${outDir}/Reports/SamToolsStats", - 'snpeffReports' : "${outDir}/Reports/SnpEff", 'vcftools' : "${outDir}/Reports/VCFTools", 'version' : "${outDir}/Reports/ToolsVersion" ] diff --git a/somaticVC.nf b/somaticVC.nf index 1b9e13dbbc..b0d84ba2e3 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -92,13 +92,7 @@ if (tsvPath) { startMessage() if (params.verbose) bamFiles = bamFiles.view { - "BAMs to process:\n\ - ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ - Files : [${it[3].fileName}, ${it[4].fileName}]" -} - -if (params.verbose) bamFiles = bamFiles.view { - "Recalibrated BAM for variant Calling:\n\ + "BAMs for variant Calling:\n\ ID : ${it[0]}\tStatus: ${it[1]}\tSample: ${it[2]}\n\ Files : [${it[3].fileName}, ${it[4].fileName}]" } @@ -286,7 +280,7 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "_" + idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${directoryMap."$variantCaller"}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap."$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge @@ -321,7 +315,7 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunStrelka { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.strelka, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelka}", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForStrelka @@ -378,7 +372,7 @@ if (params.verbose) strelkaOutput = strelkaOutput.view { process RunManta { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForManta @@ -432,7 +426,7 @@ if (params.verbose) mantaOutput = mantaOutput.view { process RunSingleManta { tag {idSample + " - Tumor-Only"} - publishDir directoryMap.manta, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta @@ -491,7 +485,7 @@ bamsForStrelkaBP = bamsForStrelkaBP.map { process RunStrelkaBP { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.strelkabp, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelkabp}", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from bamsForStrelkaBP @@ -583,7 +577,7 @@ alleleCountOutput = alleleCountOutput.map { process RunConvertAlleleCounts { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.ascat, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.ascat}", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCountOutput @@ -605,7 +599,7 @@ process RunConvertAlleleCounts { process RunAscat { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir directoryMap.ascat, mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.ascat}", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOutput From 884a1099edccc18a17beeb4b619f014a44d02b86 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 18 Feb 2019 16:12:23 +0100 Subject: [PATCH 07/22] update documentation [skip ci] --- docs/OUTPUT.md | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/docs/OUTPUT.md b/docs/OUTPUT.md index eb0040ca1f..a2039b721c 100644 --- a/docs/OUTPUT.md +++ b/docs/OUTPUT.md @@ -29,7 +29,7 @@ Some of the Manta VCF files are not always succeed in going through the VEP filt The HTML summary files show general statistics and quality-related measures. In the header of the annotated VCF files one can find the VEP/Ensembl version used for annotation, also the version numbers for additional databases like Clinvar or dbSNP used in the "VEP" line. -The format of the [consequence annotations][VEP-predictions] is also in the VCF header describing the INFO field. +The format of the [consequence annotations][VEP-predictions] is also in the VCF header describing the INFO field. In the moment it contains: * Consequence: impact of the variation, if there is any * Codons: the codon change, i.e. cGt/cAt @@ -53,11 +53,12 @@ The preprocessing is following the [GATK Best Practices][GATK-BP] to obtain alig This is the place for the BAM file delivered to users: besides the duplicatemarked files the recalibration tables are also stored (`*.recal.table`), these can be used to create base recalibrated files. The `.tsv` file is autogenerated also, these can be used by Sarek for further processing and/or variant calling. -The BAM file headers contain the details about the actual command-line arguments for mapping, merging, use `samtools view -H ` to view the used reference, read groups etc. +The BAM file headers contain the details about the actual command-line arguments for mapping, merging, use `samtools view -H ` to view the used reference, read groups etc. ### Recalibrated: -This directory is usually empty, it is the location for the final recalibrated files in the preprocessing pipeline: recalibrated BAMs are usually 2-3 times larger than the duplicatemarked files. To re-generate recalibrated BAMs you have to apply the recalibration table delivered to the `NonRecalibrated` directory either by calling Sarek, or doing this [recalibration step][BQSR-link] yourself. +This directory is usually empty, it is the location for the final recalibrated files in the preprocessing pipeline: recalibrated BAMs are usually 2-3 times larger than the duplicatemarked files. +To re-generate recalibrated BAMs you have to apply the recalibration table delivered to the `NonRecalibrated` directory either by calling Sarek, or doing this [recalibration step][BQSR-link] yourself. --- ## Reports: @@ -65,7 +66,7 @@ This directory is usually empty, it is the location for the final recalibrated f The `Reports` directory is the place for collecting outputs for different quality control (QC) software; going through these files can help us to decide whether the sequencing and the workflow was successful, or further steps are needed to get meaningful results. The main entry point it the [MultiQC][multiqc-link] directory: the HTML index file aggregates and visualizes all the software use for QC. -### MultiQC +### MultiQC To assess the quality of the sequencing and workflow the best start is to view at the `Reports/MultiQC/multiqc_report.html` file of the `MultiQC` directory, where the statistics and graphics of all the software below should be presented. The actual graphs and the tables are configurable, and generally much easier to view than the raw output of the individual software. The subsequent QC compartments are: @@ -73,25 +74,30 @@ The subsequent QC compartments are: * bamQC: [Qualimap][qualimap-link] examines sequencing alignment data in SAM/BAM files according to the features of the mapped reads and provides an overall view of the data provides quality control statistics about aligned BAM files * BCFToolsStats: [bcftools][bcftools] measuring non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. of VCF files. * [FastQC][fastqc]: provides statistics about the raw FASTQ files only. -* MarkDuplicates: a [Picard][picard-md] tool to tag PCR/optical duplicates from aligned BAM data -* SamToolsStats: [samtools][samtools] collection of statistics from BAM files +* MarkDuplicates: a [Picard][picard-md] tool to tag PCR/optical duplicates from aligned BAM data. +* SamToolsStats: [samtools][samtools] collection of statistics from BAM files. --- -## VariantCallings: +## VariantCalling: -All the raw results regarding variant-calling are collected in this directory. Not all the software below are producing VCF files, also both somatic and germline +All the raw results regarding variant-calling are collected in this directory. +Not all the software below are producing VCF files, also both somatic and germline variants are collected in this directory. -* [Ascat][ascat]: is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. This direcory contains the graphical output of the software, CNV, ploidy and sample purity estimations. -* [FreeBayes][freebayes]: is for Bayesian haplotype-based genetic polymorphism discovery and genotyping. The single VCF file generated by FreeBayes -is huge, it is recommended to flatten and filter this VCF, i.e. using the provided [SpeedSeq][speedseq] filter +* [Ascat][ascat]: is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. +This directory contains the graphical output of the software, CNV, ploidy and sample purity estimations. +* [FreeBayes][freebayes]: is for Bayesian haplotype-based genetic polymorphism discovery and genotyping. +The single VCF file generated by FreeBayes is huge, it is recommended to flatten and filter this VCF, i.e. using the provided [SpeedSeq][speedseq] filter. * [HaplotypeCaller][haplotypecaller] is the in-house germline caller of the Broad Institute, the non-recalibrated variant files are there to check the -germline variations and compare the two samples (tumour and normal) for possible mixup -* HaplotypeCallerGVCF: germline calls in [gVCF format][genomicvcf] even for the tumour sample: this format makes possible the joint analysis of a cohort -* [Manta][manta]: is a structural variant caller supported by Illumina. There are several output files, corresponding to germline (diploid) calls, candidate calls and somatic files. +germline variations and compare the two samples (tumour and normal) for possible mixup. +* HaplotypeCallerGVCF: germline calls in [gVCF format][genomicvcf] even for the tumour sample: this format makes possible the joint analysis of a cohort. +* [Manta][manta]: is a structural variant caller supported by Illumina. +There are several output files, corresponding to germline (diploid) calls, candidate calls and somatic files. Manta provides a candidate list for small indels also that can be fed to Strelka. -* [MuTect2][mutect2] is the current somatic caller of GATK for both SNPs and indels. Recommended to keep only lines with the "PASS" filter. -* [Strelka2][strelka2] is somatic SNP and indel caller supported by Illumina. Strelka gives filtered and unfiltered calls for SNPs and indels separately, together with germline calls. +* [MuTect2][mutect2] is the current somatic caller of GATK for both SNPs and indels. +Recommended to keep only lines with the "PASS" filter. +* [Strelka2][strelka2] is somatic SNP and indel caller supported by Illumina. +Strelka gives filtered and unfiltered calls for SNPs and indels separately, together with germline calls. [ascat]:https://www.crick.ac.uk/research/a-z-researchers/researchers-v-y/peter-van-loo/software/ [bcftools]: http://www.htslib.org/doc/bcftools.html From 4cba4fee46e7ccf9b6cf03e05e158f06237bb370 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 18 Feb 2019 16:12:55 +0100 Subject: [PATCH 08/22] test annotation on only one vcf file --- scripts/test.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index 9288df2c6b..25affda709 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -94,7 +94,7 @@ if [[ ALL,GERMLINE =~ $TEST ]] then # Added Strelka to germline test (no Strelka best practices test for this small data) and not asking for reports run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --noReports - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed Sarek-data/testdata/target.bed --noReports run_wrapper --germline --step recalibrate --noReports clean_repo fi @@ -120,8 +120,7 @@ then then ANNOTATOR=merge,snpEFF,VEP fi - run_wrapper --annotate --tools ${ANNOTATOR} --annotateVCF Sarek-data/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports - run_wrapper --annotate --tools ${ANNOTATOR} --annotateVCF Sarek-data/testdata/vcf/Strelka_1234N_variants.vcf.gz,Sarek-data/testdata/vcf/Strelka_9876T_variants.vcf.gz + run_wrapper --annotate --tools ${ANNOTATOR} --annotateVCF Sarek-data/testdata/vcf/Strelka_1234N_variants.vcf.gz clean_repo fi From d7f15ecc8ca7e2a100923f01c929803854cd18c1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 18 Feb 2019 16:13:37 +0100 Subject: [PATCH 09/22] annotated VCFs are now ordered by idPatient (if it exists) --- annotate.nf | 117 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/annotate.nf b/annotate.nf index ddff03f4a7..a45dc11b5b 100644 --- a/annotate.nf +++ b/annotate.nf @@ -46,10 +46,9 @@ if (workflow.profile == 'awsbatch') { if(!params.awsqueue) exit 1, "Provide the job queue for aws batch!" } - -tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : [] annotateVCF = params.annotateVCF ? params.annotateVCF.split(',').collect{it.trim()} : [] +tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] directoryMap = SarekUtils.defineDirectoryMap(params.outDir) toolList = defineToolList() @@ -68,30 +67,30 @@ vcfToAnnotate = Channel.create() vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { -// we annote all available vcfs by default that we can find in the VariantCalling directory +// Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory +// Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller +// Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka,StrelkaBP}/*.vcf.gz +// Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka +// This small snipet `vcf.minus(vcf.fileName)[-2]` catches idPatient +// This field is used to output final annotated VCFs in the correct directory Channel.empty().mix( - Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.haplotypecaller}/*.vcf.gz") - .flatten().map{vcf -> ['haplotypecaller', vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.manta}/*SV.vcf.gz") - .flatten().map{vcf -> ['manta', vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.mutect2}/*.vcf.gz") - .flatten().map{vcf -> ['mutect2', vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.strelka}/*{somatic,variants}*.vcf.gz") // Strelka only - .flatten().map{vcf -> ['strelka', vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/${directoryMap.strelkabp}/*{somatic,variants}*.vcf.gz") // Strelka with Manta indel candidates - .flatten().map{vcf -> ['strelkabp', vcf]} + Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") + .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2], vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/Manta/*SV.vcf.gz") + .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2], vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/MuTect2/*.vcf.gz") + .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2], vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") // Strelka only + .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2], vcf]}, + Channel.fromPath("${params.outDir}/VariantCalling/*/StrelkaBP/*{somatic,variant}*.vcf.gz") // Strelka with Manta indel candidates + .flatten().map{vcf -> ['strelkabp', vcf.minus(vcf.fileName)[-2], vcf]} ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } } else if (annotateTools == []) { // alternatively, annotate user-submitted VCFs - list = "" - annotateVCF.each{ list += ",${it}" } - list = list.substring(1) - if (StringUtils.countMatches("${list}", ",") == 0) vcfToAnnotate = Channel.fromPath("${list}") - .map{vcf -> ['userspecified', vcf]} - else vcfToAnnotate = Channel.fromPath("{$list}") - .map{vcf -> ['userspecified', vcf]} + vcfToAnnotate = Channel.fromPath(annotateVCF) + .map{vcf -> ['userspecified', '', vcf]} } else exit 1, "specify only tools or files to annotate, not both" vcfNotToAnnotate.close() @@ -101,17 +100,20 @@ vcfNotToAnnotate.close() (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4) vcfForVep = vcfForVep.map { - variantCaller, vcf -> - ["vep", variantCaller, vcf, null] + variantCaller, idPatient, vcf -> + ["VEP", variantCaller, idPatient, vcf, null] } process RunBcftoolsStats { - tag {vcf} + tag { idPatient != "" + ? "${idPatient} - ${vcf}" + : "${vcf}" + } publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode input: - set variantCaller, file(vcf) from vcfForBCFtools + set variantCaller, idPatient, file(vcf) from vcfForBCFtools output: file ("*.bcf.tools.stats.out") into bcfReport @@ -127,12 +129,15 @@ if (params.verbose) bcfReport = bcfReport.view { } process RunVcftools { - tag {vcf} + tag { idPatient != "" + ? "${idPatient} - ${variantCaller} - ${vcf}" + : "${variantCaller} - ${vcf}" + } publishDir directoryMap.vcftools, mode: params.publishDirMode input: - set variantCaller, file(vcf) from vcfForVCFtools + set variantCaller, idPatient, file(vcf) from vcfForVCFtools output: file ("${vcf.simpleName}.*") into vcfReport @@ -148,22 +153,25 @@ if (params.verbose) vcfReport = vcfReport.view { } process RunSnpeff { - tag {"${variantCaller} - ${vcf}"} + tag { idPatient != "" + ? "${idPatient} - ${variantCaller} - ${vcf}" + : "${variantCaller} - ${vcf}" + } publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_snpEff.csv") "${params.outDir}/Reports/${directoryMap.snpeff}/${it}" - else if (it == "${vcf.simpleName}_snpEff.ann.vcf") null - else "${params.outDir}/Annotation/${directoryMap.snpeff}/${it}" + if (it == "${vcf.simpleName}_snpEff.ann.vcf") null + else if (idPatient != "") "Annotation/${idPatient}/snpEff/${it}" + else "Annotation/snpEff/${it}" } input: - set variantCaller, file(vcf) from vcfForSnpeff + set variantCaller, idPatient, file(vcf) from vcfForSnpeff file dataDir from Channel.value(params.snpEff_cache ? params.snpEff_cache : "null") val snpeffDb from Channel.value(params.genomes[params.genome].snpeffDb) output: set file("${vcf.simpleName}_snpEff.genes.txt"), file("${vcf.simpleName}_snpEff.csv"), file("${vcf.simpleName}_snpEff.summary.html") into snpeffOutput - set val("snpeff"), variantCaller, file("${vcf.simpleName}_snpEff.ann.vcf") into snpeffVCF + set val("snpEff"), variantCaller, idPatient, file("${vcf.simpleName}_snpEff.ann.vcf") into snpeffVCF when: 'snpeff' in tools || 'merge' in tools @@ -203,26 +211,32 @@ if('merge' in tools) { } process RunVEP { - tag {"${variantCaller} - ${vcf}"} + tag { idPatient != "" + ? "${idPatient} - ${variantCaller} - ${vcf}" + : "${variantCaller} - ${vcf}" + } publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_VEP.summary.html") "${params.outDir}/Annotation/${directoryMap.vep}/${it}" + if (it == "${vcf.simpleName}_VEP.summary.html") { + if (idPatient != "") "Annotation/${idPatient}/VEP/${it}" + else "Annotation/VEP/${it}" + } else null } input: - set annotator, variantCaller, file(vcf), file(idx) from vcfForVep + set annotator, variantCaller, idPatient, file(vcf), file(idx) from vcfForVep file dataDir from Channel.value(params.vep_cache ? params.vep_cache : "null") val cache_version from Channel.value(params.genomes[params.genome].vepCacheVersion) output: - set finalannotator, variantCaller, file("${vcf.simpleName}_VEP.ann.vcf") into vepVCF + set finalAnnotator, variantCaller, idPatient, file("${vcf.simpleName}_VEP.ann.vcf") into vepVCF file("${vcf.simpleName}_VEP.summary.html") into vepReport when: 'vep' in tools || 'merge' in tools script: - finalannotator = annotator == "snpeff" ? 'merge' : 'vep' + finalAnnotator = annotator == "snpEff" ? 'merge' : 'VEP' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome cache = (params.vep_cache && params.annotation_cache) ? "--dir_cache \${PWD}/${dataDir}" : "--dir_cache /.vep" """ @@ -254,18 +268,25 @@ if (params.verbose) vepReport = vepReport.view { vcfToCompress = snpeffVCF.mix(vepVCF) process CompressVCF { - tag {"${annotator} - ${vcf}"} + tag { idPatient != "" + ? "${idPatient} - ${annotator} - ${vcf}" + : "${annotator} - ${vcf}" + } - publishDir "${params.outDir}/Annotation/${directoryMap."$finalannotator"}", mode: params.publishDirMode + publishDir params.outDir, mode: params.publishDirMode, saveAs: { + idPatient != "" + ? "Annotation/${idPatient}/${finalAnnotator}/${it}" + : "Annotation/${finalAnnotator}/${it}" + } input: - set annotator, variantCaller, file(vcf) from vcfToCompress + set annotator, variantCaller, idPatient, file(vcf) from vcfToCompress output: - set annotator, variantCaller, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (vcfCompressed, vcfCompressedoutput) + set annotator, variantCaller, idPatient, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (vcfCompressed, vcfCompressedoutput) script: - finalannotator = annotator == "merge" ? "vep" : annotator + finalAnnotator = annotator == "merge" ? "VEP" : annotator """ bgzip < ${vcf} > ${vcf}.gz tabix ${vcf}.gz @@ -273,9 +294,15 @@ process CompressVCF { } if (params.verbose) vcfCompressedoutput = vcfCompressedoutput.view { - "${it[0]} VCF:\n" + - "File : ${it[2].fileName}\n" + - "Index : ${it[3].fileName}" + if (it[2] != "") { + "${it[2]} - ${it[0]} VCF:\n" + + "File : ${it[3].fileName}\n" + + "Index : ${it[4].fileName}" + } else { + "${it[0]} VCF:\n" + + "File : ${it[3].fileName}\n" + + "Index : ${it[4].fileName}" + } } /* From f9c1403025b0f2ffeb6f2f632a322489d086241c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 18 Feb 2019 17:27:06 +0100 Subject: [PATCH 10/22] fix annoying stuff --- annotate.nf | 52 ++++++++++++------------------------------- germlineVC.nf | 18 +++++++-------- lib/SarekUtils.groovy | 10 --------- somaticVC.nf | 30 +++++++++++++------------ 4 files changed, 39 insertions(+), 71 deletions(-) diff --git a/annotate.nf b/annotate.nf index a45dc11b5b..1fb8d21ec2 100644 --- a/annotate.nf +++ b/annotate.nf @@ -69,9 +69,9 @@ vcfNotToAnnotate = Channel.create() if (annotateVCF == []) { // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller -// Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka,StrelkaBP}/*.vcf.gz +// Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka}/*.vcf.gz // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka -// This small snipet `vcf.minus(vcf.fileName)[-2]` catches idPatient +// The small snipet `vcf.minus(vcf.fileName)[-2]` catches idPatient // This field is used to output final annotated VCFs in the correct directory Channel.empty().mix( Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") @@ -80,17 +80,16 @@ if (annotateVCF == []) { .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2], vcf]}, Channel.fromPath("${params.outDir}/VariantCalling/*/MuTect2/*.vcf.gz") .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2], vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") // Strelka only + Channel.fromPath("${params.outDir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2], vcf]}, - Channel.fromPath("${params.outDir}/VariantCalling/*/StrelkaBP/*{somatic,variant}*.vcf.gz") // Strelka with Manta indel candidates - .flatten().map{vcf -> ['strelkabp', vcf.minus(vcf.fileName)[-2], vcf]} ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } } else if (annotateTools == []) { -// alternatively, annotate user-submitted VCFs +// Annotate user-submitted VCFs +// If user-submitted, Sarek assume that the idPatient should be assumed automatically vcfToAnnotate = Channel.fromPath(annotateVCF) - .map{vcf -> ['userspecified', '', vcf]} + .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2], vcf]} } else exit 1, "specify only tools or files to annotate, not both" vcfNotToAnnotate.close() @@ -105,10 +104,7 @@ vcfForVep = vcfForVep.map { } process RunBcftoolsStats { - tag { idPatient != "" - ? "${idPatient} - ${vcf}" - : "${vcf}" - } + tag {"${idPatient} - ${vcf}"} publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode @@ -129,10 +125,7 @@ if (params.verbose) bcfReport = bcfReport.view { } process RunVcftools { - tag { idPatient != "" - ? "${idPatient} - ${variantCaller} - ${vcf}" - : "${variantCaller} - ${vcf}" - } + tag {"${idPatient} - ${variantCaller} - ${vcf}"} publishDir directoryMap.vcftools, mode: params.publishDirMode @@ -153,15 +146,11 @@ if (params.verbose) vcfReport = vcfReport.view { } process RunSnpeff { - tag { idPatient != "" - ? "${idPatient} - ${variantCaller} - ${vcf}" - : "${variantCaller} - ${vcf}" - } + tag {"${idPatient} - ${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { if (it == "${vcf.simpleName}_snpEff.ann.vcf") null - else if (idPatient != "") "Annotation/${idPatient}/snpEff/${it}" - else "Annotation/snpEff/${it}" + else "Annotation/${idPatient}/snpEff/${it}" } input: @@ -211,16 +200,10 @@ if('merge' in tools) { } process RunVEP { - tag { idPatient != "" - ? "${idPatient} - ${variantCaller} - ${vcf}" - : "${variantCaller} - ${vcf}" - } + tag {"${idPatient} - ${variantCaller} - ${vcf}"} publishDir params.outDir, mode: params.publishDirMode, saveAs: { - if (it == "${vcf.simpleName}_VEP.summary.html") { - if (idPatient != "") "Annotation/${idPatient}/VEP/${it}" - else "Annotation/VEP/${it}" - } + if (it == "${vcf.simpleName}_VEP.summary.html") "Annotation/${idPatient}/VEP/${it}" else null } @@ -268,16 +251,9 @@ if (params.verbose) vepReport = vepReport.view { vcfToCompress = snpeffVCF.mix(vepVCF) process CompressVCF { - tag { idPatient != "" - ? "${idPatient} - ${annotator} - ${vcf}" - : "${annotator} - ${vcf}" - } + tag {"${idPatient} - ${annotator} - ${vcf}"} - publishDir params.outDir, mode: params.publishDirMode, saveAs: { - idPatient != "" - ? "Annotation/${idPatient}/${finalAnnotator}/${it}" - : "Annotation/${finalAnnotator}/${it}" - } + publishDir "${params.outDir}/Annotation/${idPatient}/${finalAnnotator}/${it}", mode: params.publishDirMode input: set annotator, variantCaller, idPatient, file(vcf) from vcfToCompress diff --git a/germlineVC.nf b/germlineVC.nf index ac40dbb398..3e1d7e6748 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -318,11 +318,12 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "-" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap."$variantCaller"}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${"$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge file(genomeIndex) from Channel.value(referenceMap.genomeIndex) + file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") output: // we have this funny *_* pattern to avoid copying the raw calls to publishdir @@ -335,12 +336,10 @@ process ConcatVCF { if (variantCaller == 'haplotypecaller') outputFile = "${variantCaller}_${idSampleNormal}.vcf" else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf" else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - if(params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" else // WGS concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " - """ concatenateVCFs.sh ${concatOptions} """ @@ -356,10 +355,11 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunSingleStrelka { tag {idSample} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelka}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Strelka", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleStrelka + file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") set file(genomeFile), file(genomeIndex) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex @@ -372,15 +372,15 @@ process RunSingleStrelka { script: """ - if [ ! -s "${params.targetBED}" ]; then - # do WGS + if [ ! -s "${targetBED}" ]; then + # WGS configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ --referenceFasta ${genomeFile} \ --runDir Strelka else # WES or targeted - bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz + bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz tabix call_targets.bed.gz configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ @@ -409,7 +409,7 @@ if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { process RunSingleManta { tag {idSample + " - Single Diploid"} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 87fc5587a4..dbfa0b46db 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -137,16 +137,6 @@ class SarekUtils { return [ 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", - 'ascat' : "Ascat", - 'freebayes' : "FreeBayes", - 'gvcf-hc' : "HaplotypeCallerGVCF", - 'haplotypecaller' : "HaplotypeCaller", - 'manta' : "Manta", - 'mutect2' : "MuTect2", - 'strelka' : "Strelka", - 'strelkabp' : "StrelkaBP", - 'snpeff' : "SnpEff", - 'vep' : "VEP", 'bamQC' : "${outDir}/Reports/bamQC", 'bcftoolsStats' : "${outDir}/Reports/BCFToolsStats", 'fastQC' : "${outDir}/Reports/FastQC", diff --git a/somaticVC.nf b/somaticVC.nf index b0d84ba2e3..06a02812ba 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -280,11 +280,12 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "_" + idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap."$variantCaller"}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/${"$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge file(genomeIndex) from Channel.value(referenceMap.genomeIndex) + file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") output: // we have this funny *_* pattern to avoid copying the raw calls to publishdir @@ -297,7 +298,7 @@ process ConcatVCF { outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" if(params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" else // WGS concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " @@ -315,10 +316,11 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunStrelka { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelka}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Strelka", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForStrelka + file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, @@ -332,7 +334,7 @@ process RunStrelka { script: """ - if [ ! -s "${params.targetBED}" ]; then + if [ ! -s "${targetBED}" ]; then # do WGS configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ @@ -341,7 +343,7 @@ process RunStrelka { --runDir Strelka else # WES or targeted - bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz + bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz tabix call_targets.bed.gz configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ @@ -372,7 +374,7 @@ if (params.verbose) strelkaOutput = strelkaOutput.view { process RunManta { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from bamsForManta @@ -426,7 +428,7 @@ if (params.verbose) mantaOutput = mantaOutput.view { process RunSingleManta { tag {idSample + " - Tumor-Only"} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.manta}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Manta", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta @@ -485,7 +487,7 @@ bamsForStrelkaBP = bamsForStrelkaBP.map { process RunStrelkaBP { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.strelkabp}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/Strelka", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from bamsForStrelkaBP @@ -512,13 +514,13 @@ process RunStrelkaBP { python Strelka/runWorkflow.py -m local -j ${task.cpus} mv Strelka/results/variants/somatic.indels.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi mv Strelka/results/variants/somatic.snvs.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi + StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi """ } @@ -577,7 +579,7 @@ alleleCountOutput = alleleCountOutput.map { process RunConvertAlleleCounts { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.ascat}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/ASCAT", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCountOutput @@ -599,7 +601,7 @@ process RunConvertAlleleCounts { process RunAscat { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outDir}/VariantCalling/${idPatient}/${directoryMap.ascat}", mode: params.publishDirMode + publishDir "${params.outDir}/VariantCalling/${idPatient}/ASCAT", mode: params.publishDirMode input: set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOutput From 8e6d7b20cf29dd32282797596b897f139c110a55 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 18 Feb 2019 17:30:21 +0100 Subject: [PATCH 11/22] fix output --- annotate.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index 1fb8d21ec2..721f63c6dd 100644 --- a/annotate.nf +++ b/annotate.nf @@ -253,7 +253,7 @@ vcfToCompress = snpeffVCF.mix(vepVCF) process CompressVCF { tag {"${idPatient} - ${annotator} - ${vcf}"} - publishDir "${params.outDir}/Annotation/${idPatient}/${finalAnnotator}/${it}", mode: params.publishDirMode + publishDir "${params.outDir}/Annotation/${idPatient}/${finalAnnotator}", mode: params.publishDirMode input: set annotator, variantCaller, idPatient, file(vcf) from vcfToCompress From 5410f0da7c4eef63b53ce10b24e328850338ff18 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 09:50:00 +0100 Subject: [PATCH 12/22] fix --targetBED --- germlineVC.nf | 40 ++++++++++++++++------------------------ somaticVC.nf | 32 +++++++++++--------------------- 2 files changed, 27 insertions(+), 45 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 3e1d7e6748..b5fdc98c7c 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -371,31 +371,23 @@ process RunSingleStrelka { when: 'strelka' in tools && !params.onlyQC script: + if (params.targetBED) { + beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" + options = "--exome --callRegions call_targets.bed.gz" + } """ - if [ ! -s "${targetBED}" ]; then - # WGS - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${genomeFile} \ - --runDir Strelka - else - # WES or targeted - bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz - tabix call_targets.bed.gz - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${genomeFile} \ - --exome \ - --callRegions call_targets.bed.gz \ - --runDir Strelka - fi - - # always run this part - python Strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi + ${beforeScript} + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${genomeFile} \ + ${options} \ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz + mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi + mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz + mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi """ } diff --git a/somaticVC.nf b/somaticVC.nf index 06a02812ba..afac5b755f 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -333,30 +333,20 @@ process RunStrelka { when: 'strelka' in tools && !params.onlyQC script: + if (params.targetBED) { + beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" + options = "--exome --callRegions call_targets.bed.gz" + } """ - if [ ! -s "${targetBED}" ]; then - # do WGS - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ - --runDir Strelka - else - # WES or targeted - bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz - tabix call_targets.bed.gz - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ - --exome \ - --callRegions call_targets.bed.gz \ - --runDir Strelka - fi + ${beforeScript} + configureStrelkaSomaticWorkflow.py \ + --tumor ${bamTumor} \ + --normal ${bamNormal} \ + --referenceFasta ${genomeFile} \ + ${options} \ + --runDir Strelka python Strelka/runWorkflow.py -m local -j ${task.cpus} - # always run this part - mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz From 1cdc30379d4a706cc6c79613dd4310d17af6957d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 10:20:21 +0100 Subject: [PATCH 13/22] spacing and alignements --- annotate.nf | 12 +++-------- germlineVC.nf | 40 +++++++++++++++++------------------ somaticVC.nf | 58 +++++++++++++++++++++++++-------------------------- 3 files changed, 52 insertions(+), 58 deletions(-) diff --git a/annotate.nf b/annotate.nf index 721f63c6dd..2dffd05b06 100644 --- a/annotate.nf +++ b/annotate.nf @@ -270,15 +270,9 @@ process CompressVCF { } if (params.verbose) vcfCompressedoutput = vcfCompressedoutput.view { - if (it[2] != "") { - "${it[2]} - ${it[0]} VCF:\n" + - "File : ${it[3].fileName}\n" + - "Index : ${it[4].fileName}" - } else { - "${it[0]} VCF:\n" + - "File : ${it[3].fileName}\n" + - "Index : ${it[4].fileName}" - } + "${it[2]} - ${it[0]} VCF:\n" + + "File : ${it[3].fileName}\n" + + "Index : ${it[4].fileName}" } /* diff --git a/germlineVC.nf b/germlineVC.nf index b5fdc98c7c..6ea64e9f93 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -326,7 +326,7 @@ process ConcatVCF { file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir + // we have this funny *_* pattern to avoid copying the raw calls to publishdir set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated @@ -336,13 +336,13 @@ process ConcatVCF { if (variantCaller == 'haplotypecaller') outputFile = "${variantCaller}_${idSampleNormal}.vcf" else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf" else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - if(params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" - else // WGS - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " - """ - concatenateVCFs.sh ${concatOptions} - """ + if(params.targetBED) // targeted + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" + else // WGS + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " + """ + concatenateVCFs.sh ${concatOptions} + """ } if (params.verbose) vcfConcatenated = vcfConcatenated.view { @@ -375,20 +375,20 @@ process RunSingleStrelka { beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" options = "--exome --callRegions call_targets.bed.gz" } - """ - ${beforeScript} + """ + ${beforeScript} configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${genomeFile} \ + --bam ${bam} \ + --referenceFasta ${genomeFile} \ ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi - """ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz + mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi + mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz + mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi + """ } if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { diff --git a/somaticVC.nf b/somaticVC.nf index afac5b755f..be1aab2871 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -219,18 +219,18 @@ process RunMutect2 { script: """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - Mutect2 \ - -R ${genomeFile}\ - -I ${bamTumor} -tumor ${idSampleTumor} \ - -I ${bamNormal} -normal ${idSampleNormal} \ - -L ${intervalBed} \ - -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + Mutect2 \ + -R ${genomeFile}\ + -I ${bamTumor} -tumor ${idSampleTumor} \ + -I ${bamNormal} -normal ${idSampleNormal} \ + -L ${intervalBed} \ + -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf """ } -// --germline_resource af-only-gnomad.vcf.gz \ -// --normal_panel pon.vcf.gz \ -// --dbsnp ${dbsnp} \ +// --germline_resource af-only-gnomad.vcf.gz \ +// --normal_panel pon.vcf.gz \ +// --dbsnp ${dbsnp} \ mutect2Output = mutect2Output.groupTuple(by:[0,1,2,3]) @@ -288,22 +288,22 @@ process ConcatVCF { file(targetBED) from Channel.value(params.targetBED ? params.targetBED : "null") output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir + // we have this funny *_* pattern to avoid copying the raw calls to publishdir set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated - // TODO DRY with ConcatVCF + // TODO DRY with ConcatVCF when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - if(params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" - else // WGS - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " + if(params.targetBED) // targeted + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" + else // WGS + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " - """ - concatenateVCFs.sh ${concatOptions} + """ + concatenateVCFs.sh ${concatOptions} """ } @@ -337,21 +337,21 @@ process RunStrelka { beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" options = "--exome --callRegions call_targets.bed.gz" } - """ - ${beforeScript} + """ + ${beforeScript} configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ + --referenceFasta ${genomeFile} \ ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz + mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi + mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz + mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi + """ } if (params.verbose) strelkaOutput = strelkaOutput.view { From 1abb513561074717beca53dee3a7ac1f272938db Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 10:23:29 +0100 Subject: [PATCH 14/22] finally fix targetBED --- germlineVC.nf | 3 +++ somaticVC.nf | 3 +++ 2 files changed, 6 insertions(+) diff --git a/germlineVC.nf b/germlineVC.nf index 6ea64e9f93..eaa7494a7d 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -374,6 +374,9 @@ process RunSingleStrelka { if (params.targetBED) { beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" options = "--exome --callRegions call_targets.bed.gz" + } else { + beforeScript = "" + options = "" } """ ${beforeScript} diff --git a/somaticVC.nf b/somaticVC.nf index be1aab2871..b51de85ab3 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -336,6 +336,9 @@ process RunStrelka { if (params.targetBED) { beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" options = "--exome --callRegions call_targets.bed.gz" + } else { + beforeScript = "" + options = "" } """ ${beforeScript} From eb3dfb1649e4f374b84a6c88a905435c97b0b7fe Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 11:07:55 +0100 Subject: [PATCH 15/22] this time it's for real, targetBED is fixed --- germlineVC.nf | 9 ++++----- somaticVC.nf | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index eaa7494a7d..e91a974bb3 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -46,7 +46,7 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project Date: Tue, 19 Feb 2019 11:08:20 +0100 Subject: [PATCH 16/22] spacing / alignement / code polishing --- annotate.nf | 8 ++++---- buildContainers.nf | 2 +- buildReferences.nf | 2 +- main.nf | 4 +--- runMultiQC.nf | 2 +- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/annotate.nf b/annotate.nf index 2dffd05b06..2a18c8fb1c 100644 --- a/annotate.nf +++ b/annotate.nf @@ -43,7 +43,7 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project " @@ -214,7 +213,6 @@ if (params.verbose) bamQCmappedReport = bamQCmappedReport.view { Dir : [${it.fileName}]" } - // Sort bam whether they are standalone or should be merged // Borrowed code from https://github.com/guigolab/chip-nf diff --git a/runMultiQC.nf b/runMultiQC.nf index d0f24b5e1d..5bf719e977 100644 --- a/runMultiQC.nf +++ b/runMultiQC.nf @@ -40,7 +40,7 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project Date: Tue, 19 Feb 2019 11:45:59 +0100 Subject: [PATCH 17/22] remove defineDirectoryMap() --- annotate.nf | 5 ++--- germlineVC.nf | 23 +++++++---------------- lib/SarekUtils.groovy | 16 ---------------- main.nf | 31 +++++++++++++++---------------- runMultiQC.nf | 21 ++++++++++----------- somaticVC.nf | 29 +++++++++-------------------- 6 files changed, 43 insertions(+), 82 deletions(-) diff --git a/annotate.nf b/annotate.nf index 2a18c8fb1c..6dafd5ec3e 100644 --- a/annotate.nf +++ b/annotate.nf @@ -50,7 +50,6 @@ annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{i annotateVCF = params.annotateVCF ? params.annotateVCF.split(',').collect{it.trim()} : [] tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] -directoryMap = SarekUtils.defineDirectoryMap(params.outDir) toolList = defineToolList() if (!SarekUtils.checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' @@ -106,7 +105,7 @@ vcfForVep = vcfForVep.map { process RunBcftoolsStats { tag {"${idPatient} - ${vcf}"} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: set variantCaller, idPatient, file(vcf) from vcfForBCFtools @@ -127,7 +126,7 @@ if (params.verbose) bcfReport = bcfReport.view { process RunVcftools { tag {"${idPatient} - ${variantCaller} - ${vcf}"} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: set variantCaller, idPatient, file(vcf) from vcfForVCFtools diff --git a/germlineVC.nf b/germlineVC.nf index e91a974bb3..ce192c1e7f 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -51,7 +51,6 @@ if (workflow.profile == 'awsbatch') { tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] -directoryMap = SarekUtils.defineDirectoryMap(params.outDir) referenceMap = defineReferenceMap() toolList = defineToolList() @@ -68,7 +67,7 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { tsvPath = '' if (params.sample) tsvPath = params.sample -else tsvPath = "${directoryMap.recalibrated}/recalibrated.tsv" +else tsvPath = "${params.outDir}/Preprocessing/Recalibrated/recalibrated.tsv" // Set up the bamFiles channel @@ -335,12 +334,9 @@ process ConcatVCF { if (variantCaller == 'haplotypecaller') outputFile = "${variantCaller}_${idSampleNormal}.vcf" else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf" else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - if (params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" - else // WGS - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " + options = params.targetBED ? "-t ${targetBED}" : "" """ - concatenateVCFs.sh ${concatOptions} + concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${concatOptions} """ } @@ -370,13 +366,8 @@ process RunSingleStrelka { when: 'strelka' in tools && !params.onlyQC script: - if (params.targetBED) { - beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" - options = "--exome --callRegions call_targets.bed.gz" - } else { - beforeScript = "" - options = "" - } + beforeScript = params.targetBED ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options = params.targetBED ? "--exome --callRegions call_targets.bed.gz" : "" """ ${beforeScript} configureStrelkaGermlineWorkflow.py \ @@ -467,7 +458,7 @@ vcfForQC = Channel.empty().mix( process RunBcftoolsStats { tag {vcf} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForBCFtools @@ -490,7 +481,7 @@ bcfReport.close() process RunVcftools { tag {vcf} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForVCFtools diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index dbfa0b46db..0d38fe7dfa 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -132,22 +132,6 @@ class SarekUtils { return true } - // Define map of directories - static def defineDirectoryMap(outDir) { - return [ - 'duplicateMarked' : "${outDir}/Preprocessing/DuplicateMarked", - 'recalibrated' : "${outDir}/Preprocessing/Recalibrated", - 'bamQC' : "${outDir}/Reports/bamQC", - 'bcftoolsStats' : "${outDir}/Reports/BCFToolsStats", - 'fastQC' : "${outDir}/Reports/FastQC", - 'markDuplicatesQC' : "${outDir}/Reports/MarkDuplicates", - 'multiQC' : "${outDir}/Reports/MultiQC", - 'samtoolsStats' : "${outDir}/Reports/SamToolsStats", - 'vcftools' : "${outDir}/Reports/VCFTools", - 'version' : "${outDir}/Reports/ToolsVersion" - ] - } - // Channeling the TSV file containing BAM. // Format is: "subject gender status sample bam bai" static def extractBams(tsvFile, mode) { diff --git a/main.nf b/main.nf index 6e5a53f3f4..8dbc6a6785 100644 --- a/main.nf +++ b/main.nf @@ -53,7 +53,6 @@ if (!checkUppmaxProject()) exit 1, "No UPPMAX project ID found! Use --project gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/DuplicateMarked/${bam}\t${params.outDir}/Preprocessing/DuplicateMarked/${bai}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/DuplicateMarked" ) duplicateMarkedBams = duplicateMarkedBams.map { @@ -322,7 +321,7 @@ if (params.verbose) duplicateMarkedBams = duplicateMarkedBams.view { process CreateRecalibrationTable { tag {idPatient + "-" + idSample} - publishDir directoryMap.duplicateMarked, mode: params.publishDirMode, overwrite: false + publishDir "${params.outDir}/Preprocessing/DuplicateMarked", mode: params.publishDirMode, overwrite: false input: set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam @@ -362,9 +361,9 @@ process CreateRecalibrationTable { // Create a TSV file to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.duplicateMarked}/${bam}\t${directoryMap.duplicateMarked}/${bai}\t${directoryMap.duplicateMarked}/${recalTable}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/DuplicateMarked/${bam}\t${params.outDir}/Preprocessing/DuplicateMarked/${bai}\t${params.outDir}/Preprocessing/DuplicateMarked/${recalTable}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: directoryMap.duplicateMarked + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/DuplicateMarked" ) recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) @@ -380,7 +379,7 @@ if (params.verbose) recalibrationTable = recalibrationTable.view { process RecalibrateBam { tag {idPatient + "-" + idSample} - publishDir directoryMap.recalibrated, mode: params.publishDirMode + publishDir "${params.outDir}/Preprocessing/Recalibrated", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable @@ -412,9 +411,9 @@ process RecalibrateBam { // Creating a TSV file to restart from this step recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${directoryMap.recalibrated}/${bam}\t${directoryMap.recalibrated}/${bai}\n" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outDir}/Preprocessing/Recalibrated/${bam}\t${params.outDir}/Preprocessing/Recalibrated/${bai}\n" }.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: directoryMap.recalibrated + name: 'recalibrated.tsv', sort: true, storeDir: "${params.outDir}/Preprocessing/Recalibrated" ) if (params.verbose) recalibratedBam = recalibratedBam.view { @@ -430,7 +429,7 @@ if (params.verbose) recalibratedBam = recalibratedBam.view { process RunSamtoolsStats { tag {idPatient + "-" + idSample} - publishDir directoryMap.samtoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/SamToolsStats", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats @@ -451,7 +450,7 @@ if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { process RunBamQCrecalibrated { tag {idPatient + "-" + idSample} - publishDir directoryMap.bamQC, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/bamQC", mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC diff --git a/runMultiQC.nf b/runMultiQC.nf index 5bf719e977..fcd1972e76 100644 --- a/runMultiQC.nf +++ b/runMultiQC.nf @@ -43,7 +43,6 @@ if (workflow.profile == 'awsbatch') { if (!params.awsqueue) exit 1, "Provide the job queue for aws batch!" } -directoryMap = SarekUtils.defineDirectoryMap(params.outDir) /* ================================================================================ = P R O C E S S E S = @@ -53,10 +52,10 @@ directoryMap = SarekUtils.defineDirectoryMap(params.outDir) startMessage() process GetVersionAll { - publishDir directoryMap.multiQC, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/MultiQC", mode: params.publishDirMode input: - file(versions) from Channel.fromPath("${directoryMap.version}/*").collect().ifEmpty(file ("empty")) + file(versions) from Channel.fromPath("${params.outDir}/Reports/ToolsVersion/*").collect().ifEmpty(file ("empty")) output: file ("tool_versions_mqc.yaml") into versionsForMultiQC @@ -92,17 +91,17 @@ if (params.verbose && !params.noReports) versionsForMultiQC = versionsForMultiQC reportsForMultiQC = Channel.empty() .mix( - Channel.fromPath("${directoryMap.bamQC}/*", type: 'dir'), - Channel.fromPath("${directoryMap.bcftoolsStats}/*"), - Channel.fromPath("${directoryMap.fastQC}/*/*"), - Channel.fromPath("${directoryMap.markDuplicatesQC}/*"), - Channel.fromPath("${directoryMap.samtoolsStats}/*"), - Channel.fromPath("${directoryMap.snpeffReports}/*"), - Channel.fromPath("${directoryMap.vcftools}/*"), + Channel.fromPath("${params.outDir}/Reports/bamQC/*", type: 'dir'), + Channel.fromPath("${params.outDir}/Reports/BCFToolsStats/*"), + Channel.fromPath("${params.outDir}/Reports/FastQC/*/*"), + Channel.fromPath("${params.outDir}/Reports/MarkDuplicates/*"), + Channel.fromPath("${params.outDir}/Reports/SamToolsStats/*"), + Channel.fromPath("${params.outDir}/Annotation/*/snpEff/*.csv"), + Channel.fromPath("${params.outDir}/Reports/VCFTools/*"), ).collect() process RunMultiQC { - publishDir directoryMap.multiQC, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/MultiQC", mode: params.publishDirMode input: file (multiqcConfig) from createMultiQCconfig() diff --git a/somaticVC.nf b/somaticVC.nf index 2dc78b0853..c959af9bf9 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -58,7 +58,6 @@ if (workflow.profile == 'awsbatch') { tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] -directoryMap = SarekUtils.defineDirectoryMap(params.outDir) referenceMap = defineReferenceMap() toolList = defineToolList() @@ -71,7 +70,7 @@ if (params.test && params.genome in ['GRCh37', 'GRCh38']) { tsvPath = '' if (params.sample) tsvPath = params.sample -else tsvPath = "${directoryMap.recalibrated}/recalibrated.tsv" +else tsvPath = "${params.outDir}/Preprocessing/Recalibrated/recalibrated.tsv" // Set up the bamFiles channel @@ -296,14 +295,9 @@ process ConcatVCF { script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - - if (params.targetBED) // targeted - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${targetBED}" - else // WGS - concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " - + options = params.targetBED ? "-t ${targetBED}" : "" """ - concatenateVCFs.sh ${concatOptions} + concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${options} """ } @@ -333,13 +327,8 @@ process RunStrelka { when: 'strelka' in tools && !params.onlyQC script: - if (params.targetBED) { - beforeScript = "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" - options = "--exome --callRegions call_targets.bed.gz" - } else { - beforeScript = "" - options = "" - } + beforeScript = params.targetBED ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options = params.targetBED ? "--exome --callRegions call_targets.bed.gz" : "" """ ${beforeScript} configureStrelkaSomaticWorkflow.py \ @@ -653,7 +642,7 @@ vcfForQC = Channel.empty().mix( process RunBcftoolsStats { tag {vcf} - publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForBCFtools @@ -676,7 +665,7 @@ bcfReport.close() process RunVcftools { tag {vcf} - publishDir directoryMap.vcftools, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForVCFtools @@ -697,7 +686,7 @@ if (params.verbose) vcfReport = vcfReport.view { vcfReport.close() process GetVersionAlleleCount { - publishDir directoryMap.version, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/ToolsVersion", mode: params.publishDirMode output: file("v_*.txt") when: 'ascat' in tools && !params.onlyQC @@ -708,7 +697,7 @@ process GetVersionAlleleCount { } process GetVersionASCAT { - publishDir directoryMap.version, mode: params.publishDirMode + publishDir "${params.outDir}/Reports/ToolsVersion", mode: params.publishDirMode output: file("v_*.txt") when: 'ascat' in tools && !params.onlyQC From 57e86f8e308e17a62d653a651d1249208a686391 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 11:46:50 +0100 Subject: [PATCH 18/22] update CHANGELOG --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cc8ebbd6f..0a8e38bf4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,17 +28,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Update `Sarek-data` submodule - [#723](https://github.com/SciLifeLab/Sarek/pull/723), [#725](https://github.com/SciLifeLab/Sarek/pull/725) - Update docs - [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved AwsBatch configuration +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - VCFs and Annotated VCFs are now ordered by Patient, then tools +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Strelka Best Practices output is now prefixed with `StrelkaBP_` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Improved usage of `targetBED` params ### `Removed` - [#715](https://github.com/SciLifeLab/Sarek/pull/715) - Remove `defReferencesFiles` function from `buildReferences.nf` - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpEff` base container is no longer used - [#721](https://github.com/SciLifeLab/Sarek/pull/721) - Remove COSMIC docs +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Remove `defineDirectoryMap()` ### `Fixed` - [#720](https://github.com/SciLifeLab/Sarek/pull/720) - bamQC is now run on the recalibrated bams, and not after MarkDuplicates - [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix Ascat ref file input (one file can't be a set) - [#727](https://github.com/SciLifeLab/Sarek/pull/727) - bamQC outputs are no longer overwritten (name of dir is now the file instead of sample) - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix multi sample TSV file [#691](https://github.com/SciLifeLab/Sarek/issues/691) +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels ## [2.2.2] - 2018-12-19 From 736ee5666dbfdb2a20f27183581717e4e6f327a8 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 13:08:59 +0100 Subject: [PATCH 19/22] typo --- germlineVC.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/germlineVC.nf b/germlineVC.nf index ce192c1e7f..21416a874f 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -336,7 +336,7 @@ process ConcatVCF { else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" options = params.targetBED ? "-t ${targetBED}" : "" """ - concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${concatOptions} + concatenateVCFs.sh -i ${genomeIndex} -c ${task.cpus} -o ${outputFile} ${options} """ } From 73b1c6f69231ba875813c1bff4dbbc980376d68d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 17:20:44 +0100 Subject: [PATCH 20/22] fix ifEmpty for AWSBatch --- runMultiQC.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runMultiQC.nf b/runMultiQC.nf index fcd1972e76..2aec568485 100644 --- a/runMultiQC.nf +++ b/runMultiQC.nf @@ -55,7 +55,7 @@ process GetVersionAll { publishDir "${params.outDir}/Reports/MultiQC", mode: params.publishDirMode input: - file(versions) from Channel.fromPath("${params.outDir}/Reports/ToolsVersion/*").collect().ifEmpty(file ("empty")) + file(versions) from Channel.fromPath("${params.outDir}/Reports/ToolsVersion/*").collect().ifEmpty(null) output: file ("tool_versions_mqc.yaml") into versionsForMultiQC From a98814429dadc60fdfc95cfcaed72aca77062f49 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 17:28:04 +0100 Subject: [PATCH 21/22] fix issue with AWS Batch cc @KochTobi --- annotate.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/annotate.nf b/annotate.nf index 6dafd5ec3e..0a60414209 100644 --- a/annotate.nf +++ b/annotate.nf @@ -74,13 +74,13 @@ if (annotateVCF == []) { // This field is used to output final annotated VCFs in the correct directory Channel.empty().mix( Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") - .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2], vcf]}, + .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outDir}/VariantCalling/*/Manta/*SV.vcf.gz") - .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2], vcf]}, + .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outDir}/VariantCalling/*/MuTect2/*.vcf.gz") - .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2], vcf]}, + .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outDir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") - .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2], vcf]}, + .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, ).choice(vcfToAnnotate, vcfNotToAnnotate) { annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 } @@ -88,7 +88,7 @@ if (annotateVCF == []) { // Annotate user-submitted VCFs // If user-submitted, Sarek assume that the idPatient should be assumed automatically vcfToAnnotate = Channel.fromPath(annotateVCF) - .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2], vcf]} + .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} } else exit 1, "specify only tools or files to annotate, not both" vcfNotToAnnotate.close() From aca1b687cdf0554b6af6959cb7cd0cf59bf26ffa Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 19 Feb 2019 17:31:36 +0100 Subject: [PATCH 22/22] typo --- annotate.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index 0a60414209..02768b4553 100644 --- a/annotate.nf +++ b/annotate.nf @@ -70,7 +70,7 @@ if (annotateVCF == []) { // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller // Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka}/*.vcf.gz // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka -// The small snipet `vcf.minus(vcf.fileName)[-2]` catches idPatient +// The small snippet `vcf.minus(vcf.fileName)[-2]` catches idPatient // This field is used to output final annotated VCFs in the correct directory Channel.empty().mix( Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz")