diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c4963826a..eb1dd9ba37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params - [#677](https://github.com/SciLifeLab/Sarek/pull/677) - Update docs - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration +- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - Qualimap bamQC is now ran after mapping and after recalibration for better QC ### `Fixed` diff --git a/conf/aws-batch.config b/conf/aws-batch.config index ec8a9e9139..a8d38fca57 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -8,7 +8,7 @@ */ params { - genome_base = params.genome == 'GRCh37' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" + genome_base = params.genome == 'iGRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" publishDirMode = 'copy' } diff --git a/conf/containers.config b/conf/containers.config index ad2becd9e5..2c0c34e2a4 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -65,7 +65,10 @@ process { withName:RunAscat { container = "${params.repository}/r-base:${params.tag}" } - withName:RunBamQC { + withName:RunBamQCmapped { + container = "${params.repository}/sarek:${params.tag}" + } + withName:RunBamQCrecalibrated { container = "${params.repository}/sarek:${params.tag}" } withName:RunBcftoolsStats { diff --git a/conf/genomes.config b/conf/genomes.config index b736bc9e4f..95bd91ff83 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -42,6 +42,19 @@ params { //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } + 'iGRCh37' { + acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" + dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf" + dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" + genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}" + intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" + snpeffDb = "GRCh37.75" + } 'iGRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" @@ -51,8 +64,8 @@ params { genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" - knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz" - knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" } 'smallGRCh37' { diff --git a/conf/singularity-path.config b/conf/singularity-path.config index d41cf45059..c2a357b589 100644 --- a/conf/singularity-path.config +++ b/conf/singularity-path.config @@ -70,7 +70,10 @@ process { withName:RunAscat { container = "${params.containerPath}/r-base-${params.tag}.simg" } - withName:RunBamQC { + withName:RunBamQCmapped { + container = "${params.containerPath}/sarek-${params.tag}.simg" + } + withName:RunBamQCrecalibrated { container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunBcftoolsStats { diff --git a/conf/uppmax-localhost.config b/conf/uppmax-localhost.config index 10c6725ba4..a1a08827f5 100644 --- a/conf/uppmax-localhost.config +++ b/conf/uppmax-localhost.config @@ -84,7 +84,11 @@ process { withName:RunAscat { memory = {params.singleCPUMem * 2 * task.attempt} } - withName:RunBamQC { + withName:RunBamQCmapped { + cpus = 16 + memory = {params.totalMemory} + } + withName:RunBamQCrecalibrated { cpus = 16 memory = {params.totalMemory} } diff --git a/conf/uppmax-slurm.config b/conf/uppmax-slurm.config index 8686d7af91..9be0393fa5 100644 --- a/conf/uppmax-slurm.config +++ b/conf/uppmax-slurm.config @@ -71,7 +71,11 @@ process { memory = {params.singleCPUMem * 2 * task.attempt} queue = 'core' } - withName:RunBamQC { + withName:RunBamQCmapped { + cpus = 16 + } + withName:RunBamQCrecalibrated { + cpus = 16 } withName:RunBcftoolsStats { cpus = 1 diff --git a/germlineVC.nf b/germlineVC.nf index 7b53d1006c..9c355ecf55 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -26,8 +26,6 @@ kate: syntax groovy; space-indent on; indent-width 2; https://github.com/SciLifeLab/Sarek/README.md -------------------------------------------------------------------------------- Processes overview - - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files - - RunBamQC - Run qualimap BamQC on recalibrated BAM files - CreateIntervalBeds - Create and sort intervals into bed files - RunHaplotypecaller - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - RunGenotypeGVCFs - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) @@ -89,7 +87,7 @@ if (params.verbose) bamFiles = bamFiles.view { } // assume input is recalibrated, ignore explicitBqsrNeeded -(bamForBamQC, bamForSamToolsStats, recalibratedBam, recalTables) = bamFiles.into(4) +(recalibratedBam, recalTables) = bamFiles.into(2) recalTables = recalTables.map{ it + [null] } // null recalibration table means: do not use --BQSR @@ -101,48 +99,6 @@ if (params.verbose) recalibratedBam = recalibratedBam.view { Files : [${it[3].fileName}, ${it[4].fileName}]" } -process RunSamtoolsStats { - tag {idPatient + "-" + idSample} - - publishDir directoryMap.samtoolsStats, mode: params.publishDirMode - - input: - set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats - - output: - file ("${bam}.samtools.stats.out") into samtoolsStatsReport - - when: !params.noReports - - script: QC.samtoolsStats(bam) -} - -if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { - "SAMTools stats report:\n\ - File : [${it.fileName}]" -} - -process RunBamQC { - tag {idPatient + "-" + idSample} - - publishDir directoryMap.bamQC, mode: params.publishDirMode - - input: - set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC - - output: - file(idSample) into bamQCreport - - when: !params.noReports && !params.noBAMQC - - script: QC.bamQC(bam,idSample,task.memory) -} - -if (params.verbose) bamQCreport = bamQCreport.view { - "BamQC report:\n\ - Dir : [${it.fileName}]" -} - // Here we have a recalibrated bam set, but we need to separate the bam files based on patient status. // The sample tsv config file which is formatted like: "subject status sample lane fastq1 fastq2" // cf fastqFiles channel, I decided just to add _status to the sample name to have less changes to do. diff --git a/lib/QC.groovy b/lib/QC.groovy index 2e1c20f820..409a5a27d4 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -1,15 +1,4 @@ class QC { -// Run bamQC on vcf file - static def bamQC(bam, idSample, mem) { - """ - qualimap --java-mem-size=${mem.toGiga()}G \ - bamqc \ - -bam ${bam} \ - -outdir ${idSample} \ - -outformat HTML - """ - } - // Run bcftools on vcf file static def bcftools(vcf) { """ diff --git a/main.nf b/main.nf index 6c6f647fda..433d681dad 100644 --- a/main.nf +++ b/main.nf @@ -33,7 +33,8 @@ kate: syntax groovy; space-indent on; indent-width 2; - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator - RecalibrateBam - Recalibrate Bam with PrintReads - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files - - RunBamQC - Run qualimap BamQC on recalibrated BAM files + - RunBamQCmapped - Run qualimap BamQC on mapped BAM files + - RunBamQCrecalibrated - Run qualimap BamQC on recalibrated BAM files ================================================================================ = C O N F I G U R A T I O N = ================================================================================ @@ -73,7 +74,6 @@ if (!params.sample && !params.sampleDir) { } // Set up the fastqFiles and bamFiles channels. One of them remains empty -// Except for step annotate, in which both stay empty fastqFiles = Channel.empty() bamFiles = Channel.empty() if (tsvPath) { @@ -152,7 +152,7 @@ process MapReads { set file(genomeFile), file(bwaIndex) from Channel.value([referenceMap.genomeFile, referenceMap.bwaIndex]) output: - set idPatient, status, idSample, idRun, file("${idRun}.bam") into mappedBam + set idPatient, status, idSample, idRun, file("${idRun}.bam") into (mappedBam, mappedBamForQC) when: step == 'mapping' && !params.onlyQC @@ -174,6 +174,40 @@ if (params.verbose) mappedBam = mappedBam.view { File : [${it[4].fileName}]" } +process RunBamQCmapped { + tag {idPatient + "-" + idSample} + + publishDir directoryMap.bamQC, mode: params.publishDirMode + + input: + set idPatient, status, idSample, idRun, file(bam) from mappedBamForQC + + output: + file(idSample) into bamQCmappedReport + + when: !params.noReports && !params.noBAMQC + + script: + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${idSample} \ + -outformat HTML + """ +} + +if (params.verbose) bamQCmappedReport = bamQCmappedReport.view { + "BamQC report:\n\ + Dir : [${it.fileName}]" +} + + // Sort bam whether they are standalone or should be merged // Borrowed code from https://github.com/guigolab/chip-nf @@ -416,7 +450,7 @@ if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { File : [${it.fileName}]" } -process RunBamQC { +process RunBamQCrecalibrated { tag {idPatient + "-" + idSample} publishDir directoryMap.bamQC, mode: params.publishDirMode @@ -425,14 +459,26 @@ process RunBamQC { set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC output: - file(idSample) into bamQCreport + file(idSample) into bamQCrecalibratedReport when: !params.noReports && !params.noBAMQC - script: QC.bamQC(bam,idSample,task.memory) + script: + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${idSample} \ + -outformat HTML + """ } -if (params.verbose) bamQCreport = bamQCreport.view { +if (params.verbose) bamQCrecalibratedReport = bamQCrecalibratedReport.view { "BamQC report:\n\ Dir : [${it.fileName}]" } diff --git a/somaticVC.nf b/somaticVC.nf index 9a99aebebd..e51f671d6b 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -26,8 +26,6 @@ kate: syntax groovy; space-indent on; indent-width 2; https://github.com/SciLifeLab/Sarek/README.md -------------------------------------------------------------------------------- Processes overview - - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files - - RunBamQC - Run qualimap BamQC on recalibrated BAM files - CreateIntervalBeds - Create and sort intervals into bed files - RunMutect2 - Run MuTect2 for Variant Calling (Parallelized processes) - RunFreeBayes - Run FreeBayes for Variant Calling (Parallelized processes) @@ -98,7 +96,7 @@ if (params.verbose) bamFiles = bamFiles.view { } // assume input is recalibrated, ignore explicitBqsrNeeded -(bamForBamQC, bamForSamToolsStats, recalibratedBam, recalTables) = bamFiles.into(4) +(recalibratedBam, recalTables) = bamFiles.into(2) recalTables = recalTables.map{ it + [null] } // null recalibration table means: do not use --BQSR @@ -110,48 +108,6 @@ if (params.verbose) recalibratedBam = recalibratedBam.view { Files : [${it[3].fileName}, ${it[4].fileName}]" } -process RunSamtoolsStats { - tag {idPatient + "-" + idSample} - - publishDir directoryMap.samtoolsStats, mode: params.publishDirMode - - input: - set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats - - output: - file ("${bam}.samtools.stats.out") into samtoolsStatsReport - - when: !params.noReports - - script: QC.samtoolsStats(bam) -} - -if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { - "SAMTools stats report:\n\ - File : [${it.fileName}]" -} - -process RunBamQC { - tag {idPatient + "-" + idSample} - - publishDir directoryMap.bamQC, mode: params.publishDirMode - - input: - set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC - - output: - file(idSample) into bamQCreport - - when: !params.noReports && !params.noBAMQC - - script: QC.bamQC(bam,idSample,task.memory) -} - -if (params.verbose) bamQCreport = bamQCreport.view { - "BamQC report:\n\ - Dir : [${it.fileName}]" -} - // Here we have a recalibrated bam set, but we need to separate the bam files based on patient status. // The sample tsv config file which is formatted like: "subject status sample lane fastq1 fastq2" // cf fastqFiles channel, I decided just to add _status to the sample name to have less changes to do.