Merge pull request #266 from FriederikeHanssen/dsl2_variantcalling

Variantcalling
nf-core · Sep 21, 2020 · 355ca6f · 355ca6f
2 parents a7679b9 + 3c28aa6
commit 355ca6f
Show file tree

Hide file tree

Showing 7 changed files with 246 additions and 177 deletions.
diff --git a/conf/test_germline_variantcalling.config b/conf/test_germline_variantcalling.config
@@ -0,0 +1,15 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/sarek -profile test_tool
+ */
+
+includeConfig 'test.config'
+
+params {
+  // Input data
+  tools = 'haplotypecaller,strelka'
+}
diff --git a/main.nf b/main.nf
@@ -288,6 +288,9 @@ include { SAMTOOLS_INDEX         as SAMTOOLS_INDEX_MAPPED } from './modules/nf-c
 include { SAMTOOLS_INDEX         as SAMTOOLS_INDEX_RECAL }  from './modules/nf-core/software/samtools_index'
 include { SAMTOOLS_STATS         as SAMTOOLS_STATS }        from './modules/nf-core/software/samtools_stats'
 include { QUALIMAP_BAMQC         as BAMQC }                 from './modules/nf-core/software/qualimap_bamqc'
+include { GATK_HAPLOTYPECALLER   as HAPLOTYPECALLER }       from './modules/nf-core/software/gatk_haplotypecaller'
+include { GATK_GENOTYPEVCF       as GENOTYPEVCF }           from './modules/nf-core/software/gatk_genotypegvcf'
+include { STRELKA                as STRELKA }               from './modules/nf-core/software/strelka'
 include { MULTIQC }                                         from './modules/nf-core/software/multiqc'
 
 /*
@@ -589,7 +592,15 @@ workflow {
         bam_recalibrated = MERGE_BAM_RECAL.out.bam
         tsv_recalibrated = MERGE_BAM_RECAL.out.tsv
     }
+    //TODO: set bam_recalibrated with all these steps
+    // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal
+    // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal
 
+    // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked
+    // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked
+
+    // // When starting with variant calling, Channel bam_recalibrated is input_sample
+    // if (step == 'variantcalling') bam_recalibrated = input_sample
     // Creating TSV files to restart from this step
     tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta ->
         patient = meta.patient
@@ -620,7 +631,41 @@ workflow {
                                 GERMLINE VARIANT CALLING
     ================================================================================
     */
+    //TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR
+    bam_recalibrated_indexed_variant_calling = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],)
+    if ('haplotypecaller' in tools){
+        bam_haplotypecaller = bam_recalibrated_indexed_variant_calling.combine(intervals)
+
+        // STEP GATK HAPLOTYPECALLER.1
+
+        HAPLOTYPECALLER(bam_haplotypecaller, dbsnp,
+                                             dbsnp_tbi,
+                                             dict,
+                                             fasta,
+                                             fai)
+
+
+        // STEP GATK HAPLOTYPECALLER.2
+        GENOTYPEVCF(HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, dbsnp,
+                                             dbsnp_tbi,
+                                             dict,
+                                             fasta,
+                                             fai)
+
+        GENOTYPEVCF.out.map{name, meta, vcf -> 
+            patient = meta.patient
+            sample  = meta.sample
+            gender  = meta.gender
+            status  = meta.status
+            [name, patient, sample, gender, status, vcf] 
+        }.groupTuple(by: [0,1,2,])
+         .set{ vcfGenotypeGVCFs }
+    }
 
+    if ('strelka' in tools) {
+        STRELKA(bam_recalibrated_indexed_variant_calling, fasta, fai, target_bed, params.modules['strelka'])
+    }
+
     /*
     ================================================================================
                                 SOMATIC VARIANT CALLING
@@ -696,16 +741,8 @@ workflow.onComplete {
 // ================================================================================
 // */
 
-// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal
-// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal
 
-// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked
-// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked
 
-// // When starting with variant calling, Channel bam_recalibrated is input_sample
-// if (step == 'variantcalling') bam_recalibrated = input_sample
-
-// bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling')
 
 // // Here we have a recalibrated bam set
 // // The TSV file is formatted like: "idPatient status idSample bamFile baiFile"
@@ -714,186 +751,17 @@ workflow.onComplete {
 
 // (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6)
 
-// (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3)
 
 // // To speed Variant Callers up we are chopping the reference into smaller pieces
 // // Do variant calling by this intervals, and re-merge the VCFs
-
-// bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller)
 // bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle)
 
-// // STEP GATK HAPLOTYPECALLER.1
 
-// process HaplotypeCaller {
-//     label 'memory_singleCPU_task_sq'
-//     label 'cpus_2'
 
-//     tag "${idSample}-${intervalBed.baseName}"
 
-//     input:
-//         set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller
-//         file(dbsnp) from dbsnp
-//         file(dbsnpIndex) from dbsnp_tbi
-//         file(dict) from dict
-//         file(fasta) from fasta
-//         file(fastaFai) from fai
-
-//     output:
-//         set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller
-//         set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs
-
-//     when: 'haplotypecaller' in tools
-
-//     script:
-//     intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
-//     dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
-//     """
-//     gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
-//         HaplotypeCaller \
-//         -R ${fasta} \
-//         -I ${bam} \
-//         ${intervalsOptions} \
-//         ${dbsnpOptions} \
-//         -O ${intervalBed.baseName}_${idSample}.g.vcf \
-//         -ERC GVCF
-//     """
-// }
 
-// gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2])
 
-// if (params.no_gvcf) gvcfHaplotypeCaller.close()
-// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller')
-
-// // STEP GATK HAPLOTYPECALLER.2
-
-// process GenotypeGVCFs {
-//     tag "${idSample}-${intervalBed.baseName}"
-
-//     input:
-//         set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs
-//         file(dbsnp) from dbsnp
-//         file(dbsnpIndex) from dbsnp_tbi
-//         file(dict) from dict
-//         file(fasta) from fasta
-//         file(fastaFai) from fai
-
-//     output:
-//     set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs
-
-//     when: 'haplotypecaller' in tools
-
-//     script:
-//     // Using -L is important for speed and we have to index the interval files also
-//     intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
-//     dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
-//     """
-//     gatk --java-options -Xmx${task.memory.toGiga()}g \
-//         IndexFeatureFile \
-//         -I ${gvcf}
-
-//     gatk --java-options -Xmx${task.memory.toGiga()}g \
-//         GenotypeGVCFs \
-//         -R ${fasta} \
-//         ${intervalsOptions} \
-//         ${dbsnpOptions} \
-//         -V ${gvcf} \
-//         -O ${intervalBed.baseName}_${idSample}.vcf
-//     """
-// }
-
-// vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2])
-
-// // STEP SENTIEON DNAseq
-
-// process Sentieon_DNAseq {
-//     label 'cpus_max'
-//     label 'memory_max'
-//     label 'sentieon'
-
-//     tag "${idSample}"
-
-//     input:
-//         set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq
-//         file(dbsnp) from dbsnp
-//         file(dbsnpIndex) from dbsnp_tbi
-//         file(fasta) from fasta
-//         file(fastaFai) from fai
-
-//     output:
-//     set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq
-
-//     when: 'dnaseq' in tools && params.sentieon
-
-//     script:
-//     """
-//     sentieon driver \
-//         -t ${task.cpus} \
-//         -r ${fasta} \
-//         -i ${bam} \
-//         -q ${recal} \
-//         --algo Haplotyper \
-//         -d ${dbsnp} \
-//         DNAseq_${idSample}.vcf
-//     """
-// }
-
-// vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq')
-
-// // STEP SENTIEON DNAscope
-
-// process Sentieon_DNAscope {
-//     label 'cpus_max'
-//     label 'memory_max'
-//     label 'sentieon'
-
-//     tag "${idSample}"
-
-//     input:
-//         set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope
-//         file(dbsnp) from dbsnp
-//         file(dbsnpIndex) from dbsnp_tbi
-//         file(fasta) from fasta
-//         file(fastaFai) from fai
-
-//     output:
-//     set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope
-//     set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV
-
-//     when: 'dnascope' in tools && params.sentieon
-
-//     script:
-//     """
-//     sentieon driver \
-//         -t ${task.cpus} \
-//         -r ${fasta} \
-//         -i ${bam} \
-//         -q ${recal} \
-//         --algo DNAscope \
-//         -d ${dbsnp} \
-//         DNAscope_${idSample}.vcf
-
-//     sentieon driver \
-//         -t ${task.cpus} \
-//         -r ${fasta}\
-//         -i ${bam} \
-//         -q ${recal} \
-//         --algo DNAscope \
-//         --var_type bnd \
-//         -d ${dbsnp} \
-//         DNAscope_${idSample}.temp.vcf
-
-//     sentieon driver \
-//         -t ${task.cpus} \
-//         -r ${fasta}\
-//         -q ${recal} \
-//         --algo SVSolver \
-//         -v DNAscope_${idSample}.temp.vcf \
-//         DNAscope_SV_${idSample}.vcf
-//     """
-// }
 
-// vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope')
-// vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV')
 
 // // STEP STRELKA.1 - SINGLE MODE
 

diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf
@@ -0,0 +1,59 @@
+/*
+ * -----------------------------------------------------
+ *  Utility functions used in nf-core DSL2 module files
+ * -----------------------------------------------------
+ */
+
+/*
+ * Extract name of software tool from process name using $task.process
+ */
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+/*
+ * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+ */
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args          = args.args ?: ''
+    options.args2         = args.args2 ?: ''
+    options.publish_by_id = args.publish_by_id ?: false
+    options.publish_dir   = args.publish_dir ?: ''
+    options.publish_files = args.publish_files ?: null
+    options.suffix        = args.suffix ?: ''
+    return options
+}
+
+/*
+ * Tidy up and join elements of a list to return a path string
+ */
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }  // Remove empty entries
+    paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+/*
+ * Function to save/publish module results
+ */
+def saveFiles(Map args) {
+    if (!args.filename.endsWith('.version.txt')) {
+        def ioptions = initOptions(args.options)
+        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+        if (ioptions.publish_by_id) {
+            path_list.add(args.publish_id)
+        }
+        if (ioptions.publish_files instanceof Map) {
+            for (ext in ioptions.publish_files) {
+                if (args.filename.endsWith(ext.key)) {
+                    def ext_list = path_list.collect()
+                    ext_list.add(ext.value)
+                    return "${getPathFromList(ext_list)}/$args.filename"
+                }
+            }
+        } else {
+            return "${getPathFromList(path_list)}/$args.filename"
+        }
+    }
+}    
diff --git a/modules/nf-core/software/gatk_genotypegvcf.nf b/modules/nf-core/software/gatk_genotypegvcf.nf
@@ -0,0 +1,32 @@
+process GATK_GENOTYPEVCF {
+    tag "${meta.id}-${interval.baseName}"
+
+    input:
+        tuple val(meta), path(interval), path(gvcf)
+        path dbsnp
+        path dbsnpIndex
+        path dict
+        path fasta
+        path fai
+
+    output:
+     tuple val("HaplotypeCaller"), val(meta), path("${interval.baseName}_${meta.id}.vcf")
+
+    script:
+    // Using -L is important for speed and we have to index the interval files also
+    intervalsOptions = params.no_intervals ? "" : "-L ${interval}"
+    dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
+    """
+    gatk --java-options -Xmx${task.memory.toGiga()}g \
+        IndexFeatureFile \
+        -I ${gvcf}
+
+    gatk --java-options -Xmx${task.memory.toGiga()}g \
+        GenotypeGVCFs \
+        -R ${fasta} \
+        ${intervalsOptions} \
+        ${dbsnpOptions} \
+        -V ${gvcf} \
+        -O ${interval.baseName}_${meta.id}.vcf
+    """
+}