diff --git a/conf/test_germline_variantcalling.config b/conf/test_germline_variantcalling.config new file mode 100644 index 0000000000..7cec01feca --- /dev/null +++ b/conf/test_germline_variantcalling.config @@ -0,0 +1,15 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test_tool + */ + +includeConfig 'test.config' + +params { + // Input data + tools = 'haplotypecaller,strelka' +} \ No newline at end of file diff --git a/main.nf b/main.nf index 1a08a9d127..e2c0ff5eca 100644 --- a/main.nf +++ b/main.nf @@ -288,6 +288,9 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-c include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index' include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats' include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller' +include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk_genotypegvcf' +include { STRELKA as STRELKA } from './modules/nf-core/software/strelka' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -589,7 +592,15 @@ workflow { bam_recalibrated = MERGE_BAM_RECAL.out.bam tsv_recalibrated = MERGE_BAM_RECAL.out.tsv } + //TODO: set bam_recalibrated with all these steps + // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal + // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked + // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked + + // // When starting with variant calling, Channel bam_recalibrated is input_sample + // if (step == 'variantcalling') bam_recalibrated = input_sample // Creating TSV files to restart from this step tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> patient = meta.patient @@ -620,7 +631,41 @@ workflow { GERMLINE VARIANT CALLING ================================================================================ */ + //TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR + bam_recalibrated_indexed_variant_calling = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) + if ('haplotypecaller' in tools){ + bam_haplotypecaller = bam_recalibrated_indexed_variant_calling.combine(intervals) + + // STEP GATK HAPLOTYPECALLER.1 + + HAPLOTYPECALLER(bam_haplotypecaller, dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + + // STEP GATK HAPLOTYPECALLER.2 + GENOTYPEVCF(HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + GENOTYPEVCF.out.map{name, meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [name, patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1,2,]) + .set{ vcfGenotypeGVCFs } + } + if ('strelka' in tools) { + STRELKA(bam_recalibrated_indexed_variant_calling, fasta, fai, target_bed, params.modules['strelka']) + } + /* ================================================================================ SOMATIC VARIANT CALLING @@ -696,16 +741,8 @@ workflow.onComplete { // ================================================================================ // */ -// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal -// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal -// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked -// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked -// // When starting with variant calling, Channel bam_recalibrated is input_sample -// if (step == 'variantcalling') bam_recalibrated = input_sample - -// bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') // // Here we have a recalibrated bam set // // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" @@ -714,186 +751,17 @@ workflow.onComplete { // (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) -// (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) // // To speed Variant Callers up we are chopping the reference into smaller pieces // // Do variant calling by this intervals, and re-merge the VCFs - -// bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller) // bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle) -// // STEP GATK HAPLOTYPECALLER.1 -// process HaplotypeCaller { -// label 'memory_singleCPU_task_sq' -// label 'cpus_2' -// tag "${idSample}-${intervalBed.baseName}" -// input: -// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(dict) from dict -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller -// set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs - -// when: 'haplotypecaller' in tools - -// script: -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" -// """ -// gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ -// HaplotypeCaller \ -// -R ${fasta} \ -// -I ${bam} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// -O ${intervalBed.baseName}_${idSample}.g.vcf \ -// -ERC GVCF -// """ -// } -// gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) -// if (params.no_gvcf) gvcfHaplotypeCaller.close() -// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') - -// // STEP GATK HAPLOTYPECALLER.2 - -// process GenotypeGVCFs { -// tag "${idSample}-${intervalBed.baseName}" - -// input: -// set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(dict) from dict -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs - -// when: 'haplotypecaller' in tools - -// script: -// // Using -L is important for speed and we have to index the interval files also -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// IndexFeatureFile \ -// -I ${gvcf} - -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// GenotypeGVCFs \ -// -R ${fasta} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// -V ${gvcf} \ -// -O ${intervalBed.baseName}_${idSample}.vcf -// """ -// } - -// vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) - -// // STEP SENTIEON DNAseq - -// process Sentieon_DNAseq { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idSample}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq - -// when: 'dnaseq' in tools && params.sentieon - -// script: -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${bam} \ -// -q ${recal} \ -// --algo Haplotyper \ -// -d ${dbsnp} \ -// DNAseq_${idSample}.vcf -// """ -// } - -// vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') - -// // STEP SENTIEON DNAscope - -// process Sentieon_DNAscope { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idSample}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope -// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV - -// when: 'dnascope' in tools && params.sentieon - -// script: -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${bam} \ -// -q ${recal} \ -// --algo DNAscope \ -// -d ${dbsnp} \ -// DNAscope_${idSample}.vcf - -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta}\ -// -i ${bam} \ -// -q ${recal} \ -// --algo DNAscope \ -// --var_type bnd \ -// -d ${dbsnp} \ -// DNAscope_${idSample}.temp.vcf - -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta}\ -// -q ${recal} \ -// --algo SVSolver \ -// -v DNAscope_${idSample}.temp.vcf \ -// DNAscope_SV_${idSample}.vcf -// """ -// } -// vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') -// vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') // // STEP STRELKA.1 - SINGLE MODE diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf new file mode 100644 index 0000000000..c284945c4b --- /dev/null +++ b/modules/nf-core/software/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files ?: null + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk_genotypegvcf.nf b/modules/nf-core/software/gatk_genotypegvcf.nf new file mode 100644 index 0000000000..38a139966c --- /dev/null +++ b/modules/nf-core/software/gatk_genotypegvcf.nf @@ -0,0 +1,32 @@ +process GATK_GENOTYPEVCF { + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta), path(interval), path(gvcf) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val("HaplotypeCaller"), val(meta), path("${interval.baseName}_${meta.id}.vcf") + + script: + // Using -L is important for speed and we have to index the interval files also + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + IndexFeatureFile \ + -I ${gvcf} + + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GenotypeGVCFs \ + -R ${fasta} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -V ${gvcf} \ + -O ${interval.baseName}_${meta.id}.vcf + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk_haplotypecaller.nf b/modules/nf-core/software/gatk_haplotypecaller.nf new file mode 100644 index 0000000000..6971224abb --- /dev/null +++ b/modules/nf-core/software/gatk_haplotypecaller.nf @@ -0,0 +1,33 @@ +process GATK_HAPLOTYPECALLER { + label 'MEMORY_SINGLECPU_TASK_SQ' + label 'CPUS_2' + + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta), path(bam), path(bai), file(interval) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfHaplotypeCaller + tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfGenotypeGVCFs + + + script: + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ${fasta} \ + -I ${bam} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -O ${interval.baseName}_${meta.id}.g.vcf \ + -ERC GVCF + """ +} diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf new file mode 100644 index 0000000000..e8db199108 --- /dev/null +++ b/modules/nf-core/software/strelka.nf @@ -0,0 +1,60 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +process STRELKA { + tag "$meta.id" + + label 'cpus_max' + label 'memory_max' + + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + + + container "quay.io/biocontainers/strelka:2.9.10--0" + + conda (params.conda ? "bioconda::strelka=2.9.10" : null) + + input: + tuple val(meta), path(bam), path (bai) + path(fasta) + path(fai) + path(target_bed) + val options + + output: + tuple val("Strelka"), val(meta), path("*.vcf.gz"), path("*.vcf.gz.tbi"), emit: vcfStrelkaSingle + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + def prefix = ioptions.suffix ? "Strelka_${meta.id}" : "Strelka_${meta.id}" + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${target_bed} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options_strelka = params.target_bed ? ioptions.args : "" + """ + ${beforeScript} + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${fasta} \ + ${options_strelka} \ + --runDir Strelka + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + + mv Strelka/results/variants/genome.*.vcf.gz ${prefix}_genome.vcf.gz + + mv Strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}_genome.vcf.gz.tbi + + mv Strelka/results/variants/variants.vcf.gz ${prefix}_variants.vcf.gz + + mv Strelka/results/variants/variants.vcf.gz.tbi ${prefix}_variants.vcf.gz.tbi + + echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 6193471aa4..ce60594cf7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -165,6 +165,8 @@ profiles { test_targeted { includeConfig 'conf/test_targeted.config' } test_tool { includeConfig 'conf/test_tool.config' } test_trimming { includeConfig 'conf/test_trimming.config' } + test_haplotypecaller { includeConfig 'conf/test_germline_variantcalling.config' } + } // Load genomes.config or igenomes.config