Skip to content

Commit

Permalink
Merge pull request #266 from FriederikeHanssen/dsl2_variantcalling
Browse files Browse the repository at this point in the history
Variantcalling
  • Loading branch information
FriederikeHanssen authored Sep 21, 2020
2 parents a7679b9 + 3c28aa6 commit 355ca6f
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 177 deletions.
15 changes: 15 additions & 0 deletions conf/test_germline_variantcalling.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*
* -------------------------------------------------
* Nextflow config file for running tests
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/sarek -profile test_tool
*/

includeConfig 'test.config'

params {
// Input data
tools = 'haplotypecaller,strelka'
}
222 changes: 45 additions & 177 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-c
include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index'
include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats'
include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc'
include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller'
include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk_genotypegvcf'
include { STRELKA as STRELKA } from './modules/nf-core/software/strelka'
include { MULTIQC } from './modules/nf-core/software/multiqc'

/*
Expand Down Expand Up @@ -589,7 +592,15 @@ workflow {
bam_recalibrated = MERGE_BAM_RECAL.out.bam
tsv_recalibrated = MERGE_BAM_RECAL.out.tsv
}
//TODO: set bam_recalibrated with all these steps
// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal
// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal

// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked
// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked

// // When starting with variant calling, Channel bam_recalibrated is input_sample
// if (step == 'variantcalling') bam_recalibrated = input_sample
// Creating TSV files to restart from this step
tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta ->
patient = meta.patient
Expand Down Expand Up @@ -620,7 +631,41 @@ workflow {
GERMLINE VARIANT CALLING
================================================================================
*/
//TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR
bam_recalibrated_indexed_variant_calling = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],)
if ('haplotypecaller' in tools){
bam_haplotypecaller = bam_recalibrated_indexed_variant_calling.combine(intervals)

// STEP GATK HAPLOTYPECALLER.1

HAPLOTYPECALLER(bam_haplotypecaller, dbsnp,
dbsnp_tbi,
dict,
fasta,
fai)


// STEP GATK HAPLOTYPECALLER.2
GENOTYPEVCF(HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, dbsnp,
dbsnp_tbi,
dict,
fasta,
fai)

GENOTYPEVCF.out.map{name, meta, vcf ->
patient = meta.patient
sample = meta.sample
gender = meta.gender
status = meta.status
[name, patient, sample, gender, status, vcf]
}.groupTuple(by: [0,1,2,])
.set{ vcfGenotypeGVCFs }
}

if ('strelka' in tools) {
STRELKA(bam_recalibrated_indexed_variant_calling, fasta, fai, target_bed, params.modules['strelka'])
}

/*
================================================================================
SOMATIC VARIANT CALLING
Expand Down Expand Up @@ -696,16 +741,8 @@ workflow.onComplete {
// ================================================================================
// */

// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal
// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal

// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked
// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked

// // When starting with variant calling, Channel bam_recalibrated is input_sample
// if (step == 'variantcalling') bam_recalibrated = input_sample

// bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling')

// // Here we have a recalibrated bam set
// // The TSV file is formatted like: "idPatient status idSample bamFile baiFile"
Expand All @@ -714,186 +751,17 @@ workflow.onComplete {

// (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6)

// (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3)

// // To speed Variant Callers up we are chopping the reference into smaller pieces
// // Do variant calling by this intervals, and re-merge the VCFs

// bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller)
// bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle)

// // STEP GATK HAPLOTYPECALLER.1

// process HaplotypeCaller {
// label 'memory_singleCPU_task_sq'
// label 'cpus_2'

// tag "${idSample}-${intervalBed.baseName}"

// input:
// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller
// file(dbsnp) from dbsnp
// file(dbsnpIndex) from dbsnp_tbi
// file(dict) from dict
// file(fasta) from fasta
// file(fastaFai) from fai

// output:
// set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller
// set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs

// when: 'haplotypecaller' in tools

// script:
// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
// """
// gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
// HaplotypeCaller \
// -R ${fasta} \
// -I ${bam} \
// ${intervalsOptions} \
// ${dbsnpOptions} \
// -O ${intervalBed.baseName}_${idSample}.g.vcf \
// -ERC GVCF
// """
// }

// gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2])

// if (params.no_gvcf) gvcfHaplotypeCaller.close()
// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller')

// // STEP GATK HAPLOTYPECALLER.2

// process GenotypeGVCFs {
// tag "${idSample}-${intervalBed.baseName}"

// input:
// set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs
// file(dbsnp) from dbsnp
// file(dbsnpIndex) from dbsnp_tbi
// file(dict) from dict
// file(fasta) from fasta
// file(fastaFai) from fai

// output:
// set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs

// when: 'haplotypecaller' in tools

// script:
// // Using -L is important for speed and we have to index the interval files also
// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
// """
// gatk --java-options -Xmx${task.memory.toGiga()}g \
// IndexFeatureFile \
// -I ${gvcf}

// gatk --java-options -Xmx${task.memory.toGiga()}g \
// GenotypeGVCFs \
// -R ${fasta} \
// ${intervalsOptions} \
// ${dbsnpOptions} \
// -V ${gvcf} \
// -O ${intervalBed.baseName}_${idSample}.vcf
// """
// }

// vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2])

// // STEP SENTIEON DNAseq

// process Sentieon_DNAseq {
// label 'cpus_max'
// label 'memory_max'
// label 'sentieon'

// tag "${idSample}"

// input:
// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq
// file(dbsnp) from dbsnp
// file(dbsnpIndex) from dbsnp_tbi
// file(fasta) from fasta
// file(fastaFai) from fai

// output:
// set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq

// when: 'dnaseq' in tools && params.sentieon

// script:
// """
// sentieon driver \
// -t ${task.cpus} \
// -r ${fasta} \
// -i ${bam} \
// -q ${recal} \
// --algo Haplotyper \
// -d ${dbsnp} \
// DNAseq_${idSample}.vcf
// """
// }

// vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq')

// // STEP SENTIEON DNAscope

// process Sentieon_DNAscope {
// label 'cpus_max'
// label 'memory_max'
// label 'sentieon'

// tag "${idSample}"

// input:
// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope
// file(dbsnp) from dbsnp
// file(dbsnpIndex) from dbsnp_tbi
// file(fasta) from fasta
// file(fastaFai) from fai

// output:
// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope
// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV

// when: 'dnascope' in tools && params.sentieon

// script:
// """
// sentieon driver \
// -t ${task.cpus} \
// -r ${fasta} \
// -i ${bam} \
// -q ${recal} \
// --algo DNAscope \
// -d ${dbsnp} \
// DNAscope_${idSample}.vcf

// sentieon driver \
// -t ${task.cpus} \
// -r ${fasta}\
// -i ${bam} \
// -q ${recal} \
// --algo DNAscope \
// --var_type bnd \
// -d ${dbsnp} \
// DNAscope_${idSample}.temp.vcf

// sentieon driver \
// -t ${task.cpus} \
// -r ${fasta}\
// -q ${recal} \
// --algo SVSolver \
// -v DNAscope_${idSample}.temp.vcf \
// DNAscope_SV_${idSample}.vcf
// """
// }

// vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope')
// vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV')

// // STEP STRELKA.1 - SINGLE MODE

Expand Down
59 changes: 59 additions & 0 deletions modules/nf-core/software/functions.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/

/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}

/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.publish_by_id = args.publish_by_id ?: false
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files ?: null
options.suffix = args.suffix ?: ''
return options
}

/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}

/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_id) {
path_list.add(args.publish_id)
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}
32 changes: 32 additions & 0 deletions modules/nf-core/software/gatk_genotypegvcf.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process GATK_GENOTYPEVCF {
tag "${meta.id}-${interval.baseName}"

input:
tuple val(meta), path(interval), path(gvcf)
path dbsnp
path dbsnpIndex
path dict
path fasta
path fai

output:
tuple val("HaplotypeCaller"), val(meta), path("${interval.baseName}_${meta.id}.vcf")

script:
// Using -L is important for speed and we have to index the interval files also
intervalsOptions = params.no_intervals ? "" : "-L ${interval}"
dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : ""
"""
gatk --java-options -Xmx${task.memory.toGiga()}g \
IndexFeatureFile \
-I ${gvcf}
gatk --java-options -Xmx${task.memory.toGiga()}g \
GenotypeGVCFs \
-R ${fasta} \
${intervalsOptions} \
${dbsnpOptions} \
-V ${gvcf} \
-O ${interval.baseName}_${meta.id}.vcf
"""
}
Loading

0 comments on commit 355ca6f

Please sign in to comment.