Merge pull request #256 from MaxUlysse/dsl2_meta

use a meta map
nf-core · Jul 24, 2020 · 538a296 · 538a296
2 parents 5bceacf + 9ea67af
commit 538a296
Show file tree

Hide file tree

Showing 9 changed files with 194 additions and 155 deletions.
diff --git a/main.nf b/main.nf
@@ -44,7 +44,6 @@ include {
     extract_bam;
     extract_fastq;
     extract_fastq_from_dir;
-    extract_infos;
     has_extension
 } from './modules/local/functions'
 
@@ -66,12 +65,12 @@ multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_
 output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
 output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)
 
-// // Check if genome exists in the config file
-// if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) {
-//     exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
-// } else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) {
-//     exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
-// }
+// Check if genome exists in the config file
+if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) {
+    exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
+} else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) {
+    exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
+}
 
 step_list = define_step_list()
 step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : ''
@@ -167,8 +166,6 @@ if (tsv_path) {
     log.info "Trying automatic annotation on files in the VariantCalling/ directory"
 } else exit 1, 'No sample were defined, see --help'
 
-(gender_map, status_map, input_sample) = extract_infos(input_sample)
-
 // input_sample.dump(tag: 'input sample')
 
 /*
@@ -199,7 +196,7 @@ params.snpeff_db               = params.genome ? params.genomes[params.genome].s
 params.species                 = params.genome ? params.genomes[params.genome].species                 ?: false : false
 params.vep_cache_version       = params.genome ? params.genomes[params.genome].vep_cache_version       ?: false : false
 
-// Initialize channels based on params
+// Initialize file channels based on params
 chr_dir           = params.chr_dir           ?: Channel.empty()
 chr_length        = params.chr_length        ?: Channel.empty()
 dbsnp             = params.dbsnp             ?: Channel.empty()
@@ -210,6 +207,8 @@ loci              = params.ac_loci           ?: Channel.empty()
 loci_gc           = params.ac_loci_gc        ?: Channel.empty()
 mappability       = params.mappability       ?: Channel.empty()
 pon               = params.pon               ?: Channel.empty()
+
+// Initialize value channels based on params
 snpeff_cache      = params.snpeff_cache      ?: Channel.empty()
 snpeff_db         = params.snpeff_db         ?: Channel.empty()
 snpeff_species    = params.species           ?: Channel.empty()
@@ -253,7 +252,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works
 ================================================================================
 */
 
-
 include { BWAMEM2_MEM }           from './modules/local/bwamem2_mem.nf'
 include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions'
 include { OUTPUT_DOCUMENTATION }  from './modules/local/output_documentation'
@@ -347,63 +345,100 @@ workflow {
     pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty()
 
     // PREPROCESSING
-    if(!('fastqc' in skip_qc))
-        result_fastqc = FASTQC(input_sample)
-    else
-        result_fastqc = Channel.empty()
+
+    fastqc_html    = Channel.empty()
+    fastqc_version = Channel.empty()
+    fastqc_zip     = Channel.empty()
+
+    if (!('fastqc' in skip_qc)) {
+        FASTQC(input_sample)
+        fastqc_html    = FASTQC.out.html
+        fastqc_version = FASTQC.out.version
+        fastqc_zip     = FASTQC.out.zip
+    }
+
+    def bwamem2_mem_options = [:]
+
+    bwamem2_mem_options.args_bwamem2 = "-K 100000000 -M"
+    trim_galore_report = Channel.empty()
 
     if (params.trim_fastq) {
         TRIM_GALORE(input_sample)
-        result_trim_galore = TRIM_GALORE.out.report
-        BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai)
-    }
-    else {
-        result_trim_galore = Channel.empty()
-        BWAMEM2_MEM(input_sample, bwa, fasta, fai)
+        BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai, bwamem2_mem_options)
+        trim_galore_report = TRIM_GALORE.out.report
     }
-
-    BWAMEM2_MEM.out.groupTuple(by:[0, 1])
-        .branch {
-            single:   it[2].size() == 1
-            multiple: it[2].size() > 1
+    else BWAMEM2_MEM(input_sample, bwa, fasta, fai, bwamem2_mem_options)
+
+    results = BWAMEM2_MEM.out.map{ meta, bam, bai ->
+        patient = meta.patient
+        sample  = meta.sample
+        gender  = meta.gender
+        status  = meta.status
+        [patient, sample, gender, status, bam, bai]
+    }.groupTuple(by: [0,1])
+        .branch{
+            single:   it[4].size() == 1
+            multiple: it[4].size() > 1
         }.set { bam }
 
     bam_single = bam.single.map {
-        idPatient, idSample, idRun, bam, bai ->
-        [idPatient, idSample, bam[0], bai[0]]
+        patient, sample, gender, status, bam, bai ->
+
+        def meta = [:]
+        meta.patient = patient
+        meta.sample = sample
+        meta.gender = gender[0]
+        meta.status = status[0]
+        meta.id = sample
+
+        [meta, bam[0], bai[0]]
+    }
+
+    bam_multiple = bam.multiple.map {
+        patient, sample, gender, status, bam, bai ->
+
+        def meta = [:]
+        meta.patient = patient
+        meta.sample = sample
+        meta.gender = gender[0]
+        meta.status = status[0]
+        meta.id = sample
+
+        [meta, bam, bai]
     }
 
-    //multipleBam = multipleBam.mix(multipleBamSentieon)
+    // multipleBam = multipleBam.mix(multipleBamSentieon)
 
-    bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam.multiple))
+    bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple))
 
     bam_mapped.view()
 
-    if(!(params.skip_markduplicates)){
-        MARK_DUPLICATES(bam_mapped)
-        mark_duplicates_report = MARK_DUPLICATES.out.duplicates_marked_report
-        bam_duplicates_marked =  MARK_DUPLICATES.out.bam_duplicates_marked 
-    }
-    else {
-        mark_duplicates_report = Channel.empty()
-        bam_duplicates_marked = Channel.empty()
+    mark_duplicates_report = Channel.empty()
+    bam_duplicates_marked  = Channel.empty()
+
+    if (!(params.skip_markduplicates)) {
+        // MARK_DUPLICATES(bam_mapped)
+        // mark_duplicates_report = MARK_DUPLICATES.out.report
+        // bam_duplicates_marked  =  MARK_DUPLICATES.out.bam 
     }
 
-    bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed)
+    // bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed)
 
-    //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,)
+    // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,)
 
     OUTPUT_DOCUMENTATION(
         output_docs,
         output_docs_images)
 
     GET_SOFTWARE_VERSIONS()
+
     MULTIQC(
-        result_fastqc.collect().ifEmpty([]),
+        fastqc_html.collect().ifEmpty([]),
+        fastqc_zip.collect().ifEmpty([]),
         multiqc_config,
         multiqc_custom_config.ifEmpty([]),
         GET_SOFTWARE_VERSIONS.out.yml,
-        result_trim_galore.collect().ifEmpty([]),
+        trim_galore_report.collect().ifEmpty([]),
         mark_duplicates_report.collect().ifEmpty([]),
         workflow_summary)
 }

diff --git a/modules/local/bwamem2_mem.nf b/modules/local/bwamem2_mem.nf
@@ -1,29 +1,38 @@
-params.bwa_options = "-K 100000000 -M"
-params.sequencer = "ILLUMINA"
-
 process BWAMEM2_MEM {
-    label 'CPUS_MAX'
-
-    tag "${sample}_${run}"
+    tag "${meta.id}"
+    label 'process_high'
 
-    publishDir "${params.outdir}/bwamem2_mem", mode: 'copy'
+    publishDir "${params.outdir}/bwamem2/${meta.sample}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename ->
+                    if (filename.endsWith('.version.txt')) null
+                    else filename }
 
     input:
-        tuple val(patient), val(sample), val(run), path(read1), path(read2)
+        tuple val(meta), path(reads)
         path bwa
         path fasta
         path fai
+        val options
 
     output:
-        tuple val(patient), val(sample), val(run), path("*.bam"), path("*.bai")
+        tuple val(meta), path("*.bam"), path("*.bai")
 
     script:
     CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : ""
-    readGroup = "@RG\\tID:${run}\\t${CN}PU:${run}\\tSM:${sample}\\tLB:${sample}\\tPL:${params.sequencer}"
+    readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA"
+    extra = meta.status == 1 ? "-B 3" : ""
     """
-    bwa-mem2 mem ${params.bwa_options} -R \"${readGroup}\" -t ${task.cpus} \
-    ${fasta} ${read1} ${read2} | \
-    samtools sort --threads ${task.cpus} -m 2G - > ${sample}_${run}.bam
-    samtools index ${sample}_${run}.bam
+    bwa-mem2 mem \
+        ${options.args_bwamem2} \
+        -R \"${readGroup}\" \
+        ${extra} \
+        -t ${task.cpus} \
+        ${fasta} ${reads} | \
+    samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam
+
+    samtools index ${meta.id}.bam
+
+    echo \$(bwa-mem2 version 2>&1) > bwa-mem2.version.txt
     """
 }
diff --git a/modules/local/functions.nf b/modules/local/functions.nf
@@ -137,48 +137,34 @@ def extract_fastq_from_dir(pattern) {
     fastq
 }
 
-// Extract gender and status from Channel
-def extract_infos(channel) {
-    def genderMap = [:]
-    def statusMap = [:]
-    channel = channel.map{ it ->
-        def idPatient = it[0]
-        def gender = it[1]
-        def status = it[2]
-        def idSample = it[3]
-        genderMap[idPatient] = gender
-        statusMap[idPatient, idSample] = status
-        [idPatient] + it[3..-1]
-    }
-    [genderMap, statusMap, channel]
-}
-
 // Channeling the TSV file containing FASTQ or BAM
 // Format is: "subject gender status sample lane fastq1 fastq2"
 // or: "subject gender status sample lane bam"
 def extract_fastq(tsvFile) {
     Channel.from(tsvFile)
         .splitCsv(sep: '\t')
         .map { row ->
-            def idPatient  = row[0]
-            def gender     = row[1]
-            def status     = return_status(row[2].toInteger())
-            def idSample   = row[3]
-            def idRun      = row[4]
-            def file1      = return_file(row[5])
-            def file2      = "null"
-            if (has_extension(file1, "fastq.gz") || has_extension(file1, "fq.gz") || has_extension(file1, "fastq") || has_extension(file1, "fq")) {
+            def meta = [:]
+            meta.patient = row[0]
+            meta.gender  = row[1]
+            meta.status  = return_status(row[2].toInteger())
+            meta.sample  = row[3]
+            meta.run     = row[4]
+            meta.id      = "${meta.sample}-${meta.run}"
+            read1   = return_file(row[5])
+            read2   = "null"
+            if (has_extension(read1, "fastq.gz") || has_extension(read1, "fq.gz") || has_extension(read1, "fastq") || has_extension(read1, "fq")) {
                 check_number_of_item(row, 7)
-                file2 = return_file(row[6])
-            if (!has_extension(file2, "fastq.gz") && !has_extension(file2, "fq.gz")  && !has_extension(file2, "fastq") && !has_extension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information"
-            if (has_extension(file1, "fastq") || has_extension(file1, "fq") || has_extension(file2, "fastq") || has_extension(file2, "fq")) {
+                read2 = return_file(row[6])
+            if (!has_extension(read2, "fastq.gz") && !has_extension(read2, "fq.gz")  && !has_extension(read2, "fastq") && !has_extension(read2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information"
+            if (has_extension(read1, "fastq") || has_extension(read1, "fq") || has_extension(read2, "fastq") || has_extension(read2, "fq")) {
                 exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
             }
         }
-        else if (has_extension(file1, "bam")) check_number_of_item(row, 6)
-        else "No recognisable extention for input file: ${file1}"
+        else if (has_extension(read1, "bam")) check_number_of_item(row, 6)
+        else "No recognisable extention for input file: ${read1}"
 
-        [idPatient, gender, status, idSample, idRun, file1, file2]
+        return [meta, [read1, read2]]
     }
 }