Skip to content

Commit

Permalink
Merge pull request #256 from MaxUlysse/dsl2_meta
Browse files Browse the repository at this point in the history
use a meta map
  • Loading branch information
FriederikeHanssen authored Jul 24, 2020
2 parents 5bceacf + 9ea67af commit 538a296
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 155 deletions.
119 changes: 77 additions & 42 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ include {
extract_bam;
extract_fastq;
extract_fastq_from_dir;
extract_infos;
has_extension
} from './modules/local/functions'

Expand All @@ -66,12 +65,12 @@ multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_
output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)

// // Check if genome exists in the config file
// if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) {
// exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
// } else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) {
// exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
// }
// Check if genome exists in the config file
if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) {
exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
} else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) {
exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
}

step_list = define_step_list()
step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : ''
Expand Down Expand Up @@ -167,8 +166,6 @@ if (tsv_path) {
log.info "Trying automatic annotation on files in the VariantCalling/ directory"
} else exit 1, 'No sample were defined, see --help'

(gender_map, status_map, input_sample) = extract_infos(input_sample)

// input_sample.dump(tag: 'input sample')

/*
Expand Down Expand Up @@ -199,7 +196,7 @@ params.snpeff_db = params.genome ? params.genomes[params.genome].s
params.species = params.genome ? params.genomes[params.genome].species ?: false : false
params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false

// Initialize channels based on params
// Initialize file channels based on params
chr_dir = params.chr_dir ?: Channel.empty()
chr_length = params.chr_length ?: Channel.empty()
dbsnp = params.dbsnp ?: Channel.empty()
Expand All @@ -210,6 +207,8 @@ loci = params.ac_loci ?: Channel.empty()
loci_gc = params.ac_loci_gc ?: Channel.empty()
mappability = params.mappability ?: Channel.empty()
pon = params.pon ?: Channel.empty()

// Initialize value channels based on params
snpeff_cache = params.snpeff_cache ?: Channel.empty()
snpeff_db = params.snpeff_db ?: Channel.empty()
snpeff_species = params.species ?: Channel.empty()
Expand Down Expand Up @@ -253,7 +252,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works
================================================================================
*/


include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf'
include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions'
include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation'
Expand Down Expand Up @@ -347,63 +345,100 @@ workflow {
pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty()

// PREPROCESSING
if(!('fastqc' in skip_qc))
result_fastqc = FASTQC(input_sample)
else
result_fastqc = Channel.empty()

fastqc_html = Channel.empty()
fastqc_version = Channel.empty()
fastqc_zip = Channel.empty()

if (!('fastqc' in skip_qc)) {
FASTQC(input_sample)
fastqc_html = FASTQC.out.html
fastqc_version = FASTQC.out.version
fastqc_zip = FASTQC.out.zip
}

def bwamem2_mem_options = [:]

bwamem2_mem_options.args_bwamem2 = "-K 100000000 -M"
trim_galore_report = Channel.empty()

if (params.trim_fastq) {
TRIM_GALORE(input_sample)
result_trim_galore = TRIM_GALORE.out.report
BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai)
}
else {
result_trim_galore = Channel.empty()
BWAMEM2_MEM(input_sample, bwa, fasta, fai)
BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai, bwamem2_mem_options)
trim_galore_report = TRIM_GALORE.out.report
}

BWAMEM2_MEM.out.groupTuple(by:[0, 1])
.branch {
single: it[2].size() == 1
multiple: it[2].size() > 1
else BWAMEM2_MEM(input_sample, bwa, fasta, fai, bwamem2_mem_options)

results = BWAMEM2_MEM.out.map{ meta, bam, bai ->
patient = meta.patient
sample = meta.sample
gender = meta.gender
status = meta.status
[patient, sample, gender, status, bam, bai]
}.groupTuple(by: [0,1])
.branch{
single: it[4].size() == 1
multiple: it[4].size() > 1
}.set { bam }

bam_single = bam.single.map {
idPatient, idSample, idRun, bam, bai ->
[idPatient, idSample, bam[0], bai[0]]
patient, sample, gender, status, bam, bai ->

def meta = [:]
meta.patient = patient
meta.sample = sample
meta.gender = gender[0]
meta.status = status[0]
meta.id = sample

[meta, bam[0], bai[0]]
}

bam_multiple = bam.multiple.map {
patient, sample, gender, status, bam, bai ->

def meta = [:]
meta.patient = patient
meta.sample = sample
meta.gender = gender[0]
meta.status = status[0]
meta.id = sample

[meta, bam, bai]
}

//multipleBam = multipleBam.mix(multipleBamSentieon)
// multipleBam = multipleBam.mix(multipleBamSentieon)

bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam.multiple))
bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple))

bam_mapped.view()

if(!(params.skip_markduplicates)){
MARK_DUPLICATES(bam_mapped)
mark_duplicates_report = MARK_DUPLICATES.out.duplicates_marked_report
bam_duplicates_marked = MARK_DUPLICATES.out.bam_duplicates_marked
}
else {
mark_duplicates_report = Channel.empty()
bam_duplicates_marked = Channel.empty()
mark_duplicates_report = Channel.empty()
bam_duplicates_marked = Channel.empty()

if (!(params.skip_markduplicates)) {
// MARK_DUPLICATES(bam_mapped)
// mark_duplicates_report = MARK_DUPLICATES.out.report
// bam_duplicates_marked = MARK_DUPLICATES.out.bam
}

bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed)
// bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed)

//BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,)
// //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,)

OUTPUT_DOCUMENTATION(
output_docs,
output_docs_images)

GET_SOFTWARE_VERSIONS()

MULTIQC(
result_fastqc.collect().ifEmpty([]),
fastqc_html.collect().ifEmpty([]),
fastqc_zip.collect().ifEmpty([]),
multiqc_config,
multiqc_custom_config.ifEmpty([]),
GET_SOFTWARE_VERSIONS.out.yml,
result_trim_galore.collect().ifEmpty([]),
trim_galore_report.collect().ifEmpty([]),
mark_duplicates_report.collect().ifEmpty([]),
workflow_summary)
}
Expand Down
37 changes: 23 additions & 14 deletions modules/local/bwamem2_mem.nf
Original file line number Diff line number Diff line change
@@ -1,29 +1,38 @@
params.bwa_options = "-K 100000000 -M"
params.sequencer = "ILLUMINA"

process BWAMEM2_MEM {
label 'CPUS_MAX'

tag "${sample}_${run}"
tag "${meta.id}"
label 'process_high'

publishDir "${params.outdir}/bwamem2_mem", mode: 'copy'
publishDir "${params.outdir}/bwamem2/${meta.sample}",
mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('.version.txt')) null
else filename }

input:
tuple val(patient), val(sample), val(run), path(read1), path(read2)
tuple val(meta), path(reads)
path bwa
path fasta
path fai
val options

output:
tuple val(patient), val(sample), val(run), path("*.bam"), path("*.bai")
tuple val(meta), path("*.bam"), path("*.bai")

script:
CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : ""
readGroup = "@RG\\tID:${run}\\t${CN}PU:${run}\\tSM:${sample}\\tLB:${sample}\\tPL:${params.sequencer}"
readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA"
extra = meta.status == 1 ? "-B 3" : ""
"""
bwa-mem2 mem ${params.bwa_options} -R \"${readGroup}\" -t ${task.cpus} \
${fasta} ${read1} ${read2} | \
samtools sort --threads ${task.cpus} -m 2G - > ${sample}_${run}.bam
samtools index ${sample}_${run}.bam
bwa-mem2 mem \
${options.args_bwamem2} \
-R \"${readGroup}\" \
${extra} \
-t ${task.cpus} \
${fasta} ${reads} | \
samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam
samtools index ${meta.id}.bam
echo \$(bwa-mem2 version 2>&1) > bwa-mem2.version.txt
"""
}
46 changes: 16 additions & 30 deletions modules/local/functions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -137,48 +137,34 @@ def extract_fastq_from_dir(pattern) {
fastq
}

// Extract gender and status from Channel
def extract_infos(channel) {
def genderMap = [:]
def statusMap = [:]
channel = channel.map{ it ->
def idPatient = it[0]
def gender = it[1]
def status = it[2]
def idSample = it[3]
genderMap[idPatient] = gender
statusMap[idPatient, idSample] = status
[idPatient] + it[3..-1]
}
[genderMap, statusMap, channel]
}

// Channeling the TSV file containing FASTQ or BAM
// Format is: "subject gender status sample lane fastq1 fastq2"
// or: "subject gender status sample lane bam"
def extract_fastq(tsvFile) {
Channel.from(tsvFile)
.splitCsv(sep: '\t')
.map { row ->
def idPatient = row[0]
def gender = row[1]
def status = return_status(row[2].toInteger())
def idSample = row[3]
def idRun = row[4]
def file1 = return_file(row[5])
def file2 = "null"
if (has_extension(file1, "fastq.gz") || has_extension(file1, "fq.gz") || has_extension(file1, "fastq") || has_extension(file1, "fq")) {
def meta = [:]
meta.patient = row[0]
meta.gender = row[1]
meta.status = return_status(row[2].toInteger())
meta.sample = row[3]
meta.run = row[4]
meta.id = "${meta.sample}-${meta.run}"
read1 = return_file(row[5])
read2 = "null"
if (has_extension(read1, "fastq.gz") || has_extension(read1, "fq.gz") || has_extension(read1, "fastq") || has_extension(read1, "fq")) {
check_number_of_item(row, 7)
file2 = return_file(row[6])
if (!has_extension(file2, "fastq.gz") && !has_extension(file2, "fq.gz") && !has_extension(file2, "fastq") && !has_extension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information"
if (has_extension(file1, "fastq") || has_extension(file1, "fq") || has_extension(file2, "fastq") || has_extension(file2, "fq")) {
read2 = return_file(row[6])
if (!has_extension(read2, "fastq.gz") && !has_extension(read2, "fq.gz") && !has_extension(read2, "fastq") && !has_extension(read2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information"
if (has_extension(read1, "fastq") || has_extension(read1, "fq") || has_extension(read2, "fastq") || has_extension(read2, "fq")) {
exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
}
}
else if (has_extension(file1, "bam")) check_number_of_item(row, 6)
else "No recognisable extention for input file: ${file1}"
else if (has_extension(read1, "bam")) check_number_of_item(row, 6)
else "No recognisable extention for input file: ${read1}"

[idPatient, gender, status, idSample, idRun, file1, file2]
return [meta, [read1, read2]]
}
}

Expand Down
Loading

0 comments on commit 538a296

Please sign in to comment.