-
Notifications
You must be signed in to change notification settings - Fork 0
/
kraken2.nf
120 lines (97 loc) · 4.64 KB
/
kraken2.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env nextflow
/*
Run Kraken2 (installed in a Conda environment) and Bracken over gzip-compressed paired-end read sets for species identification.
[Dependencies]
- Conda
- Python and the pandas package
- Bracken v2.6+
- top3_bracken_taxa.py in github.com/wanyuac/Kraken_toolkit
[Use guide]
An example command line in a screen session:
nextflow -Djava.io.tmpdir=$PWD run kraken2.nf --db "$HOME/database/bacteria_20220910" --outdir "${PWD}/report" --queueSize 5 --fastq "./reads/*_{1,2}.fastq.gz" --conda_env "kraken2.1" --bracken_dir "$HOME/bin/Bracken" --read_len 150 --script_dir "$HOME/bin/Taxonomy_toolkit" -c $HOME/bin/Taxonomy_toolkit/kraken2.config
Note to use quote signs and '[1,2]' (not '{1,2}') for paths, particularly, the paths of input read files, in this command line. Otherwise, Nextflow only reads the first item in the file list provided
by --fastq ./reads/*_{1,2}.fastq.gz, causing an error to run the pipeline. Using simply './reads/*.fastq.gz' causes a failure in graping read files.
[Declarations]
Copyright (C) 2020-2023 Yu Wan <[email protected]>
Licensed under the GNU General Public License v3.0
Publication: 24 Mar 2020; last modification: 4 Jan 2023
*/
/*------------------------------------------------------------------------------
Output settings
------------------------------------------------------------------------------*/
nextflow.enable.dsl = 2
include { mkdir } from "./module"
def subdirs = ["kraken", "bracken"] // A list object of subdirectory names
out_dir = mkdir(params.outdir) // Create the parental output directory
subdirs.each { mkdir("${out_dir}/${it}") } // Iterate through the list
/*------------------------------------------------------------------------------
Processes
------------------------------------------------------------------------------*/
process kraken2 {
/*
The "publishDir" statement must be accompanied by the declaration of output in the process in order
to copy/move output files into the right output directory. Nothing happens if the output is not
defined even though this is a terminal process.
*/
publishDir path: "${out_dir}/kraken", mode: "copy", overwrite: true, pattern: "${genome}_kraken.txt"
label "PBS"
executor "pbs"
input:
tuple val(genome), file(fastqs)
output:
tuple val(genome), path("${genome}_kraken.txt")
script:
"""
module load anaconda3/personal
source activate ${params.conda_env}
kraken2 --threads ${params.cpus} --db ${params.db} --paired --gzip-compressed --output - --report ${genome}_kraken.txt ${fastqs}
"""
}
process bracken {
publishDir path: "${out_dir}/bracken", mode: "copy", overwrite: true, pattern: "${genome}_bracken.tsv"
publishDir path: "${out_dir}/bracken", mode: "copy", overwrite: true, pattern: "${genome}_bracken.txt"
label "PBS"
executor "pbs"
input:
tuple val(genome), path("${genome}_kraken.txt")
output:
val genome, emit: genome_name
path "${genome}_bracken.tsv" // Output paths must be declared or the corresponding files will not be copied to the published directory.
path "${genome}_bracken.txt"
script:
"""
module load anaconda3/personal
source activate ${params.conda_env}
${params.bracken_dir}/bracken -d ${params.db} -i ${genome}_kraken.txt -o ${genome}_bracken.tsv -w ${genome}_bracken.txt -l ${params.bracken_level} -r ${params.read_len} -t ${params.cpus}
"""
}
process compile_reports { // This process requires Python
publishDir path: "$out_dir", mode: "copy", overwrite: true, pattern: "top3_taxa.tsv"
label "PBS_light"
executor "pbs" //executor "local" # Could not get around the issue '/rds/general/user/ywan1/home/anaconda3/etc/profile.d/conda.sh: line 55: PS1: unbound variable' (github.com/conda/conda/issues/8186)
input:
val genomes // Triggers this process when all genome names are gathered (see 'workflow')
output:
path "top3_taxa.tsv"
script:
genome_list = file("genome_list.txt") // A file object
if (genome_list.exists()) {
genome_list.delete()
}
for (g : genomes) {
genome_list.append("${g}\n")
}
"""
module load anaconda3/personal
source activate ${params.conda_env}
python ${params.script_dir}/top3_bracken_taxa.py -l ${genome_list} -d ${out_dir}/bracken -s '_bracken.tsv' -o "top3_taxa.tsv"
"""
}
/*------------------------------------------------------------------------------
Workflow
------------------------------------------------------------------------------*/
workflow {
kraken2(Channel.fromFilePairs(params.fastq))
bracken(kraken2.out)
compile_reports(bracken.out.genome_name.collect())
}