Skip to content

Commit

Permalink
Merge pull request #67 from d4straub/compress-assembly
Browse files Browse the repository at this point in the history
Compress assembly files
  • Loading branch information
d4straub authored Jul 14, 2020
2 parents dfbf582 + 7101914 commit e597127
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Add host read removal with Bowtie 2 and according custom section to MultiQC
- Add separate MultiQC section for FastQC after preprocessing
- Add social preview image
- Compress assembly files
- Add MetaBAT2 RNG seed parameter `--metabat_rng_seed` and set the default to 1 which ensures reproducible binning results
- Add parameters `--megahit_fix_cpu_1`, `--spades_fix_cpus` and `--spadeshybrid_fix_cpus` to ensure reproducible results from assembly tools

Expand Down
10 changes: 7 additions & 3 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl

**Output directory: `results/MEGAHIT`**

- `${sample}.contigs.fasta`: metagenome assembly in fasta format
- `${sample}.contigs.fa.gz`: compressed metagenome assembly in fasta format
- `${sample}_QC/`: directory containing QUAST files

### SPAdes
Expand All @@ -132,7 +132,9 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl

**Output directory: `results/SPAdes`**

- `${sample}_contigs.fasta`: metagenome assembly in fasta format
- `${sample}_scaffolds.fasta.gz`: compressed assembled scaffolds in fasta format
- `${sample}_graph.gfa.gz`: compressed assembly graph in gfa format
- `${sample}_contigs.fasta.gz`: compressed assembled contigs in fasta format
- `${sample}_QC/`: directory containing QUAST files

### SPAdesHybrid
Expand All @@ -141,7 +143,9 @@ SPAdesHybrid is a part of the SPAdes software and is used when the user provides

**Output directory: `results/SPAdesHybrid`**

- `${sample}_contigs.fasta`: metagenome assembly in fasta format
- `${sample}_scaffolds.fasta.gz`: compressed assembled scaffolds in fasta format
- `${sample}_graph.gfa.gz`: compressed assembly graph in gfa format
- `${sample}_contigs.fasta.gz`: compressed assembled contigs in fasta format
- `${sample}_QC/`: directory containing QUAST files

### Quast
Expand Down
31 changes: 23 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -888,14 +888,17 @@ process krona {
process megahit {
tag "$name"
publishDir "${params.outdir}/", mode: 'copy',
saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "Assembly/$filename" : null}
saveAs: {filename ->
if (filename.indexOf(".log") > 0 || filename.indexOf(".contigs.fa.gz") > 0 ) "Assembly/$filename"
else null}

input:
set val(name), file(reads) from trimmed_reads_megahit

output:
set val("MEGAHIT"), val("$name"), file("MEGAHIT/${name}.contigs.fa") into (assembly_megahit_to_quast, assembly_megahit_to_metabat)
file("MEGAHIT/*.log")
file("MEGAHIT/${name}.contigs.fa.gz")

when:
!params.skip_megahit
Expand All @@ -905,6 +908,7 @@ process megahit {
if ( !params.megahit_fix_cpu_1 || task.cpus == 1 )
"""
megahit -t "${task.cpus}" $input -o MEGAHIT --out-prefix "${name}"
gzip -c "MEGAHIT/${name}.contigs.fa" > "MEGAHIT/${name}.contigs.fa.gz"
"""
else
error "ERROR: '--megahit_fix_cpu_1' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
Expand All @@ -922,16 +926,19 @@ process megahit {
process spadeshybrid {
tag "$id"
publishDir "${params.outdir}/", mode: 'copy', pattern: "${id}*",
saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "Assembly/SPAdesHybrid/$filename" : null}
saveAs: {filename ->
if (filename.indexOf(".log") > 0 || filename.indexOf("_scaffolds.fasta.gz") > 0 || filename.indexOf("_graph.gfa.gz") > 0 || filename.indexOf("_contigs.fasta.gz") > 0 ) "Assembly/SPAdesHybrid/$filename"
else null}

input:
set id, file(lr), file(sr) from files_pre_spadeshybrid

output:
set id, val("SPAdesHybrid"), file("${id}_graph.gfa") into assembly_graph_spadeshybrid
set val("SPAdesHybrid"), val("$id"), file("${id}_scaffolds.fasta") into (assembly_spadeshybrid_to_quast, assembly_spadeshybrid_to_metabat)
file("${id}_contigs.fasta")
file("${id}_log.txt")
file("${id}_contigs.fasta.gz")
file("${id}_scaffolds.fasta.gz")
file("${id}_graph.gfa.gz")

when:
params.manifest && !params.single_end && !params.skip_spadeshybrid
Expand All @@ -951,6 +958,9 @@ process spadeshybrid {
mv spades/scaffolds.fasta ${id}_scaffolds.fasta
mv spades/contigs.fasta ${id}_contigs.fasta
mv spades/spades.log ${id}_log.txt
gzip "${id}_contigs.fasta"
gzip "${id}_graph.gfa"
gzip -c "${id}_scaffolds.fasta" > "${id}_scaffolds.fasta.gz"
"""
else
error "ERROR: '--spadeshyrid_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
Expand All @@ -960,16 +970,18 @@ process spadeshybrid {
process spades {
tag "$id"
publishDir "${params.outdir}/", mode: 'copy', pattern: "${id}*",
saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "Assembly/SPAdes/$filename" : null}

saveAs: {filename ->
if (filename.indexOf(".log") > 0 || filename.indexOf("_scaffolds.fasta.gz") > 0 || filename.indexOf("_graph.gfa.gz") > 0 || filename.indexOf("_contigs.fasta.gz") > 0 ) "Assembly/SPAdes/$filename"
else null}
input:
set id, file(sr) from trimmed_reads_spades

output:
set id, val("SPAdes"), file("${id}_graph.gfa") into assembly_graph_spades
set val("SPAdes"), val("$id"), file("${id}_scaffolds.fasta") into (assembly_spades_to_quast, assembly_spades_to_metabat)
file("${id}_contigs.fasta")
file("${id}_log.txt")
file("${id}_contigs.fasta.gz")
file("${id}_scaffolds.fasta.gz")
file("${id}_graph.gfa.gz")

when:
!params.single_end && !params.skip_spades
Expand All @@ -988,6 +1000,9 @@ process spades {
mv spades/scaffolds.fasta ${id}_scaffolds.fasta
mv spades/contigs.fasta ${id}_contigs.fasta
mv spades/spades.log ${id}_log.txt
gzip "${id}_contigs.fasta"
gzip "${id}_graph.gfa"
gzip -c "${id}_scaffolds.fasta" > "${id}_scaffolds.fasta.gz"
"""
else
error "ERROR: '--spades_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
Expand Down

0 comments on commit e597127

Please sign in to comment.