From b2c3d5fccf93c0a9770ab302764cad8ada45ad50 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Thu, 8 Aug 2024 13:32:39 -0600 Subject: [PATCH] Attempts to rescue incomplete assemblies, similar to EDGE --- runAssembly/nextflow.config | 3 + runAssembly/runAssembly.nf | 131 ++++++++++++++++++++++-------------- 2 files changed, 84 insertions(+), 50 deletions(-) diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index be66020..04c17d9 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -42,4 +42,7 @@ params { } workflow.onComplete = { "rm -rf nf_assets".execute().text +} +workflow.onError = { + "rm -rf nf_assets".execute().text } \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 75d9fef..e9e005d 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -18,7 +18,7 @@ process idbaUD { "scaffold.fa" } else{ - null //do not publish contig-max.fa yet, but use it for downstream process + null //do not publish contig-max.fa or intermediate contigs yet, but use them for downstream processes } } ) @@ -32,7 +32,8 @@ process idbaUD { output: path "log" path "scaffold*.{fa,fasta}", optional:true - path "contig-max.fa", emit: contigs + path "contig-max.fa", emit: contigs, optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def avg_len = avg_len as Integer @@ -168,6 +169,7 @@ process idbaAvgLen { //assemble using spades process spades { + publishDir ( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -186,7 +188,10 @@ process spades { "assembly_graph_with_scaffolds.gfa" } else if(filename ==~ /(contigs|transcripts)\.fasta/) { - null //do not publish, but use downstream + null //do not publish contigs, but use downstream + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -204,9 +209,10 @@ process spades { path "scaffold*.{fa,fasta}", optional:true path "spades.log" path "contigs.paths" - path "{contigs,transcripts}.fasta", emit:contigs + path "{contigs,transcripts}.fasta", emit:contigs, optional:true path "assembly_graph.fastg", optional:true path "assembly_graph_with_scaffolds.gfa", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: @@ -246,6 +252,7 @@ process spades { //assemble using megahit process megahit { + publishDir( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -258,7 +265,10 @@ process megahit { "scaffold.fa" } else if(filename ==~ /megahit\/final\.contigs\.fa/) { - null //don't publish, but pass to downstream "rename" process + null //don't publish, but pass to downstream "renameFilterFasta" process + } + else if(filename ==~ /megahit\/((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -273,8 +283,9 @@ process megahit { output: path "megahit/log" path "megahit/scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of megahit, but just in case - path "megahit/final.contigs.fa", emit: contigs + path "megahit/final.contigs.fa", emit: contigs, optional:true path "${params.projName}_contigs.fastg" + path "megahit/{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -295,23 +306,6 @@ process megahit { } -//filter long reads for unicycler -process unicyclerPrep { - input: - path longreads - - - output: - path "long_reads.fasta" - - script: - - """ - seqtk seq -A -L\ - $params.unicycler.minLongReads\ - $longreads > long_reads.fasta - """ -} //assembly using unicycler process unicycler { @@ -330,7 +324,7 @@ process unicycler { "${params.projName}_contigs.fastg" } else if(filename ==~ /assembly\.fasta/) { - null //don't publish, but emit for use in downstream process "rename" + null //don't publish, but emit for use in downstream process "renameFilterFasta" } else{ filename @@ -348,6 +342,7 @@ process unicycler { path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of unicycler, but just in case path "assembly.fasta", emit: contigs path "assembly.gfa", optional:true + //path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs | not produced by unicycler script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -366,8 +361,28 @@ process unicycler { } +//filter long reads for unicycler +process unicyclerPrep { + input: + path longreads + + + output: + path "long_reads.fasta" + + script: + + """ + seqtk seq -A -L\ + $params.unicycler.minLongReads\ + $longreads > long_reads.fasta + """ +} + + //assembly using lrasm process lrasm { + publishDir ( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -392,7 +407,10 @@ process lrasm { "assembly_info.txt" } else if(filename ==~ /contigs\.fa/) { - null //do not publish, but emit for use in downstream process "rename" + null //do not publish, but emit for use in downstream process "renameFilterFasta" + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -406,11 +424,12 @@ process lrasm { output: path "contigs.log" path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of lrasm, but just in case - path "contigs.fa", emit:contigs + path "contigs.fa", emit:contigs, optional:true path "Assembly/unitig.gfa", optional:true path "Assembly/assembly_graph.gfa", optional:true path "Assembly/assembly_graph.gv", optional:true path "Assembly/assembly_info.txt", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" @@ -439,7 +458,7 @@ process lrasm { //2>/dev/null } -process rename { +process renameFilterFasta { publishDir( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy' @@ -467,25 +486,30 @@ process rename { } -// process ifNoOutputContigs { -// input: -// val x -// path contigDirectory - -// when: -// x == 'EMPTY' +process bestIncompleteAssembly { + input: + val x + path intContigs -// output: -// path "*" + when: + x == 'EMPTY' -// shell: -// ''' -// #!/usr/bin/env perl -// my @intermediate_contigs = sort { -M $a <=> -M $b} glob("$AssemblerOutDir/contig-* $AssemblerOutDir/intermediate_contigs/*contigs.fa $AssemblerOutDir/K*/final_contigs.fasta" ); + output: + path "bestIntContig/*" -// ''' + shell: + ''' + #!/usr/bin/env perl + use Cwd; + my $dir = getcwd; + use Cwd 'abs_path'; + my @intermediate_contigs = sort { -M $a <=> -M $b} glob("!{intContigs}"); + my $best_int = $dir . "/" . $intermediate_contigs[0]; + mkdir bestIntContig; + system("cp $best_int ./bestIntContig"); + ''' -// } +} workflow { "mkdir nf_assets".execute().text @@ -508,16 +532,22 @@ workflow { su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")}), avg_len_ch) - rename(idbaUD.out.contigs) + + bestIncompleteAssembly(idbaUD.out.contigs.ifEmpty('EMPTY'), idbaUD.out.intContigs) + renameFilterFasta(idbaUD.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("SPAdes")) { spades(paired_ch, unpaired_ch, spades_pb, spades_np) - rename(spades.out.contigs) + + bestIncompleteAssembly(spades.out.contigs.ifEmpty('EMPTY'), spades.out.intContigs) + renameFilterFasta(spades.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("MEGAHIT")) { megahit(paired_ch, unpaired_ch) - rename(megahit.out.contigs) + + bestIncompleteAssembly(megahit.out.contigs.ifEmpty('EMPTY'), megahit.out.intContigs) + renameFilterFasta(megahit.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("UniCycler")) { if (params.unicycler.longreads != "nf_assets/NO_FILE3") { @@ -525,21 +555,22 @@ workflow { unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr).filter{it.size()>0}.ifEmpty({file("nf_assets/NO_FILE3")})) - rename(unicycler.out.contigs) + //unicycler produces no intermediate contigs, we let it error out above rather than try to rescue a failed assembly + renameFilterFasta(unicycler.out.contigs) } else { unicycler(paired_ch, unpaired_ch, unicycler_lr) - rename(unicycler.out.contigs) + renameFilterFasta(unicycler.out.contigs) } } else if (params.assembler.equalsIgnoreCase("LRASM")) { lrasm(unpaired_ch) - rename(lrasm.out.contigs) + + bestIncompleteAssembly(lrasm.out.contigs.ifEmpty('EMPTY'), lrasm.out.intContigs) + renameFilterFasta(lrasm.out.contigs.concat(bestIncompleteAssembly.out).first()) } else { error "Invalid assembler: $params.assembler" } - //TODO: add safety in case of assembly failure/incomplete assembly - } \ No newline at end of file