From be5791568e54dc54e94b501e695bcf52eab582f1 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Mon, 1 Jul 2024 16:54:34 -0600 Subject: [PATCH 1/9] Early version. Not yet functional --- runAssembly/extractLongReads.pl | 85 ++++++ runAssembly/nextflow.config | 23 ++ runAssembly/runAssembly.nf | 254 ++++++++++++++++++ .../test_files/parameters/idba_assembly.json | 6 + 4 files changed, 368 insertions(+) create mode 100644 runAssembly/extractLongReads.pl create mode 100644 runAssembly/nextflow.config create mode 100644 runAssembly/runAssembly.nf create mode 100644 runAssembly/test_files/parameters/idba_assembly.json diff --git a/runAssembly/extractLongReads.pl b/runAssembly/extractLongReads.pl new file mode 100644 index 0000000..a36d57d --- /dev/null +++ b/runAssembly/extractLongReads.pl @@ -0,0 +1,85 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + +my $paired_fasta=''; +my $single_fasta=''; +my $outputDir=''; +my $len_cutoff=350; + +GetOptions( + 'p:s' => \$paired_fasta, + 'u:s' => \$unpaired_fasta, + 'd=s' => \$outputDir, + 'len:s' => \$len_cutoff +) + +my $short_paired_fasta="short_paired.fa"; +my $short_single="short_single.fa"; +my $long_fasta="long.fa" + +open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n"; +open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n"; +open (my $o_long, ">$long_fasta") or die "Cannot write $long_fasta\n"; +$/ = ">"; +if (-s $paired_fasta) { + open (my $fh, $paired_fasta) or die "$! $paired_fasta"; + while (<$fh>) + { + $_ =~ s/\>//g; + my ($id, @seq) = split /\n/, $_; + next if (!$id); + my ($id2, @seq2) = split /\n/, <$fh>; + my $seq = join "", @seq; + my $seq2 = join "", @seq2; + my $len = length($seq); + my $len2 = length($seq2); + if ($len > $len_cutoff and $len2 > $len_cutoff) + { + print $o_long ">$id\n$seq\n>$id2\n$seq2\n"; + } + elsif ($len > $len_cutoff) + { + print $o_long ">$id\n$seq\n"; + print $o_single ">$id2\n$seq2\n"; + } + elsif ($len2 > $len_cutoff) + { + print $o_long ">$id2\n$seq2\n"; + print $o_single ">$id\n$seq\n"; + } + else + { + print $o_paired ">$id\n$seq\n>$id2\n$seq2\n"; + } + } + close $fh; +} +if (-s $single_fasta) +{ + open (my $fh, $single_fasta) or die "$! $single_fasta"; + while (<$fh>) + { + $_ =~ s/\>//g; + my ($id, @seq) = split /\n/, $_; + next if (!$id); + my $seq = join "", @seq; + my $len = length($seq); + if ($len > $len_cutoff) + { + print $o_long ">$id\n$seq\n"; + } + else + { + print $o_single ">$id\n$seq\n"; + } + } + close $fh; +} +$/="\n"; +close $o_paired; +close $o_long; +close $o_single; \ No newline at end of file diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config new file mode 100644 index 0000000..13bffad --- /dev/null +++ b/runAssembly/nextflow.config @@ -0,0 +1,23 @@ +params { + spades { + pacbio = "NO_FILE" + nanopore = "NO_FILE" + algorithm = null + } + megahit { + preset = null + } + unicycler { + longreads = "NO_FILE" + minLongReads = 2000 + bridgingMode = "normal" + } + lrasm { + lrasm = '' + lrasm.minLength = 400 + lrasm.preset = null + lrasm.algorithm = null + lrasm.ec = null + lrasm.numConsensus = null + } +} \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf new file mode 100644 index 0000000..ca77bf4 --- /dev/null +++ b/runAssembly/runAssembly.nf @@ -0,0 +1,254 @@ +#!/usr/bin/env nextflow +//to run: nextflow [OPT: -log /path/to/log file] run runAssembly.nf -params-file [JSON parameter file] + +//TODO: rethink inputs and outputs given nextflow's work directory aspects, esp. output directories +params.assembler = "IDBA_UD" + +params.outDir = '.' +params.threads = null //default? +params.projName = null +params.pairedFiles = "NO_FILE" +params.unpairedFile = "NO_FILE" +params.pacbioFasta = "NO_FILE" + + +// process idbaUD { +// //TODO +// } + +process idbaExtractLong { + //TODO + input: + path paired + path unpaired + + output: + path "*" + + script: + """ + extractLongReads.pl + -p paired + -u unpaired + -d . + """ +} + +process idbaPrep { + + input: + path "paired" + path "unpaired" + + output: + path "pairedForAssembly.fasta", emit:idba_prep_paired + path "unpairedForAssembly.fasta", optional:true + + script: + def pair_process = params.pairedFiles != "NO_FILE" ? "fq2fa --filter --merge paired1 paired2 pairedForAssembly.fasta;" : "" + def unpair_process = params.unpairedFile != "NO_FILE" ? "fq2fa --filter unpaired unpairedForAssembly.fasta;" : "" + + """ + $pair_process + $unpair_process + """ + +} + +process spades { + input: + path paired + path unpaired + path pacbio + path nanopore + + output: + path "$params.assembler/*" + + script: + def paired = params.pairedFiles != "NO_FILE" ? "--pe1-1 paired1 --pe2-2 paired2 " : "" + def unpaired = params.unpairedFile != "NO_FILE" ? "--s1 unpaired " : "" + def pacbio_file = params.spades.pacbio != "NO_FILE" ? "--pacbio pacbio " : "" + def nanopore_file = params.spades.nanopore != "NO_FILE" ? "--nanopore nanopore " : "" + def meta_flag = (params.pairedFiles != "NO_FILE" && params.spades.algorithm == "metagenome") ? "--meta " : "" + def sc_flag = params.spades == "singlecell" ? "--sc " : "" + def rna_flag = params.spades == "rna" ? "--rna " : "" + def plasmid_flag = params.spades == "plasmid" ? "--plasmid " : "" + def bio_flag = params.spades == "bio" ? "--bio " : "" + def corona_flag = params.spades == "corona" ? "--corona " : "" + def metaviral_flag = params.spades == "metaviral" ? "--metaviral " : "" + def metaplasmid_flag = params.spades == "metaplasmid" ? "--metaplasmid " : "" + def rnaviral_flag = params.spades == "rnaviral" ? "--rnaviral " : "" + def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" + + """ + spades.py -o $params.outDir -t $params.threads + $paired + $meta_flag + $sc_flag + $rna_flag + $plasmid_flag + $bio_flag + $corona_flag + $metaviral_flag + $metaplasmid_flag + $rnaviral_flag + $unpaired + $pacbio_file + $nanopore_file + $memlimit + """ +} + + +process megahit { + input: + path paired + path unpaired + + output: + path "$params.assembler/*" + + script: + def paired = params.pairedFiles != "NO_FILE" ? "-1 paired1 -2 paired2 " : "" + def unpaired = params.unpairedFile != "NO_FILE" ? "-r unpaired " : "" + def megahit_preset = params.megahit.preset != null ? "--presets $params.megahit.preset " : "" + + """ + megahit -o $params.outDir -t $params.threads + $megahit_preset + $paired + $unpaired + 2>&1 + """ + +} + +process unicyclerPrep { + input: + path longreads + + + output: + path "long_reads.fasta" + + script: + + """ + seqtk seq -A -L + $params.unicycler.minLongReads + longreads > long_reads.fasta + """ +} + +process unicycler { + input: + path paired + path unpaired + path longreads //must check for NO_FILE. If present, expects filtered long reads. + + output: + path "$params.outDir/$params.assembler/*" + + script: + def paired = params.pairedFiles != "NO_FILE" ? "-1 paired1 -2 paired2 " : "" + def unpaired = params.unpairedFile != "NO_FILE" ? "-r unpaired " : "" + def filt_lr = params.unicycler.longreads != "NO_FILE" ? "-l longreads " : "" + def bridge = params.unicycler.bridgingMode != "normal" ? "--mode $params.unicycler.bridgingMode" : "--mode normal" + + //test to see if unicycler can be run from the environment + //and if we need to export some java options + """ + unicycler -t $params.threads -o $params.outDir + $paired + $filt_lr + $bridge 2>&1 1>/dev/null + """ + +} + +process lrasm { + + input: + path unpaired + + output: + + script: + def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" + def preset = params.lrasm.preset != null ? "-x $params.lrasm.preset " : "" + def errorCorrection = params.lrasm.ec != null ? "-e " : "" + def algorithm = params.lrasm.algorithm != null ? "-a $params.lrasm.algorithm " : "" + if (params.lrasm.algorithm == "miniasm") { + def minLenOpt = "--ao \'-s $params.lrasm.minLength\' " + } + else if (params.lrasm.algorithm == "wtdbg2") { + def minLenOpt = "--wo \'-L $params.lrasm.minLength\' " + } + else { + def minLenOpt = "" + } + def flyeOpt = params.lrasm.algorithm == "metaflye" ? "--fo '--meta' ": "" + + """ + lrasm -o $params.OutDir -t $params.threads + $preset + $consensus + $errorCorrection + $algorithm + $minLenOpt + $flyeOpt + $unpaired + 2>/dev/null + """ + +} + +workflow { + "touch NO_FILE".execute().text + + paired_ch = channel.fromPath(params.pairedFiles, relative:true, checkIfExists:true).collect() + unpaired_ch = channel.fromPath(params.unpairedFile, relative:true, checkIfExists:true) + spades_pb = file(params.spades.pacbio, checkIfExists:true) + spades_np = file(params.spades.nanopore, checkIfExists:true) + unicycler_lr = file(params.unicycler.longreads, checkIfExists:true) + + if (params.assembler == "IDBA_UD") { + //idbaUD(idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))) + idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))//even though the files to pipe exist, passed channel seems empty + + + + + } + else if (params.assembler == "SPAdes") { + spades(paired_ch, unpaired_ch, spades_pb, spades_np) + } + else if (params.assembler == "MEGAHIT") { + megahit(paired_ch, unpaired_ch, unicycler_lr) + } + else if (params.assembler == "UniCycler") { + if (params.unicycler.longreads != "NO_FILE") { + println("Filter long reads with $params.unicycler.minLongReads (bp) cutoff") + unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr)) + } + else { + unicycler(paired_ch, unpaired_ch, unicycler_lr) + } + } + else if (params.assembler == "LRASM") { + //lrasm(unpaired_ch) //issues installing LRASM + } + else { + error "Invalid assembler: $params.assembler" + } + + //TODO: add in rest of runAssembly + + //scaffold cleanup + //contigs + //assembly graph + //rename by project name + //cleanup + +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/idba_assembly.json b/runAssembly/test_files/parameters/idba_assembly.json new file mode 100644 index 0000000..4df2da9 --- /dev/null +++ b/runAssembly/test_files/parameters/idba_assembly.json @@ -0,0 +1,6 @@ +{ + "assembler":"IDBA_UD", + "threads":4, + "projName":"test_idba", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] +} \ No newline at end of file From b4760740fc50813c5f389774bd7825483f378651 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Tue, 2 Jul 2024 17:03:46 -0600 Subject: [PATCH 2/9] IDBA runs from NF script --- runAssembly/{ => bin}/extractLongReads.pl | 10 +- runAssembly/nextflow.config | 18 ++- runAssembly/runAssembly.nf | 142 +++++++++++++++------- 3 files changed, 113 insertions(+), 57 deletions(-) rename runAssembly/{ => bin}/extractLongReads.pl (93%) mode change 100644 => 100755 diff --git a/runAssembly/extractLongReads.pl b/runAssembly/bin/extractLongReads.pl old mode 100644 new mode 100755 similarity index 93% rename from runAssembly/extractLongReads.pl rename to runAssembly/bin/extractLongReads.pl index a36d57d..d03674a --- a/runAssembly/extractLongReads.pl +++ b/runAssembly/bin/extractLongReads.pl @@ -12,14 +12,15 @@ GetOptions( 'p:s' => \$paired_fasta, - 'u:s' => \$unpaired_fasta, + 'u:s' => \$single_fasta, 'd=s' => \$outputDir, 'len:s' => \$len_cutoff -) +); + my $short_paired_fasta="short_paired.fa"; -my $short_single="short_single.fa"; -my $long_fasta="long.fa" +my $short_single_fasta="short_single.fa"; +my $long_fasta="long.fa"; open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n"; open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n"; @@ -29,6 +30,7 @@ open (my $fh, $paired_fasta) or die "$! $paired_fasta"; while (<$fh>) { + print "reading file!"; $_ =~ s/\>//g; my ($id, @seq) = split /\n/, $_; next if (!$id); diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index 13bffad..c38b7cf 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -1,4 +1,11 @@ params { + minContigSize = 200 + memLimit = null + idba{ + maxK = 121 + minK = 31 + step = 20 + } spades { pacbio = "NO_FILE" nanopore = "NO_FILE" @@ -13,11 +20,10 @@ params { bridgingMode = "normal" } lrasm { - lrasm = '' - lrasm.minLength = 400 - lrasm.preset = null - lrasm.algorithm = null - lrasm.ec = null - lrasm.numConsensus = null + minLength = 400 + preset = null + algorithm = null + ec = null + numConsensus = null } } \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index ca77bf4..85a7a5d 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -12,30 +12,73 @@ params.unpairedFile = "NO_FILE" params.pacbioFasta = "NO_FILE" -// process idbaUD { -// //TODO -// } +process idbaUD { + //TODO: implement avglen safety + publishDir "test_idba" -process idbaExtractLong { - //TODO input: - path paired - path unpaired + path short_paired + path short_single + path long_reads output: path "*" script: + println(short_paired.name) + println(short_single.name) + println(long_reads.name) + def runFlag = "" + if(short_paired.name != "NO_FILE" && short_single.name != "NO_FILE2") { + runFlag = "-r $short_single --read_level_2 $short_paired " + } + else if(short_paired.name != "NO_FILE") { + runFlag = "-r $short_paired " + } + else if(short_single.name != "NO_FILE2") { + runFlag = "-r $short_single " + } + def longReadsFile = long_reads.name != "NO_FILE3" ? "-l $long_reads" : "" + def maxK = params.idba.maxK != null ? "--maxk $params.idba.maxK " : "" + def minK = params.idba.minK != null ? "--mink $params.idba.minK " : "" + def step = params.idba.step != null ? "--step $params.idba.step " : "" + def minLen = params.minContigSize != null ? "--min_contig $params.minContigSize " : "" + + def memLimit = params.memLimit != null ? "ulimit -v $params.memLimit 2>/dev/null;" : "" + """ + ${memLimit}idba_ud --pre_correction -o . --num_threads $params.threads\ + $runFlag\ + $longReadsFile\ + $maxK\ + $minK\ + $step\ + $minLen + """ + + } + +process idbaExtractLong { + input: + path "paired" + path "unpaired" + + output: + path "short_paired.fa" + path "short_single.fa" + path "long.fa" + + script: + def pair_file = paired.name != "NO_FILE" ? "-p paired " : "" + def unpaired_file = unpaired.name != "NO_FILE" ? "-u unpaired " : "" """ - extractLongReads.pl - -p paired - -u unpaired - -d . + extractLongReads.pl\ + $pair_file\ + $unpaired_file\ + -d . """ } process idbaPrep { - input: path "paired" path "unpaired" @@ -82,20 +125,20 @@ process spades { def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" """ - spades.py -o $params.outDir -t $params.threads - $paired - $meta_flag - $sc_flag - $rna_flag - $plasmid_flag - $bio_flag - $corona_flag - $metaviral_flag - $metaplasmid_flag - $rnaviral_flag - $unpaired - $pacbio_file - $nanopore_file + spades.py -o $params.outDir -t $params.threads\ + $paired\ + $meta_flag\ + $sc_flag\ + $rna_flag\ + $plasmid_flag\ + $bio_flag\ + $corona_flag\ + $metaviral_flag\ + $metaplasmid_flag\ + $rnaviral_flag\ + $unpaired\ + $pacbio_file\ + $nanopore_file\ $memlimit """ } @@ -115,10 +158,10 @@ process megahit { def megahit_preset = params.megahit.preset != null ? "--presets $params.megahit.preset " : "" """ - megahit -o $params.outDir -t $params.threads - $megahit_preset - $paired - $unpaired + megahit -o $params.outDir -t $params.threads\ + $megahit_preset\ + $paired\ + $unpaired\ 2>&1 """ @@ -135,8 +178,8 @@ process unicyclerPrep { script: """ - seqtk seq -A -L - $params.unicycler.minLongReads + seqtk seq -A -L\ + $params.unicycler.minLongReads\ longreads > long_reads.fasta """ } @@ -159,9 +202,9 @@ process unicycler { //test to see if unicycler can be run from the environment //and if we need to export some java options """ - unicycler -t $params.threads -o $params.outDir - $paired - $filt_lr + unicycler -t $params.threads -o $params.outDir\ + $paired\ + $filt_lr\ $bridge 2>&1 1>/dev/null """ @@ -191,14 +234,14 @@ process lrasm { def flyeOpt = params.lrasm.algorithm == "metaflye" ? "--fo '--meta' ": "" """ - lrasm -o $params.OutDir -t $params.threads - $preset - $consensus - $errorCorrection - $algorithm - $minLenOpt - $flyeOpt - $unpaired + lrasm -o $params.OutDir -t $params.threads\ + $preset\ + $consensus\ + $errorCorrection\ + $algorithm\ + $minLenOpt\ + $flyeOpt\ + $unpaired\ 2>/dev/null """ @@ -206,6 +249,8 @@ process lrasm { workflow { "touch NO_FILE".execute().text + "touch NO_FILE2".execute().text + "touch NO_FILE3".execute().text paired_ch = channel.fromPath(params.pairedFiles, relative:true, checkIfExists:true).collect() unpaired_ch = channel.fromPath(params.unpairedFile, relative:true, checkIfExists:true) @@ -215,10 +260,13 @@ workflow { if (params.assembler == "IDBA_UD") { //idbaUD(idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))) - idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))//even though the files to pipe exist, passed channel seems empty - - - + //idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))//even though the files to pipe exist, passed channel seems empty + (c1,c2) = idbaPrep(paired_ch, unpaired_ch) + (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("NO_FILE", checkIfExists:true)})) + l.filter{it.size()>0}.ifEmpty("EMPTY!!!!!").view() + idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("NO_FILE", checkIfExists:true)}), + su.filter{ it.size()>0 }.ifEmpty({file("NO_FILE2", checkIfExists:true)}), + l.filter{ it.size()>0 }.ifEmpty({file("NO_FILE3", checkIfExists:true)})) } else if (params.assembler == "SPAdes") { From f91da3090ef3abb0dab61e8e9ec84164ef90e8a1 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Wed, 3 Jul 2024 13:32:42 -0600 Subject: [PATCH 3/9] 4 of 5 assemblers running --- runAssembly/nextflow.config | 7 +- runAssembly/runAssembly.nf | 72 ++++++++++--------- .../parameters/megahit_assembly.json | 6 ++ .../parameters/spades_assembly.json | 6 ++ .../parameters/unicycler_assembly.json | 6 ++ 5 files changed, 60 insertions(+), 37 deletions(-) create mode 100644 runAssembly/test_files/parameters/megahit_assembly.json create mode 100644 runAssembly/test_files/parameters/spades_assembly.json create mode 100644 runAssembly/test_files/parameters/unicycler_assembly.json diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index c38b7cf..4846cc5 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -1,4 +1,5 @@ params { + threads= 4 minContigSize = 200 memLimit = null idba{ @@ -7,15 +8,15 @@ params { step = 20 } spades { - pacbio = "NO_FILE" - nanopore = "NO_FILE" + pacbio = "NO_FILE3" + nanopore = "NO_FILE4" algorithm = null } megahit { preset = null } unicycler { - longreads = "NO_FILE" + longreads = "NO_FILE3" minLongReads = 2000 bridgingMode = "normal" } diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 85a7a5d..1521d3d 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -2,14 +2,17 @@ //to run: nextflow [OPT: -log /path/to/log file] run runAssembly.nf -params-file [JSON parameter file] //TODO: rethink inputs and outputs given nextflow's work directory aspects, esp. output directories +//TODO: detection system for memory limit +//TODO: output and transparency needs to be compared to EDGE website +//TODO: specificity on output from assemblers, separate from cleanup. Will require seeing what's needed for downstream processes. + params.assembler = "IDBA_UD" params.outDir = '.' -params.threads = null //default? +params.threads = 8 //default? params.projName = null params.pairedFiles = "NO_FILE" -params.unpairedFile = "NO_FILE" -params.pacbioFasta = "NO_FILE" +params.unpairedFile = "NO_FILE2" process idbaUD { @@ -25,9 +28,6 @@ process idbaUD { path "*" script: - println(short_paired.name) - println(short_single.name) - println(long_reads.name) def runFlag = "" if(short_paired.name != "NO_FILE" && short_single.name != "NO_FILE2") { runFlag = "-r $short_single --read_level_2 $short_paired " @@ -99,6 +99,8 @@ process idbaPrep { } process spades { + publishDir "test_spades" + input: path paired path unpaired @@ -106,14 +108,14 @@ process spades { path nanopore output: - path "$params.assembler/*" + path "*" script: - def paired = params.pairedFiles != "NO_FILE" ? "--pe1-1 paired1 --pe2-2 paired2 " : "" - def unpaired = params.unpairedFile != "NO_FILE" ? "--s1 unpaired " : "" - def pacbio_file = params.spades.pacbio != "NO_FILE" ? "--pacbio pacbio " : "" - def nanopore_file = params.spades.nanopore != "NO_FILE" ? "--nanopore nanopore " : "" - def meta_flag = (params.pairedFiles != "NO_FILE" && params.spades.algorithm == "metagenome") ? "--meta " : "" + def paired = paired.name != "NO_FILE" ? "--pe1-1 ${paired[0]} --pe1-2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "--s1 $unpaired " : "" + def pacbio_file = pacbio.name != "NO_FILE3" ? "--pacbio $pacbio " : "" + def nanopore_file = nanopore.name != "NO_FILE4" ? "--nanopore $nanopore " : "" + def meta_flag = (paired != "" && params.spades.algorithm == "metagenome") ? "--meta " : "" def sc_flag = params.spades == "singlecell" ? "--sc " : "" def rna_flag = params.spades == "rna" ? "--rna " : "" def plasmid_flag = params.spades == "plasmid" ? "--plasmid " : "" @@ -122,7 +124,7 @@ process spades { def metaviral_flag = params.spades == "metaviral" ? "--metaviral " : "" def metaplasmid_flag = params.spades == "metaplasmid" ? "--metaplasmid " : "" def rnaviral_flag = params.spades == "rnaviral" ? "--rnaviral " : "" - def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" + //def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" """ spades.py -o $params.outDir -t $params.threads\ @@ -138,27 +140,29 @@ process spades { $rnaviral_flag\ $unpaired\ $pacbio_file\ - $nanopore_file\ - $memlimit + $nanopore_file """ + //$memlimit } process megahit { + publishDir "test_megahit" + input: path paired path unpaired output: - path "$params.assembler/*" + path "megahit/*" script: - def paired = params.pairedFiles != "NO_FILE" ? "-1 paired1 -2 paired2 " : "" - def unpaired = params.unpairedFile != "NO_FILE" ? "-r unpaired " : "" + def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE" ? "-r $unpaired " : "" def megahit_preset = params.megahit.preset != null ? "--presets $params.megahit.preset " : "" """ - megahit -o $params.outDir -t $params.threads\ + megahit -o $params.outDir/megahit -t $params.threads\ $megahit_preset\ $paired\ $unpaired\ @@ -180,23 +184,25 @@ process unicyclerPrep { """ seqtk seq -A -L\ $params.unicycler.minLongReads\ - longreads > long_reads.fasta + $longreads > long_reads.fasta """ } process unicycler { + publishDir "unicycler_test" + input: path paired path unpaired path longreads //must check for NO_FILE. If present, expects filtered long reads. output: - path "$params.outDir/$params.assembler/*" + path "*" script: - def paired = params.pairedFiles != "NO_FILE" ? "-1 paired1 -2 paired2 " : "" - def unpaired = params.unpairedFile != "NO_FILE" ? "-r unpaired " : "" - def filt_lr = params.unicycler.longreads != "NO_FILE" ? "-l longreads " : "" + def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "-r $unpaired " : "" + def filt_lr = longreads.name != "NO_FILE3" ? "-l $longreads " : "" def bridge = params.unicycler.bridgingMode != "normal" ? "--mode $params.unicycler.bridgingMode" : "--mode normal" //test to see if unicycler can be run from the environment @@ -251,6 +257,7 @@ workflow { "touch NO_FILE".execute().text "touch NO_FILE2".execute().text "touch NO_FILE3".execute().text + "touch NO_FILE4".execute().text paired_ch = channel.fromPath(params.pairedFiles, relative:true, checkIfExists:true).collect() unpaired_ch = channel.fromPath(params.unpairedFile, relative:true, checkIfExists:true) @@ -258,25 +265,22 @@ workflow { spades_np = file(params.spades.nanopore, checkIfExists:true) unicycler_lr = file(params.unicycler.longreads, checkIfExists:true) - if (params.assembler == "IDBA_UD") { - //idbaUD(idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))) - //idbaExtractLong(idbaPrep(paired_ch, unpaired_ch))//even though the files to pipe exist, passed channel seems empty + if (params.assembler.equalsIgnoreCase("IDBA_UD")) { (c1,c2) = idbaPrep(paired_ch, unpaired_ch) (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("NO_FILE", checkIfExists:true)})) - l.filter{it.size()>0}.ifEmpty("EMPTY!!!!!").view() idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("NO_FILE", checkIfExists:true)}), su.filter{ it.size()>0 }.ifEmpty({file("NO_FILE2", checkIfExists:true)}), l.filter{ it.size()>0 }.ifEmpty({file("NO_FILE3", checkIfExists:true)})) } - else if (params.assembler == "SPAdes") { + else if (params.assembler.equalsIgnoreCase("SPAdes")) { spades(paired_ch, unpaired_ch, spades_pb, spades_np) } - else if (params.assembler == "MEGAHIT") { - megahit(paired_ch, unpaired_ch, unicycler_lr) + else if (params.assembler.equalsIgnoreCase("MEGAHIT")) { + megahit(paired_ch, unpaired_ch) } - else if (params.assembler == "UniCycler") { - if (params.unicycler.longreads != "NO_FILE") { + else if (params.assembler.equalsIgnoreCase("UniCycler")) { + if (params.unicycler.longreads != "NO_FILE3") { println("Filter long reads with $params.unicycler.minLongReads (bp) cutoff") unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr)) } @@ -284,7 +288,7 @@ workflow { unicycler(paired_ch, unpaired_ch, unicycler_lr) } } - else if (params.assembler == "LRASM") { + else if (params.assembler.equalsIgnoreCase("LRASM")) { //lrasm(unpaired_ch) //issues installing LRASM } else { diff --git a/runAssembly/test_files/parameters/megahit_assembly.json b/runAssembly/test_files/parameters/megahit_assembly.json new file mode 100644 index 0000000..8975d84 --- /dev/null +++ b/runAssembly/test_files/parameters/megahit_assembly.json @@ -0,0 +1,6 @@ +{ + "assembler":"MEGAHIT", + "threads":4, + "projName":"test_megahit", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/spades_assembly.json b/runAssembly/test_files/parameters/spades_assembly.json new file mode 100644 index 0000000..f2a4e82 --- /dev/null +++ b/runAssembly/test_files/parameters/spades_assembly.json @@ -0,0 +1,6 @@ +{ + "assembler":"SPAdes", + "threads":4, + "projName":"test_spades", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/unicycler_assembly.json b/runAssembly/test_files/parameters/unicycler_assembly.json new file mode 100644 index 0000000..8b2ff05 --- /dev/null +++ b/runAssembly/test_files/parameters/unicycler_assembly.json @@ -0,0 +1,6 @@ +{ + "assembler":"uNicycleR", + "threads":4, + "projName":"test_unicycler", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] +} \ No newline at end of file From ef415fe7abfa36ff168f169d5aef6535d80a5794 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Fri, 5 Jul 2024 11:24:16 -0600 Subject: [PATCH 4/9] all assemblers running --- runAssembly/nextflow.config | 11 ++- runAssembly/runAssembly.nf | 99 +++++++++---------- .../test_files/parameters/idba_assembly.json | 3 +- .../test_files/parameters/lrasm_assembly.json | 10 ++ .../parameters/megahit_assembly.json | 3 +- .../parameters/spades_assembly.json | 3 +- .../parameters/unicycler_assembly.json | 3 +- 7 files changed, 73 insertions(+), 59 deletions(-) create mode 100644 runAssembly/test_files/parameters/lrasm_assembly.json diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index 4846cc5..f631d55 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -1,4 +1,6 @@ params { + pairedFiles = "nf_assets/NO_FILE" + unpairedFile = "nf_assets/NO_FILE2" threads= 4 minContigSize = 200 memLimit = null @@ -8,15 +10,15 @@ params { step = 20 } spades { - pacbio = "NO_FILE3" - nanopore = "NO_FILE4" + pacbio = "nf_assets/NO_FILE3" + nanopore = "nf_assets/NO_FILE4" algorithm = null } megahit { preset = null } unicycler { - longreads = "NO_FILE3" + longreads = "nf_assets/NO_FILE3" minLongReads = 2000 bridgingMode = "normal" } @@ -27,4 +29,7 @@ params { ec = null numConsensus = null } +} +workflow.onComplete = { + "rm -rf nf_assets".execute().text } \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 1521d3d..dad97d7 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -1,23 +1,19 @@ #!/usr/bin/env nextflow //to run: nextflow [OPT: -log /path/to/log file] run runAssembly.nf -params-file [JSON parameter file] -//TODO: rethink inputs and outputs given nextflow's work directory aspects, esp. output directories //TODO: detection system for memory limit //TODO: output and transparency needs to be compared to EDGE website -//TODO: specificity on output from assemblers, separate from cleanup. Will require seeing what's needed for downstream processes. params.assembler = "IDBA_UD" params.outDir = '.' params.threads = 8 //default? params.projName = null -params.pairedFiles = "NO_FILE" -params.unpairedFile = "NO_FILE2" process idbaUD { //TODO: implement avglen safety - publishDir "test_idba" + publishDir "$params.outDir/idba", mode: 'copy' input: path short_paired @@ -55,12 +51,11 @@ process idbaUD { $minLen """ - } - +} process idbaExtractLong { input: - path "paired" - path "unpaired" + path paired + path unpaired output: path "short_paired.fa" @@ -68,8 +63,8 @@ process idbaExtractLong { path "long.fa" script: - def pair_file = paired.name != "NO_FILE" ? "-p paired " : "" - def unpaired_file = unpaired.name != "NO_FILE" ? "-u unpaired " : "" + def pair_file = paired.name != "NO_FILE" ? "-p $paired " : "" + def unpaired_file = unpaired.name != "NO_FILE2" ? "-u $unpaired " : "" """ extractLongReads.pl\ $pair_file\ @@ -77,19 +72,18 @@ process idbaExtractLong { -d . """ } - process idbaPrep { input: - path "paired" - path "unpaired" + path paired + path unpaired output: path "pairedForAssembly.fasta", emit:idba_prep_paired path "unpairedForAssembly.fasta", optional:true script: - def pair_process = params.pairedFiles != "NO_FILE" ? "fq2fa --filter --merge paired1 paired2 pairedForAssembly.fasta;" : "" - def unpair_process = params.unpairedFile != "NO_FILE" ? "fq2fa --filter unpaired unpairedForAssembly.fasta;" : "" + def pair_process = paired.name != "NO_FILE" ? "fq2fa --filter --merge ${paired[0]} ${paired[1]} pairedForAssembly.fasta;" : "" + def unpair_process = unpaired.name != "NO_FILE2" ? "fq2fa --filter $unpaired unpairedForAssembly.fasta;" : "" """ $pair_process @@ -99,7 +93,7 @@ process idbaPrep { } process spades { - publishDir "test_spades" + publishDir "$params.outDir/spades", mode: 'copy' input: path paired @@ -116,18 +110,18 @@ process spades { def pacbio_file = pacbio.name != "NO_FILE3" ? "--pacbio $pacbio " : "" def nanopore_file = nanopore.name != "NO_FILE4" ? "--nanopore $nanopore " : "" def meta_flag = (paired != "" && params.spades.algorithm == "metagenome") ? "--meta " : "" - def sc_flag = params.spades == "singlecell" ? "--sc " : "" - def rna_flag = params.spades == "rna" ? "--rna " : "" - def plasmid_flag = params.spades == "plasmid" ? "--plasmid " : "" - def bio_flag = params.spades == "bio" ? "--bio " : "" - def corona_flag = params.spades == "corona" ? "--corona " : "" - def metaviral_flag = params.spades == "metaviral" ? "--metaviral " : "" - def metaplasmid_flag = params.spades == "metaplasmid" ? "--metaplasmid " : "" - def rnaviral_flag = params.spades == "rnaviral" ? "--rnaviral " : "" + def sc_flag = params.spades.algorithm == "singlecell" ? "--sc " : "" + def rna_flag = params.spades.algorithm == "rna" ? "--rna " : "" + def plasmid_flag = params.spades.algorithm == "plasmid" ? "--plasmid " : "" + def bio_flag = params.spades.algorithm == "bio" ? "--bio " : "" + def corona_flag = params.spades.algorithm == "corona" ? "--corona " : "" + def metaviral_flag = params.spades.algorithm == "metaviral" ? "--metaviral " : "" + def metaplasmid_flag = params.spades.algorithm == "metaplasmid" ? "--metaplasmid " : "" + def rnaviral_flag = params.spades.algorithm == "rnaviral" ? "--rnaviral " : "" //def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" """ - spades.py -o $params.outDir -t $params.threads\ + spades.py -o . -t $params.threads\ $paired\ $meta_flag\ $sc_flag\ @@ -145,24 +139,23 @@ process spades { //$memlimit } - process megahit { - publishDir "test_megahit" + publishDir "$params.outDir", mode: 'copy' input: path paired path unpaired output: - path "megahit/*" + path "*" script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" - def unpaired = unpaired.name != "NO_FILE" ? "-r $unpaired " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "-r $unpaired " : "" def megahit_preset = params.megahit.preset != null ? "--presets $params.megahit.preset " : "" """ - megahit -o $params.outDir/megahit -t $params.threads\ + megahit -o ./megahit -t $params.threads\ $megahit_preset\ $paired\ $unpaired\ @@ -187,14 +180,13 @@ process unicyclerPrep { $longreads > long_reads.fasta """ } - process unicycler { - publishDir "unicycler_test" + publishDir "$params.outDir/unicycler", mode: 'copy' input: path paired path unpaired - path longreads //must check for NO_FILE. If present, expects filtered long reads. + path longreads //If present, expects filtered long reads. output: path "*" @@ -208,7 +200,7 @@ process unicycler { //test to see if unicycler can be run from the environment //and if we need to export some java options """ - unicycler -t $params.threads -o $params.outDir\ + unicycler -t $params.threads -o .\ $paired\ $filt_lr\ $bridge 2>&1 1>/dev/null @@ -217,30 +209,30 @@ process unicycler { } process lrasm { + publishDir "$params.outDir/lrasm", mode: 'copy' input: path unpaired output: + path "*" script: def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" def preset = params.lrasm.preset != null ? "-x $params.lrasm.preset " : "" def errorCorrection = params.lrasm.ec != null ? "-e " : "" def algorithm = params.lrasm.algorithm != null ? "-a $params.lrasm.algorithm " : "" + def minLenOpt = "" if (params.lrasm.algorithm == "miniasm") { - def minLenOpt = "--ao \'-s $params.lrasm.minLength\' " + minLenOpt = "--ao \'-s $params.lrasm.minLength\' " } else if (params.lrasm.algorithm == "wtdbg2") { - def minLenOpt = "--wo \'-L $params.lrasm.minLength\' " - } - else { - def minLenOpt = "" + minLenOpt = "--wo \'-L $params.lrasm.minLength\' " } def flyeOpt = params.lrasm.algorithm == "metaflye" ? "--fo '--meta' ": "" """ - lrasm -o $params.OutDir -t $params.threads\ + lrasm -o . -t $params.threads\ $preset\ $consensus\ $errorCorrection\ @@ -254,10 +246,11 @@ process lrasm { } workflow { - "touch NO_FILE".execute().text - "touch NO_FILE2".execute().text - "touch NO_FILE3".execute().text - "touch NO_FILE4".execute().text + "mkdir nf_assets".execute().text + "touch nf_assets/NO_FILE".execute().text + "touch nf_assets/NO_FILE2".execute().text + "touch nf_assets/NO_FILE3".execute().text + "touch nf_assets/NO_FILE4".execute().text paired_ch = channel.fromPath(params.pairedFiles, relative:true, checkIfExists:true).collect() unpaired_ch = channel.fromPath(params.unpairedFile, relative:true, checkIfExists:true) @@ -267,10 +260,10 @@ workflow { if (params.assembler.equalsIgnoreCase("IDBA_UD")) { (c1,c2) = idbaPrep(paired_ch, unpaired_ch) - (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("NO_FILE", checkIfExists:true)})) - idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("NO_FILE", checkIfExists:true)}), - su.filter{ it.size()>0 }.ifEmpty({file("NO_FILE2", checkIfExists:true)}), - l.filter{ it.size()>0 }.ifEmpty({file("NO_FILE3", checkIfExists:true)})) + (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("nf_assets/NO_FILE")})) + idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE")}), + su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), + l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")})) } else if (params.assembler.equalsIgnoreCase("SPAdes")) { @@ -280,16 +273,18 @@ workflow { megahit(paired_ch, unpaired_ch) } else if (params.assembler.equalsIgnoreCase("UniCycler")) { - if (params.unicycler.longreads != "NO_FILE3") { + if (params.unicycler.longreads != "nf_assets/NO_FILE3") { println("Filter long reads with $params.unicycler.minLongReads (bp) cutoff") - unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr)) + unicycler(paired_ch, + unpaired_ch, + unicyclerPrep(unicycler_lr).filter{it.size()>0}.ifEmpty({file("nf_assets/NO_FILE3")})) } else { unicycler(paired_ch, unpaired_ch, unicycler_lr) } } else if (params.assembler.equalsIgnoreCase("LRASM")) { - //lrasm(unpaired_ch) //issues installing LRASM + lrasm(unpaired_ch) } else { error "Invalid assembler: $params.assembler" diff --git a/runAssembly/test_files/parameters/idba_assembly.json b/runAssembly/test_files/parameters/idba_assembly.json index 4df2da9..159b0bc 100644 --- a/runAssembly/test_files/parameters/idba_assembly.json +++ b/runAssembly/test_files/parameters/idba_assembly.json @@ -2,5 +2,6 @@ "assembler":"IDBA_UD", "threads":4, "projName":"test_idba", - "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" } \ No newline at end of file diff --git a/runAssembly/test_files/parameters/lrasm_assembly.json b/runAssembly/test_files/parameters/lrasm_assembly.json new file mode 100644 index 0000000..a32a2f4 --- /dev/null +++ b/runAssembly/test_files/parameters/lrasm_assembly.json @@ -0,0 +1,10 @@ +{ + "assembler":"LRASM", + "threads":4, + "projName":"test_lrasm", + "unpairedFile":["test_files/reads.fastq.gz"], + "lrasm" : { + "ec":true + }, + "outDir": "test_results" +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/megahit_assembly.json b/runAssembly/test_files/parameters/megahit_assembly.json index 8975d84..97415d5 100644 --- a/runAssembly/test_files/parameters/megahit_assembly.json +++ b/runAssembly/test_files/parameters/megahit_assembly.json @@ -2,5 +2,6 @@ "assembler":"MEGAHIT", "threads":4, "projName":"test_megahit", - "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" } \ No newline at end of file diff --git a/runAssembly/test_files/parameters/spades_assembly.json b/runAssembly/test_files/parameters/spades_assembly.json index f2a4e82..b9aa978 100644 --- a/runAssembly/test_files/parameters/spades_assembly.json +++ b/runAssembly/test_files/parameters/spades_assembly.json @@ -2,5 +2,6 @@ "assembler":"SPAdes", "threads":4, "projName":"test_spades", - "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" } \ No newline at end of file diff --git a/runAssembly/test_files/parameters/unicycler_assembly.json b/runAssembly/test_files/parameters/unicycler_assembly.json index 8b2ff05..d74f750 100644 --- a/runAssembly/test_files/parameters/unicycler_assembly.json +++ b/runAssembly/test_files/parameters/unicycler_assembly.json @@ -2,5 +2,6 @@ "assembler":"uNicycleR", "threads":4, "projName":"test_unicycler", - "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"] + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" } \ No newline at end of file From b939da80bbdacd98378109328cdb06b348c1e03b Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Mon, 8 Jul 2024 17:39:30 -0600 Subject: [PATCH 5/9] Assemblers running. Graphs produced. --- runAssembly/bin/extractLongReads.pl | 1 - runAssembly/bin/getAvgLen.pl | 198 +++++++++++++++++++++++ runAssembly/nextflow.config | 2 +- runAssembly/runAssembly.nf | 239 +++++++++++++++++++++++++--- 4 files changed, 413 insertions(+), 27 deletions(-) create mode 100755 runAssembly/bin/getAvgLen.pl diff --git a/runAssembly/bin/extractLongReads.pl b/runAssembly/bin/extractLongReads.pl index d03674a..870c346 100755 --- a/runAssembly/bin/extractLongReads.pl +++ b/runAssembly/bin/extractLongReads.pl @@ -30,7 +30,6 @@ open (my $fh, $paired_fasta) or die "$! $paired_fasta"; while (<$fh>) { - print "reading file!"; $_ =~ s/\>//g; my ($id, @seq) = split /\n/, $_; next if (!$id); diff --git a/runAssembly/bin/getAvgLen.pl b/runAssembly/bin/getAvgLen.pl new file mode 100755 index 0000000..1333951 --- /dev/null +++ b/runAssembly/bin/getAvgLen.pl @@ -0,0 +1,198 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + +my $outputDir=''; + +my @unpairedList= (); +my @pairedList= (); + +GetOptions( + 'u:s{1,}' => \@unpairedList, + 'p:s{1,}' => \@pairedList, + 'd=s' => \$outputDir, +); + + +my $all_R1_fastq = "$outputDir/all.1.fastq"; +my $all_R2_fastq = "$outputDir/all.2.fastq"; +my $all_SE_fastq = "$outputDir/all.se.fastq"; +my $count_file_list= "$outputDir/fastqCount.txt"; +my $avg_read_len= 0 ; +my $PE_count = 0 ; +my $PE_total_len = 0; +my $SE_count= 0 ; +my $SE_total_len = 0 ; + + + +open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n"; +while(my ($R1,$R2) = splice (@pairedList,0,2)) +{ + ($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh); +} +foreach my $file (@unpairedList) +{ + ($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq); + printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); + printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); +} +close $fh; +$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ; + +sub countPE_exe{ + my $r1=shift; + my $r2=shift; + my $out_r1=shift; + my $out_r2=shift; + my $out_se=shift; + my $count_fh=shift; + my %seq_hash; + my $pair_char; + my $unpaired_count=0; + my $read1_count=0; + my $read2_count=0; + my $se2_count=0; + my $se1_count=0; + my $paired_count=0; + my $read1_total_len=0; + my $read2_total_len=0; + my $existed_id1=0; + my $existed_id2=0; + my ($fh1,$pid) = open_file($r1); + open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n"; + open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n"; + open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n"; + while(<$fh1>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/; + my $seq = <$fh1>; + chomp $seq; + if ($seq_hash{$id}){ + $existed_id1++; + } + my $len = length $seq; + $read1_total_len += $len; + my $qual_id = <$fh1>; + my $qual = <$fh1>; + $seq = $seq."\n".$qual_id.$qual; + $seq_hash{$id}++; + $read1_count++; + } + close $fh1; + my %seq_hash2; + my ($fh2,$pid2) = open_file($r2); + while(<$fh2>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/; + $read2_count++; + my $seq2 = <$fh2>; + chomp $seq2; + if ($seq_hash2{$id2}){ + $existed_id2++; + } + my $len = length $seq2; + $read2_total_len += $len; + my $qual_id = <$fh2>; + my $qual = <$fh2>; + $seq2 = $seq2."\n".$qual_id.$qual; + $seq_hash2{$id2}++; + if ($seq_hash{$id2}){ + $seq_hash{$id2}++; + $paired_count++; + print $ofh2 $id_line,"\n",$seq2; + }else{ + print $ofhse $id_line,"\n",$seq2; + $se2_count++; + } + } + close $fh2; + ($fh1,$pid) = open_file($r1); + while(<$fh1>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/; + my $seq = <$fh1>; + chomp $seq; + my $qual_id = <$fh1>; + my $qual = <$fh1>; + $seq = $seq."\n".$qual_id.$qual; + if ($seq_hash{$id} == 2){ + print $ofh1 $id_line,"\n",$seq; + } + if ($seq_hash{$id} == 1){ + print $ofhse $id_line,"\n",$seq; + $se1_count++; + } + } + close $fh1; + close $ofh1; + close $ofh2; + close $ofhse; + printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); + printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); + printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); + printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); + printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0); + printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0); + printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); + printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0); + + unlink $out_se if (-z $out_se); + return ($read1_count + $read2_count, $read1_total_len + $read2_total_len); +} + +sub countFastq_exe +{ + my $file=shift; + my $output=shift; + my $seq_count=0; + my $total_length; + my ($fh,$pid)= open_file($file); + open (my $ofh, ">>$output") or die "Cannot write $output\n"; + while (<$fh>) + { + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id=$_; + $id = '@'."seq_$seq_count\n" if ($id =~ /No name/); + my $seq=<$fh>; + chomp $seq; + my $q_id=<$fh>; + my $q_seq=<$fh>; + my $len = length $seq; + $seq_count++; + $total_length +=$len; + print $ofh "$id$seq\n$q_id$q_seq"; + } + close $fh; + return ($seq_count,$total_length); +} + +sub touchFile{ + my $file=shift; + open (my $fh,">",$file) or die "$!"; + close $fh; +} + +sub open_file +{ + my ($file) = @_; + print "$file\n"; + my $fh; + my $pid; + if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } + else { $pid=open($fh,'<',$file) or die("$file: $!"); } + return ($fh,$pid); +} \ No newline at end of file diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index f631d55..fc0fa9e 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -5,7 +5,7 @@ params { minContigSize = 200 memLimit = null idba{ - maxK = 121 + maxK = null minK = 31 step = 20 } diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index dad97d7..1e41219 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -7,23 +7,41 @@ params.assembler = "IDBA_UD" params.outDir = '.' -params.threads = 8 //default? -params.projName = null +params.threads = 8 //default +params.projName = "project" process idbaUD { - //TODO: implement avglen safety - publishDir "$params.outDir/idba", mode: 'copy' + publishDir ( + path:"$params.outDir/idba", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + "scaffold.fa" + } + else{ + filename + } + } + ) input: path short_paired path short_single path long_reads + val avg_len output: - path "*" + path "log" + path "scaffold*.{fa,fasta}", optional:true + path "contig-max.fa" script: + def avg_len = avg_len as Integer def runFlag = "" if(short_paired.name != "NO_FILE" && short_single.name != "NO_FILE2") { runFlag = "-r $short_single --read_level_2 $short_paired " @@ -35,7 +53,14 @@ process idbaUD { runFlag = "-r $short_single " } def longReadsFile = long_reads.name != "NO_FILE3" ? "-l $long_reads" : "" - def maxK = params.idba.maxK != null ? "--maxk $params.idba.maxK " : "" + def maxK = 121 + def maxK_option = "--maxk $maxK " + if (params.idba.maxK == null || params.idba.maxK > avg_len) { + if(avg_len > 0 && avg_len <= 151) { + maxK = avg_len - 1 + maxK_option = "--maxk ${avg_len - 1}" + } + } def minK = params.idba.minK != null ? "--mink $params.idba.minK " : "" def step = params.idba.step != null ? "--step $params.idba.step " : "" def minLen = params.minContigSize != null ? "--min_contig $params.minContigSize " : "" @@ -45,10 +70,12 @@ process idbaUD { ${memLimit}idba_ud --pre_correction -o . --num_threads $params.threads\ $runFlag\ $longReadsFile\ - $maxK\ + $maxK_option\ $minK\ $step\ $minLen + + mv contig-${maxK}.fa contig-max.fa """ } @@ -72,7 +99,7 @@ process idbaExtractLong { -d . """ } -process idbaPrep { +process idbaPrepReads { input: path paired path unpaired @@ -92,8 +119,73 @@ process idbaPrep { } +process idbaReadFastq { + input: + path paired + path unpaired + + output: + path "fastqCount.txt" + + script: + def paired_list = paired.name != "NO_FILE" ? "-p ${paired}" : "" + def unpaired_list = unpaired.name != "NO_FILE2" ? "-u ${unpaired}" : "" + + """ + getAvgLen.pl\ + $paired_list\ + $unpaired_list\ + -d . + """ +} + +process idbaAvgLen { + input: + + path countFastq + + output: + stdout + + shell: + ''' + #!/usr/bin/env perl + my $fastq_count_file = "./!{countFastq}"; + my $total_count = 0; + my $total_len = 0; + open (my $fh, "<", $fastq_count_file) or die "Cannot open $fastq_count_file\n"; + while(<$fh>){ + chomp; + my ($name,$count,$len,$avg) = split /\t/,$_; + $total_count += $count; + $total_len += $len; + } + close $fh; + my $avg_len = ($total_count > 0)? $total_len/$total_count : 0; + print "$avg_len"; + ''' +} + process spades { - publishDir "$params.outDir/spades", mode: 'copy' + publishDir ( + path: "$params.outDir/spades", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /spades.log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + "scaffold.fa" + } + else if(filename.equals("assembly_graph.fastg")) { + "${params.projName}_contigs.fastg" + } + else{ + filename + } + } + ) input: path paired @@ -102,7 +194,12 @@ process spades { path nanopore output: - path "*" + path "scaffold*.{fa,fasta}", optional:true + path "spades.log" + path "{contigs,transcripts}.fasta" + path "assembly_graph.fastg", optional:true + path "assembly_graph_with_scaffolds.gfa", optional:true + script: def paired = paired.name != "NO_FILE" ? "--pe1-1 ${paired[0]} --pe1-2 ${paired[1]} " : "" @@ -140,14 +237,47 @@ process spades { } process megahit { - publishDir "$params.outDir", mode: 'copy' + publishDir( + path: "$params.outDir/megahit", + mode: 'copy', + pattern: "${params.projName}_contigs.fastg" + ) + publishDir( + path: "$params.outDir", + mode: 'copy', + pattern: "{megahit/log,megahit/final.contigs.fa}", + saveAs: { + filename -> + if(filename.equals("megahit/log")) { + "megahit/assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + "scaffold.fa" + } + else{ + filename + } + } + ) + publishDir( + path: "$params.outDir", + mode: 'copy', + pattern: "megahit/scaffold*.{fa,fasta}", + saveAs: { + filename -> + "scaffold.fa" + } + ) input: path paired path unpaired output: - path "*" + path "megahit/log" + path "megahit/scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of megahit, but just in case + path "megahit/final.contigs.fa" + path "${params.projName}_contigs.fastg" script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -160,6 +290,10 @@ process megahit { $paired\ $unpaired\ 2>&1 + + LARGESTKMER=\$(head -n 1 megahit/final.contigs.fa | perl -ne '/^>k(\\d+)\\_/; print \$1;') + + megahit_toolkit contig2fastg \$LARGESTKMER megahit/intermediate_contigs/k\${LARGESTKMER}.contigs.fa > ${params.projName}_contigs.fastg """ } @@ -181,7 +315,25 @@ process unicyclerPrep { """ } process unicycler { - publishDir "$params.outDir/unicycler", mode: 'copy' + publishDir ( + path: "$params.outDir/unicycler", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /unicycler.log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + "scaffold.fa" + } + else if(filename.equals("assembly.gfa")) { + "${params.projName}_contigs.fastg" + } + else{ + filename + } + } + ) input: path paired @@ -189,7 +341,10 @@ process unicycler { path longreads //If present, expects filtered long reads. output: - path "*" + path "unicycler.log" + path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of unicycler, but just in case + path "assembly.fasta" + path "assembly.gfa", optional:true script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -197,9 +352,9 @@ process unicycler { def filt_lr = longreads.name != "NO_FILE3" ? "-l $longreads " : "" def bridge = params.unicycler.bridgingMode != "normal" ? "--mode $params.unicycler.bridgingMode" : "--mode normal" - //test to see if unicycler can be run from the environment - //and if we need to export some java options """ + export _JAVA_OPTIONS='-Xmx20G'; export TERM='xterm'; + unicycler -t $params.threads -o .\ $paired\ $filt_lr\ @@ -209,13 +364,46 @@ process unicycler { } process lrasm { - publishDir "$params.outDir/lrasm", mode: 'copy' + publishDir ( + path: "$params.outDir/lrasm", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + "scaffold.fa" + } + else if(filename.equals("Assembly/unitig.gfa")) { + "${params.projName}_contigs.fastg" + } + else if(filename.equals("Assembly/assembly_graph.gfa")) { + "${params.projName}_contigs.fastg" + } + else if(filename.equals("Assembly/assembly_graph.gv")) { + "${params.projName}_contigs.gv" + } + else if(filename.equals("Assembly/assembly_info.txt")) { + "assembly_info.txt" + } + else{ + filename + } + } + ) input: path unpaired output: - path "*" + path "contigs.log" + path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of lrasm, but just in case + path "contigs.fa" + path "Assembly/unitig.gfa", optional:true + path "Assembly/assembly_graph.gfa", optional:true + path "Assembly/assembly_graph.gv", optional:true + path "Assembly/assembly_info.txt", optional:true script: def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" @@ -240,9 +428,8 @@ process lrasm { $minLenOpt\ $flyeOpt\ $unpaired\ - 2>/dev/null """ - + //2>/dev/null } workflow { @@ -259,11 +446,13 @@ workflow { unicycler_lr = file(params.unicycler.longreads, checkIfExists:true) if (params.assembler.equalsIgnoreCase("IDBA_UD")) { - (c1,c2) = idbaPrep(paired_ch, unpaired_ch) + avg_len_ch = idbaAvgLen(idbaReadFastq(paired_ch, unpaired_ch)) + (c1,c2) = idbaPrepReads(paired_ch, unpaired_ch) (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("nf_assets/NO_FILE")})) idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE")}), su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), - l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")})) + l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")}), + avg_len_ch) } else if (params.assembler.equalsIgnoreCase("SPAdes")) { @@ -290,12 +479,12 @@ workflow { error "Invalid assembler: $params.assembler" } - //TODO: add in rest of runAssembly + //TODO: add safety in case of assembly failure/incomplete assembly - //scaffold cleanup - //contigs //assembly graph + //rename by project name + //cleanup } \ No newline at end of file From 3333575de495053e2e16dd9bdb9df3e01d3c072e Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Tue, 9 Jul 2024 10:30:50 -0600 Subject: [PATCH 6/9] workflow now creates contigs for assembly --- runAssembly/bin/renameFilterFasta.pl | 94 ++++++++++++++++++++++++++++ runAssembly/nextflow.config | 7 ++- runAssembly/runAssembly.nf | 53 +++++++++++----- 3 files changed, 138 insertions(+), 16 deletions(-) create mode 100755 runAssembly/bin/renameFilterFasta.pl diff --git a/runAssembly/bin/renameFilterFasta.pl b/runAssembly/bin/renameFilterFasta.pl new file mode 100755 index 0000000..78052e4 --- /dev/null +++ b/runAssembly/bin/renameFilterFasta.pl @@ -0,0 +1,94 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + + +my $fasta; +my $outputDir; +my $size_filter; +my $max_seq_number; +my $id_mapping; +my $contig_size_for_annotation; +my $project_name; + +GetOptions( + 'u=s{1,}' => \$fasta, + 'd=s' => \$outputDir, + 'filt=i' => \$size_filter, + 'maxseq=i' => \$max_seq_number, + 'id:s' => \$id_mapping, + 'ann=i' => \$contig_size_for_annotation, + 'n=s' => \$project_name +); + +my $output= "$outputDir/${project_name}_contigs.fa"; +my $id_map= "$outputDir/id_map.txt"; +my $contig_for_annotation = "$outputDir/${project_name}_contigs_${contig_size_for_annotation}up.fa"; +$max_seq_number ||= 100000; +my $serial_id= "0" x length($max_seq_number); +my $id_info; +my ($fh,$pid)=open_file($fasta); +open (my $ofh, "> $output") or die "Cannot write $output\n"; +open (my $ofh2, "> $contig_for_annotation" ) or die "Cannot write $contig_for_annotation\n"; +open (my $idmap_ofh, "> $id_map") or die "Cannot write $id_map\n"; +$/ = ">"; +while (my $line=<$fh>) +{ + $line =~ s/\>//g; + my ($id, @seq) = split /\n/, $line; + next if (!$id); + ($id_info) = $id =~ /(length_\d+ read_count_\d+)/; + my $seq = join "", @seq; + $seq =~ s/-//g; + $seq =~ s/ //g; + $seq =~ s/\n//g; + $seq =~ s/\r//g; + $seq = uc($seq); + my $len = length($seq); + my $GC_num = $seq=~ tr/GCgc/GCgc/; + my $GC_content = sprintf("%.2f",$GC_num/$len); + $id_info = "length_$len "if (!$id_info); + next if ($len < $size_filter); + $seq =~ s/(.{70})/$1\n/g; + chomp $seq; + my $fasta_header; + $id =~ s/\W/_/g; + #not doing any of the annotation-specific tasks for this modularized version of runAssembly + #if($configuration->{DoAnnotation}){ + # genbank format limit the LOCUS name length + #if ($id_mapping){ + #$fasta_header = ( length($id) > 20 ) ? "contig_$serial_id $id" : "$id contig_$serial_id"; + #print $idmap_ofh "contig_$serial_id\t$id\n"; + #}else{ + #$fasta_header = ( length($project_name) > 20 || $project_name =~/^Assembly/i ) ? "contig_$serial_id $id_info GC_content_$GC_content": "${project_name}_$serial_id $id_info GC_content_$GC_content"; + #} + #}else{ + $fasta_header = ($id_mapping)? "$id" : "${project_name}_$serial_id $id_info GC_content_$GC_content"; + #} + if ($len >= $contig_size_for_annotation) + { + print $ofh2 ">$fasta_header\n" . $seq."\n"; + } + print $ofh ">$fasta_header\n" . $seq."\n"; + $serial_id++; +} +$/="\n"; +close $fh; +close $ofh; +close $ofh2; +#close $idmap_ofh if ($id_mapping && $configuration->{DoAnnotation}); +if ( -z $id_map) { unlink $id_map ; } + +sub open_file +{ + my ($file) = @_; + print "$file\n"; + my $fh; + my $pid; + if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } + else { $pid=open($fh,'<',$file) or die("$file: $!"); } + return ($fh,$pid); +} \ No newline at end of file diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index fc0fa9e..c496dcd 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -1,7 +1,12 @@ params { + + assembler = "IDBA_UD" + outDir = '.' + threads = 8 + projName = "project" + contigSizeForAnnotation = 700 pairedFiles = "nf_assets/NO_FILE" unpairedFile = "nf_assets/NO_FILE2" - threads= 4 minContigSize = 200 memLimit = null idba{ diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 1e41219..1b990b3 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -4,11 +4,7 @@ //TODO: detection system for memory limit //TODO: output and transparency needs to be compared to EDGE website -params.assembler = "IDBA_UD" - -params.outDir = '.' -params.threads = 8 //default -params.projName = "project" +//default parameters are in nextflow.config process idbaUD { @@ -38,7 +34,7 @@ process idbaUD { output: path "log" path "scaffold*.{fa,fasta}", optional:true - path "contig-max.fa" + path "contig-max.fa", emit: contigs script: def avg_len = avg_len as Integer @@ -196,7 +192,7 @@ process spades { output: path "scaffold*.{fa,fasta}", optional:true path "spades.log" - path "{contigs,transcripts}.fasta" + path "{contigs,transcripts}.fasta", emit:contigs path "assembly_graph.fastg", optional:true path "assembly_graph_with_scaffolds.gfa", optional:true @@ -276,7 +272,7 @@ process megahit { output: path "megahit/log" path "megahit/scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of megahit, but just in case - path "megahit/final.contigs.fa" + path "megahit/final.contigs.fa", emit: contigs path "${params.projName}_contigs.fastg" script: @@ -343,7 +339,7 @@ process unicycler { output: path "unicycler.log" path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of unicycler, but just in case - path "assembly.fasta" + path "assembly.fasta", emit: contigs path "assembly.gfa", optional:true script: @@ -399,7 +395,7 @@ process lrasm { output: path "contigs.log" path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of lrasm, but just in case - path "contigs.fa" + path "contigs.fa", emit:contigs path "Assembly/unitig.gfa", optional:true path "Assembly/assembly_graph.gfa", optional:true path "Assembly/assembly_graph.gv", optional:true @@ -432,6 +428,32 @@ process lrasm { //2>/dev/null } +process rename { + publishDir( + path: "$params.outDir/final_files", + mode: 'copy' + ) + input: + path contigs + + output: + path "*" + + script: + """ + CONTIG_NUMBER=\$(grep -c '>' ${contigs}) + + renameFilterFasta.pl \ + -u $contigs\ + -d .\ + -filt $params.minContigSize\ + -maxseq \$CONTIG_NUMBER\ + -ann $params.contigSizeForAnnotation\ + -n $params.projName + """ + +} + workflow { "mkdir nf_assets".execute().text "touch nf_assets/NO_FILE".execute().text @@ -453,13 +475,16 @@ workflow { su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")}), avg_len_ch) + rename(idbaUD.out.contigs) } else if (params.assembler.equalsIgnoreCase("SPAdes")) { spades(paired_ch, unpaired_ch, spades_pb, spades_np) + rename(spades.out.contigs) } else if (params.assembler.equalsIgnoreCase("MEGAHIT")) { megahit(paired_ch, unpaired_ch) + rename(megahit.out.contigs) } else if (params.assembler.equalsIgnoreCase("UniCycler")) { if (params.unicycler.longreads != "nf_assets/NO_FILE3") { @@ -467,24 +492,22 @@ workflow { unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr).filter{it.size()>0}.ifEmpty({file("nf_assets/NO_FILE3")})) + rename(unicycler.out.contigs) } else { unicycler(paired_ch, unpaired_ch, unicycler_lr) + rename(unicycler.out.contigs) } } else if (params.assembler.equalsIgnoreCase("LRASM")) { lrasm(unpaired_ch) + rename(lrasm.out.contigs) } else { error "Invalid assembler: $params.assembler" } //TODO: add safety in case of assembly failure/incomplete assembly - - //assembly graph - - //rename by project name - //cleanup } \ No newline at end of file From 1a51cb3d0f1e79e2fb91b99a1e2356a17907eedb Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Fri, 12 Jul 2024 09:51:32 -0600 Subject: [PATCH 7/9] Docker image for environment --- runAssembly/Dockerfile | 32 ++++++++++++++++++++++++++++++++ runAssembly/nextflow.config | 4 +++- runAssembly/runAssembly.nf | 4 ---- 3 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 runAssembly/Dockerfile diff --git a/runAssembly/Dockerfile b/runAssembly/Dockerfile new file mode 100644 index 0000000..f32d4a6 --- /dev/null +++ b/runAssembly/Dockerfile @@ -0,0 +1,32 @@ +FROM continuumio/miniconda3:23.5.2-0 + +ENV container=docker + +# add conda channels +RUN conda config --add channels conda-forge \ + && conda config --add channels bioconda + +# install dependencies +RUN conda install conda-libmamba-solver +RUN conda config --set solver libmamba +RUN conda install -c conda-forge python=3.9 +RUN conda install -c bioconda samtools=1.17 +RUN conda install -c bioconda racon=1.5 +RUN conda install -c bioconda seqtk=1.3 +RUN conda install -c bioconda spades=3.15.5 +RUN conda install -c bioconda minimap2=2.26 +RUN conda install -c bioconda megahit=1.2.9 +RUN conda install -c bioconda idba=1.1.3 +RUN conda install -c bioconda unicycler=0.5.0 +RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz +RUN tar -xvzf wtdbg-2.5_x64_linux.tgz +RUN cp wtdbg-2.5_x64_linux/* /opt/conda/bin +RUN conda install -c bioconda flye=2.9.2 +RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git +ENV PATH="/long_read_assembly:$PATH" + +ADD bin/extractLongReads.pl /opt/conda/bin +ADD bin/getAvgLen.pl /opt/conda/bin +ADD bin/renameFilterFasta.pl /opt/conda/bin + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index c496dcd..19b1c16 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -37,4 +37,6 @@ params { } workflow.onComplete = { "rm -rf nf_assets".execute().text -} \ No newline at end of file +} +process.container = 'apwat/run_assembly:1.2.5' +docker.enabled = true \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 1b990b3..b2e12e7 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -4,9 +4,6 @@ //TODO: detection system for memory limit //TODO: output and transparency needs to be compared to EDGE website -//default parameters are in nextflow.config - - process idbaUD { publishDir ( path:"$params.outDir/idba", @@ -508,6 +505,5 @@ workflow { } //TODO: add safety in case of assembly failure/incomplete assembly - //cleanup } \ No newline at end of file From 3645606d8c392b33cba24bbd425cbb6cada95065 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Wed, 7 Aug 2024 15:20:38 -0600 Subject: [PATCH 8/9] Better matches EDGE output --- runAssembly/Dockerfile | 60 +++++++++----- runAssembly/bin/renameFilterFasta.pl | 25 +++--- runAssembly/nextflow.config | 11 ++- runAssembly/runAssembly.nf | 116 ++++++++++++++++++--------- 4 files changed, 137 insertions(+), 75 deletions(-) diff --git a/runAssembly/Dockerfile b/runAssembly/Dockerfile index f32d4a6..0a9832c 100644 --- a/runAssembly/Dockerfile +++ b/runAssembly/Dockerfile @@ -1,4 +1,4 @@ -FROM continuumio/miniconda3:23.5.2-0 +FROM continuumio/miniconda3:23.5.2-0 AS build ENV container=docker @@ -6,27 +6,49 @@ ENV container=docker RUN conda config --add channels conda-forge \ && conda config --add channels bioconda +RUN conda init bash \ + && . ~/.bashrc \ + && conda create --name assembly \ + && conda activate assembly + # install dependencies RUN conda install conda-libmamba-solver RUN conda config --set solver libmamba -RUN conda install -c conda-forge python=3.9 -RUN conda install -c bioconda samtools=1.17 -RUN conda install -c bioconda racon=1.5 -RUN conda install -c bioconda seqtk=1.3 -RUN conda install -c bioconda spades=3.15.5 -RUN conda install -c bioconda minimap2=2.26 -RUN conda install -c bioconda megahit=1.2.9 -RUN conda install -c bioconda idba=1.1.3 -RUN conda install -c bioconda unicycler=0.5.0 -RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz -RUN tar -xvzf wtdbg-2.5_x64_linux.tgz -RUN cp wtdbg-2.5_x64_linux/* /opt/conda/bin -RUN conda install -c bioconda flye=2.9.2 +RUN conda install -n assembly -c conda-forge python=3.9 +RUN conda install -n assembly -c bioconda samtools=1.17 +RUN conda install -n assembly -c bioconda racon=1.5 +RUN conda install -n assembly -c bioconda seqtk=1.3 +RUN conda install -n assembly -c bioconda spades=3.15.5 +RUN conda install -n assembly -c bioconda minimap2=2.26 +RUN conda install -n assembly -c bioconda megahit=1.2.9 +RUN conda install -n assembly -c bioconda idba=1.1.3 +RUN conda install -n assembly -c bioconda unicycler=0.5.0 +RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz \ + && tar -xvzf wtdbg-2.5_x64_linux.tgz \ + && cp wtdbg-2.5_x64_linux/* /opt/conda/envs/assembly/bin +RUN conda install -n assembly -c bioconda flye=2.9.2 +RUN conda install -n assembly git +RUN conda install -c conda-forge conda-pack + +ADD bin/extractLongReads.pl /opt/conda/envs/assembly/bin +ADD bin/getAvgLen.pl /opt/conda/envs/assembly/bin +ADD bin/renameFilterFasta.pl /opt/conda/envs/assembly/bin + +RUN conda-pack -n assembly -o /tmp/env.tar && \ + mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ + rm /tmp/env.tar + +RUN /venv/bin/conda-unpack + +FROM debian:latest AS runtime + +COPY --from=build /venv /venv +ENV PERL5LIB=/venv/lib/perl5/core_perl + + +ENV PATH="/venv/bin:$PATH" RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git ENV PATH="/long_read_assembly:$PATH" -ADD bin/extractLongReads.pl /opt/conda/bin -ADD bin/getAvgLen.pl /opt/conda/bin -ADD bin/renameFilterFasta.pl /opt/conda/bin - -CMD ["/bin/bash"] \ No newline at end of file +SHELL ["/bin/bash", "-c"] +CMD /bin/bash \ No newline at end of file diff --git a/runAssembly/bin/renameFilterFasta.pl b/runAssembly/bin/renameFilterFasta.pl index 78052e4..605d86a 100755 --- a/runAssembly/bin/renameFilterFasta.pl +++ b/runAssembly/bin/renameFilterFasta.pl @@ -13,6 +13,7 @@ my $id_mapping; my $contig_size_for_annotation; my $project_name; +my $do_annotation; GetOptions( 'u=s{1,}' => \$fasta, @@ -20,8 +21,9 @@ 'filt=i' => \$size_filter, 'maxseq=i' => \$max_seq_number, 'id:s' => \$id_mapping, - 'ann=i' => \$contig_size_for_annotation, - 'n=s' => \$project_name + 'ann_size=i' => \$contig_size_for_annotation, + 'n=s' => \$project_name, + 'ann:i' => \$do_annotation #default to false (0) ); my $output= "$outputDir/${project_name}_contigs.fa"; @@ -56,18 +58,17 @@ chomp $seq; my $fasta_header; $id =~ s/\W/_/g; - #not doing any of the annotation-specific tasks for this modularized version of runAssembly - #if($configuration->{DoAnnotation}){ + if($do_annotation){ # genbank format limit the LOCUS name length - #if ($id_mapping){ - #$fasta_header = ( length($id) > 20 ) ? "contig_$serial_id $id" : "$id contig_$serial_id"; - #print $idmap_ofh "contig_$serial_id\t$id\n"; - #}else{ - #$fasta_header = ( length($project_name) > 20 || $project_name =~/^Assembly/i ) ? "contig_$serial_id $id_info GC_content_$GC_content": "${project_name}_$serial_id $id_info GC_content_$GC_content"; - #} - #}else{ + if ($id_mapping){ + $fasta_header = ( length($id) > 20 ) ? "contig_$serial_id $id" : "$id contig_$serial_id"; + print $idmap_ofh "contig_$serial_id\t$id\n"; + }else{ + $fasta_header = ( length($project_name) > 20 || $project_name =~/^Assembly/i ) ? "contig_$serial_id $id_info GC_content_$GC_content": "${project_name}_$serial_id $id_info GC_content_$GC_content"; + } + }else{ $fasta_header = ($id_mapping)? "$id" : "${project_name}_$serial_id $id_info GC_content_$GC_content"; - #} + } if ($len >= $contig_size_for_annotation) { print $ofh2 ">$fasta_header\n" . $seq."\n"; diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index 19b1c16..be66020 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -1,9 +1,14 @@ +process.container = 'apwat/run_assembly:1.4.5' +singularity { + enabled = true + runOptions = "--compat" +} params { - assembler = "IDBA_UD" outDir = '.' threads = 8 projName = "project" + annotation = false contigSizeForAnnotation = 700 pairedFiles = "nf_assets/NO_FILE" unpairedFile = "nf_assets/NO_FILE2" @@ -37,6 +42,4 @@ params { } workflow.onComplete = { "rm -rf nf_assets".execute().text -} -process.container = 'apwat/run_assembly:1.2.5' -docker.enabled = true \ No newline at end of file +} \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index b2e12e7..75d9fef 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -1,23 +1,24 @@ #!/usr/bin/env nextflow -//to run: nextflow [OPT: -log /path/to/log file] run runAssembly.nf -params-file [JSON parameter file] +//to run: nextflow run runAssembly.nf -params-file [JSON parameter file] -//TODO: detection system for memory limit -//TODO: output and transparency needs to be compared to EDGE website +//this workflow is unable to set memory limits (used in idba and spades assemblies) by itself, +//but a limit (in KB) can be provided as a parameter. +//main process for assembly with IDBA process idbaUD { publishDir ( - path:"$params.outDir/idba", + path:"$params.outDir/AssemblyBasedAnalysis", mode: 'copy', saveAs: { filename -> if(filename ==~ /log/) { "assembly.log" } - else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { "scaffold.fa" } else{ - filename + null //do not publish contig-max.fa yet, but use it for downstream process } } ) @@ -72,6 +73,8 @@ process idbaUD { """ } + +//prep for idba process idbaExtractLong { input: path paired @@ -92,6 +95,8 @@ process idbaExtractLong { -d . """ } + +//prep for idba process idbaPrepReads { input: path paired @@ -112,6 +117,7 @@ process idbaPrepReads { } +//prep for idba process idbaReadFastq { input: path paired @@ -132,6 +138,7 @@ process idbaReadFastq { """ } +//prep for idba process idbaAvgLen { input: @@ -159,21 +166,28 @@ process idbaAvgLen { ''' } +//assemble using spades process spades { publishDir ( - path: "$params.outDir/spades", + path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', saveAs: { filename -> - if(filename ==~ /spades.log/) { + if(filename ==~ /spades\.log/) { "assembly.log" } - else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { "scaffold.fa" } - else if(filename.equals("assembly_graph.fastg")) { + else if(filename ==~ /assembly_graph\.fastg/) { "${params.projName}_contigs.fastg" } + else if(filename ==~ /assembly_graph_with_scaffolds\.gfa/) { + "assembly_graph_with_scaffolds.gfa" + } + else if(filename ==~ /(contigs|transcripts)\.fasta/) { + null //do not publish, but use downstream + } else{ filename } @@ -189,6 +203,7 @@ process spades { output: path "scaffold*.{fa,fasta}", optional:true path "spades.log" + path "contigs.paths" path "{contigs,transcripts}.fasta", emit:contigs path "assembly_graph.fastg", optional:true path "assembly_graph_with_scaffolds.gfa", optional:true @@ -208,7 +223,7 @@ process spades { def metaviral_flag = params.spades.algorithm == "metaviral" ? "--metaviral " : "" def metaplasmid_flag = params.spades.algorithm == "metaplasmid" ? "--metaplasmid " : "" def rnaviral_flag = params.spades.algorithm == "rnaviral" ? "--rnaviral " : "" - //def memlimit = params.memlimit != null ? "-m ${params.memlimit/1024*1024}" : "" + def memLimit = params.memLimit != null ? "-m ${params.memLimit/1024*1024}" : "" """ spades.py -o . -t $params.threads\ @@ -224,43 +239,32 @@ process spades { $rnaviral_flag\ $unpaired\ $pacbio_file\ - $nanopore_file + $nanopore_file\ + $memLimit """ - //$memlimit } +//assemble using megahit process megahit { publishDir( - path: "$params.outDir/megahit", + path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', - pattern: "${params.projName}_contigs.fastg" - ) - publishDir( - path: "$params.outDir", - mode: 'copy', - pattern: "{megahit/log,megahit/final.contigs.fa}", saveAs: { filename -> if(filename.equals("megahit/log")) { - "megahit/assembly.log" + "assembly.log" } - else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + else if(filename ==~ /megahit\/scaffold(.*)\.fa(sta)?/) { "scaffold.fa" } + else if(filename ==~ /megahit\/final\.contigs\.fa/) { + null //don't publish, but pass to downstream "rename" process + } else{ filename } } ) - publishDir( - path: "$params.outDir", - mode: 'copy', - pattern: "megahit/scaffold*.{fa,fasta}", - saveAs: { - filename -> - "scaffold.fa" - } - ) input: path paired @@ -291,6 +295,7 @@ process megahit { } +//filter long reads for unicycler process unicyclerPrep { input: path longreads @@ -307,21 +312,26 @@ process unicyclerPrep { $longreads > long_reads.fasta """ } + +//assembly using unicycler process unicycler { publishDir ( - path: "$params.outDir/unicycler", + path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', saveAs: { filename -> - if(filename ==~ /unicycler.log/) { + if(filename ==~ /unicycler\.log/) { "assembly.log" } - else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { "scaffold.fa" } else if(filename.equals("assembly.gfa")) { "${params.projName}_contigs.fastg" } + else if(filename ==~ /assembly\.fasta/) { + null //don't publish, but emit for use in downstream process "rename" + } else{ filename } @@ -356,16 +366,17 @@ process unicycler { } +//assembly using lrasm process lrasm { publishDir ( - path: "$params.outDir/lrasm", + path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', saveAs: { filename -> - if(filename ==~ /log/) { + if(filename ==~ /contigs\.log/) { "assembly.log" } - else if(filename ==~ /scaffold(s|-level-2|).fa(sta)?/) { + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { "scaffold.fa" } else if(filename.equals("Assembly/unitig.gfa")) { @@ -380,6 +391,9 @@ process lrasm { else if(filename.equals("Assembly/assembly_info.txt")) { "assembly_info.txt" } + else if(filename ==~ /contigs\.fa/) { + null //do not publish, but emit for use in downstream process "rename" + } else{ filename } @@ -427,7 +441,7 @@ process lrasm { process rename { publishDir( - path: "$params.outDir/final_files", + path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy' ) input: @@ -437,6 +451,7 @@ process rename { path "*" script: + def annotation = params.annotation ? "-ann 1" : "" """ CONTIG_NUMBER=\$(grep -c '>' ${contigs}) @@ -445,12 +460,33 @@ process rename { -d .\ -filt $params.minContigSize\ -maxseq \$CONTIG_NUMBER\ - -ann $params.contigSizeForAnnotation\ - -n $params.projName + -ann_size $params.contigSizeForAnnotation\ + -n $params.projName\ + $annotation """ } +// process ifNoOutputContigs { +// input: +// val x +// path contigDirectory + +// when: +// x == 'EMPTY' + +// output: +// path "*" + +// shell: +// ''' +// #!/usr/bin/env perl +// my @intermediate_contigs = sort { -M $a <=> -M $b} glob("$AssemblerOutDir/contig-* $AssemblerOutDir/intermediate_contigs/*contigs.fa $AssemblerOutDir/K*/final_contigs.fasta" ); + +// ''' + +// } + workflow { "mkdir nf_assets".execute().text "touch nf_assets/NO_FILE".execute().text From b2c3d5fccf93c0a9770ab302764cad8ada45ad50 Mon Sep 17 00:00:00 2001 From: Andre Watson Date: Thu, 8 Aug 2024 13:32:39 -0600 Subject: [PATCH 9/9] Attempts to rescue incomplete assemblies, similar to EDGE --- runAssembly/nextflow.config | 3 + runAssembly/runAssembly.nf | 131 ++++++++++++++++++++++-------------- 2 files changed, 84 insertions(+), 50 deletions(-) diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config index be66020..04c17d9 100644 --- a/runAssembly/nextflow.config +++ b/runAssembly/nextflow.config @@ -42,4 +42,7 @@ params { } workflow.onComplete = { "rm -rf nf_assets".execute().text +} +workflow.onError = { + "rm -rf nf_assets".execute().text } \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf index 75d9fef..e9e005d 100644 --- a/runAssembly/runAssembly.nf +++ b/runAssembly/runAssembly.nf @@ -18,7 +18,7 @@ process idbaUD { "scaffold.fa" } else{ - null //do not publish contig-max.fa yet, but use it for downstream process + null //do not publish contig-max.fa or intermediate contigs yet, but use them for downstream processes } } ) @@ -32,7 +32,8 @@ process idbaUD { output: path "log" path "scaffold*.{fa,fasta}", optional:true - path "contig-max.fa", emit: contigs + path "contig-max.fa", emit: contigs, optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def avg_len = avg_len as Integer @@ -168,6 +169,7 @@ process idbaAvgLen { //assemble using spades process spades { + publishDir ( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -186,7 +188,10 @@ process spades { "assembly_graph_with_scaffolds.gfa" } else if(filename ==~ /(contigs|transcripts)\.fasta/) { - null //do not publish, but use downstream + null //do not publish contigs, but use downstream + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -204,9 +209,10 @@ process spades { path "scaffold*.{fa,fasta}", optional:true path "spades.log" path "contigs.paths" - path "{contigs,transcripts}.fasta", emit:contigs + path "{contigs,transcripts}.fasta", emit:contigs, optional:true path "assembly_graph.fastg", optional:true path "assembly_graph_with_scaffolds.gfa", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: @@ -246,6 +252,7 @@ process spades { //assemble using megahit process megahit { + publishDir( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -258,7 +265,10 @@ process megahit { "scaffold.fa" } else if(filename ==~ /megahit\/final\.contigs\.fa/) { - null //don't publish, but pass to downstream "rename" process + null //don't publish, but pass to downstream "renameFilterFasta" process + } + else if(filename ==~ /megahit\/((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -273,8 +283,9 @@ process megahit { output: path "megahit/log" path "megahit/scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of megahit, but just in case - path "megahit/final.contigs.fa", emit: contigs + path "megahit/final.contigs.fa", emit: contigs, optional:true path "${params.projName}_contigs.fastg" + path "megahit/{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -295,23 +306,6 @@ process megahit { } -//filter long reads for unicycler -process unicyclerPrep { - input: - path longreads - - - output: - path "long_reads.fasta" - - script: - - """ - seqtk seq -A -L\ - $params.unicycler.minLongReads\ - $longreads > long_reads.fasta - """ -} //assembly using unicycler process unicycler { @@ -330,7 +324,7 @@ process unicycler { "${params.projName}_contigs.fastg" } else if(filename ==~ /assembly\.fasta/) { - null //don't publish, but emit for use in downstream process "rename" + null //don't publish, but emit for use in downstream process "renameFilterFasta" } else{ filename @@ -348,6 +342,7 @@ process unicycler { path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of unicycler, but just in case path "assembly.fasta", emit: contigs path "assembly.gfa", optional:true + //path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs | not produced by unicycler script: def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" @@ -366,8 +361,28 @@ process unicycler { } +//filter long reads for unicycler +process unicyclerPrep { + input: + path longreads + + + output: + path "long_reads.fasta" + + script: + + """ + seqtk seq -A -L\ + $params.unicycler.minLongReads\ + $longreads > long_reads.fasta + """ +} + + //assembly using lrasm process lrasm { + publishDir ( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy', @@ -392,7 +407,10 @@ process lrasm { "assembly_info.txt" } else if(filename ==~ /contigs\.fa/) { - null //do not publish, but emit for use in downstream process "rename" + null //do not publish, but emit for use in downstream process "renameFilterFasta" + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream } else{ filename @@ -406,11 +424,12 @@ process lrasm { output: path "contigs.log" path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of lrasm, but just in case - path "contigs.fa", emit:contigs + path "contigs.fa", emit:contigs, optional:true path "Assembly/unitig.gfa", optional:true path "Assembly/assembly_graph.gfa", optional:true path "Assembly/assembly_graph.gv", optional:true path "Assembly/assembly_info.txt", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs script: def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" @@ -439,7 +458,7 @@ process lrasm { //2>/dev/null } -process rename { +process renameFilterFasta { publishDir( path: "$params.outDir/AssemblyBasedAnalysis", mode: 'copy' @@ -467,25 +486,30 @@ process rename { } -// process ifNoOutputContigs { -// input: -// val x -// path contigDirectory - -// when: -// x == 'EMPTY' +process bestIncompleteAssembly { + input: + val x + path intContigs -// output: -// path "*" + when: + x == 'EMPTY' -// shell: -// ''' -// #!/usr/bin/env perl -// my @intermediate_contigs = sort { -M $a <=> -M $b} glob("$AssemblerOutDir/contig-* $AssemblerOutDir/intermediate_contigs/*contigs.fa $AssemblerOutDir/K*/final_contigs.fasta" ); + output: + path "bestIntContig/*" -// ''' + shell: + ''' + #!/usr/bin/env perl + use Cwd; + my $dir = getcwd; + use Cwd 'abs_path'; + my @intermediate_contigs = sort { -M $a <=> -M $b} glob("!{intContigs}"); + my $best_int = $dir . "/" . $intermediate_contigs[0]; + mkdir bestIntContig; + system("cp $best_int ./bestIntContig"); + ''' -// } +} workflow { "mkdir nf_assets".execute().text @@ -508,16 +532,22 @@ workflow { su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")}), avg_len_ch) - rename(idbaUD.out.contigs) + + bestIncompleteAssembly(idbaUD.out.contigs.ifEmpty('EMPTY'), idbaUD.out.intContigs) + renameFilterFasta(idbaUD.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("SPAdes")) { spades(paired_ch, unpaired_ch, spades_pb, spades_np) - rename(spades.out.contigs) + + bestIncompleteAssembly(spades.out.contigs.ifEmpty('EMPTY'), spades.out.intContigs) + renameFilterFasta(spades.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("MEGAHIT")) { megahit(paired_ch, unpaired_ch) - rename(megahit.out.contigs) + + bestIncompleteAssembly(megahit.out.contigs.ifEmpty('EMPTY'), megahit.out.intContigs) + renameFilterFasta(megahit.out.contigs.concat(bestIncompleteAssembly.out).first()) } else if (params.assembler.equalsIgnoreCase("UniCycler")) { if (params.unicycler.longreads != "nf_assets/NO_FILE3") { @@ -525,21 +555,22 @@ workflow { unicycler(paired_ch, unpaired_ch, unicyclerPrep(unicycler_lr).filter{it.size()>0}.ifEmpty({file("nf_assets/NO_FILE3")})) - rename(unicycler.out.contigs) + //unicycler produces no intermediate contigs, we let it error out above rather than try to rescue a failed assembly + renameFilterFasta(unicycler.out.contigs) } else { unicycler(paired_ch, unpaired_ch, unicycler_lr) - rename(unicycler.out.contigs) + renameFilterFasta(unicycler.out.contigs) } } else if (params.assembler.equalsIgnoreCase("LRASM")) { lrasm(unpaired_ch) - rename(lrasm.out.contigs) + + bestIncompleteAssembly(lrasm.out.contigs.ifEmpty('EMPTY'), lrasm.out.intContigs) + renameFilterFasta(lrasm.out.contigs.concat(bestIncompleteAssembly.out).first()) } else { error "Invalid assembler: $params.assembler" } - //TODO: add safety in case of assembly failure/incomplete assembly - } \ No newline at end of file