diff --git a/runAssembly/Dockerfile b/runAssembly/Dockerfile new file mode 100644 index 0000000..0a9832c --- /dev/null +++ b/runAssembly/Dockerfile @@ -0,0 +1,54 @@ +FROM continuumio/miniconda3:23.5.2-0 AS build + +ENV container=docker + +# add conda channels +RUN conda config --add channels conda-forge \ + && conda config --add channels bioconda + +RUN conda init bash \ + && . ~/.bashrc \ + && conda create --name assembly \ + && conda activate assembly + +# install dependencies +RUN conda install conda-libmamba-solver +RUN conda config --set solver libmamba +RUN conda install -n assembly -c conda-forge python=3.9 +RUN conda install -n assembly -c bioconda samtools=1.17 +RUN conda install -n assembly -c bioconda racon=1.5 +RUN conda install -n assembly -c bioconda seqtk=1.3 +RUN conda install -n assembly -c bioconda spades=3.15.5 +RUN conda install -n assembly -c bioconda minimap2=2.26 +RUN conda install -n assembly -c bioconda megahit=1.2.9 +RUN conda install -n assembly -c bioconda idba=1.1.3 +RUN conda install -n assembly -c bioconda unicycler=0.5.0 +RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz \ + && tar -xvzf wtdbg-2.5_x64_linux.tgz \ + && cp wtdbg-2.5_x64_linux/* /opt/conda/envs/assembly/bin +RUN conda install -n assembly -c bioconda flye=2.9.2 +RUN conda install -n assembly git +RUN conda install -c conda-forge conda-pack + +ADD bin/extractLongReads.pl /opt/conda/envs/assembly/bin +ADD bin/getAvgLen.pl /opt/conda/envs/assembly/bin +ADD bin/renameFilterFasta.pl /opt/conda/envs/assembly/bin + +RUN conda-pack -n assembly -o /tmp/env.tar && \ + mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ + rm /tmp/env.tar + +RUN /venv/bin/conda-unpack + +FROM debian:latest AS runtime + +COPY --from=build /venv /venv +ENV PERL5LIB=/venv/lib/perl5/core_perl + + +ENV PATH="/venv/bin:$PATH" +RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git +ENV PATH="/long_read_assembly:$PATH" + +SHELL ["/bin/bash", "-c"] +CMD /bin/bash \ No newline at end of file diff --git a/runAssembly/bin/extractLongReads.pl b/runAssembly/bin/extractLongReads.pl new file mode 100755 index 0000000..870c346 --- /dev/null +++ b/runAssembly/bin/extractLongReads.pl @@ -0,0 +1,86 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + +my $paired_fasta=''; +my $single_fasta=''; +my $outputDir=''; +my $len_cutoff=350; + +GetOptions( + 'p:s' => \$paired_fasta, + 'u:s' => \$single_fasta, + 'd=s' => \$outputDir, + 'len:s' => \$len_cutoff +); + + +my $short_paired_fasta="short_paired.fa"; +my $short_single_fasta="short_single.fa"; +my $long_fasta="long.fa"; + +open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n"; +open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n"; +open (my $o_long, ">$long_fasta") or die "Cannot write $long_fasta\n"; +$/ = ">"; +if (-s $paired_fasta) { + open (my $fh, $paired_fasta) or die "$! $paired_fasta"; + while (<$fh>) + { + $_ =~ s/\>//g; + my ($id, @seq) = split /\n/, $_; + next if (!$id); + my ($id2, @seq2) = split /\n/, <$fh>; + my $seq = join "", @seq; + my $seq2 = join "", @seq2; + my $len = length($seq); + my $len2 = length($seq2); + if ($len > $len_cutoff and $len2 > $len_cutoff) + { + print $o_long ">$id\n$seq\n>$id2\n$seq2\n"; + } + elsif ($len > $len_cutoff) + { + print $o_long ">$id\n$seq\n"; + print $o_single ">$id2\n$seq2\n"; + } + elsif ($len2 > $len_cutoff) + { + print $o_long ">$id2\n$seq2\n"; + print $o_single ">$id\n$seq\n"; + } + else + { + print $o_paired ">$id\n$seq\n>$id2\n$seq2\n"; + } + } + close $fh; +} +if (-s $single_fasta) +{ + open (my $fh, $single_fasta) or die "$! $single_fasta"; + while (<$fh>) + { + $_ =~ s/\>//g; + my ($id, @seq) = split /\n/, $_; + next if (!$id); + my $seq = join "", @seq; + my $len = length($seq); + if ($len > $len_cutoff) + { + print $o_long ">$id\n$seq\n"; + } + else + { + print $o_single ">$id\n$seq\n"; + } + } + close $fh; +} +$/="\n"; +close $o_paired; +close $o_long; +close $o_single; \ No newline at end of file diff --git a/runAssembly/bin/getAvgLen.pl b/runAssembly/bin/getAvgLen.pl new file mode 100755 index 0000000..1333951 --- /dev/null +++ b/runAssembly/bin/getAvgLen.pl @@ -0,0 +1,198 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + +my $outputDir=''; + +my @unpairedList= (); +my @pairedList= (); + +GetOptions( + 'u:s{1,}' => \@unpairedList, + 'p:s{1,}' => \@pairedList, + 'd=s' => \$outputDir, +); + + +my $all_R1_fastq = "$outputDir/all.1.fastq"; +my $all_R2_fastq = "$outputDir/all.2.fastq"; +my $all_SE_fastq = "$outputDir/all.se.fastq"; +my $count_file_list= "$outputDir/fastqCount.txt"; +my $avg_read_len= 0 ; +my $PE_count = 0 ; +my $PE_total_len = 0; +my $SE_count= 0 ; +my $SE_total_len = 0 ; + + + +open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n"; +while(my ($R1,$R2) = splice (@pairedList,0,2)) +{ + ($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh); +} +foreach my $file (@unpairedList) +{ + ($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq); + printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); + printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); +} +close $fh; +$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ; + +sub countPE_exe{ + my $r1=shift; + my $r2=shift; + my $out_r1=shift; + my $out_r2=shift; + my $out_se=shift; + my $count_fh=shift; + my %seq_hash; + my $pair_char; + my $unpaired_count=0; + my $read1_count=0; + my $read2_count=0; + my $se2_count=0; + my $se1_count=0; + my $paired_count=0; + my $read1_total_len=0; + my $read2_total_len=0; + my $existed_id1=0; + my $existed_id2=0; + my ($fh1,$pid) = open_file($r1); + open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n"; + open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n"; + open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n"; + while(<$fh1>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/; + my $seq = <$fh1>; + chomp $seq; + if ($seq_hash{$id}){ + $existed_id1++; + } + my $len = length $seq; + $read1_total_len += $len; + my $qual_id = <$fh1>; + my $qual = <$fh1>; + $seq = $seq."\n".$qual_id.$qual; + $seq_hash{$id}++; + $read1_count++; + } + close $fh1; + my %seq_hash2; + my ($fh2,$pid2) = open_file($r2); + while(<$fh2>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/; + $read2_count++; + my $seq2 = <$fh2>; + chomp $seq2; + if ($seq_hash2{$id2}){ + $existed_id2++; + } + my $len = length $seq2; + $read2_total_len += $len; + my $qual_id = <$fh2>; + my $qual = <$fh2>; + $seq2 = $seq2."\n".$qual_id.$qual; + $seq_hash2{$id2}++; + if ($seq_hash{$id2}){ + $seq_hash{$id2}++; + $paired_count++; + print $ofh2 $id_line,"\n",$seq2; + }else{ + print $ofhse $id_line,"\n",$seq2; + $se2_count++; + } + } + close $fh2; + ($fh1,$pid) = open_file($r1); + while(<$fh1>){ + chomp; + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id_line=$_; + my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/; + my $seq = <$fh1>; + chomp $seq; + my $qual_id = <$fh1>; + my $qual = <$fh1>; + $seq = $seq."\n".$qual_id.$qual; + if ($seq_hash{$id} == 2){ + print $ofh1 $id_line,"\n",$seq; + } + if ($seq_hash{$id} == 1){ + print $ofhse $id_line,"\n",$seq; + $se1_count++; + } + } + close $fh1; + close $ofh1; + close $ofh2; + close $ofhse; + printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); + printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); + printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); + printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); + printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0); + printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0); + printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); + printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0); + + unlink $out_se if (-z $out_se); + return ($read1_count + $read2_count, $read1_total_len + $read2_total_len); +} + +sub countFastq_exe +{ + my $file=shift; + my $output=shift; + my $seq_count=0; + my $total_length; + my ($fh,$pid)= open_file($file); + open (my $ofh, ">>$output") or die "Cannot write $output\n"; + while (<$fh>) + { + next unless $_ =~ /\S/; + next if ($_ =~ /length=0/); + my $id=$_; + $id = '@'."seq_$seq_count\n" if ($id =~ /No name/); + my $seq=<$fh>; + chomp $seq; + my $q_id=<$fh>; + my $q_seq=<$fh>; + my $len = length $seq; + $seq_count++; + $total_length +=$len; + print $ofh "$id$seq\n$q_id$q_seq"; + } + close $fh; + return ($seq_count,$total_length); +} + +sub touchFile{ + my $file=shift; + open (my $fh,">",$file) or die "$!"; + close $fh; +} + +sub open_file +{ + my ($file) = @_; + print "$file\n"; + my $fh; + my $pid; + if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } + else { $pid=open($fh,'<',$file) or die("$file: $!"); } + return ($fh,$pid); +} \ No newline at end of file diff --git a/runAssembly/bin/renameFilterFasta.pl b/runAssembly/bin/renameFilterFasta.pl new file mode 100755 index 0000000..605d86a --- /dev/null +++ b/runAssembly/bin/renameFilterFasta.pl @@ -0,0 +1,95 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +use Getopt::Long; + + +my $fasta; +my $outputDir; +my $size_filter; +my $max_seq_number; +my $id_mapping; +my $contig_size_for_annotation; +my $project_name; +my $do_annotation; + +GetOptions( + 'u=s{1,}' => \$fasta, + 'd=s' => \$outputDir, + 'filt=i' => \$size_filter, + 'maxseq=i' => \$max_seq_number, + 'id:s' => \$id_mapping, + 'ann_size=i' => \$contig_size_for_annotation, + 'n=s' => \$project_name, + 'ann:i' => \$do_annotation #default to false (0) +); + +my $output= "$outputDir/${project_name}_contigs.fa"; +my $id_map= "$outputDir/id_map.txt"; +my $contig_for_annotation = "$outputDir/${project_name}_contigs_${contig_size_for_annotation}up.fa"; +$max_seq_number ||= 100000; +my $serial_id= "0" x length($max_seq_number); +my $id_info; +my ($fh,$pid)=open_file($fasta); +open (my $ofh, "> $output") or die "Cannot write $output\n"; +open (my $ofh2, "> $contig_for_annotation" ) or die "Cannot write $contig_for_annotation\n"; +open (my $idmap_ofh, "> $id_map") or die "Cannot write $id_map\n"; +$/ = ">"; +while (my $line=<$fh>) +{ + $line =~ s/\>//g; + my ($id, @seq) = split /\n/, $line; + next if (!$id); + ($id_info) = $id =~ /(length_\d+ read_count_\d+)/; + my $seq = join "", @seq; + $seq =~ s/-//g; + $seq =~ s/ //g; + $seq =~ s/\n//g; + $seq =~ s/\r//g; + $seq = uc($seq); + my $len = length($seq); + my $GC_num = $seq=~ tr/GCgc/GCgc/; + my $GC_content = sprintf("%.2f",$GC_num/$len); + $id_info = "length_$len "if (!$id_info); + next if ($len < $size_filter); + $seq =~ s/(.{70})/$1\n/g; + chomp $seq; + my $fasta_header; + $id =~ s/\W/_/g; + if($do_annotation){ + # genbank format limit the LOCUS name length + if ($id_mapping){ + $fasta_header = ( length($id) > 20 ) ? "contig_$serial_id $id" : "$id contig_$serial_id"; + print $idmap_ofh "contig_$serial_id\t$id\n"; + }else{ + $fasta_header = ( length($project_name) > 20 || $project_name =~/^Assembly/i ) ? "contig_$serial_id $id_info GC_content_$GC_content": "${project_name}_$serial_id $id_info GC_content_$GC_content"; + } + }else{ + $fasta_header = ($id_mapping)? "$id" : "${project_name}_$serial_id $id_info GC_content_$GC_content"; + } + if ($len >= $contig_size_for_annotation) + { + print $ofh2 ">$fasta_header\n" . $seq."\n"; + } + print $ofh ">$fasta_header\n" . $seq."\n"; + $serial_id++; +} +$/="\n"; +close $fh; +close $ofh; +close $ofh2; +#close $idmap_ofh if ($id_mapping && $configuration->{DoAnnotation}); +if ( -z $id_map) { unlink $id_map ; } + +sub open_file +{ + my ($file) = @_; + print "$file\n"; + my $fh; + my $pid; + if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } + else { $pid=open($fh,'<',$file) or die("$file: $!"); } + return ($fh,$pid); +} \ No newline at end of file diff --git a/runAssembly/nextflow.config b/runAssembly/nextflow.config new file mode 100644 index 0000000..04c17d9 --- /dev/null +++ b/runAssembly/nextflow.config @@ -0,0 +1,48 @@ +process.container = 'apwat/run_assembly:1.4.5' +singularity { + enabled = true + runOptions = "--compat" +} +params { + assembler = "IDBA_UD" + outDir = '.' + threads = 8 + projName = "project" + annotation = false + contigSizeForAnnotation = 700 + pairedFiles = "nf_assets/NO_FILE" + unpairedFile = "nf_assets/NO_FILE2" + minContigSize = 200 + memLimit = null + idba{ + maxK = null + minK = 31 + step = 20 + } + spades { + pacbio = "nf_assets/NO_FILE3" + nanopore = "nf_assets/NO_FILE4" + algorithm = null + } + megahit { + preset = null + } + unicycler { + longreads = "nf_assets/NO_FILE3" + minLongReads = 2000 + bridgingMode = "normal" + } + lrasm { + minLength = 400 + preset = null + algorithm = null + ec = null + numConsensus = null + } +} +workflow.onComplete = { + "rm -rf nf_assets".execute().text +} +workflow.onError = { + "rm -rf nf_assets".execute().text +} \ No newline at end of file diff --git a/runAssembly/runAssembly.nf b/runAssembly/runAssembly.nf new file mode 100644 index 0000000..e9e005d --- /dev/null +++ b/runAssembly/runAssembly.nf @@ -0,0 +1,576 @@ +#!/usr/bin/env nextflow +//to run: nextflow run runAssembly.nf -params-file [JSON parameter file] + +//this workflow is unable to set memory limits (used in idba and spades assemblies) by itself, +//but a limit (in KB) can be provided as a parameter. + +//main process for assembly with IDBA +process idbaUD { + publishDir ( + path:"$params.outDir/AssemblyBasedAnalysis", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { + "scaffold.fa" + } + else{ + null //do not publish contig-max.fa or intermediate contigs yet, but use them for downstream processes + } + } + ) + + input: + path short_paired + path short_single + path long_reads + val avg_len + + output: + path "log" + path "scaffold*.{fa,fasta}", optional:true + path "contig-max.fa", emit: contigs, optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs + + script: + def avg_len = avg_len as Integer + def runFlag = "" + if(short_paired.name != "NO_FILE" && short_single.name != "NO_FILE2") { + runFlag = "-r $short_single --read_level_2 $short_paired " + } + else if(short_paired.name != "NO_FILE") { + runFlag = "-r $short_paired " + } + else if(short_single.name != "NO_FILE2") { + runFlag = "-r $short_single " + } + def longReadsFile = long_reads.name != "NO_FILE3" ? "-l $long_reads" : "" + def maxK = 121 + def maxK_option = "--maxk $maxK " + if (params.idba.maxK == null || params.idba.maxK > avg_len) { + if(avg_len > 0 && avg_len <= 151) { + maxK = avg_len - 1 + maxK_option = "--maxk ${avg_len - 1}" + } + } + def minK = params.idba.minK != null ? "--mink $params.idba.minK " : "" + def step = params.idba.step != null ? "--step $params.idba.step " : "" + def minLen = params.minContigSize != null ? "--min_contig $params.minContigSize " : "" + + def memLimit = params.memLimit != null ? "ulimit -v $params.memLimit 2>/dev/null;" : "" + """ + ${memLimit}idba_ud --pre_correction -o . --num_threads $params.threads\ + $runFlag\ + $longReadsFile\ + $maxK_option\ + $minK\ + $step\ + $minLen + + mv contig-${maxK}.fa contig-max.fa + """ + +} + +//prep for idba +process idbaExtractLong { + input: + path paired + path unpaired + + output: + path "short_paired.fa" + path "short_single.fa" + path "long.fa" + + script: + def pair_file = paired.name != "NO_FILE" ? "-p $paired " : "" + def unpaired_file = unpaired.name != "NO_FILE2" ? "-u $unpaired " : "" + """ + extractLongReads.pl\ + $pair_file\ + $unpaired_file\ + -d . + """ +} + +//prep for idba +process idbaPrepReads { + input: + path paired + path unpaired + + output: + path "pairedForAssembly.fasta", emit:idba_prep_paired + path "unpairedForAssembly.fasta", optional:true + + script: + def pair_process = paired.name != "NO_FILE" ? "fq2fa --filter --merge ${paired[0]} ${paired[1]} pairedForAssembly.fasta;" : "" + def unpair_process = unpaired.name != "NO_FILE2" ? "fq2fa --filter $unpaired unpairedForAssembly.fasta;" : "" + + """ + $pair_process + $unpair_process + """ + +} + +//prep for idba +process idbaReadFastq { + input: + path paired + path unpaired + + output: + path "fastqCount.txt" + + script: + def paired_list = paired.name != "NO_FILE" ? "-p ${paired}" : "" + def unpaired_list = unpaired.name != "NO_FILE2" ? "-u ${unpaired}" : "" + + """ + getAvgLen.pl\ + $paired_list\ + $unpaired_list\ + -d . + """ +} + +//prep for idba +process idbaAvgLen { + input: + + path countFastq + + output: + stdout + + shell: + ''' + #!/usr/bin/env perl + my $fastq_count_file = "./!{countFastq}"; + my $total_count = 0; + my $total_len = 0; + open (my $fh, "<", $fastq_count_file) or die "Cannot open $fastq_count_file\n"; + while(<$fh>){ + chomp; + my ($name,$count,$len,$avg) = split /\t/,$_; + $total_count += $count; + $total_len += $len; + } + close $fh; + my $avg_len = ($total_count > 0)? $total_len/$total_count : 0; + print "$avg_len"; + ''' +} + +//assemble using spades +process spades { + + publishDir ( + path: "$params.outDir/AssemblyBasedAnalysis", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /spades\.log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { + "scaffold.fa" + } + else if(filename ==~ /assembly_graph\.fastg/) { + "${params.projName}_contigs.fastg" + } + else if(filename ==~ /assembly_graph_with_scaffolds\.gfa/) { + "assembly_graph_with_scaffolds.gfa" + } + else if(filename ==~ /(contigs|transcripts)\.fasta/) { + null //do not publish contigs, but use downstream + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream + } + else{ + filename + } + } + ) + + input: + path paired + path unpaired + path pacbio + path nanopore + + output: + path "scaffold*.{fa,fasta}", optional:true + path "spades.log" + path "contigs.paths" + path "{contigs,transcripts}.fasta", emit:contigs, optional:true + path "assembly_graph.fastg", optional:true + path "assembly_graph_with_scaffolds.gfa", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs + + + script: + def paired = paired.name != "NO_FILE" ? "--pe1-1 ${paired[0]} --pe1-2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "--s1 $unpaired " : "" + def pacbio_file = pacbio.name != "NO_FILE3" ? "--pacbio $pacbio " : "" + def nanopore_file = nanopore.name != "NO_FILE4" ? "--nanopore $nanopore " : "" + def meta_flag = (paired != "" && params.spades.algorithm == "metagenome") ? "--meta " : "" + def sc_flag = params.spades.algorithm == "singlecell" ? "--sc " : "" + def rna_flag = params.spades.algorithm == "rna" ? "--rna " : "" + def plasmid_flag = params.spades.algorithm == "plasmid" ? "--plasmid " : "" + def bio_flag = params.spades.algorithm == "bio" ? "--bio " : "" + def corona_flag = params.spades.algorithm == "corona" ? "--corona " : "" + def metaviral_flag = params.spades.algorithm == "metaviral" ? "--metaviral " : "" + def metaplasmid_flag = params.spades.algorithm == "metaplasmid" ? "--metaplasmid " : "" + def rnaviral_flag = params.spades.algorithm == "rnaviral" ? "--rnaviral " : "" + def memLimit = params.memLimit != null ? "-m ${params.memLimit/1024*1024}" : "" + + """ + spades.py -o . -t $params.threads\ + $paired\ + $meta_flag\ + $sc_flag\ + $rna_flag\ + $plasmid_flag\ + $bio_flag\ + $corona_flag\ + $metaviral_flag\ + $metaplasmid_flag\ + $rnaviral_flag\ + $unpaired\ + $pacbio_file\ + $nanopore_file\ + $memLimit + """ +} + +//assemble using megahit +process megahit { + + publishDir( + path: "$params.outDir/AssemblyBasedAnalysis", + mode: 'copy', + saveAs: { + filename -> + if(filename.equals("megahit/log")) { + "assembly.log" + } + else if(filename ==~ /megahit\/scaffold(.*)\.fa(sta)?/) { + "scaffold.fa" + } + else if(filename ==~ /megahit\/final\.contigs\.fa/) { + null //don't publish, but pass to downstream "renameFilterFasta" process + } + else if(filename ==~ /megahit\/((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream + } + else{ + filename + } + } + ) + + input: + path paired + path unpaired + + output: + path "megahit/log" + path "megahit/scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of megahit, but just in case + path "megahit/final.contigs.fa", emit: contigs, optional:true + path "${params.projName}_contigs.fastg" + path "megahit/{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs + + script: + def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "-r $unpaired " : "" + def megahit_preset = params.megahit.preset != null ? "--presets $params.megahit.preset " : "" + + """ + megahit -o ./megahit -t $params.threads\ + $megahit_preset\ + $paired\ + $unpaired\ + 2>&1 + + LARGESTKMER=\$(head -n 1 megahit/final.contigs.fa | perl -ne '/^>k(\\d+)\\_/; print \$1;') + + megahit_toolkit contig2fastg \$LARGESTKMER megahit/intermediate_contigs/k\${LARGESTKMER}.contigs.fa > ${params.projName}_contigs.fastg + """ + +} + + +//assembly using unicycler +process unicycler { + publishDir ( + path: "$params.outDir/AssemblyBasedAnalysis", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /unicycler\.log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { + "scaffold.fa" + } + else if(filename.equals("assembly.gfa")) { + "${params.projName}_contigs.fastg" + } + else if(filename ==~ /assembly\.fasta/) { + null //don't publish, but emit for use in downstream process "renameFilterFasta" + } + else{ + filename + } + } + ) + + input: + path paired + path unpaired + path longreads //If present, expects filtered long reads. + + output: + path "unicycler.log" + path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of unicycler, but just in case + path "assembly.fasta", emit: contigs + path "assembly.gfa", optional:true + //path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs | not produced by unicycler + + script: + def paired = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]} " : "" + def unpaired = unpaired.name != "NO_FILE2" ? "-r $unpaired " : "" + def filt_lr = longreads.name != "NO_FILE3" ? "-l $longreads " : "" + def bridge = params.unicycler.bridgingMode != "normal" ? "--mode $params.unicycler.bridgingMode" : "--mode normal" + + """ + export _JAVA_OPTIONS='-Xmx20G'; export TERM='xterm'; + + unicycler -t $params.threads -o .\ + $paired\ + $filt_lr\ + $bridge 2>&1 1>/dev/null + """ + +} + +//filter long reads for unicycler +process unicyclerPrep { + input: + path longreads + + + output: + path "long_reads.fasta" + + script: + + """ + seqtk seq -A -L\ + $params.unicycler.minLongReads\ + $longreads > long_reads.fasta + """ +} + + +//assembly using lrasm +process lrasm { + + publishDir ( + path: "$params.outDir/AssemblyBasedAnalysis", + mode: 'copy', + saveAs: { + filename -> + if(filename ==~ /contigs\.log/) { + "assembly.log" + } + else if(filename ==~ /scaffold(s|-level-2|)\.fa(sta)?/) { + "scaffold.fa" + } + else if(filename.equals("Assembly/unitig.gfa")) { + "${params.projName}_contigs.fastg" + } + else if(filename.equals("Assembly/assembly_graph.gfa")) { + "${params.projName}_contigs.fastg" + } + else if(filename.equals("Assembly/assembly_graph.gv")) { + "${params.projName}_contigs.gv" + } + else if(filename.equals("Assembly/assembly_info.txt")) { + "assembly_info.txt" + } + else if(filename ==~ /contigs\.fa/) { + null //do not publish, but emit for use in downstream process "renameFilterFasta" + } + else if(filename ==~ /((contig-.*)|(.*contigs\.fa)|(K.*\/final_contigs\.fasta))/) { + null //do not publish intermediate contigs, but use downstream + } + else{ + filename + } + } + ) + + input: + path unpaired + + output: + path "contigs.log" + path "scaffold*.{fa,fasta}", optional:true //I don't believe this is a normal output of lrasm, but just in case + path "contigs.fa", emit:contigs, optional:true + path "Assembly/unitig.gfa", optional:true + path "Assembly/assembly_graph.gfa", optional:true + path "Assembly/assembly_graph.gv", optional:true + path "Assembly/assembly_info.txt", optional:true + path "{contig-*,*contigs.fa,K*/final_contigs.fasta}", emit: intContigs + + script: + def consensus = params.lrasm.numConsensus != null ? "-n $params.lrasm.numConsensus ": "" + def preset = params.lrasm.preset != null ? "-x $params.lrasm.preset " : "" + def errorCorrection = params.lrasm.ec != null ? "-e " : "" + def algorithm = params.lrasm.algorithm != null ? "-a $params.lrasm.algorithm " : "" + def minLenOpt = "" + if (params.lrasm.algorithm == "miniasm") { + minLenOpt = "--ao \'-s $params.lrasm.minLength\' " + } + else if (params.lrasm.algorithm == "wtdbg2") { + minLenOpt = "--wo \'-L $params.lrasm.minLength\' " + } + def flyeOpt = params.lrasm.algorithm == "metaflye" ? "--fo '--meta' ": "" + + """ + lrasm -o . -t $params.threads\ + $preset\ + $consensus\ + $errorCorrection\ + $algorithm\ + $minLenOpt\ + $flyeOpt\ + $unpaired\ + """ + //2>/dev/null +} + +process renameFilterFasta { + publishDir( + path: "$params.outDir/AssemblyBasedAnalysis", + mode: 'copy' + ) + input: + path contigs + + output: + path "*" + + script: + def annotation = params.annotation ? "-ann 1" : "" + """ + CONTIG_NUMBER=\$(grep -c '>' ${contigs}) + + renameFilterFasta.pl \ + -u $contigs\ + -d .\ + -filt $params.minContigSize\ + -maxseq \$CONTIG_NUMBER\ + -ann_size $params.contigSizeForAnnotation\ + -n $params.projName\ + $annotation + """ + +} + +process bestIncompleteAssembly { + input: + val x + path intContigs + + when: + x == 'EMPTY' + + output: + path "bestIntContig/*" + + shell: + ''' + #!/usr/bin/env perl + use Cwd; + my $dir = getcwd; + use Cwd 'abs_path'; + my @intermediate_contigs = sort { -M $a <=> -M $b} glob("!{intContigs}"); + my $best_int = $dir . "/" . $intermediate_contigs[0]; + mkdir bestIntContig; + system("cp $best_int ./bestIntContig"); + ''' + +} + +workflow { + "mkdir nf_assets".execute().text + "touch nf_assets/NO_FILE".execute().text + "touch nf_assets/NO_FILE2".execute().text + "touch nf_assets/NO_FILE3".execute().text + "touch nf_assets/NO_FILE4".execute().text + + paired_ch = channel.fromPath(params.pairedFiles, relative:true, checkIfExists:true).collect() + unpaired_ch = channel.fromPath(params.unpairedFile, relative:true, checkIfExists:true) + spades_pb = file(params.spades.pacbio, checkIfExists:true) + spades_np = file(params.spades.nanopore, checkIfExists:true) + unicycler_lr = file(params.unicycler.longreads, checkIfExists:true) + + if (params.assembler.equalsIgnoreCase("IDBA_UD")) { + avg_len_ch = idbaAvgLen(idbaReadFastq(paired_ch, unpaired_ch)) + (c1,c2) = idbaPrepReads(paired_ch, unpaired_ch) + (sp,su,l) = idbaExtractLong(c1,c2.ifEmpty({file("nf_assets/NO_FILE")})) + idbaUD(sp.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE")}), + su.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE2")}), + l.filter{ it.size()>0 }.ifEmpty({file("nf_assets/NO_FILE3")}), + avg_len_ch) + + bestIncompleteAssembly(idbaUD.out.contigs.ifEmpty('EMPTY'), idbaUD.out.intContigs) + renameFilterFasta(idbaUD.out.contigs.concat(bestIncompleteAssembly.out).first()) + + } + else if (params.assembler.equalsIgnoreCase("SPAdes")) { + spades(paired_ch, unpaired_ch, spades_pb, spades_np) + + bestIncompleteAssembly(spades.out.contigs.ifEmpty('EMPTY'), spades.out.intContigs) + renameFilterFasta(spades.out.contigs.concat(bestIncompleteAssembly.out).first()) + } + else if (params.assembler.equalsIgnoreCase("MEGAHIT")) { + megahit(paired_ch, unpaired_ch) + + bestIncompleteAssembly(megahit.out.contigs.ifEmpty('EMPTY'), megahit.out.intContigs) + renameFilterFasta(megahit.out.contigs.concat(bestIncompleteAssembly.out).first()) + } + else if (params.assembler.equalsIgnoreCase("UniCycler")) { + if (params.unicycler.longreads != "nf_assets/NO_FILE3") { + println("Filter long reads with $params.unicycler.minLongReads (bp) cutoff") + unicycler(paired_ch, + unpaired_ch, + unicyclerPrep(unicycler_lr).filter{it.size()>0}.ifEmpty({file("nf_assets/NO_FILE3")})) + //unicycler produces no intermediate contigs, we let it error out above rather than try to rescue a failed assembly + renameFilterFasta(unicycler.out.contigs) + } + else { + unicycler(paired_ch, unpaired_ch, unicycler_lr) + renameFilterFasta(unicycler.out.contigs) + } + } + else if (params.assembler.equalsIgnoreCase("LRASM")) { + lrasm(unpaired_ch) + + bestIncompleteAssembly(lrasm.out.contigs.ifEmpty('EMPTY'), lrasm.out.intContigs) + renameFilterFasta(lrasm.out.contigs.concat(bestIncompleteAssembly.out).first()) + } + else { + error "Invalid assembler: $params.assembler" + } + +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/idba_assembly.json b/runAssembly/test_files/parameters/idba_assembly.json new file mode 100644 index 0000000..159b0bc --- /dev/null +++ b/runAssembly/test_files/parameters/idba_assembly.json @@ -0,0 +1,7 @@ +{ + "assembler":"IDBA_UD", + "threads":4, + "projName":"test_idba", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/lrasm_assembly.json b/runAssembly/test_files/parameters/lrasm_assembly.json new file mode 100644 index 0000000..a32a2f4 --- /dev/null +++ b/runAssembly/test_files/parameters/lrasm_assembly.json @@ -0,0 +1,10 @@ +{ + "assembler":"LRASM", + "threads":4, + "projName":"test_lrasm", + "unpairedFile":["test_files/reads.fastq.gz"], + "lrasm" : { + "ec":true + }, + "outDir": "test_results" +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/megahit_assembly.json b/runAssembly/test_files/parameters/megahit_assembly.json new file mode 100644 index 0000000..97415d5 --- /dev/null +++ b/runAssembly/test_files/parameters/megahit_assembly.json @@ -0,0 +1,7 @@ +{ + "assembler":"MEGAHIT", + "threads":4, + "projName":"test_megahit", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/spades_assembly.json b/runAssembly/test_files/parameters/spades_assembly.json new file mode 100644 index 0000000..b9aa978 --- /dev/null +++ b/runAssembly/test_files/parameters/spades_assembly.json @@ -0,0 +1,7 @@ +{ + "assembler":"SPAdes", + "threads":4, + "projName":"test_spades", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" +} \ No newline at end of file diff --git a/runAssembly/test_files/parameters/unicycler_assembly.json b/runAssembly/test_files/parameters/unicycler_assembly.json new file mode 100644 index 0000000..d74f750 --- /dev/null +++ b/runAssembly/test_files/parameters/unicycler_assembly.json @@ -0,0 +1,7 @@ +{ + "assembler":"uNicycleR", + "threads":4, + "projName":"test_unicycler", + "pairedFiles":["test_files/Ecoli_10x.1.fastq","test_files/Ecoli_10x.2.fastq"], + "outDir": "test_results" +} \ No newline at end of file