-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from LANL-Bioinformatics/nf_runAssembly
Nf run assembly
- Loading branch information
Showing
11 changed files
with
1,095 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
FROM continuumio/miniconda3:23.5.2-0 AS build | ||
|
||
ENV container=docker | ||
|
||
# add conda channels | ||
RUN conda config --add channels conda-forge \ | ||
&& conda config --add channels bioconda | ||
|
||
RUN conda init bash \ | ||
&& . ~/.bashrc \ | ||
&& conda create --name assembly \ | ||
&& conda activate assembly | ||
|
||
# install dependencies | ||
RUN conda install conda-libmamba-solver | ||
RUN conda config --set solver libmamba | ||
RUN conda install -n assembly -c conda-forge python=3.9 | ||
RUN conda install -n assembly -c bioconda samtools=1.17 | ||
RUN conda install -n assembly -c bioconda racon=1.5 | ||
RUN conda install -n assembly -c bioconda seqtk=1.3 | ||
RUN conda install -n assembly -c bioconda spades=3.15.5 | ||
RUN conda install -n assembly -c bioconda minimap2=2.26 | ||
RUN conda install -n assembly -c bioconda megahit=1.2.9 | ||
RUN conda install -n assembly -c bioconda idba=1.1.3 | ||
RUN conda install -n assembly -c bioconda unicycler=0.5.0 | ||
RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz \ | ||
&& tar -xvzf wtdbg-2.5_x64_linux.tgz \ | ||
&& cp wtdbg-2.5_x64_linux/* /opt/conda/envs/assembly/bin | ||
RUN conda install -n assembly -c bioconda flye=2.9.2 | ||
RUN conda install -n assembly git | ||
RUN conda install -c conda-forge conda-pack | ||
|
||
ADD bin/extractLongReads.pl /opt/conda/envs/assembly/bin | ||
ADD bin/getAvgLen.pl /opt/conda/envs/assembly/bin | ||
ADD bin/renameFilterFasta.pl /opt/conda/envs/assembly/bin | ||
|
||
RUN conda-pack -n assembly -o /tmp/env.tar && \ | ||
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ | ||
rm /tmp/env.tar | ||
|
||
RUN /venv/bin/conda-unpack | ||
|
||
FROM debian:latest AS runtime | ||
|
||
COPY --from=build /venv /venv | ||
ENV PERL5LIB=/venv/lib/perl5/core_perl | ||
|
||
|
||
ENV PATH="/venv/bin:$PATH" | ||
RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git | ||
ENV PATH="/long_read_assembly:$PATH" | ||
|
||
SHELL ["/bin/bash", "-c"] | ||
CMD /bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use Getopt::Long; | ||
|
||
my $paired_fasta=''; | ||
my $single_fasta=''; | ||
my $outputDir=''; | ||
my $len_cutoff=350; | ||
|
||
GetOptions( | ||
'p:s' => \$paired_fasta, | ||
'u:s' => \$single_fasta, | ||
'd=s' => \$outputDir, | ||
'len:s' => \$len_cutoff | ||
); | ||
|
||
|
||
my $short_paired_fasta="short_paired.fa"; | ||
my $short_single_fasta="short_single.fa"; | ||
my $long_fasta="long.fa"; | ||
|
||
open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n"; | ||
open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n"; | ||
open (my $o_long, ">$long_fasta") or die "Cannot write $long_fasta\n"; | ||
$/ = ">"; | ||
if (-s $paired_fasta) { | ||
open (my $fh, $paired_fasta) or die "$! $paired_fasta"; | ||
while (<$fh>) | ||
{ | ||
$_ =~ s/\>//g; | ||
my ($id, @seq) = split /\n/, $_; | ||
next if (!$id); | ||
my ($id2, @seq2) = split /\n/, <$fh>; | ||
my $seq = join "", @seq; | ||
my $seq2 = join "", @seq2; | ||
my $len = length($seq); | ||
my $len2 = length($seq2); | ||
if ($len > $len_cutoff and $len2 > $len_cutoff) | ||
{ | ||
print $o_long ">$id\n$seq\n>$id2\n$seq2\n"; | ||
} | ||
elsif ($len > $len_cutoff) | ||
{ | ||
print $o_long ">$id\n$seq\n"; | ||
print $o_single ">$id2\n$seq2\n"; | ||
} | ||
elsif ($len2 > $len_cutoff) | ||
{ | ||
print $o_long ">$id2\n$seq2\n"; | ||
print $o_single ">$id\n$seq\n"; | ||
} | ||
else | ||
{ | ||
print $o_paired ">$id\n$seq\n>$id2\n$seq2\n"; | ||
} | ||
} | ||
close $fh; | ||
} | ||
if (-s $single_fasta) | ||
{ | ||
open (my $fh, $single_fasta) or die "$! $single_fasta"; | ||
while (<$fh>) | ||
{ | ||
$_ =~ s/\>//g; | ||
my ($id, @seq) = split /\n/, $_; | ||
next if (!$id); | ||
my $seq = join "", @seq; | ||
my $len = length($seq); | ||
if ($len > $len_cutoff) | ||
{ | ||
print $o_long ">$id\n$seq\n"; | ||
} | ||
else | ||
{ | ||
print $o_single ">$id\n$seq\n"; | ||
} | ||
} | ||
close $fh; | ||
} | ||
$/="\n"; | ||
close $o_paired; | ||
close $o_long; | ||
close $o_single; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use Getopt::Long; | ||
|
||
my $outputDir=''; | ||
|
||
my @unpairedList= (); | ||
my @pairedList= (); | ||
|
||
GetOptions( | ||
'u:s{1,}' => \@unpairedList, | ||
'p:s{1,}' => \@pairedList, | ||
'd=s' => \$outputDir, | ||
); | ||
|
||
|
||
my $all_R1_fastq = "$outputDir/all.1.fastq"; | ||
my $all_R2_fastq = "$outputDir/all.2.fastq"; | ||
my $all_SE_fastq = "$outputDir/all.se.fastq"; | ||
my $count_file_list= "$outputDir/fastqCount.txt"; | ||
my $avg_read_len= 0 ; | ||
my $PE_count = 0 ; | ||
my $PE_total_len = 0; | ||
my $SE_count= 0 ; | ||
my $SE_total_len = 0 ; | ||
|
||
|
||
|
||
open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n"; | ||
while(my ($R1,$R2) = splice (@pairedList,0,2)) | ||
{ | ||
($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh); | ||
} | ||
foreach my $file (@unpairedList) | ||
{ | ||
($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq); | ||
printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); | ||
} | ||
close $fh; | ||
$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ; | ||
|
||
sub countPE_exe{ | ||
my $r1=shift; | ||
my $r2=shift; | ||
my $out_r1=shift; | ||
my $out_r2=shift; | ||
my $out_se=shift; | ||
my $count_fh=shift; | ||
my %seq_hash; | ||
my $pair_char; | ||
my $unpaired_count=0; | ||
my $read1_count=0; | ||
my $read2_count=0; | ||
my $se2_count=0; | ||
my $se1_count=0; | ||
my $paired_count=0; | ||
my $read1_total_len=0; | ||
my $read2_total_len=0; | ||
my $existed_id1=0; | ||
my $existed_id2=0; | ||
my ($fh1,$pid) = open_file($r1); | ||
open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n"; | ||
open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n"; | ||
open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n"; | ||
while(<$fh1>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/; | ||
my $seq = <$fh1>; | ||
chomp $seq; | ||
if ($seq_hash{$id}){ | ||
$existed_id1++; | ||
} | ||
my $len = length $seq; | ||
$read1_total_len += $len; | ||
my $qual_id = <$fh1>; | ||
my $qual = <$fh1>; | ||
$seq = $seq."\n".$qual_id.$qual; | ||
$seq_hash{$id}++; | ||
$read1_count++; | ||
} | ||
close $fh1; | ||
my %seq_hash2; | ||
my ($fh2,$pid2) = open_file($r2); | ||
while(<$fh2>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/; | ||
$read2_count++; | ||
my $seq2 = <$fh2>; | ||
chomp $seq2; | ||
if ($seq_hash2{$id2}){ | ||
$existed_id2++; | ||
} | ||
my $len = length $seq2; | ||
$read2_total_len += $len; | ||
my $qual_id = <$fh2>; | ||
my $qual = <$fh2>; | ||
$seq2 = $seq2."\n".$qual_id.$qual; | ||
$seq_hash2{$id2}++; | ||
if ($seq_hash{$id2}){ | ||
$seq_hash{$id2}++; | ||
$paired_count++; | ||
print $ofh2 $id_line,"\n",$seq2; | ||
}else{ | ||
print $ofhse $id_line,"\n",$seq2; | ||
$se2_count++; | ||
} | ||
} | ||
close $fh2; | ||
($fh1,$pid) = open_file($r1); | ||
while(<$fh1>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/; | ||
my $seq = <$fh1>; | ||
chomp $seq; | ||
my $qual_id = <$fh1>; | ||
my $qual = <$fh1>; | ||
$seq = $seq."\n".$qual_id.$qual; | ||
if ($seq_hash{$id} == 2){ | ||
print $ofh1 $id_line,"\n",$seq; | ||
} | ||
if ($seq_hash{$id} == 1){ | ||
print $ofhse $id_line,"\n",$seq; | ||
$se1_count++; | ||
} | ||
} | ||
close $fh1; | ||
close $ofh1; | ||
close $ofh2; | ||
close $ofhse; | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); | ||
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); | ||
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); | ||
printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0); | ||
printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0); | ||
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); | ||
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0); | ||
|
||
unlink $out_se if (-z $out_se); | ||
return ($read1_count + $read2_count, $read1_total_len + $read2_total_len); | ||
} | ||
|
||
sub countFastq_exe | ||
{ | ||
my $file=shift; | ||
my $output=shift; | ||
my $seq_count=0; | ||
my $total_length; | ||
my ($fh,$pid)= open_file($file); | ||
open (my $ofh, ">>$output") or die "Cannot write $output\n"; | ||
while (<$fh>) | ||
{ | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id=$_; | ||
$id = '@'."seq_$seq_count\n" if ($id =~ /No name/); | ||
my $seq=<$fh>; | ||
chomp $seq; | ||
my $q_id=<$fh>; | ||
my $q_seq=<$fh>; | ||
my $len = length $seq; | ||
$seq_count++; | ||
$total_length +=$len; | ||
print $ofh "$id$seq\n$q_id$q_seq"; | ||
} | ||
close $fh; | ||
return ($seq_count,$total_length); | ||
} | ||
|
||
sub touchFile{ | ||
my $file=shift; | ||
open (my $fh,">",$file) or die "$!"; | ||
close $fh; | ||
} | ||
|
||
sub open_file | ||
{ | ||
my ($file) = @_; | ||
print "$file\n"; | ||
my $fh; | ||
my $pid; | ||
if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } | ||
else { $pid=open($fh,'<',$file) or die("$file: $!"); } | ||
return ($fh,$pid); | ||
} |
Oops, something went wrong.