Skip to content

Commit

Permalink
Merge pull request #7 from LANL-Bioinformatics/nf_runAssembly
Browse files Browse the repository at this point in the history
Nf run assembly
  • Loading branch information
mflynn-lanl authored Sep 16, 2024
2 parents 20f99cc + b2c3d5f commit 8fa630c
Show file tree
Hide file tree
Showing 11 changed files with 1,095 additions and 0 deletions.
54 changes: 54 additions & 0 deletions runAssembly/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
FROM continuumio/miniconda3:23.5.2-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name assembly \
&& conda activate assembly

# install dependencies
RUN conda install conda-libmamba-solver
RUN conda config --set solver libmamba
RUN conda install -n assembly -c conda-forge python=3.9
RUN conda install -n assembly -c bioconda samtools=1.17
RUN conda install -n assembly -c bioconda racon=1.5
RUN conda install -n assembly -c bioconda seqtk=1.3
RUN conda install -n assembly -c bioconda spades=3.15.5
RUN conda install -n assembly -c bioconda minimap2=2.26
RUN conda install -n assembly -c bioconda megahit=1.2.9
RUN conda install -n assembly -c bioconda idba=1.1.3
RUN conda install -n assembly -c bioconda unicycler=0.5.0
RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz \
&& tar -xvzf wtdbg-2.5_x64_linux.tgz \
&& cp wtdbg-2.5_x64_linux/* /opt/conda/envs/assembly/bin
RUN conda install -n assembly -c bioconda flye=2.9.2
RUN conda install -n assembly git
RUN conda install -c conda-forge conda-pack

ADD bin/extractLongReads.pl /opt/conda/envs/assembly/bin
ADD bin/getAvgLen.pl /opt/conda/envs/assembly/bin
ADD bin/renameFilterFasta.pl /opt/conda/envs/assembly/bin

RUN conda-pack -n assembly -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

FROM debian:latest AS runtime

COPY --from=build /venv /venv
ENV PERL5LIB=/venv/lib/perl5/core_perl


ENV PATH="/venv/bin:$PATH"
RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git
ENV PATH="/long_read_assembly:$PATH"

SHELL ["/bin/bash", "-c"]
CMD /bin/bash
86 changes: 86 additions & 0 deletions runAssembly/bin/extractLongReads.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
use Getopt::Long;

my $paired_fasta='';
my $single_fasta='';
my $outputDir='';
my $len_cutoff=350;

GetOptions(
'p:s' => \$paired_fasta,
'u:s' => \$single_fasta,
'd=s' => \$outputDir,
'len:s' => \$len_cutoff
);


my $short_paired_fasta="short_paired.fa";
my $short_single_fasta="short_single.fa";
my $long_fasta="long.fa";

open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n";
open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n";
open (my $o_long, ">$long_fasta") or die "Cannot write $long_fasta\n";
$/ = ">";
if (-s $paired_fasta) {
open (my $fh, $paired_fasta) or die "$! $paired_fasta";
while (<$fh>)
{
$_ =~ s/\>//g;
my ($id, @seq) = split /\n/, $_;
next if (!$id);
my ($id2, @seq2) = split /\n/, <$fh>;
my $seq = join "", @seq;
my $seq2 = join "", @seq2;
my $len = length($seq);
my $len2 = length($seq2);
if ($len > $len_cutoff and $len2 > $len_cutoff)
{
print $o_long ">$id\n$seq\n>$id2\n$seq2\n";
}
elsif ($len > $len_cutoff)
{
print $o_long ">$id\n$seq\n";
print $o_single ">$id2\n$seq2\n";
}
elsif ($len2 > $len_cutoff)
{
print $o_long ">$id2\n$seq2\n";
print $o_single ">$id\n$seq\n";
}
else
{
print $o_paired ">$id\n$seq\n>$id2\n$seq2\n";
}
}
close $fh;
}
if (-s $single_fasta)
{
open (my $fh, $single_fasta) or die "$! $single_fasta";
while (<$fh>)
{
$_ =~ s/\>//g;
my ($id, @seq) = split /\n/, $_;
next if (!$id);
my $seq = join "", @seq;
my $len = length($seq);
if ($len > $len_cutoff)
{
print $o_long ">$id\n$seq\n";
}
else
{
print $o_single ">$id\n$seq\n";
}
}
close $fh;
}
$/="\n";
close $o_paired;
close $o_long;
close $o_single;
198 changes: 198 additions & 0 deletions runAssembly/bin/getAvgLen.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
use Getopt::Long;

my $outputDir='';

my @unpairedList= ();
my @pairedList= ();

GetOptions(
'u:s{1,}' => \@unpairedList,
'p:s{1,}' => \@pairedList,
'd=s' => \$outputDir,
);


my $all_R1_fastq = "$outputDir/all.1.fastq";
my $all_R2_fastq = "$outputDir/all.2.fastq";
my $all_SE_fastq = "$outputDir/all.se.fastq";
my $count_file_list= "$outputDir/fastqCount.txt";
my $avg_read_len= 0 ;
my $PE_count = 0 ;
my $PE_total_len = 0;
my $SE_count= 0 ;
my $SE_total_len = 0 ;



open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n";
while(my ($R1,$R2) = splice (@pairedList,0,2))
{
($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh);
}
foreach my $file (@unpairedList)
{
($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq);
printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
}
close $fh;
$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ;

sub countPE_exe{
my $r1=shift;
my $r2=shift;
my $out_r1=shift;
my $out_r2=shift;
my $out_se=shift;
my $count_fh=shift;
my %seq_hash;
my $pair_char;
my $unpaired_count=0;
my $read1_count=0;
my $read2_count=0;
my $se2_count=0;
my $se1_count=0;
my $paired_count=0;
my $read1_total_len=0;
my $read2_total_len=0;
my $existed_id1=0;
my $existed_id2=0;
my ($fh1,$pid) = open_file($r1);
open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n";
open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n";
open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n";
while(<$fh1>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/;
my $seq = <$fh1>;
chomp $seq;
if ($seq_hash{$id}){
$existed_id1++;
}
my $len = length $seq;
$read1_total_len += $len;
my $qual_id = <$fh1>;
my $qual = <$fh1>;
$seq = $seq."\n".$qual_id.$qual;
$seq_hash{$id}++;
$read1_count++;
}
close $fh1;
my %seq_hash2;
my ($fh2,$pid2) = open_file($r2);
while(<$fh2>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/;
$read2_count++;
my $seq2 = <$fh2>;
chomp $seq2;
if ($seq_hash2{$id2}){
$existed_id2++;
}
my $len = length $seq2;
$read2_total_len += $len;
my $qual_id = <$fh2>;
my $qual = <$fh2>;
$seq2 = $seq2."\n".$qual_id.$qual;
$seq_hash2{$id2}++;
if ($seq_hash{$id2}){
$seq_hash{$id2}++;
$paired_count++;
print $ofh2 $id_line,"\n",$seq2;
}else{
print $ofhse $id_line,"\n",$seq2;
$se2_count++;
}
}
close $fh2;
($fh1,$pid) = open_file($r1);
while(<$fh1>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/;
my $seq = <$fh1>;
chomp $seq;
my $qual_id = <$fh1>;
my $qual = <$fh1>;
$seq = $seq."\n".$qual_id.$qual;
if ($seq_hash{$id} == 2){
print $ofh1 $id_line,"\n",$seq;
}
if ($seq_hash{$id} == 1){
print $ofhse $id_line,"\n",$seq;
$se1_count++;
}
}
close $fh1;
close $ofh1;
close $ofh2;
close $ofhse;
printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0);
printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0);
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0);
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0);

unlink $out_se if (-z $out_se);
return ($read1_count + $read2_count, $read1_total_len + $read2_total_len);
}

sub countFastq_exe
{
my $file=shift;
my $output=shift;
my $seq_count=0;
my $total_length;
my ($fh,$pid)= open_file($file);
open (my $ofh, ">>$output") or die "Cannot write $output\n";
while (<$fh>)
{
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id=$_;
$id = '@'."seq_$seq_count\n" if ($id =~ /No name/);
my $seq=<$fh>;
chomp $seq;
my $q_id=<$fh>;
my $q_seq=<$fh>;
my $len = length $seq;
$seq_count++;
$total_length +=$len;
print $ofh "$id$seq\n$q_id$q_seq";
}
close $fh;
return ($seq_count,$total_length);
}

sub touchFile{
my $file=shift;
open (my $fh,">",$file) or die "$!";
close $fh;
}

sub open_file
{
my ($file) = @_;
print "$file\n";
my $fh;
my $pid;
if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); }
else { $pid=open($fh,'<',$file) or die("$file: $!"); }
return ($fh,$pid);
}
Loading

0 comments on commit 8fa630c

Please sign in to comment.