Merge pull request #7 from LANL-Bioinformatics/nf_runAssembly

Nf run assembly
LANL-Bioinformatics · Sep 16, 2024 · 8fa630c · 8fa630c
2 parents 20f99cc + b2c3d5f
commit 8fa630c
Show file tree

Hide file tree

Showing 11 changed files with 1,095 additions and 0 deletions.
diff --git a/runAssembly/Dockerfile b/runAssembly/Dockerfile
@@ -0,0 +1,54 @@
+FROM continuumio/miniconda3:23.5.2-0 AS build
+
+ENV container=docker
+
+# add conda channels
+RUN conda config --add channels conda-forge \
+    && conda config --add channels bioconda
+
+RUN conda init bash \
+    && . ~/.bashrc \
+    && conda create --name assembly \
+    && conda activate assembly 
+
+# install dependencies
+RUN conda install conda-libmamba-solver
+RUN conda config --set solver libmamba
+RUN conda install -n assembly -c conda-forge python=3.9
+RUN conda install -n assembly -c bioconda samtools=1.17
+RUN conda install -n assembly -c bioconda racon=1.5
+RUN conda install -n assembly -c bioconda seqtk=1.3
+RUN conda install -n assembly -c bioconda spades=3.15.5
+RUN conda install -n assembly -c bioconda minimap2=2.26
+RUN conda install -n assembly -c bioconda megahit=1.2.9
+RUN conda install -n assembly -c bioconda idba=1.1.3
+RUN conda install -n assembly -c bioconda unicycler=0.5.0
+RUN wget https://github.com/ruanjue/wtdbg2/releases/download/v2.5/wtdbg-2.5_x64_linux.tgz \
+    && tar -xvzf wtdbg-2.5_x64_linux.tgz \
+    && cp wtdbg-2.5_x64_linux/* /opt/conda/envs/assembly/bin
+RUN conda install -n assembly -c bioconda flye=2.9.2
+RUN conda install -n assembly git
+RUN conda install -c conda-forge conda-pack
+
+ADD bin/extractLongReads.pl /opt/conda/envs/assembly/bin
+ADD bin/getAvgLen.pl /opt/conda/envs/assembly/bin
+ADD bin/renameFilterFasta.pl /opt/conda/envs/assembly/bin
+
+RUN conda-pack -n assembly -o /tmp/env.tar && \
+    mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
+    rm /tmp/env.tar
+
+RUN /venv/bin/conda-unpack
+
+FROM debian:latest AS runtime
+
+COPY --from=build /venv /venv
+ENV PERL5LIB=/venv/lib/perl5/core_perl
+
+
+ENV PATH="/venv/bin:$PATH"
+RUN git clone --depth 1 https://gitlab.com/chienchi/long_read_assembly.git
+ENV PATH="/long_read_assembly:$PATH"
+
+SHELL ["/bin/bash", "-c"]
+CMD /bin/bash
diff --git a/runAssembly/bin/extractLongReads.pl b/runAssembly/bin/extractLongReads.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $paired_fasta='';
+my $single_fasta='';
+my $outputDir='';
+my $len_cutoff=350;
+
+GetOptions(
+    'p:s' => \$paired_fasta,
+    'u:s' => \$single_fasta,
+    'd=s' => \$outputDir,
+    'len:s' => \$len_cutoff
+);
+
+
+my $short_paired_fasta="short_paired.fa";
+my $short_single_fasta="short_single.fa";
+my $long_fasta="long.fa";
+
+open (my $o_paired, ">$short_paired_fasta") or die "Cannot write $short_paired_fasta\n";
+open (my $o_single, ">$short_single_fasta") or die "Cannot write $short_single_fasta\n";
+open (my $o_long, ">$long_fasta") or die "Cannot write $long_fasta\n";
+$/ = ">";
+if (-s $paired_fasta) {
+    open (my $fh, $paired_fasta) or die "$! $paired_fasta";
+    while (<$fh>)
+    { 
+        $_ =~ s/\>//g;
+        my ($id, @seq) = split /\n/, $_;
+        next if (!$id);
+        my ($id2, @seq2) = split /\n/, <$fh>;
+        my $seq = join "", @seq;
+        my $seq2 = join "", @seq2;
+        my $len = length($seq);
+        my $len2 = length($seq2);
+        if ($len > $len_cutoff and $len2 > $len_cutoff)
+        {
+                print $o_long ">$id\n$seq\n>$id2\n$seq2\n";
+        }
+        elsif ($len > $len_cutoff)
+        {
+                print $o_long ">$id\n$seq\n";
+                print $o_single ">$id2\n$seq2\n";
+        }
+        elsif ($len2 > $len_cutoff)
+        {
+                print $o_long ">$id2\n$seq2\n";
+                print $o_single ">$id\n$seq\n";
+        }
+        else
+        {
+                print $o_paired ">$id\n$seq\n>$id2\n$seq2\n";   
+        }
+    }
+    close $fh;
+}
+if (-s $single_fasta)
+{
+    open (my $fh, $single_fasta) or die "$! $single_fasta";
+    while (<$fh>)
+    { 
+        $_ =~ s/\>//g;
+        my ($id, @seq) = split /\n/, $_;
+        next if (!$id);
+        my $seq = join "", @seq;
+        my $len = length($seq);
+        if ($len > $len_cutoff)
+        {
+                print $o_long ">$id\n$seq\n";
+        }
+        else
+        {
+                print $o_single ">$id\n$seq\n";
+        }
+    }
+    close $fh;
+}
+$/="\n";
+close $o_paired;
+close $o_long;
+close $o_single;
diff --git a/runAssembly/bin/getAvgLen.pl b/runAssembly/bin/getAvgLen.pl
@@ -0,0 +1,198 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $outputDir='';
+
+my @unpairedList= ();
+my @pairedList=  ();
+
+GetOptions(
+    'u:s{1,}' => \@unpairedList,
+    'p:s{1,}' => \@pairedList,
+    'd=s' => \$outputDir,
+);
+
+
+my $all_R1_fastq = "$outputDir/all.1.fastq";
+my $all_R2_fastq = "$outputDir/all.2.fastq";
+my $all_SE_fastq = "$outputDir/all.se.fastq";
+my $count_file_list= "$outputDir/fastqCount.txt";
+my $avg_read_len= 0 ;
+my $PE_count = 0 ;
+my $PE_total_len = 0;
+my $SE_count= 0 ;
+my $SE_total_len = 0 ;
+
+
+
+open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n";
+while(my ($R1,$R2) = splice (@pairedList,0,2))
+{
+    ($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh);
+}
+foreach my $file (@unpairedList)
+{
+    ($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq);
+    printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
+    printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
+}
+close $fh;
+$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ;
+
+sub countPE_exe{
+    my $r1=shift;
+    my $r2=shift;
+    my $out_r1=shift;
+    my $out_r2=shift;
+    my $out_se=shift;
+    my $count_fh=shift;
+    my %seq_hash;
+    my $pair_char;
+    my $unpaired_count=0;
+    my $read1_count=0;
+    my $read2_count=0;
+    my $se2_count=0;
+    my $se1_count=0;
+    my $paired_count=0;
+    my $read1_total_len=0;
+    my $read2_total_len=0;
+    my $existed_id1=0;
+    my $existed_id2=0;
+    my ($fh1,$pid) = open_file($r1);
+    open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n";
+    open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n";
+    open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n";
+    while(<$fh1>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/;
+        my $seq = <$fh1>;
+        chomp $seq;
+        if ($seq_hash{$id}){
+            $existed_id1++;
+        }
+        my $len = length $seq;
+        $read1_total_len += $len;
+        my $qual_id = <$fh1>;
+        my $qual = <$fh1>;
+        $seq = $seq."\n".$qual_id.$qual;
+        $seq_hash{$id}++;
+        $read1_count++;
+    }
+    close $fh1;
+    my %seq_hash2;
+    my ($fh2,$pid2) = open_file($r2);
+    while(<$fh2>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/;
+        $read2_count++;
+        my $seq2 = <$fh2>;
+        chomp $seq2;
+        if ($seq_hash2{$id2}){
+            $existed_id2++;
+        }
+        my $len = length $seq2;
+        $read2_total_len += $len;
+        my $qual_id = <$fh2>;
+        my $qual = <$fh2>;
+        $seq2 = $seq2."\n".$qual_id.$qual;
+        $seq_hash2{$id2}++;
+        if ($seq_hash{$id2}){
+            $seq_hash{$id2}++;
+            $paired_count++;
+            print $ofh2 $id_line,"\n",$seq2;
+        }else{
+            print $ofhse $id_line,"\n",$seq2;
+            $se2_count++;
+        }
+    }
+    close $fh2;
+    ($fh1,$pid) = open_file($r1);
+    while(<$fh1>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/;
+        my $seq = <$fh1>;
+        chomp $seq;
+        my $qual_id = <$fh1>;
+        my $qual = <$fh1>;
+        $seq = $seq."\n".$qual_id.$qual;
+        if ($seq_hash{$id} == 2){
+                print $ofh1 $id_line,"\n",$seq;
+        }
+        if ($seq_hash{$id} == 1){
+                print $ofhse $id_line,"\n",$seq;
+                $se1_count++;
+        }
+    }
+    close $fh1;
+    close $ofh1;
+    close $ofh2;
+    close $ofhse;
+    printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
+    printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
+    printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
+    printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
+    printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0);
+    printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0);
+    printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); 
+    printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0);
+
+    unlink $out_se if (-z $out_se);
+    return ($read1_count + $read2_count, $read1_total_len + $read2_total_len);
+}
+
+sub countFastq_exe 
+{
+    my $file=shift;
+    my $output=shift;
+    my $seq_count=0;
+    my $total_length;
+    my ($fh,$pid)= open_file($file);
+    open (my $ofh, ">>$output") or die "Cannot write $output\n";
+    while (<$fh>)
+    { 
+        next unless $_ =~ /\S/;
+	next if ($_ =~ /length=0/);
+        my $id=$_;
+        $id = '@'."seq_$seq_count\n" if ($id =~ /No name/);
+        my $seq=<$fh>;
+        chomp $seq;
+        my $q_id=<$fh>;
+        my $q_seq=<$fh>;
+        my $len = length $seq;
+        $seq_count++;
+        $total_length +=$len;
+        print $ofh "$id$seq\n$q_id$q_seq";
+    }
+    close $fh;
+    return ($seq_count,$total_length);
+}
+
+sub touchFile{
+    my $file=shift;
+    open (my $fh,">",$file) or die "$!";
+    close $fh;
+}
+
+sub open_file
+{
+    my ($file) = @_;
+    print "$file\n"; 
+    my $fh;
+    my $pid;
+    if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); }
+    else { $pid=open($fh,'<',$file) or die("$file: $!"); }
+    return ($fh,$pid);
+}