Merge pull request #8 from LANL-Bioinformatics/nf_readsTaxonomy

Nf reads taxonomy
LANL-Bioinformatics · Sep 16, 2024 · b111107 · b111107
2 parents 8fa630c + 7eb69e7
commit b111107
Show file tree

Hide file tree

Showing 69 changed files with 1,335,076 additions and 0 deletions.
diff --git a/readsTaxonomyAssignment/Dockerfile b/readsTaxonomyAssignment/Dockerfile
@@ -0,0 +1,87 @@
+# syntax=docker/dockerfile:1
+FROM continuumio/miniconda3:24.5.0-0 AS build
+
+ENV container=docker
+
+# add conda channels
+RUN conda config --add channels conda-forge \
+    && conda config --add channels bioconda
+
+RUN conda init bash \
+    && . ~/.bashrc \
+    && conda create --name readsTaxonomyAssignment \
+    && conda activate readsTaxonomyAssignment 
+
+RUN conda install -n readsTaxonomyAssignment -c bioconda metaphlan=4.1.1
+RUN conda install -n readsTaxonomyAssignment python=3.10
+#the required version of diamond is 2.0.5, but this runs into conda installation problems
+#we will install it here to handle any dependencies, then later replace diamond with the appropriate release
+RUN conda install -n readsTaxonomyAssignment -c bioconda diamond 
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-json
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
+RUN conda install -n readsTaxonomyAssignment -c bioconda kraken2
+RUN conda install -n readsTaxonomyAssignment -c bioconda krona
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-yaml
+RUN conda install -n readsTaxonomyAssignment -c bioconda centrifuge
+RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha2
+RUN conda install -n readsTaxonomyAssignment -c bioconda minimap2=2.17
+RUN conda install -n readsTaxonomyAssignment -c bioconda pybedtools
+RUN conda install -n readsTaxonomyAssignment -c conda-forge parallel
+#conda does not have the most recent version of gottcha (1.0b instead of 1.0c), 
+#but we encounter errors when compiling splitrim.d in gottcha's latest source code release. 
+#we will install gottcha here and later replace the non-splitrim scripts with the latest source code.
+RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha
+RUN conda install -n readsTaxonomyAssignment -c bioconda bowtie2=2.5.1
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl
+RUN conda install -n readsTaxonomyAssignment -c conda-forge perl-app-cpanminus
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl-core
+RUN conda install -n readsTaxonomyAssignment -c conda-forge cairosvg=2.7.1
+RUN conda install -c conda-forge conda-pack
+
+
+#download latest PanGIA 
+#ISSUE: differs from version in EDGE's third-party software, in ways that break scripts
+#we can get EDGE's version from its third-party tarball, but it's a wastefully large download.
+RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf thirdParty/pangia-1.0.0.tar.gz -C . \
+    && mv pangia /opt/conda/envs/readsTaxonomyAssignment/opt
+
+#correct diamond version
+RUN wget https://github.com/bbuchfink/diamond/releases/download/v2.0.5/diamond-linux64.tar.gz \
+    && tar -xvzf diamond-linux64.tar.gz
+
+#correct gottcha version
+RUN wget https://github.com/LANL-Bioinformatics/GOTTCHA/archive/refs/tags/1.0c.tar.gz \
+    && tar -xvzf 1.0c.tar.gz \
+    && chmod 755 GOTTCHA-1.0c/src/*.pl
+
+
+#add scripts from this project to bin
+ADD bin/* /opt/conda/envs/readsTaxonomyAssignment/bin
+
+#pack environment for runtime image
+RUN conda-pack -n readsTaxonomyAssignment -o /tmp/env.tar && \
+    mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
+    rm /tmp/env.tar
+
+RUN /venv/bin/conda-unpack
+
+FROM debian:latest AS runtime
+
+COPY --from=build /venv /venv
+COPY --from=build /diamond /venv/bin
+COPY --from=build /GOTTCHA-1.0c/src/*.pl /venv/bin
+
+#add environment, pangia base and vis-scripts to PATH
+ENV PATH=/venv/bin:/venv/opt/pangia:/venv/opt/pangia/pangia-vis/scripts:/venv/opt/krona:$PATH
+ENV PERL5LIB=/venv/lib/perl5/core_perl/
+
+
+SHELL ["/bin/bash", "-c"]
+CMD /bin/bash
diff --git a/readsTaxonomyAssignment/bin/getAvgLen.pl b/readsTaxonomyAssignment/bin/getAvgLen.pl
@@ -0,0 +1,198 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Long;
+
+my $outputDir='';
+
+my @unpairedList= ();
+my @pairedList=  ();
+
+GetOptions(
+    'u:s{1,}' => \@unpairedList,
+    'p:s{1,}' => \@pairedList,
+    'd=s' => \$outputDir,
+);
+
+
+my $all_R1_fastq = "$outputDir/all.1.fastq";
+my $all_R2_fastq = "$outputDir/all.2.fastq";
+my $all_SE_fastq = "$outputDir/all.se.fastq";
+my $count_file_list= "$outputDir/fastqCount.txt";
+my $avg_read_len= 0 ;
+my $PE_count = 0 ;
+my $PE_total_len = 0;
+my $SE_count= 0 ;
+my $SE_total_len = 0 ;
+
+
+
+open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n";
+while(my ($R1,$R2) = splice (@pairedList,0,2))
+{
+    ($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh);
+}
+foreach my $file (@unpairedList)
+{
+    ($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq);
+    printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
+    printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
+}
+close $fh;
+$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ;
+
+sub countPE_exe{
+    my $r1=shift;
+    my $r2=shift;
+    my $out_r1=shift;
+    my $out_r2=shift;
+    my $out_se=shift;
+    my $count_fh=shift;
+    my %seq_hash;
+    my $pair_char;
+    my $unpaired_count=0;
+    my $read1_count=0;
+    my $read2_count=0;
+    my $se2_count=0;
+    my $se1_count=0;
+    my $paired_count=0;
+    my $read1_total_len=0;
+    my $read2_total_len=0;
+    my $existed_id1=0;
+    my $existed_id2=0;
+    my ($fh1,$pid) = open_file($r1);
+    open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n";
+    open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n";
+    open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n";
+    while(<$fh1>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/;
+        my $seq = <$fh1>;
+        chomp $seq;
+        if ($seq_hash{$id}){
+            $existed_id1++;
+        }
+        my $len = length $seq;
+        $read1_total_len += $len;
+        my $qual_id = <$fh1>;
+        my $qual = <$fh1>;
+        $seq = $seq."\n".$qual_id.$qual;
+        $seq_hash{$id}++;
+        $read1_count++;
+    }
+    close $fh1;
+    my %seq_hash2;
+    my ($fh2,$pid2) = open_file($r2);
+    while(<$fh2>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/;
+        $read2_count++;
+        my $seq2 = <$fh2>;
+        chomp $seq2;
+        if ($seq_hash2{$id2}){
+            $existed_id2++;
+        }
+        my $len = length $seq2;
+        $read2_total_len += $len;
+        my $qual_id = <$fh2>;
+        my $qual = <$fh2>;
+        $seq2 = $seq2."\n".$qual_id.$qual;
+        $seq_hash2{$id2}++;
+        if ($seq_hash{$id2}){
+            $seq_hash{$id2}++;
+            $paired_count++;
+            print $ofh2 $id_line,"\n",$seq2;
+        }else{
+            print $ofhse $id_line,"\n",$seq2;
+            $se2_count++;
+        }
+    }
+    close $fh2;
+    ($fh1,$pid) = open_file($r1);
+    while(<$fh1>){
+        chomp;
+        next unless $_ =~ /\S/;
+        next if ($_ =~ /length=0/);
+        my $id_line=$_;
+        my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/;
+        my $seq = <$fh1>;
+        chomp $seq;
+        my $qual_id = <$fh1>;
+        my $qual = <$fh1>;
+        $seq = $seq."\n".$qual_id.$qual;
+        if ($seq_hash{$id} == 2){
+                print $ofh1 $id_line,"\n",$seq;
+        }
+        if ($seq_hash{$id} == 1){
+                print $ofhse $id_line,"\n",$seq;
+                $se1_count++;
+        }
+    }
+    close $fh1;
+    close $ofh1;
+    close $ofh2;
+    close $ofhse;
+    printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
+    printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
+    printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
+    printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
+    printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0);
+    printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0);
+    printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); 
+    printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0);
+
+    unlink $out_se if (-z $out_se);
+    return ($read1_count + $read2_count, $read1_total_len + $read2_total_len);
+}
+
+sub countFastq_exe 
+{
+    my $file=shift;
+    my $output=shift;
+    my $seq_count=0;
+    my $total_length;
+    my ($fh,$pid)= open_file($file);
+    open (my $ofh, ">>$output") or die "Cannot write $output\n";
+    while (<$fh>)
+    { 
+        next unless $_ =~ /\S/;
+	next if ($_ =~ /length=0/);
+        my $id=$_;
+        $id = '@'."seq_$seq_count\n" if ($id =~ /No name/);
+        my $seq=<$fh>;
+        chomp $seq;
+        my $q_id=<$fh>;
+        my $q_seq=<$fh>;
+        my $len = length $seq;
+        $seq_count++;
+        $total_length +=$len;
+        print $ofh "$id$seq\n$q_id$q_seq";
+    }
+    close $fh;
+    return ($seq_count,$total_length);
+}
+
+sub touchFile{
+    my $file=shift;
+    open (my $fh,">",$file) or die "$!";
+    close $fh;
+}
+
+sub open_file
+{
+    my ($file) = @_;
+    print "$file\n"; 
+    my $fh;
+    my $pid;
+    if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); }
+    else { $pid=open($fh,'<',$file) or die("$file: $!"); }
+    return ($fh,$pid);
+}