Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nf reads taxonomy #8

Merged
merged 22 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions readsTaxonomyAssignment/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:24.5.0-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name readsTaxonomyAssignment \
&& conda activate readsTaxonomyAssignment

RUN conda install -n readsTaxonomyAssignment -c bioconda metaphlan=4.1.1
RUN conda install -n readsTaxonomyAssignment python=3.10
#the required version of diamond is 2.0.5, but this runs into conda installation problems
#we will install it here to handle any dependencies, then later replace diamond with the appropriate release
RUN conda install -n readsTaxonomyAssignment -c bioconda diamond
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-json
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
RUN conda install -n readsTaxonomyAssignment -c bioconda kraken2
RUN conda install -n readsTaxonomyAssignment -c bioconda krona
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-yaml
RUN conda install -n readsTaxonomyAssignment -c bioconda centrifuge
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha2
RUN conda install -n readsTaxonomyAssignment -c bioconda minimap2=2.17
RUN conda install -n readsTaxonomyAssignment -c bioconda pybedtools
RUN conda install -n readsTaxonomyAssignment -c conda-forge parallel
#conda does not have the most recent version of gottcha (1.0b instead of 1.0c),
#but we encounter errors when compiling splitrim.d in gottcha's latest source code release.
#we will install gottcha here and later replace the non-splitrim scripts with the latest source code.
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha
RUN conda install -n readsTaxonomyAssignment -c bioconda bowtie2=2.5.1
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl
RUN conda install -n readsTaxonomyAssignment -c conda-forge perl-app-cpanminus
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl-core
RUN conda install -n readsTaxonomyAssignment -c conda-forge cairosvg=2.7.1
RUN conda install -c conda-forge conda-pack


#download latest PanGIA
#ISSUE: differs from version in EDGE's third-party software, in ways that break scripts
#we can get EDGE's version from its third-party tarball, but it's a wastefully large download.
RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf thirdParty/pangia-1.0.0.tar.gz -C . \
&& mv pangia /opt/conda/envs/readsTaxonomyAssignment/opt

#correct diamond version
RUN wget https://github.com/bbuchfink/diamond/releases/download/v2.0.5/diamond-linux64.tar.gz \
&& tar -xvzf diamond-linux64.tar.gz

#correct gottcha version
RUN wget https://github.com/LANL-Bioinformatics/GOTTCHA/archive/refs/tags/1.0c.tar.gz \
&& tar -xvzf 1.0c.tar.gz \
&& chmod 755 GOTTCHA-1.0c/src/*.pl


#add scripts from this project to bin
ADD bin/* /opt/conda/envs/readsTaxonomyAssignment/bin

#pack environment for runtime image
RUN conda-pack -n readsTaxonomyAssignment -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

FROM debian:latest AS runtime

COPY --from=build /venv /venv
COPY --from=build /diamond /venv/bin
COPY --from=build /GOTTCHA-1.0c/src/*.pl /venv/bin

#add environment, pangia base and vis-scripts to PATH
ENV PATH=/venv/bin:/venv/opt/pangia:/venv/opt/pangia/pangia-vis/scripts:/venv/opt/krona:$PATH
ENV PERL5LIB=/venv/lib/perl5/core_perl/


SHELL ["/bin/bash", "-c"]
CMD /bin/bash
198 changes: 198 additions & 0 deletions readsTaxonomyAssignment/bin/getAvgLen.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
use Getopt::Long;

my $outputDir='';

my @unpairedList= ();
my @pairedList= ();

GetOptions(
'u:s{1,}' => \@unpairedList,
'p:s{1,}' => \@pairedList,
'd=s' => \$outputDir,
);


my $all_R1_fastq = "$outputDir/all.1.fastq";
my $all_R2_fastq = "$outputDir/all.2.fastq";
my $all_SE_fastq = "$outputDir/all.se.fastq";
my $count_file_list= "$outputDir/fastqCount.txt";
my $avg_read_len= 0 ;
my $PE_count = 0 ;
my $PE_total_len = 0;
my $SE_count= 0 ;
my $SE_total_len = 0 ;



open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n";
while(my ($R1,$R2) = splice (@pairedList,0,2))
{
($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh);
}
foreach my $file (@unpairedList)
{
($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq);
printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count);
}
close $fh;
$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ;

sub countPE_exe{
my $r1=shift;
my $r2=shift;
my $out_r1=shift;
my $out_r2=shift;
my $out_se=shift;
my $count_fh=shift;
my %seq_hash;
my $pair_char;
my $unpaired_count=0;
my $read1_count=0;
my $read2_count=0;
my $se2_count=0;
my $se1_count=0;
my $paired_count=0;
my $read1_total_len=0;
my $read2_total_len=0;
my $existed_id1=0;
my $existed_id2=0;
my ($fh1,$pid) = open_file($r1);
open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n";
open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n";
open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n";
while(<$fh1>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/;
my $seq = <$fh1>;
chomp $seq;
if ($seq_hash{$id}){
$existed_id1++;
}
my $len = length $seq;
$read1_total_len += $len;
my $qual_id = <$fh1>;
my $qual = <$fh1>;
$seq = $seq."\n".$qual_id.$qual;
$seq_hash{$id}++;
$read1_count++;
}
close $fh1;
my %seq_hash2;
my ($fh2,$pid2) = open_file($r2);
while(<$fh2>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/;
$read2_count++;
my $seq2 = <$fh2>;
chomp $seq2;
if ($seq_hash2{$id2}){
$existed_id2++;
}
my $len = length $seq2;
$read2_total_len += $len;
my $qual_id = <$fh2>;
my $qual = <$fh2>;
$seq2 = $seq2."\n".$qual_id.$qual;
$seq_hash2{$id2}++;
if ($seq_hash{$id2}){
$seq_hash{$id2}++;
$paired_count++;
print $ofh2 $id_line,"\n",$seq2;
}else{
print $ofhse $id_line,"\n",$seq2;
$se2_count++;
}
}
close $fh2;
($fh1,$pid) = open_file($r1);
while(<$fh1>){
chomp;
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id_line=$_;
my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/;
my $seq = <$fh1>;
chomp $seq;
my $qual_id = <$fh1>;
my $qual = <$fh1>;
$seq = $seq."\n".$qual_id.$qual;
if ($seq_hash{$id} == 2){
print $ofh1 $id_line,"\n",$seq;
}
if ($seq_hash{$id} == 1){
print $ofhse $id_line,"\n",$seq;
$se1_count++;
}
}
close $fh1;
close $ofh1;
close $ofh2;
close $ofhse;
printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count);
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count);
printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0);
printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0);
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0);
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0);

unlink $out_se if (-z $out_se);
return ($read1_count + $read2_count, $read1_total_len + $read2_total_len);
}

sub countFastq_exe
{
my $file=shift;
my $output=shift;
my $seq_count=0;
my $total_length;
my ($fh,$pid)= open_file($file);
open (my $ofh, ">>$output") or die "Cannot write $output\n";
while (<$fh>)
{
next unless $_ =~ /\S/;
next if ($_ =~ /length=0/);
my $id=$_;
$id = '@'."seq_$seq_count\n" if ($id =~ /No name/);
my $seq=<$fh>;
chomp $seq;
my $q_id=<$fh>;
my $q_seq=<$fh>;
my $len = length $seq;
$seq_count++;
$total_length +=$len;
print $ofh "$id$seq\n$q_id$q_seq";
}
close $fh;
return ($seq_count,$total_length);
}

sub touchFile{
my $file=shift;
open (my $fh,">",$file) or die "$!";
close $fh;
}

sub open_file
{
my ($file) = @_;
print "$file\n";
my $fh;
my $pid;
if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); }
else { $pid=open($fh,'<',$file) or die("$file: $!"); }
return ($fh,$pid);
}
Loading