-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from LANL-Bioinformatics/nf_readsTaxonomy
Nf reads taxonomy
- Loading branch information
Showing
69 changed files
with
1,335,076 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# syntax=docker/dockerfile:1 | ||
FROM continuumio/miniconda3:24.5.0-0 AS build | ||
|
||
ENV container=docker | ||
|
||
# add conda channels | ||
RUN conda config --add channels conda-forge \ | ||
&& conda config --add channels bioconda | ||
|
||
RUN conda init bash \ | ||
&& . ~/.bashrc \ | ||
&& conda create --name readsTaxonomyAssignment \ | ||
&& conda activate readsTaxonomyAssignment | ||
|
||
RUN conda install -n readsTaxonomyAssignment -c bioconda metaphlan=4.1.1 | ||
RUN conda install -n readsTaxonomyAssignment python=3.10 | ||
#the required version of diamond is 2.0.5, but this runs into conda installation problems | ||
#we will install it here to handle any dependencies, then later replace diamond with the appropriate release | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda diamond | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-json | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda kraken2 | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda krona | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-yaml | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda centrifuge | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha2 | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda minimap2=2.17 | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda pybedtools | ||
RUN conda install -n readsTaxonomyAssignment -c conda-forge parallel | ||
#conda does not have the most recent version of gottcha (1.0b instead of 1.0c), | ||
#but we encounter errors when compiling splitrim.d in gottcha's latest source code release. | ||
#we will install gottcha here and later replace the non-splitrim scripts with the latest source code. | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda bowtie2=2.5.1 | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl | ||
RUN conda install -n readsTaxonomyAssignment -c conda-forge perl-app-cpanminus | ||
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl-core | ||
RUN conda install -n readsTaxonomyAssignment -c conda-forge cairosvg=2.7.1 | ||
RUN conda install -c conda-forge conda-pack | ||
|
||
|
||
#download latest PanGIA | ||
#ISSUE: differs from version in EDGE's third-party software, in ways that break scripts | ||
#we can get EDGE's version from its third-party tarball, but it's a wastefully large download. | ||
RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \ | ||
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \ | ||
&& tar -xvzf thirdParty/pangia-1.0.0.tar.gz -C . \ | ||
&& mv pangia /opt/conda/envs/readsTaxonomyAssignment/opt | ||
|
||
#correct diamond version | ||
RUN wget https://github.com/bbuchfink/diamond/releases/download/v2.0.5/diamond-linux64.tar.gz \ | ||
&& tar -xvzf diamond-linux64.tar.gz | ||
|
||
#correct gottcha version | ||
RUN wget https://github.com/LANL-Bioinformatics/GOTTCHA/archive/refs/tags/1.0c.tar.gz \ | ||
&& tar -xvzf 1.0c.tar.gz \ | ||
&& chmod 755 GOTTCHA-1.0c/src/*.pl | ||
|
||
|
||
#add scripts from this project to bin | ||
ADD bin/* /opt/conda/envs/readsTaxonomyAssignment/bin | ||
|
||
#pack environment for runtime image | ||
RUN conda-pack -n readsTaxonomyAssignment -o /tmp/env.tar && \ | ||
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ | ||
rm /tmp/env.tar | ||
|
||
RUN /venv/bin/conda-unpack | ||
|
||
FROM debian:latest AS runtime | ||
|
||
COPY --from=build /venv /venv | ||
COPY --from=build /diamond /venv/bin | ||
COPY --from=build /GOTTCHA-1.0c/src/*.pl /venv/bin | ||
|
||
#add environment, pangia base and vis-scripts to PATH | ||
ENV PATH=/venv/bin:/venv/opt/pangia:/venv/opt/pangia/pangia-vis/scripts:/venv/opt/krona:$PATH | ||
ENV PERL5LIB=/venv/lib/perl5/core_perl/ | ||
|
||
|
||
SHELL ["/bin/bash", "-c"] | ||
CMD /bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
use File::Basename; | ||
use Getopt::Long; | ||
|
||
my $outputDir=''; | ||
|
||
my @unpairedList= (); | ||
my @pairedList= (); | ||
|
||
GetOptions( | ||
'u:s{1,}' => \@unpairedList, | ||
'p:s{1,}' => \@pairedList, | ||
'd=s' => \$outputDir, | ||
); | ||
|
||
|
||
my $all_R1_fastq = "$outputDir/all.1.fastq"; | ||
my $all_R2_fastq = "$outputDir/all.2.fastq"; | ||
my $all_SE_fastq = "$outputDir/all.se.fastq"; | ||
my $count_file_list= "$outputDir/fastqCount.txt"; | ||
my $avg_read_len= 0 ; | ||
my $PE_count = 0 ; | ||
my $PE_total_len = 0; | ||
my $SE_count= 0 ; | ||
my $SE_total_len = 0 ; | ||
|
||
|
||
|
||
open (my $fh, ">$count_file_list") or die "Cannot write $count_file_list\n"; | ||
while(my ($R1,$R2) = splice (@pairedList,0,2)) | ||
{ | ||
($PE_count, $PE_total_len) = &countPE_exe($R1,$R2,$all_R1_fastq,$all_R2_fastq,$all_SE_fastq,$fh); | ||
} | ||
foreach my $file (@unpairedList) | ||
{ | ||
($SE_count,$SE_total_len)=&countFastq_exe($file,$all_SE_fastq); | ||
printf $fh ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($file),$SE_count,$SE_total_len,$SE_total_len/$SE_count); | ||
} | ||
close $fh; | ||
$avg_read_len = ($PE_count + $SE_count) > 0 ? ($PE_total_len + $SE_total_len) / ($PE_count + $SE_count) : 0 ; | ||
|
||
sub countPE_exe{ | ||
my $r1=shift; | ||
my $r2=shift; | ||
my $out_r1=shift; | ||
my $out_r2=shift; | ||
my $out_se=shift; | ||
my $count_fh=shift; | ||
my %seq_hash; | ||
my $pair_char; | ||
my $unpaired_count=0; | ||
my $read1_count=0; | ||
my $read2_count=0; | ||
my $se2_count=0; | ||
my $se1_count=0; | ||
my $paired_count=0; | ||
my $read1_total_len=0; | ||
my $read2_total_len=0; | ||
my $existed_id1=0; | ||
my $existed_id2=0; | ||
my ($fh1,$pid) = open_file($r1); | ||
open (my $ofh1, ">>$out_r1") or die "Cannot write $out_r1\n"; | ||
open (my $ofh2, ">>$out_r2") or die "Cannot write $out_r2\n"; | ||
open (my $ofhse, ">>$out_se") or die "Cannot write $out_se\n"; | ||
while(<$fh1>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id) = $id_line =~ /^\@(\S+).?\/?1?\s*/; | ||
my $seq = <$fh1>; | ||
chomp $seq; | ||
if ($seq_hash{$id}){ | ||
$existed_id1++; | ||
} | ||
my $len = length $seq; | ||
$read1_total_len += $len; | ||
my $qual_id = <$fh1>; | ||
my $qual = <$fh1>; | ||
$seq = $seq."\n".$qual_id.$qual; | ||
$seq_hash{$id}++; | ||
$read1_count++; | ||
} | ||
close $fh1; | ||
my %seq_hash2; | ||
my ($fh2,$pid2) = open_file($r2); | ||
while(<$fh2>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id2) = $id_line =~ /^\@(\S+)\.?\/?2?\s*/; | ||
$read2_count++; | ||
my $seq2 = <$fh2>; | ||
chomp $seq2; | ||
if ($seq_hash2{$id2}){ | ||
$existed_id2++; | ||
} | ||
my $len = length $seq2; | ||
$read2_total_len += $len; | ||
my $qual_id = <$fh2>; | ||
my $qual = <$fh2>; | ||
$seq2 = $seq2."\n".$qual_id.$qual; | ||
$seq_hash2{$id2}++; | ||
if ($seq_hash{$id2}){ | ||
$seq_hash{$id2}++; | ||
$paired_count++; | ||
print $ofh2 $id_line,"\n",$seq2; | ||
}else{ | ||
print $ofhse $id_line,"\n",$seq2; | ||
$se2_count++; | ||
} | ||
} | ||
close $fh2; | ||
($fh1,$pid) = open_file($r1); | ||
while(<$fh1>){ | ||
chomp; | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id_line=$_; | ||
my ($id) = $id_line =~ /^\@(\S+)\.?\/?1?\s*/; | ||
my $seq = <$fh1>; | ||
chomp $seq; | ||
my $qual_id = <$fh1>; | ||
my $qual = <$fh1>; | ||
$seq = $seq."\n".$qual_id.$qual; | ||
if ($seq_hash{$id} == 2){ | ||
print $ofh1 $id_line,"\n",$seq; | ||
} | ||
if ($seq_hash{$id} == 1){ | ||
print $ofhse $id_line,"\n",$seq; | ||
$se1_count++; | ||
} | ||
} | ||
close $fh1; | ||
close $ofh1; | ||
close $ofh2; | ||
close $ofhse; | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); | ||
printf ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); | ||
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r1),$read1_count,$read1_total_len,$read1_total_len/$read1_count); | ||
printf $count_fh ("%s\t%d\t%d\t%.2f\n",basename($r2),$read2_count,$read2_total_len,$read2_total_len/$read2_count); | ||
printf ("%d duplicate id from %s\n", $existed_id1, basename($r1)) if ($existed_id1 > 0); | ||
printf ("%d duplicate id from %s\n", $existed_id2, basename($r2)) if ($existed_id2 > 0); | ||
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se1_count, basename($r1)) if ($se1_count >0); | ||
printf ("There are %d reads from %s don't have corresponding paired read.\n", $se2_count, basename($r2)) if ($se2_count >0); | ||
|
||
unlink $out_se if (-z $out_se); | ||
return ($read1_count + $read2_count, $read1_total_len + $read2_total_len); | ||
} | ||
|
||
sub countFastq_exe | ||
{ | ||
my $file=shift; | ||
my $output=shift; | ||
my $seq_count=0; | ||
my $total_length; | ||
my ($fh,$pid)= open_file($file); | ||
open (my $ofh, ">>$output") or die "Cannot write $output\n"; | ||
while (<$fh>) | ||
{ | ||
next unless $_ =~ /\S/; | ||
next if ($_ =~ /length=0/); | ||
my $id=$_; | ||
$id = '@'."seq_$seq_count\n" if ($id =~ /No name/); | ||
my $seq=<$fh>; | ||
chomp $seq; | ||
my $q_id=<$fh>; | ||
my $q_seq=<$fh>; | ||
my $len = length $seq; | ||
$seq_count++; | ||
$total_length +=$len; | ||
print $ofh "$id$seq\n$q_id$q_seq"; | ||
} | ||
close $fh; | ||
return ($seq_count,$total_length); | ||
} | ||
|
||
sub touchFile{ | ||
my $file=shift; | ||
open (my $fh,">",$file) or die "$!"; | ||
close $fh; | ||
} | ||
|
||
sub open_file | ||
{ | ||
my ($file) = @_; | ||
print "$file\n"; | ||
my $fh; | ||
my $pid; | ||
if ( $file=~/\.gz\.?\d?$/i ) { $pid=open($fh, "gunzip -c $file |") or die ("gunzip -c $file: $!"); } | ||
else { $pid=open($fh,'<',$file) or die("$file: $!"); } | ||
return ($fh,$pid); | ||
} |
Oops, something went wrong.