Nf phage finder (#9)

Merging two workflows with related functionality.
LANL-Bioinformatics · Sep 23, 2024 · f9ad25d · f9ad25d
1 parent ede9fec
commit f9ad25d
Show file tree

Hide file tree

Showing 92 changed files with 1,336,510 additions and 31 deletions.
diff --git a/phageFinder/Dockerfile b/phageFinder/Dockerfile
@@ -0,0 +1,46 @@
+# syntax=docker/dockerfile:1
+FROM continuumio/miniconda3:23.5.2-0 AS build
+
+ENV container=docker
+
+# add conda channels
+RUN conda config --add channels conda-forge \
+    && conda config --add channels bioconda
+
+RUN conda init bash \
+    && . ~/.bashrc \
+    && conda create --name phageFinder \
+    && conda activate phageFinder 
+
+# install dependencies for phageFinder
+RUN conda install -n phageFinder -c bioconda aragorn
+RUN conda install -n phageFinder -c bioconda blast-legacy
+RUN conda install -n phageFinder -c bioconda hmmer
+RUN conda install -n phageFinder -c bioconda trnascan-se
+RUN conda install -c conda-forge conda-pack
+
+ADD bin/*.pl /opt/conda/envs/phageFinder/bin
+
+RUN conda-pack -n phageFinder -o /tmp/env.tar && \
+    mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
+    rm /tmp/env.tar
+
+RUN /venv/bin/conda-unpack
+
+#we need the version of phage_finder from EDGE's third-party database to handle some bugs
+
+RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf thirdParty/phage_finder_v2.1.tar.gz -C .
+
+
+RUN chmod -R a+rx phage_finder_v2.1/* 
+
+FROM debian:latest AS runtime
+
+COPY --from=build /venv /venv
+COPY --from=build /phage_finder_v2.1 /venv/opt/phage_finder_v2.1
+ENV PATH=/venv/opt/phage_finder_v2.1/bin:/venv/bin:$PATH
+
+SHELL ["/bin/bash", "-c"]
+CMD /bin/bash
diff --git a/phageFinder/bin/phageFinder_prepare.pl b/phageFinder/bin/phageFinder_prepare.pl
@@ -0,0 +1,101 @@
+#!/usr/bin/env perl
+# Purpose: prepare files for phage finder. 
+# This script takes a GFF file from Prokka as input, and produces a
+# phage_finder_info.txt (protein table)
+# Written by Chien-Chi Lo
+# 16 Oct 2014
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+
+my $outDir;
+my $version=0.1;
+GetOptions(
+            "o=s"              => \$outDir,
+            "version"          => sub{print "Version $version\n";exit;},
+            "help|?"           => sub{Usage()} );
+
+
+if (@ARGV != 2) {&Usage();} 
+unless ( -e $ARGV[0] ) { print "GFF File not exist\n"; &Usage();}
+unless ( -e $ARGV[1] ) { print "Genome/Contig fasta file not exist\n"; &Usage();}
+open(my $fh, $ARGV[0]) or die "Cannot open GFF file\n";
+
+my %len;
+my $cds_count=0;
+my %id_map;
+my $id_map_file="$outDir/id_map.txt";
+my $seq_id="Sequence0000001";
+
+## rename fasta file to mapped id
+my $new_fasta="$outDir/Assembly.con";
+open(my $ofh, ">$new_fasta") or die "Cannot write $new_fasta\n";
+open(my $ffh, $ARGV[1]) or die "Cannot open Fasta file\n";
+open (my $id_fh, ">$id_map_file") or die "Cannot write $id_map_file\n";
+my ($id,$seq);
+while(<$ffh>)
+{
+    chomp;
+    if(/^>(\S+)/)
+    {
+        if ($seq){
+           $len{$id}=length($seq);
+        }
+	$id = $1;
+        $id_map{$id}=$seq_id;
+        print $id_fh "$seq_id\t$id\n";
+        print $ofh ">$seq_id\n";
+        $seq_id++;
+        $seq="";
+    }else{
+	$seq .= $_;
+    	print $ofh $_,"\n";
+    }
+}
+$len{$id}=length($seq) if ($seq);
+
+close $ffh;
+close $id_fh;
+close $ofh;
+
+## prepare phage_finder_info file
+open (my $ph_fh, ">$outDir/phage_finder_info.txt") or die "Cannot write $outDir/phage_finder_info.txt\n";
+while (<$fh>)  # each LOCUS
+{
+    chomp;
+    if (/#sequence-region/)
+    {
+        my ($tmp, $region_id, $start, $end)=split/\s+/,$_;
+        $len{$region_id}=$end-$start+1;
+    }
+    else
+    {
+        my ($id,$source,$type,$start,$end,$score,$strand,$phase,$Attributes)=split /\t/,$_;
+        if (defined $type and $type eq "CDS")
+        {
+            my $region_len = $len{$id};
+            my %annotations=map { split /=/;} split /;/,$Attributes;
+            my $product = $annotations{"product"} || $annotations{"Note"} ||  $annotations{"function"} || "Unknown" ;
+            my $locus_tag = $annotations{"locus_tag"} || $annotations{"Name"} || "";
+            $product =~ s/\%2C/,/g;
+            $product =~ s/\%3B/;/g;
+            print $ph_fh join("\t",$id_map{$id},$region_len,$locus_tag,$start,$end,$product),"\n";
+            $cds_count++;
+        }    
+    }    
+}
+close $ph_fh;
+close $fh;
+
+
+sub Usage 
+{
+    print <<"END";
+    Usage: perl $0 -o outDir GFF_file Fasta_file
+    Version $version
+    -o      Output directory.
+END
+    exit;
+}
diff --git a/phageFinder/bin/phageFinder_summary.pl b/phageFinder/bin/phageFinder_summary.pl
@@ -0,0 +1,30 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $id_file;
+my $table;
+
+GetOptions(
+    't=s' => \$table,
+    'i=s' => \$id_file
+);
+
+my %id_map;
+print "$id_file";
+print "$table";
+open(my $fh,$id_file) or die "Cannot read id_map.txt\n";
+while(<$fh>){chomp; my($new_id,$original_id)=split; $id_map{$new_id}=$original_id;}
+close $fh;
+open(my $ofh,">phageFinder_summary.txt") or die "Cannot write phageFinder_summary.txt\n";
+open(my $result_fh,$table) or die "Cannot read PFPR_tab.txt\n";
+while(<$result_fh>)
+{
+    my @fields=split /\s+/,$_;
+    $fields[0]=$id_map{$fields[0]} if ($id_map{$fields[0]});
+    print $ofh join("\t",@fields),"\n";
+}
+close $result_fh;       
+close $ofh;
diff --git a/phageFinder/nextflow.config b/phageFinder/nextflow.config
@@ -0,0 +1,15 @@
+params {
+    outDir = "."
+    gffFile = null
+    faaFile = null
+    fnaFile = null
+    numCPU = 8
+}
+
+singularity {
+    enabled = true
+    runOptions="--compat"
+
+}
+
+process.container = "apwat/phage_finder:noWrite"
diff --git a/phageFinder/phageFinder.nf b/phageFinder/phageFinder.nf
@@ -0,0 +1,73 @@
+#!/usr/bin/env nextflow
+
+
+process phageFinderPrep {
+
+    input:
+    path gff
+    path fna
+
+    output:
+    path "id_map.txt", emit:idMap //separate output declaration for post-PF processing
+    path "*", emit:allPFoutput //all output files will go into the next process
+
+
+    script:
+    """
+    phageFinder_prepare.pl -o . $gff $fna
+    """
+} 
+
+process phageFinder {
+    publishDir(
+        path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
+        mode: 'copy',
+        pattern: "log.txt"
+    )
+
+    input:
+    path prepOut
+    path faa, stageAs: "Assembly.pep"
+
+    output:
+    path "PFPR_tab.txt", emit: phageTable
+    path "log.txt"
+
+    //must be on PATH
+    script:
+    """
+    phage_finder_v2.1.sh Assembly $params.numCPU 1>log.txt 2>&1
+    """
+
+}
+
+process summary {
+    publishDir(
+        path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
+        mode: 'copy'
+    )
+
+    input:
+    path idMap
+    path pfprTab
+
+    output:
+    path "phageFinder_summary.txt"
+
+    script:
+    """
+    phageFinder_summary.pl -t $pfprTab -i $idMap
+    """
+}
+
+
+workflow {
+    gff_ch = channel.fromPath(params.gffFile, checkIfExists:true)
+    faa_ch = channel.fromPath(params.faaFile, checkIfExists:true).filter{ it.size()>0 }
+    fna_ch = channel.fromPath(params.fnaFile, checkIfExists:true)
+
+    phageFinderPrep(gff_ch, fna_ch)
+    phageFinder(phageFinderPrep.out.allPFoutput, faa_ch)
+    summary(phageFinderPrep.out.idMap,phageFinder.out.phageTable)
+
+}
diff --git a/phageFinder/test_files/parameters/basic.json b/phageFinder/test_files/parameters/basic.json
@@ -0,0 +1,7 @@
+{
+    "gffFile" : "test_files/phageProj.gff",
+    "faaFile" : "test_files/phageProj.faa",
+    "fnaFile" : "test_files/phageProj.fna",
+    "numCPU" : 4,
+    "outDir": "test_phageFinder"
+}
diff --git a/readsTaxonomyAssignment/Dockerfile b/readsTaxonomyAssignment/Dockerfile
@@ -0,0 +1,87 @@
+# syntax=docker/dockerfile:1
+FROM continuumio/miniconda3:24.5.0-0 AS build
+
+ENV container=docker
+
+# add conda channels
+RUN conda config --add channels conda-forge \
+    && conda config --add channels bioconda
+
+RUN conda init bash \
+    && . ~/.bashrc \
+    && conda create --name readsTaxonomyAssignment \
+    && conda activate readsTaxonomyAssignment 
+
+RUN conda install -n readsTaxonomyAssignment -c bioconda metaphlan=4.1.1
+RUN conda install -n readsTaxonomyAssignment python=3.10
+#the required version of diamond is 2.0.5, but this runs into conda installation problems
+#we will install it here to handle any dependencies, then later replace diamond with the appropriate release
+RUN conda install -n readsTaxonomyAssignment -c bioconda diamond 
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-json
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
+RUN conda install -n readsTaxonomyAssignment -c bioconda kraken2
+RUN conda install -n readsTaxonomyAssignment -c bioconda krona
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-yaml
+RUN conda install -n readsTaxonomyAssignment -c bioconda centrifuge
+RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha2
+RUN conda install -n readsTaxonomyAssignment -c bioconda minimap2=2.17
+RUN conda install -n readsTaxonomyAssignment -c bioconda pybedtools
+RUN conda install -n readsTaxonomyAssignment -c conda-forge parallel
+#conda does not have the most recent version of gottcha (1.0b instead of 1.0c), 
+#but we encounter errors when compiling splitrim.d in gottcha's latest source code release. 
+#we will install gottcha here and later replace the non-splitrim scripts with the latest source code.
+RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha
+RUN conda install -n readsTaxonomyAssignment -c bioconda bowtie2=2.5.1
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl
+RUN conda install -n readsTaxonomyAssignment -c conda-forge perl-app-cpanminus
+RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl-core
+RUN conda install -n readsTaxonomyAssignment -c conda-forge cairosvg=2.7.1
+RUN conda install -c conda-forge conda-pack
+
+
+#download latest PanGIA 
+#ISSUE: differs from version in EDGE's third-party software, in ways that break scripts
+#we can get EDGE's version from its third-party tarball, but it's a wastefully large download.
+RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf edge_dev_thirdParty_softwares.tgz \
+    && tar -xvzf thirdParty/pangia-1.0.0.tar.gz -C . \
+    && mv pangia /opt/conda/envs/readsTaxonomyAssignment/opt
+
+#correct diamond version
+RUN wget https://github.com/bbuchfink/diamond/releases/download/v2.0.5/diamond-linux64.tar.gz \
+    && tar -xvzf diamond-linux64.tar.gz
+
+#correct gottcha version
+RUN wget https://github.com/LANL-Bioinformatics/GOTTCHA/archive/refs/tags/1.0c.tar.gz \
+    && tar -xvzf 1.0c.tar.gz \
+    && chmod 755 GOTTCHA-1.0c/src/*.pl
+
+
+#add scripts from this project to bin
+ADD bin/* /opt/conda/envs/readsTaxonomyAssignment/bin
+
+#pack environment for runtime image
+RUN conda-pack -n readsTaxonomyAssignment -o /tmp/env.tar && \
+    mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
+    rm /tmp/env.tar
+
+RUN /venv/bin/conda-unpack
+
+FROM debian:latest AS runtime
+
+COPY --from=build /venv /venv
+COPY --from=build /diamond /venv/bin
+COPY --from=build /GOTTCHA-1.0c/src/*.pl /venv/bin
+
+#add environment, pangia base and vis-scripts to PATH
+ENV PATH=/venv/bin:/venv/opt/pangia:/venv/opt/pangia/pangia-vis/scripts:/venv/opt/krona:$PATH
+ENV PERL5LIB=/venv/lib/perl5/core_perl/
+
+
+SHELL ["/bin/bash", "-c"]
+CMD /bin/bash