Skip to content

Commit

Permalink
Nf phage finder (#9)
Browse files Browse the repository at this point in the history
Merging two workflows with related functionality.
  • Loading branch information
aw-watson authored Sep 23, 2024
1 parent ede9fec commit f9ad25d
Show file tree
Hide file tree
Showing 92 changed files with 1,336,510 additions and 31 deletions.
46 changes: 46 additions & 0 deletions phageFinder/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:23.5.2-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name phageFinder \
&& conda activate phageFinder

# install dependencies for phageFinder
RUN conda install -n phageFinder -c bioconda aragorn
RUN conda install -n phageFinder -c bioconda blast-legacy
RUN conda install -n phageFinder -c bioconda hmmer
RUN conda install -n phageFinder -c bioconda trnascan-se
RUN conda install -c conda-forge conda-pack

ADD bin/*.pl /opt/conda/envs/phageFinder/bin

RUN conda-pack -n phageFinder -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

#we need the version of phage_finder from EDGE's third-party database to handle some bugs

RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf thirdParty/phage_finder_v2.1.tar.gz -C .


RUN chmod -R a+rx phage_finder_v2.1/*

FROM debian:latest AS runtime

COPY --from=build /venv /venv
COPY --from=build /phage_finder_v2.1 /venv/opt/phage_finder_v2.1
ENV PATH=/venv/opt/phage_finder_v2.1/bin:/venv/bin:$PATH

SHELL ["/bin/bash", "-c"]
CMD /bin/bash
101 changes: 101 additions & 0 deletions phageFinder/bin/phageFinder_prepare.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env perl
# Purpose: prepare files for phage finder.
# This script takes a GFF file from Prokka as input, and produces a
# phage_finder_info.txt (protein table)
# Written by Chien-Chi Lo
# 16 Oct 2014

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

my $outDir;
my $version=0.1;
GetOptions(
"o=s" => \$outDir,
"version" => sub{print "Version $version\n";exit;},
"help|?" => sub{Usage()} );


if (@ARGV != 2) {&Usage();}
unless ( -e $ARGV[0] ) { print "GFF File not exist\n"; &Usage();}
unless ( -e $ARGV[1] ) { print "Genome/Contig fasta file not exist\n"; &Usage();}
open(my $fh, $ARGV[0]) or die "Cannot open GFF file\n";

my %len;
my $cds_count=0;
my %id_map;
my $id_map_file="$outDir/id_map.txt";
my $seq_id="Sequence0000001";

## rename fasta file to mapped id
my $new_fasta="$outDir/Assembly.con";
open(my $ofh, ">$new_fasta") or die "Cannot write $new_fasta\n";
open(my $ffh, $ARGV[1]) or die "Cannot open Fasta file\n";
open (my $id_fh, ">$id_map_file") or die "Cannot write $id_map_file\n";
my ($id,$seq);
while(<$ffh>)
{
chomp;
if(/^>(\S+)/)
{
if ($seq){
$len{$id}=length($seq);
}
$id = $1;
$id_map{$id}=$seq_id;
print $id_fh "$seq_id\t$id\n";
print $ofh ">$seq_id\n";
$seq_id++;
$seq="";
}else{
$seq .= $_;
print $ofh $_,"\n";
}
}
$len{$id}=length($seq) if ($seq);

close $ffh;
close $id_fh;
close $ofh;

## prepare phage_finder_info file
open (my $ph_fh, ">$outDir/phage_finder_info.txt") or die "Cannot write $outDir/phage_finder_info.txt\n";
while (<$fh>) # each LOCUS
{
chomp;
if (/#sequence-region/)
{
my ($tmp, $region_id, $start, $end)=split/\s+/,$_;
$len{$region_id}=$end-$start+1;
}
else
{
my ($id,$source,$type,$start,$end,$score,$strand,$phase,$Attributes)=split /\t/,$_;
if (defined $type and $type eq "CDS")
{
my $region_len = $len{$id};
my %annotations=map { split /=/;} split /;/,$Attributes;
my $product = $annotations{"product"} || $annotations{"Note"} || $annotations{"function"} || "Unknown" ;
my $locus_tag = $annotations{"locus_tag"} || $annotations{"Name"} || "";
$product =~ s/\%2C/,/g;
$product =~ s/\%3B/;/g;
print $ph_fh join("\t",$id_map{$id},$region_len,$locus_tag,$start,$end,$product),"\n";
$cds_count++;
}
}
}
close $ph_fh;
close $fh;


sub Usage
{
print <<"END";
Usage: perl $0 -o outDir GFF_file Fasta_file
Version $version
-o Output directory.
END
exit;
}
30 changes: 30 additions & 0 deletions phageFinder/bin/phageFinder_summary.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;

my $id_file;
my $table;

GetOptions(
't=s' => \$table,
'i=s' => \$id_file
);

my %id_map;
print "$id_file";
print "$table";
open(my $fh,$id_file) or die "Cannot read id_map.txt\n";
while(<$fh>){chomp; my($new_id,$original_id)=split; $id_map{$new_id}=$original_id;}
close $fh;
open(my $ofh,">phageFinder_summary.txt") or die "Cannot write phageFinder_summary.txt\n";
open(my $result_fh,$table) or die "Cannot read PFPR_tab.txt\n";
while(<$result_fh>)
{
my @fields=split /\s+/,$_;
$fields[0]=$id_map{$fields[0]} if ($id_map{$fields[0]});
print $ofh join("\t",@fields),"\n";
}
close $result_fh;
close $ofh;
15 changes: 15 additions & 0 deletions phageFinder/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
params {
outDir = "."
gffFile = null
faaFile = null
fnaFile = null
numCPU = 8
}

singularity {
enabled = true
runOptions="--compat"

}

process.container = "apwat/phage_finder:noWrite"
73 changes: 73 additions & 0 deletions phageFinder/phageFinder.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env nextflow


process phageFinderPrep {

input:
path gff
path fna

output:
path "id_map.txt", emit:idMap //separate output declaration for post-PF processing
path "*", emit:allPFoutput //all output files will go into the next process


script:
"""
phageFinder_prepare.pl -o . $gff $fna
"""
}

process phageFinder {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
mode: 'copy',
pattern: "log.txt"
)

input:
path prepOut
path faa, stageAs: "Assembly.pep"

output:
path "PFPR_tab.txt", emit: phageTable
path "log.txt"

//must be on PATH
script:
"""
phage_finder_v2.1.sh Assembly $params.numCPU 1>log.txt 2>&1
"""

}

process summary {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
mode: 'copy'
)

input:
path idMap
path pfprTab

output:
path "phageFinder_summary.txt"

script:
"""
phageFinder_summary.pl -t $pfprTab -i $idMap
"""
}


workflow {
gff_ch = channel.fromPath(params.gffFile, checkIfExists:true)
faa_ch = channel.fromPath(params.faaFile, checkIfExists:true).filter{ it.size()>0 }
fna_ch = channel.fromPath(params.fnaFile, checkIfExists:true)

phageFinderPrep(gff_ch, fna_ch)
phageFinder(phageFinderPrep.out.allPFoutput, faa_ch)
summary(phageFinderPrep.out.idMap,phageFinder.out.phageTable)

}
7 changes: 7 additions & 0 deletions phageFinder/test_files/parameters/basic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"gffFile" : "test_files/phageProj.gff",
"faaFile" : "test_files/phageProj.faa",
"fnaFile" : "test_files/phageProj.fna",
"numCPU" : 4,
"outDir": "test_phageFinder"
}
87 changes: 87 additions & 0 deletions readsTaxonomyAssignment/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:24.5.0-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name readsTaxonomyAssignment \
&& conda activate readsTaxonomyAssignment

RUN conda install -n readsTaxonomyAssignment -c bioconda metaphlan=4.1.1
RUN conda install -n readsTaxonomyAssignment python=3.10
#the required version of diamond is 2.0.5, but this runs into conda installation problems
#we will install it here to handle any dependencies, then later replace diamond with the appropriate release
RUN conda install -n readsTaxonomyAssignment -c bioconda diamond
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-json
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
RUN conda install -n readsTaxonomyAssignment -c bioconda kraken2
RUN conda install -n readsTaxonomyAssignment -c bioconda krona
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-yaml
RUN conda install -n readsTaxonomyAssignment -c bioconda centrifuge
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha2
RUN conda install -n readsTaxonomyAssignment -c bioconda minimap2=2.17
RUN conda install -n readsTaxonomyAssignment -c bioconda pybedtools
RUN conda install -n readsTaxonomyAssignment -c conda-forge parallel
#conda does not have the most recent version of gottcha (1.0b instead of 1.0c),
#but we encounter errors when compiling splitrim.d in gottcha's latest source code release.
#we will install gottcha here and later replace the non-splitrim scripts with the latest source code.
RUN conda install -n readsTaxonomyAssignment -c bioconda gottcha
RUN conda install -n readsTaxonomyAssignment -c bioconda bowtie2=2.5.1
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-html-template
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-xml-simple
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-excel-writer-xlsx
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl
RUN conda install -n readsTaxonomyAssignment -c conda-forge perl-app-cpanminus
RUN conda install -n readsTaxonomyAssignment -c bioconda perl-bioperl-core
RUN conda install -n readsTaxonomyAssignment -c conda-forge cairosvg=2.7.1
RUN conda install -c conda-forge conda-pack


#download latest PanGIA
#ISSUE: differs from version in EDGE's third-party software, in ways that break scripts
#we can get EDGE's version from its third-party tarball, but it's a wastefully large download.
RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf thirdParty/pangia-1.0.0.tar.gz -C . \
&& mv pangia /opt/conda/envs/readsTaxonomyAssignment/opt

#correct diamond version
RUN wget https://github.com/bbuchfink/diamond/releases/download/v2.0.5/diamond-linux64.tar.gz \
&& tar -xvzf diamond-linux64.tar.gz

#correct gottcha version
RUN wget https://github.com/LANL-Bioinformatics/GOTTCHA/archive/refs/tags/1.0c.tar.gz \
&& tar -xvzf 1.0c.tar.gz \
&& chmod 755 GOTTCHA-1.0c/src/*.pl


#add scripts from this project to bin
ADD bin/* /opt/conda/envs/readsTaxonomyAssignment/bin

#pack environment for runtime image
RUN conda-pack -n readsTaxonomyAssignment -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

FROM debian:latest AS runtime

COPY --from=build /venv /venv
COPY --from=build /diamond /venv/bin
COPY --from=build /GOTTCHA-1.0c/src/*.pl /venv/bin

#add environment, pangia base and vis-scripts to PATH
ENV PATH=/venv/bin:/venv/opt/pangia:/venv/opt/pangia/pangia-vis/scripts:/venv/opt/krona:$PATH
ENV PERL5LIB=/venv/lib/perl5/core_perl/


SHELL ["/bin/bash", "-c"]
CMD /bin/bash
Loading

0 comments on commit f9ad25d

Please sign in to comment.