Skip to content

Commit

Permalink
Nf annotation (#10)
Browse files Browse the repository at this point in the history
* initial commit

* Prokka and RATT running

* generating plots and KEGG pathway views

* Full workflow, with explicit outputs and TODOs resolved.

* Containerized

* comments

* Nf phage finder (#9)

Merging two workflows with related functionality.

* Image version bump  and documentation for phageFinder
  • Loading branch information
aw-watson authored Oct 30, 2024
1 parent 4bfec7c commit bb4c8a7
Show file tree
Hide file tree
Showing 25 changed files with 3,772 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
**/__pycache__/*
.nextflow*
*/work/*
*/logs/*
*/logs/*
**/ec_info/*
46 changes: 46 additions & 0 deletions phageFinder/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:23.5.2-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name phageFinder \
&& conda activate phageFinder

# install dependencies for phageFinder
RUN conda install -n phageFinder -c bioconda aragorn
RUN conda install -n phageFinder -c bioconda blast-legacy
RUN conda install -n phageFinder -c bioconda hmmer
RUN conda install -n phageFinder -c bioconda trnascan-se
RUN conda install -c conda-forge conda-pack

ADD bin/*.pl /opt/conda/envs/phageFinder/bin

RUN conda-pack -n phageFinder -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

#we need the version of phage_finder from EDGE's third-party database to handle some bugs

RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf thirdParty/phage_finder_v2.1.tar.gz -C .


RUN chmod -R a+rx phage_finder_v2.1/*

FROM debian:latest AS runtime

COPY --from=build /venv /venv
COPY --from=build /phage_finder_v2.1 /venv/opt/phage_finder_v2.1
ENV PATH=/venv/opt/phage_finder_v2.1/bin:/venv/bin:$PATH

SHELL ["/bin/bash", "-c"]
CMD /bin/bash
101 changes: 101 additions & 0 deletions phageFinder/bin/phageFinder_prepare.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env perl
# Purpose: prepare files for phage finder.
# This script takes a GFF file from Prokka as input, and produces a
# phage_finder_info.txt (protein table)
# Written by Chien-Chi Lo
# 16 Oct 2014

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

my $outDir;
my $version=0.1;
GetOptions(
"o=s" => \$outDir,
"version" => sub{print "Version $version\n";exit;},
"help|?" => sub{Usage()} );


if (@ARGV != 2) {&Usage();}
unless ( -e $ARGV[0] ) { print "GFF File not exist\n"; &Usage();}
unless ( -e $ARGV[1] ) { print "Genome/Contig fasta file not exist\n"; &Usage();}
open(my $fh, $ARGV[0]) or die "Cannot open GFF file\n";

my %len;
my $cds_count=0;
my %id_map;
my $id_map_file="$outDir/id_map.txt";
my $seq_id="Sequence0000001";

## rename fasta file to mapped id
my $new_fasta="$outDir/Assembly.con";
open(my $ofh, ">$new_fasta") or die "Cannot write $new_fasta\n";
open(my $ffh, $ARGV[1]) or die "Cannot open Fasta file\n";
open (my $id_fh, ">$id_map_file") or die "Cannot write $id_map_file\n";
my ($id,$seq);
while(<$ffh>)
{
chomp;
if(/^>(\S+)/)
{
if ($seq){
$len{$id}=length($seq);
}
$id = $1;
$id_map{$id}=$seq_id;
print $id_fh "$seq_id\t$id\n";
print $ofh ">$seq_id\n";
$seq_id++;
$seq="";
}else{
$seq .= $_;
print $ofh $_,"\n";
}
}
$len{$id}=length($seq) if ($seq);

close $ffh;
close $id_fh;
close $ofh;

## prepare phage_finder_info file
open (my $ph_fh, ">$outDir/phage_finder_info.txt") or die "Cannot write $outDir/phage_finder_info.txt\n";
while (<$fh>) # each LOCUS
{
chomp;
if (/#sequence-region/)
{
my ($tmp, $region_id, $start, $end)=split/\s+/,$_;
$len{$region_id}=$end-$start+1;
}
else
{
my ($id,$source,$type,$start,$end,$score,$strand,$phase,$Attributes)=split /\t/,$_;
if (defined $type and $type eq "CDS")
{
my $region_len = $len{$id};
my %annotations=map { split /=/;} split /;/,$Attributes;
my $product = $annotations{"product"} || $annotations{"Note"} || $annotations{"function"} || "Unknown" ;
my $locus_tag = $annotations{"locus_tag"} || $annotations{"Name"} || "";
$product =~ s/\%2C/,/g;
$product =~ s/\%3B/;/g;
print $ph_fh join("\t",$id_map{$id},$region_len,$locus_tag,$start,$end,$product),"\n";
$cds_count++;
}
}
}
close $ph_fh;
close $fh;


sub Usage
{
print <<"END";
Usage: perl $0 -o outDir GFF_file Fasta_file
Version $version
-o Output directory.
END
exit;
}
30 changes: 30 additions & 0 deletions phageFinder/bin/phageFinder_summary.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;

my $id_file;
my $table;

GetOptions(
't=s' => \$table,
'i=s' => \$id_file
);

my %id_map;
print "$id_file";
print "$table";
open(my $fh,$id_file) or die "Cannot read id_map.txt\n";
while(<$fh>){chomp; my($new_id,$original_id)=split; $id_map{$new_id}=$original_id;}
close $fh;
open(my $ofh,">phageFinder_summary.txt") or die "Cannot write phageFinder_summary.txt\n";
open(my $result_fh,$table) or die "Cannot read PFPR_tab.txt\n";
while(<$result_fh>)
{
my @fields=split /\s+/,$_;
$fields[0]=$id_map{$fields[0]} if ($id_map{$fields[0]});
print $ofh join("\t",@fields),"\n";
}
close $result_fh;
close $ofh;
15 changes: 15 additions & 0 deletions phageFinder/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
params {
outDir = "."
gffFile = null
faaFile = null
fnaFile = null
numCPU = 8
}

singularity {
enabled = true
runOptions="--compat"

}

process.container = "apwat/phage_finder:1.0"
77 changes: 77 additions & 0 deletions phageFinder/phageFinder.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env nextflow


//prepares input for phage finder (creates appropriate files)
process phageFinderPrep {

input:
path gff
path fna

output:
path "id_map.txt", emit:idMap //separate output declaration for post-PF processing
path "*", emit:allPFoutput //all output files will go into the next process


script:
"""
phageFinder_prepare.pl -o . $gff $fna
"""
}

//calls phage_finder
process phageFinder {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
mode: 'copy',
pattern: "log.txt"
)

input:
path prepOut
path faa, stageAs: "Assembly.pep"

output:
path "PFPR_tab.txt", emit: phageTable
path "log.txt"

//must be on PATH
script:
"""
phage_finder_v2.1.sh Assembly $params.numCPU 1>log.txt 2>&1
"""

}


//creates text summary of results
process summary {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Prophage",
mode: 'copy'
)

input:
path idMap
path pfprTab

output:
path "phageFinder_summary.txt"

script:
"""
phageFinder_summary.pl -t $pfprTab -i $idMap
"""
}


workflow {
gff_ch = channel.fromPath(params.gffFile, checkIfExists:true)
faa_ch = channel.fromPath(params.faaFile, checkIfExists:true).filter{ it.size()>0 }
fna_ch = channel.fromPath(params.fnaFile, checkIfExists:true)

phageFinderPrep(gff_ch, fna_ch)
phageFinder(phageFinderPrep.out.allPFoutput, faa_ch)
summary(phageFinderPrep.out.idMap,phageFinder.out.phageTable)

}
7 changes: 7 additions & 0 deletions phageFinder/test_files/parameters/basic.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"gffFile" : "test_files/PROJECT.gff",
"faaFile" : "test_files/PROJECT.faa",
"fnaFile" : "test_files/PROJECT.fna",
"numCPU" : 4,
"outDir": "test_phageFinder"
}
47 changes: 47 additions & 0 deletions runAnnotation/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:24.5.0-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name annotation \
&& conda activate annotation

RUN conda install -n annotation -c bioconda perl-lwp-protocol-https
RUN conda install -n annotation -c conda-forge r-base
RUN conda install -n annotation -c bioconda prokka
RUN conda install -n annotation -c bioconda mummer
RUN conda install -n annotation -c bioconda blast=2.16
RUN conda install -n annotation -c bioconda perl-yaml

RUN conda install -c conda-forge conda-pack

#Custom implementation of RATT from EDGE
RUN wget https://ref-db.edgebioinformatics.org/EDGE/dev/edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf edge_dev_thirdParty_softwares.tgz \
&& tar -xvzf thirdParty/RATT.tar.gz -C . \
&& mv RATT /opt/conda/envs/annotation/opt

#add scripts from this project to bin
ADD bin/* /opt/conda/envs/annotation/bin

#pack environment for runtime image
RUN conda-pack -n annotation -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

FROM debian:latest AS runtime

COPY --from=build /venv /venv

ENV PATH=/venv/bin:/venv/opt/RATT:$PATH

SHELL ["/bin/bash", "-c"]
CMD /bin/bash
27 changes: 27 additions & 0 deletions runAnnotation/bin/check_server_up.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env perl

use FindBin qw($Bin);
use Getopt::Long;
use strict;
use warnings;
use LWP::UserAgent;

my $url;

GetOptions(
"url=s" => \$url
);


my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
my $up=0;
my $response = $ua->get($url);

if ($response->is_success) {
$up=1
}
else {
die("$url is not up!")
}
17 changes: 17 additions & 0 deletions runAnnotation/bin/embl2genbank.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Bio::SeqIO;

if (@ARGV != 2) { die "USAGE: embl2genbank.pl embl_Iutput Genbank_Onput \n"; }

my $seqio = Bio::SeqIO->new('-format' => 'embl', '-file' => "$ARGV[0]");
my $seqout = new Bio::SeqIO('-format' => 'genbank', '-file' => ">>$ARGV[1]");
while( my $seq = $seqio->next_seq) {
my $locus = $seq->display_id;
$locus =~ s/\.final$//;
$locus =~ s/^(\S+?)\.//;
$seq->accession_number($locus);
$seq->display_id($locus);
$seqout->write_seq($seq)
}
Loading

0 comments on commit bb4c8a7

Please sign in to comment.