Skip to content

Commit

Permalink
Merge pull request #12 from LANL-Bioinformatics/nf_contigsTaxonomy
Browse files Browse the repository at this point in the history
Nf contigs taxonomy
  • Loading branch information
mflynn-lanl authored Sep 25, 2024
2 parents 07f32c4 + 450d9e1 commit 4bfec7c
Show file tree
Hide file tree
Showing 9 changed files with 58,179 additions and 0 deletions.
42 changes: 42 additions & 0 deletions contigsTaxonomyAssignment/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# syntax=docker/dockerfile:1
FROM continuumio/miniconda3:24.5.0-0 AS build

ENV container=docker

# add conda channels
RUN conda config --add channels conda-forge \
&& conda config --add channels bioconda

RUN conda init bash \
&& . ~/.bashrc \
&& conda create --name contigsTaxonomyAssignment \
&& conda activate contigsTaxonomyAssignment

RUN conda install -n contigsTaxonomyAssignment python=3.5
RUN conda install -n contigsTaxonomyAssignment r-base=4.3.1
RUN conda install -n contigsTaxonomyAssignment -c bioconda minimap2=2.24
RUN conda install -n contigsTaxonomyAssignment -c bioconda pandas=0.23.4
RUN conda install -n contigsTaxonomyAssignment -c bioconda perl-json
RUN conda install -c conda-forge conda-pack
#use latest miccr (v0.0.3) with Chienchi's fix for tokenizing alignment file
RUN git clone --depth 1 https://github.com/chienchi/miccr.git
#add scripts from this project to bin
ADD bin/* /opt/conda/envs/contigsTaxonomyAssignment/bin

#pack environment for runtime image
RUN conda-pack -n contigsTaxonomyAssignment -o /tmp/env.tar && \
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
rm /tmp/env.tar

RUN /venv/bin/conda-unpack

FROM debian:latest AS runtime

COPY --from=build /venv /venv
COPY --from=build /miccr /miccr

#add environment and miccr to PATH
ENV PATH=/venv/bin:/miccr:/miccr/utils:$PATH

SHELL ["/bin/bash", "-c"]
CMD /bin/bash
58 changes: 58 additions & 0 deletions contigsTaxonomyAssignment/bin/get_unclassified_fasta.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env perl
use strict;
use Getopt::Long;
use File::Basename;

my $input;
my $classified_result;
my $output;
my $log;

GetOptions(
'in=s' => \$input,
'classified=s' => \$classified_result,
'output=s' => \$output,
'log=s' => \$log
);

my %result;


#Total Contigs: (\d+) \((\d+) bp\); Classified Contigs: (\d+) \((\d+) bp\); Unclassified Contigs: (\d+) \((\d+) bp\);/){
my $total_classified_count = 0;
my $total_classified_len = 0;
open (my $fh, "<", $classified_result) or die "Cannot read $classified_result\n";
my $header = <$fh>;
while(<$fh>){
chomp;
my @f = split /\t/,$_;
$result{$f[0]}=1;
$total_classified_len += $f[9];
$total_classified_count++;
}
close $fh;
my $ofh;
if ($output){
open ($ofh, ">" , $output) or die "Cannot Write $output\n";
}
open (my $fafh, "<" , $input ) or die "Cannot read $input\n";
my $seq_id;
my $total_count=0;
my $total_len=0;
while(<$fafh>){
chomp;
if ($_=~ /^>(\S+)/){
$total_count++;
$seq_id=$1;
print $ofh "$_\n" if (!$result{$seq_id} && $output);
}else{
print $ofh "$_\n" if (!$result{$seq_id} && $output);
$total_len += length($_);
}
}
close $ofh if ($output);
close $fafh;
open (my $log_fh , ">>" , $log) or die "Cannote write $log\n";
print $log_fh "Total Contigs: $total_count ($total_len bp); Classified Contigs: $total_classified_count ($total_classified_len bp); ";
print $log_fh "Unclassified Contigs: ". ($total_count - $total_classified_count) . " (". ($total_len - $total_classified_len) . " bp);\n";
close $log_fh;
129 changes: 129 additions & 0 deletions contigsTaxonomyAssignment/bin/tab2Json_for_dataTable.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env perl
#chienchi @ lanl.gov
#20160511

use strict;
use Getopt::Long;
use File::Basename;
#use FindBin qw($RealBin);
#use lib "$RealBin/../lib";
use JSON;

my $limit=3000;
my $contigsizelimit=700;
my $info;

my $projdir;
my $mode;
my $out;
my $delimit="tab";

GetOptions(
"mode=s" => \$mode,
"project_dir=s" => \$projdir,
"limit=i" => \$limit,
"delimit=s" => \$delimit,
"size=i" => \$contigsizelimit,
"out=s" => \$out,
"help|?" => sub{Usage()} );

sub Usage{
print <<"END";
Usage: perl $0 [options] tab_delimited_table
Options:
-project_dir the project directory
-mode 'contig' or 'reference' or 'ref_gap' or 'feature_count'
the first column of the tab_delimited_table
is contig ID or Reference ID or other ID
-delimit tab (default) or comma
-limit <NUM> process # for row for the input table. (default: 3000)
-out output filename
END

exit;
}


my $table=$ARGV[0];

my $sep = "\t";
$sep = "," if ($delimit =~ /comma/i);

if (!$table){&Usage();}

if ($out){
unlink $out;
open (my $fh,">",$out);
select $fh;
}

my $projname;
if ($projdir){
$projdir =~ s/\/$//;
$projname = basename($projdir);
}

open(my $fh, $table) or die "Cannot read $table\n";
my $header = <$fh>;
chomp $header;
my @headers = split /$sep/,$header;
splice @headers, 1, 0, "NCBI BLAST" if ($mode eq 'contig');
$headers[0]= "CONTIG_ID" if ($mode eq 'contig');
shift @headers if ($mode eq "feature_count");
my ($length_index,$start_index,$end_index);
foreach my $i (0..$#headers){
my $hash;
$headers[$i] =~ s/^\"|\"$//g;
if ($mode eq "feature_count"){
my @fc_header = split(/[\.\/]/, $headers[$i]);
if (scalar(@fc_header)>=3){
#$headers[$i]="$fc_header[-3]"."_"."$fc_header[-2]";
$headers[$i]="$fc_header[-2]";
}
}
$hash->{title}= $headers[$i];
$hash->{data}= $headers[$i];
push @{$info->{columns}},$hash;
$length_index = $i if ($headers[$i] =~ /length/i);
$start_index = $i if ($headers[$i] =~ /gap_start/i);
$end_index = $i if ($headers[$i] =~ /gap_end/i);
}
my $count=0;
while(<$fh>){
chomp;
my $data;
last if ($limit > 0 && $count >= $limit);

my @array=split(/$sep/,$_);
if ($mode eq 'contig'){
splice @array, 1, 0, "<a href='#' class='edge-contigBlast' >BLAST</a>";
my $end = ($length_index)? $array[$length_index] : $contigsizelimit;
$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fctg_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A1..$end' target='_blank'>$array[0]</a>" if ($projname);
next if ($length_index && $array[$length_index] < $contigsizelimit);
}elsif ($mode eq 'ref_gap'){
my $start = ($start_index)? $array[$start_index] : 1;
my $end = ($end_index)? $array[$end_index]: $contigsizelimit;
$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fref_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A$start..$end' target='_blank'>$array[0]</a>" if ($projname);
}
shift @array if ($mode eq "feature_count");
foreach my $i (0..$#headers){
$array[$i] = "" if (!$array[$i]);
$array[$i] =~ s/^\"|\"$//g;
$array[$i] = ($array[$i] !~ /\D/ && $array[$i] =~ /\d+\.\d+/)? sprintf("%.4f",$array[$i]):$array[$i];
$data->{$headers[$i]}=$array[$i];
}
push @{$info->{data}},$data;
$count++;
}

&returnStatus;

sub returnStatus {
my $json = "{}";
$json = to_json($info) if $info;
#$json = to_json( $info, { ascii => 1, pretty => 1 } ) if $info && $ARGV[0];
print $json;
exit;
}

79 changes: 79 additions & 0 deletions contigsTaxonomyAssignment/contigsTaxonomyAssignment.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//André Watson
//Aug 2024
//apwat @ lanl.gov

//base process. Takes a FASTA file containing contigs and performs taxonomic analysis with MICCR (https://github.com/chienchi/miccr).
process contigTaxonomy {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
mode: 'copy'
)
input:
path contigs

output:
path "${params.projName}.log"
path "log.txt"
path "${params.projName}.lca_ctg.tsv", emit: taxLcaResult
path "${params.projName}.ctg.tsv", emit: taxResult
path "${params.projName}.unclassified.fasta"
path "${params.projName}.paf"

script:
"""
miccr.py -x asm10 -d $params.dbPath -t $params.cpus -p $params.projName -i $contigs 1>log.txt 2>&1
get_unclassified_fasta.pl -in $contigs -classified ${params.projName}.lca_ctg.tsv -output ${params.projName}.unclassified.fasta -log log.txt
"""
}

//adds multi-level taxonomic classification to results file. Takes in a .ctg.tsv file produced by MICCR.
process addLineage {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
mode: 'copy'
)

input:
path taxResult

output:
path "*.lineage", emit: lineage

script:
def dbFolder = params.dbPath.take(params.dbPath.lastIndexOf("/")) //get folder path containing DB
"""
add_lineage.py $dbFolder $taxResult > ${taxResult.name}.lineage
"""
}

//creates taxonomy classification graphs. Takes lineage file, .lca_ctg.tsv file produced by MICCR,
//and a coverage table (see https://github.com/chienchi/miccr/blob/master/utils/README.md), or from workflow runReadsToContig
process plotAndTable {
publishDir(
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
mode: 'copy'
)
input:
path lineage
path covTable
path lcaResult

output:
path "${params.projName}.ctg_class.LCA.json"
path "summary_by_*.txt"
path "*.pdf"

script:
"""
classification_plot.R $lineage $params.projName $covTable
tab2Json_for_dataTable.pl -project_dir $params.outDir -mode contig -limit $params.rowLimit $lcaResult > ${params.projName}.ctg_class.LCA.json
"""
}

workflow {
contigs = channel.fromPath(params.contigFile, checkIfExists:true)
coverageTable = channel.fromPath(params.coverageTable, checkIfExists:true)
contigTaxonomy(contigs)
addLineage(contigTaxonomy.out.taxResult)
plotAndTable(addLineage.out.lineage, coverageTable, contigTaxonomy.out.taxLcaResult)
}
15 changes: 15 additions & 0 deletions contigsTaxonomyAssignment/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

singularity {
enabled=true
runOptions="--compat --home /path/to/home --bind /path/to/miccrDB:/venv/database/miccrDB"
}
process.container = 'apwat/contigs_taxonomy:1.10'
params {
cpus = 8
outDir = "."
contigFile = null
coverageTable = null
dbPath = null
projName = null
rowLimit = 3000
}
Loading

0 comments on commit 4bfec7c

Please sign in to comment.