-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from LANL-Bioinformatics/nf_contigsTaxonomy
Nf contigs taxonomy
- Loading branch information
Showing
9 changed files
with
58,179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# syntax=docker/dockerfile:1 | ||
FROM continuumio/miniconda3:24.5.0-0 AS build | ||
|
||
ENV container=docker | ||
|
||
# add conda channels | ||
RUN conda config --add channels conda-forge \ | ||
&& conda config --add channels bioconda | ||
|
||
RUN conda init bash \ | ||
&& . ~/.bashrc \ | ||
&& conda create --name contigsTaxonomyAssignment \ | ||
&& conda activate contigsTaxonomyAssignment | ||
|
||
RUN conda install -n contigsTaxonomyAssignment python=3.5 | ||
RUN conda install -n contigsTaxonomyAssignment r-base=4.3.1 | ||
RUN conda install -n contigsTaxonomyAssignment -c bioconda minimap2=2.24 | ||
RUN conda install -n contigsTaxonomyAssignment -c bioconda pandas=0.23.4 | ||
RUN conda install -n contigsTaxonomyAssignment -c bioconda perl-json | ||
RUN conda install -c conda-forge conda-pack | ||
#use latest miccr (v0.0.3) with Chienchi's fix for tokenizing alignment file | ||
RUN git clone --depth 1 https://github.com/chienchi/miccr.git | ||
#add scripts from this project to bin | ||
ADD bin/* /opt/conda/envs/contigsTaxonomyAssignment/bin | ||
|
||
#pack environment for runtime image | ||
RUN conda-pack -n contigsTaxonomyAssignment -o /tmp/env.tar && \ | ||
mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ | ||
rm /tmp/env.tar | ||
|
||
RUN /venv/bin/conda-unpack | ||
|
||
FROM debian:latest AS runtime | ||
|
||
COPY --from=build /venv /venv | ||
COPY --from=build /miccr /miccr | ||
|
||
#add environment and miccr to PATH | ||
ENV PATH=/venv/bin:/miccr:/miccr/utils:$PATH | ||
|
||
SHELL ["/bin/bash", "-c"] | ||
CMD /bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env perl | ||
use strict; | ||
use Getopt::Long; | ||
use File::Basename; | ||
|
||
my $input; | ||
my $classified_result; | ||
my $output; | ||
my $log; | ||
|
||
GetOptions( | ||
'in=s' => \$input, | ||
'classified=s' => \$classified_result, | ||
'output=s' => \$output, | ||
'log=s' => \$log | ||
); | ||
|
||
my %result; | ||
|
||
|
||
#Total Contigs: (\d+) \((\d+) bp\); Classified Contigs: (\d+) \((\d+) bp\); Unclassified Contigs: (\d+) \((\d+) bp\);/){ | ||
my $total_classified_count = 0; | ||
my $total_classified_len = 0; | ||
open (my $fh, "<", $classified_result) or die "Cannot read $classified_result\n"; | ||
my $header = <$fh>; | ||
while(<$fh>){ | ||
chomp; | ||
my @f = split /\t/,$_; | ||
$result{$f[0]}=1; | ||
$total_classified_len += $f[9]; | ||
$total_classified_count++; | ||
} | ||
close $fh; | ||
my $ofh; | ||
if ($output){ | ||
open ($ofh, ">" , $output) or die "Cannot Write $output\n"; | ||
} | ||
open (my $fafh, "<" , $input ) or die "Cannot read $input\n"; | ||
my $seq_id; | ||
my $total_count=0; | ||
my $total_len=0; | ||
while(<$fafh>){ | ||
chomp; | ||
if ($_=~ /^>(\S+)/){ | ||
$total_count++; | ||
$seq_id=$1; | ||
print $ofh "$_\n" if (!$result{$seq_id} && $output); | ||
}else{ | ||
print $ofh "$_\n" if (!$result{$seq_id} && $output); | ||
$total_len += length($_); | ||
} | ||
} | ||
close $ofh if ($output); | ||
close $fafh; | ||
open (my $log_fh , ">>" , $log) or die "Cannote write $log\n"; | ||
print $log_fh "Total Contigs: $total_count ($total_len bp); Classified Contigs: $total_classified_count ($total_classified_len bp); "; | ||
print $log_fh "Unclassified Contigs: ". ($total_count - $total_classified_count) . " (". ($total_len - $total_classified_len) . " bp);\n"; | ||
close $log_fh; |
129 changes: 129 additions & 0 deletions
129
contigsTaxonomyAssignment/bin/tab2Json_for_dataTable.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#!/usr/bin/env perl | ||
#chienchi @ lanl.gov | ||
#20160511 | ||
|
||
use strict; | ||
use Getopt::Long; | ||
use File::Basename; | ||
#use FindBin qw($RealBin); | ||
#use lib "$RealBin/../lib"; | ||
use JSON; | ||
|
||
my $limit=3000; | ||
my $contigsizelimit=700; | ||
my $info; | ||
|
||
my $projdir; | ||
my $mode; | ||
my $out; | ||
my $delimit="tab"; | ||
|
||
GetOptions( | ||
"mode=s" => \$mode, | ||
"project_dir=s" => \$projdir, | ||
"limit=i" => \$limit, | ||
"delimit=s" => \$delimit, | ||
"size=i" => \$contigsizelimit, | ||
"out=s" => \$out, | ||
"help|?" => sub{Usage()} ); | ||
|
||
sub Usage{ | ||
print <<"END"; | ||
Usage: perl $0 [options] tab_delimited_table | ||
Options: | ||
-project_dir the project directory | ||
-mode 'contig' or 'reference' or 'ref_gap' or 'feature_count' | ||
the first column of the tab_delimited_table | ||
is contig ID or Reference ID or other ID | ||
-delimit tab (default) or comma | ||
-limit <NUM> process # for row for the input table. (default: 3000) | ||
-out output filename | ||
END | ||
|
||
exit; | ||
} | ||
|
||
|
||
my $table=$ARGV[0]; | ||
|
||
my $sep = "\t"; | ||
$sep = "," if ($delimit =~ /comma/i); | ||
|
||
if (!$table){&Usage();} | ||
|
||
if ($out){ | ||
unlink $out; | ||
open (my $fh,">",$out); | ||
select $fh; | ||
} | ||
|
||
my $projname; | ||
if ($projdir){ | ||
$projdir =~ s/\/$//; | ||
$projname = basename($projdir); | ||
} | ||
|
||
open(my $fh, $table) or die "Cannot read $table\n"; | ||
my $header = <$fh>; | ||
chomp $header; | ||
my @headers = split /$sep/,$header; | ||
splice @headers, 1, 0, "NCBI BLAST" if ($mode eq 'contig'); | ||
$headers[0]= "CONTIG_ID" if ($mode eq 'contig'); | ||
shift @headers if ($mode eq "feature_count"); | ||
my ($length_index,$start_index,$end_index); | ||
foreach my $i (0..$#headers){ | ||
my $hash; | ||
$headers[$i] =~ s/^\"|\"$//g; | ||
if ($mode eq "feature_count"){ | ||
my @fc_header = split(/[\.\/]/, $headers[$i]); | ||
if (scalar(@fc_header)>=3){ | ||
#$headers[$i]="$fc_header[-3]"."_"."$fc_header[-2]"; | ||
$headers[$i]="$fc_header[-2]"; | ||
} | ||
} | ||
$hash->{title}= $headers[$i]; | ||
$hash->{data}= $headers[$i]; | ||
push @{$info->{columns}},$hash; | ||
$length_index = $i if ($headers[$i] =~ /length/i); | ||
$start_index = $i if ($headers[$i] =~ /gap_start/i); | ||
$end_index = $i if ($headers[$i] =~ /gap_end/i); | ||
} | ||
my $count=0; | ||
while(<$fh>){ | ||
chomp; | ||
my $data; | ||
last if ($limit > 0 && $count >= $limit); | ||
|
||
my @array=split(/$sep/,$_); | ||
if ($mode eq 'contig'){ | ||
splice @array, 1, 0, "<a href='#' class='edge-contigBlast' >BLAST</a>"; | ||
my $end = ($length_index)? $array[$length_index] : $contigsizelimit; | ||
$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fctg_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A1..$end' target='_blank'>$array[0]</a>" if ($projname); | ||
next if ($length_index && $array[$length_index] < $contigsizelimit); | ||
}elsif ($mode eq 'ref_gap'){ | ||
my $start = ($start_index)? $array[$start_index] : 1; | ||
my $end = ($end_index)? $array[$end_index]: $contigsizelimit; | ||
$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fref_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A$start..$end' target='_blank'>$array[0]</a>" if ($projname); | ||
} | ||
shift @array if ($mode eq "feature_count"); | ||
foreach my $i (0..$#headers){ | ||
$array[$i] = "" if (!$array[$i]); | ||
$array[$i] =~ s/^\"|\"$//g; | ||
$array[$i] = ($array[$i] !~ /\D/ && $array[$i] =~ /\d+\.\d+/)? sprintf("%.4f",$array[$i]):$array[$i]; | ||
$data->{$headers[$i]}=$array[$i]; | ||
} | ||
push @{$info->{data}},$data; | ||
$count++; | ||
} | ||
|
||
&returnStatus; | ||
|
||
sub returnStatus { | ||
my $json = "{}"; | ||
$json = to_json($info) if $info; | ||
#$json = to_json( $info, { ascii => 1, pretty => 1 } ) if $info && $ARGV[0]; | ||
print $json; | ||
exit; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
//André Watson | ||
//Aug 2024 | ||
//apwat @ lanl.gov | ||
|
||
//base process. Takes a FASTA file containing contigs and performs taxonomic analysis with MICCR (https://github.com/chienchi/miccr). | ||
process contigTaxonomy { | ||
publishDir( | ||
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy", | ||
mode: 'copy' | ||
) | ||
input: | ||
path contigs | ||
|
||
output: | ||
path "${params.projName}.log" | ||
path "log.txt" | ||
path "${params.projName}.lca_ctg.tsv", emit: taxLcaResult | ||
path "${params.projName}.ctg.tsv", emit: taxResult | ||
path "${params.projName}.unclassified.fasta" | ||
path "${params.projName}.paf" | ||
|
||
script: | ||
""" | ||
miccr.py -x asm10 -d $params.dbPath -t $params.cpus -p $params.projName -i $contigs 1>log.txt 2>&1 | ||
get_unclassified_fasta.pl -in $contigs -classified ${params.projName}.lca_ctg.tsv -output ${params.projName}.unclassified.fasta -log log.txt | ||
""" | ||
} | ||
|
||
//adds multi-level taxonomic classification to results file. Takes in a .ctg.tsv file produced by MICCR. | ||
process addLineage { | ||
publishDir( | ||
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy", | ||
mode: 'copy' | ||
) | ||
|
||
input: | ||
path taxResult | ||
|
||
output: | ||
path "*.lineage", emit: lineage | ||
|
||
script: | ||
def dbFolder = params.dbPath.take(params.dbPath.lastIndexOf("/")) //get folder path containing DB | ||
""" | ||
add_lineage.py $dbFolder $taxResult > ${taxResult.name}.lineage | ||
""" | ||
} | ||
|
||
//creates taxonomy classification graphs. Takes lineage file, .lca_ctg.tsv file produced by MICCR, | ||
//and a coverage table (see https://github.com/chienchi/miccr/blob/master/utils/README.md), or from workflow runReadsToContig | ||
process plotAndTable { | ||
publishDir( | ||
path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy", | ||
mode: 'copy' | ||
) | ||
input: | ||
path lineage | ||
path covTable | ||
path lcaResult | ||
|
||
output: | ||
path "${params.projName}.ctg_class.LCA.json" | ||
path "summary_by_*.txt" | ||
path "*.pdf" | ||
|
||
script: | ||
""" | ||
classification_plot.R $lineage $params.projName $covTable | ||
tab2Json_for_dataTable.pl -project_dir $params.outDir -mode contig -limit $params.rowLimit $lcaResult > ${params.projName}.ctg_class.LCA.json | ||
""" | ||
} | ||
|
||
workflow { | ||
contigs = channel.fromPath(params.contigFile, checkIfExists:true) | ||
coverageTable = channel.fromPath(params.coverageTable, checkIfExists:true) | ||
contigTaxonomy(contigs) | ||
addLineage(contigTaxonomy.out.taxResult) | ||
plotAndTable(addLineage.out.lineage, coverageTable, contigTaxonomy.out.taxLcaResult) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
singularity { | ||
enabled=true | ||
runOptions="--compat --home /path/to/home --bind /path/to/miccrDB:/venv/database/miccrDB" | ||
} | ||
process.container = 'apwat/contigs_taxonomy:1.10' | ||
params { | ||
cpus = 8 | ||
outDir = "." | ||
contigFile = null | ||
coverageTable = null | ||
dbPath = null | ||
projName = null | ||
rowLimit = 3000 | ||
} |
Oops, something went wrong.