Merge pull request #12 from LANL-Bioinformatics/nf_contigsTaxonomy

Nf contigs taxonomy
LANL-Bioinformatics · Sep 25, 2024 · 4bfec7c · 4bfec7c
2 parents 07f32c4 + 450d9e1
commit 4bfec7c
Show file tree

Hide file tree

Showing 9 changed files with 58,179 additions and 0 deletions.
diff --git a/contigsTaxonomyAssignment/Dockerfile b/contigsTaxonomyAssignment/Dockerfile
@@ -0,0 +1,42 @@
+# syntax=docker/dockerfile:1
+FROM continuumio/miniconda3:24.5.0-0 AS build
+
+ENV container=docker
+
+# add conda channels
+RUN conda config --add channels conda-forge \
+    && conda config --add channels bioconda
+
+RUN conda init bash \
+    && . ~/.bashrc \
+    && conda create --name contigsTaxonomyAssignment \
+    && conda activate contigsTaxonomyAssignment 
+
+RUN conda install -n contigsTaxonomyAssignment python=3.5
+RUN conda install -n contigsTaxonomyAssignment r-base=4.3.1
+RUN conda install -n contigsTaxonomyAssignment -c bioconda minimap2=2.24
+RUN conda install -n contigsTaxonomyAssignment -c bioconda pandas=0.23.4
+RUN conda install -n contigsTaxonomyAssignment -c bioconda perl-json
+RUN conda install -c conda-forge conda-pack
+#use latest miccr (v0.0.3) with Chienchi's fix for tokenizing alignment file
+RUN git clone --depth 1 https://github.com/chienchi/miccr.git
+#add scripts from this project to bin
+ADD bin/* /opt/conda/envs/contigsTaxonomyAssignment/bin
+
+#pack environment for runtime image
+RUN conda-pack -n contigsTaxonomyAssignment -o /tmp/env.tar && \
+    mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
+    rm /tmp/env.tar
+
+RUN /venv/bin/conda-unpack
+
+FROM debian:latest AS runtime
+
+COPY --from=build /venv /venv
+COPY --from=build /miccr /miccr
+
+#add environment and miccr to PATH
+ENV PATH=/venv/bin:/miccr:/miccr/utils:$PATH
+
+SHELL ["/bin/bash", "-c"]
+CMD /bin/bash
diff --git a/contigsTaxonomyAssignment/bin/get_unclassified_fasta.pl b/contigsTaxonomyAssignment/bin/get_unclassified_fasta.pl
@@ -0,0 +1,58 @@
+#!/usr/bin/env perl
+use strict;
+use Getopt::Long;
+use File::Basename;
+
+my $input;
+my $classified_result;
+my $output;
+my $log;
+
+GetOptions(
+    'in=s' => \$input,
+    'classified=s' => \$classified_result,
+    'output=s' => \$output,
+    'log=s' => \$log
+);
+
+my %result;
+
+
+#Total Contigs: (\d+) \((\d+) bp\); Classified Contigs: (\d+) \((\d+) bp\); Unclassified Contigs: (\d+) \((\d+) bp\);/){
+my $total_classified_count = 0;
+my $total_classified_len = 0;
+open (my $fh, "<", $classified_result) or die "Cannot read $classified_result\n";
+my $header = <$fh>;
+while(<$fh>){
+    chomp;
+    my @f = split /\t/,$_;
+    $result{$f[0]}=1;
+    $total_classified_len += $f[9];
+    $total_classified_count++;
+}
+close $fh;
+my $ofh;
+if ($output){
+    open ($ofh, ">" , $output) or die "Cannot Write $output\n";
+}
+open (my $fafh, "<" , $input ) or die "Cannot read $input\n";
+my $seq_id;
+my $total_count=0;
+my $total_len=0;
+while(<$fafh>){
+    chomp;
+    if ($_=~ /^>(\S+)/){
+        $total_count++;
+        $seq_id=$1;
+        print $ofh "$_\n" if (!$result{$seq_id} && $output);
+    }else{
+        print $ofh "$_\n" if (!$result{$seq_id} && $output);
+        $total_len += length($_);
+    }
+}
+close $ofh if ($output);
+close $fafh;
+open (my $log_fh , ">>" , $log) or die "Cannote write $log\n";
+print $log_fh "Total Contigs: $total_count ($total_len bp); Classified Contigs: $total_classified_count ($total_classified_len bp); ";
+print $log_fh "Unclassified Contigs: ". ($total_count - $total_classified_count) .  " (". ($total_len - $total_classified_len) . " bp);\n";
+close $log_fh;
diff --git a/contigsTaxonomyAssignment/bin/tab2Json_for_dataTable.pl b/contigsTaxonomyAssignment/bin/tab2Json_for_dataTable.pl
@@ -0,0 +1,129 @@
+#!/usr/bin/env perl
+#chienchi @ lanl.gov
+#20160511
+
+use strict;
+use Getopt::Long;
+use File::Basename;
+#use FindBin qw($RealBin);
+#use lib "$RealBin/../lib";
+use JSON;
+
+my $limit=3000;
+my $contigsizelimit=700;
+my $info;
+
+my $projdir;
+my $mode;
+my $out;
+my $delimit="tab";
+
+GetOptions(
+			"mode=s"        => \$mode,
+			"project_dir=s" => \$projdir,
+			"limit=i"	=> \$limit,
+			"delimit=s"     => \$delimit, 
+			"size=i"	=> \$contigsizelimit,
+			"out=s"         => \$out,
+			"help|?"        =>  sub{Usage()} );
+
+sub Usage{
+	print <<"END";
+	Usage: perl $0 [options] tab_delimited_table 
+
+    Options:
+          -project_dir    the project directory
+          -mode           'contig' or 'reference' or 'ref_gap' or 'feature_count'
+                          the first column of the tab_delimited_table 
+                          is contig ID or Reference ID or other ID
+          -delimit        tab (default) or comma 
+          -limit <NUM>    process # for row for the input table. (default: 3000)
+          -out            output filename
+END
+
+exit;
+}
+
+
+my $table=$ARGV[0];
+
+my $sep = "\t"; 
+$sep = "," if ($delimit =~ /comma/i);
+
+if (!$table){&Usage();}
+
+if ($out){
+	unlink $out;
+	open (my $fh,">",$out);
+	select $fh;
+}
+
+my $projname;
+if ($projdir){
+	$projdir =~ s/\/$//;
+	$projname = basename($projdir);
+}
+
+open(my $fh, $table) or die "Cannot read $table\n";
+my $header = <$fh>;
+chomp $header;
+my @headers = split /$sep/,$header;
+splice @headers, 1, 0,  "NCBI BLAST" if ($mode eq 'contig');
+$headers[0]= "CONTIG_ID" if ($mode eq 'contig');
+shift @headers if ($mode eq "feature_count");
+my ($length_index,$start_index,$end_index);
+foreach my $i (0..$#headers){
+	my $hash;
+	$headers[$i] =~ s/^\"|\"$//g;
+	if ($mode eq "feature_count"){
+		my @fc_header = split(/[\.\/]/, $headers[$i]);
+		if (scalar(@fc_header)>=3){
+			#$headers[$i]="$fc_header[-3]"."_"."$fc_header[-2]";
+			$headers[$i]="$fc_header[-2]";
+		}
+	}
+	$hash->{title}= $headers[$i];
+	$hash->{data}= $headers[$i];
+	push @{$info->{columns}},$hash;
+	$length_index = $i if ($headers[$i] =~ /length/i);
+	$start_index = $i if ($headers[$i] =~ /gap_start/i);
+	$end_index = $i if ($headers[$i] =~ /gap_end/i);
+}	
+my $count=0;
+while(<$fh>){
+	chomp;
+	my $data;
+	last if ($limit > 0 && $count >= $limit);
+
+	my @array=split(/$sep/,$_);
+	if ($mode eq 'contig'){
+		splice @array, 1, 0,  "<a href='#' class='edge-contigBlast' >BLAST</a>";
+		my $end = ($length_index)? $array[$length_index] : $contigsizelimit;
+		$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fctg_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A1..$end' target='_blank'>$array[0]</a>" if ($projname);
+		next if ($length_index && $array[$length_index] < $contigsizelimit);
+	}elsif ($mode eq 'ref_gap'){
+		my $start = ($start_index)? $array[$start_index] : 1;
+		my $end = ($end_index)? $array[$end_index]: $contigsizelimit;
+		$array[0]="<a href='JBrowse/?data=data%2F$projname%2FJBrowse%2Fref_tracks&tracks=DNA%2CCDS%2CRRNA%2CTRNA&loc=$array[0]%3A$start..$end' target='_blank'>$array[0]</a>" if ($projname);
+	}
+	shift @array if ($mode eq "feature_count");
+	foreach my $i (0..$#headers){
+		$array[$i] = "" if (!$array[$i]);
+		$array[$i] =~ s/^\"|\"$//g;
+		$array[$i] = ($array[$i] !~ /\D/ && $array[$i] =~ /\d+\.\d+/)? sprintf("%.4f",$array[$i]):$array[$i];
+		$data->{$headers[$i]}=$array[$i];
+	}
+	push @{$info->{data}},$data;
+	$count++;
+}
+
+&returnStatus;
+
+sub returnStatus {
+	my $json = "{}";
+	$json = to_json($info) if $info;
+	#$json = to_json( $info, { ascii => 1, pretty => 1 } ) if $info && $ARGV[0];
+	print $json;
+	exit;
+}
+
diff --git a/contigsTaxonomyAssignment/contigsTaxonomyAssignment.nf b/contigsTaxonomyAssignment/contigsTaxonomyAssignment.nf
@@ -0,0 +1,79 @@
+//André Watson
+//Aug 2024
+//apwat @ lanl.gov
+
+//base process. Takes a FASTA file containing contigs and performs taxonomic analysis with MICCR (https://github.com/chienchi/miccr).
+process contigTaxonomy {
+    publishDir(
+        path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
+        mode: 'copy'
+    )
+    input:
+    path contigs
+
+    output:
+    path "${params.projName}.log"
+    path "log.txt"
+    path "${params.projName}.lca_ctg.tsv", emit: taxLcaResult
+    path "${params.projName}.ctg.tsv", emit: taxResult
+    path "${params.projName}.unclassified.fasta"
+    path "${params.projName}.paf"
+
+    script:
+    """
+    miccr.py -x asm10 -d $params.dbPath -t $params.cpus -p $params.projName -i $contigs 1>log.txt 2>&1 
+    get_unclassified_fasta.pl -in $contigs -classified ${params.projName}.lca_ctg.tsv -output ${params.projName}.unclassified.fasta -log log.txt
+    """
+}
+
+//adds multi-level taxonomic classification to results file. Takes in a .ctg.tsv file produced by MICCR.
+process addLineage {
+    publishDir(
+        path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
+        mode: 'copy'
+    )
+
+    input:
+    path taxResult
+
+    output:
+    path "*.lineage", emit: lineage
+
+    script:
+    def dbFolder = params.dbPath.take(params.dbPath.lastIndexOf("/")) //get folder path containing DB
+    """
+    add_lineage.py $dbFolder $taxResult > ${taxResult.name}.lineage
+    """
+}
+
+//creates taxonomy classification graphs. Takes lineage file, .lca_ctg.tsv file produced by MICCR, 
+//and a coverage table (see https://github.com/chienchi/miccr/blob/master/utils/README.md), or from workflow runReadsToContig 
+process plotAndTable {
+    publishDir(
+        path: "$params.outDir/AssemblyBasedAnalysis/Taxonomy",
+	mode: 'copy'
+    )
+    input:
+    path lineage
+    path covTable
+    path lcaResult
+
+    output:
+    path "${params.projName}.ctg_class.LCA.json"
+    path "summary_by_*.txt"
+    path "*.pdf"
+
+    script:
+    """
+    classification_plot.R $lineage $params.projName $covTable
+    tab2Json_for_dataTable.pl -project_dir $params.outDir -mode contig -limit $params.rowLimit $lcaResult > ${params.projName}.ctg_class.LCA.json
+    """
+}
+
+workflow {
+    contigs = channel.fromPath(params.contigFile, checkIfExists:true)
+    coverageTable = channel.fromPath(params.coverageTable, checkIfExists:true)
+    contigTaxonomy(contigs)
+    addLineage(contigTaxonomy.out.taxResult)
+    plotAndTable(addLineage.out.lineage, coverageTable, contigTaxonomy.out.taxLcaResult)
+}
diff --git a/contigsTaxonomyAssignment/nextflow.config b/contigsTaxonomyAssignment/nextflow.config
@@ -0,0 +1,15 @@
+
+singularity {
+    enabled=true
+    runOptions="--compat --home /path/to/home --bind /path/to/miccrDB:/venv/database/miccrDB"
+}
+process.container = 'apwat/contigs_taxonomy:1.10'
+params {
+    cpus = 8
+    outDir = "."
+    contigFile = null
+    coverageTable = null
+    dbPath = null
+    projName = null
+    rowLimit = 3000
+}