Merge pull request #34 from andrewjpage/master

run post analysis as a job
sanger-pathogens · Oct 2, 2013 · 1fc9eda · 1fc9eda
2 parents beea8e1 + 0924fa4
commit 1fc9eda
Show file tree

Hide file tree

Showing 8 changed files with 345 additions and 69 deletions.
diff --git a/bin/pan_genome_post_analysis b/bin/pan_genome_post_analysis
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+package Bio::PanGenome::Main::PanGenomePostAnalysis;
+
+# ABSTRACT: Perform the post analysis on the pan genome
+# PODNAME: pan_genome_post_analysis
+
+=head1 SYNOPSIS
+
+Perform the post analysis on the pan genome
+
+=cut
+
+BEGIN { unshift( @INC, '../lib' ) }
+BEGIN { unshift( @INC, './lib' ) }
+BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
+use Bio::PanGenome::CommandLine::PanGenomePostAnalysis;
+
+Bio::PanGenome::CommandLine::PanGenomePostAnalysis->new(args => \@ARGV, script_name => $0)->run;
diff --git a/lib/Bio/PanGenome.pm b/lib/Bio/PanGenome.pm
@@ -20,6 +20,7 @@ use Bio::PanGenome::AnnotateGroups;
 use Bio::PanGenome::Output::OneGenePerGroupFasta;
 use Bio::PanGenome::GroupStatistics;
 use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
+use Bio::PanGenome::External::PostAnalysis;
 
 has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
 has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef', required => 1 );
@@ -32,21 +33,19 @@ has 'blastp_exec'                 => ( is => 'rw', isa => 'Str',      default  =
 has 'mcxdeblast_exec'             => ( is => 'ro', isa => 'Str',      default  => 'mcxdeblast' );
 has 'mcl_exec'                    => ( is => 'ro', isa => 'Str',      default  => 'mcl' );
 
-has 'output_multifasta_files'     => ( is => 'ro', isa => 'Bool',     default  => 0 );
+has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
 
 sub run {
     my ($self) = @_;
 
-    my $output_combined_filename         = '_combined_files';
-    my $output_cd_hit_filename           = '_clustered';
-    my $output_blast_results_filename    = '_blast_results';
-    my $output_mcl_filename              = '_uninflated_mcl_groups';
-    my $output_inflate_clusters_filename = '_inflated_mcl_groups';
-    my $output_group_labels_filename     = '_labeled_mcl_groups';
+    my $output_combined_filename      = '_combined_files';
+    my $output_cd_hit_filename        = '_clustered';
+    my $output_blast_results_filename = '_blast_results';
+    my $output_mcl_filename           = '_uninflated_mcl_groups';
 
     my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
-        proteome_files        => $self->fasta_files,
-        output_filename       => $output_combined_filename,
+        proteome_files  => $self->fasta_files,
+        output_filename => $output_combined_filename,
     );
     $combine_fasta_files->create_combined_proteome_file;
 
@@ -75,64 +74,21 @@ sub run {
     );
     $mcl->run();
 
-    my $inflate_clusters = Bio::PanGenome::InflateClusters->new(
-        clusters_filename => $cdhit_obj->clusters_filename,
-        mcl_filename      => $output_mcl_filename,
-        output_file       => $output_inflate_clusters_filename
-    );
-    $inflate_clusters->inflate();
-
-    my $group_labels = Bio::PanGenome::GroupLabels->new(
-        groups_filename => $output_inflate_clusters_filename,
-        output_filename => $output_group_labels_filename
-    );
-    $group_labels->add_labels();
-
-    my $analyse_groups_obj = Bio::PanGenome::AnalyseGroups->new(
-        fasta_files     => $self->fasta_files,
-        groups_filename => $output_group_labels_filename
-    );
-    $analyse_groups_obj->create_plots();
-
-    my $annotate_groups = Bio::PanGenome::AnnotateGroups->new(
-        gff_files       => $self->input_files,
-        output_filename => $self->output_filename,
-        groups_filename => $output_group_labels_filename,
-    );
-    $annotate_groups->reannotate;
-
-    my $group_statistics = Bio::PanGenome::GroupStatistics->new(
-        output_filename     => $self->output_statistics_filename,
-        annotate_groups_obj => $annotate_groups,
-        analyse_groups_obj  => $analyse_groups_obj
-    );
-    $group_statistics->create_spreadsheet;
+    unlink($output_blast_results_filename);
+    unlink($output_combined_filename);
 
-    my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new(
-        analyse_groups  => $analyse_groups_obj,
-        output_filename => $self->output_pan_geneome_filename
+    my $post_analysis = Bio::PanGenome::External::PostAnalysis->new(
+        job_runner                  => $self->job_runner,
+        fasta_files                 => $self->fasta_files,
+        input_files                 => $self->input_files,
+        output_filename             => $self->output_filename,
+        output_pan_geneome_filename => $self->output_pan_geneome_filename,
+        output_statistics_filename  => $self->output_statistics_filename,
+        clusters_filename           => $cdhit_obj->clusters_filename,
+        dont_wait                   => 1,
     );
-    $one_gene_per_fasta->create_file();
-
-    if($self->output_multifasta_files)
-    {
-      my $group_multifastas_nucleotides = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new(
-          gff_files       => $self->input_files,
-          analyse_groups  => $analyse_groups_obj,
-          annotate_groups => $annotate_groups,
-          group_names     => $analyse_groups_obj->_groups
-        );
-      $group_multifastas_nucleotides->create_files();
-    }
+    $post_analysis->run();
 
-    unlink($output_blast_results_filename);
-    unlink($output_combined_filename);
-    unlink($output_cd_hit_filename);
-    unlink($output_mcl_filename);
-    unlink($output_inflate_clusters_filename);
-    unlink($output_group_labels_filename);
-    unlink( $output_cd_hit_filename . '.clstr' );
-    unlink( $output_cd_hit_filename . '.bak.clstr' );
 }
 
 no Moose;

diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm b/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm
@@ -0,0 +1,107 @@
+package Bio::PanGenome::CommandLine::PanGenomePostAnalysis;
+
+# ABSTRACT: Perform the post analysis on the pan genome
+
+=head1 SYNOPSIS
+
+Perform the post analysis on the pan genome
+
+=cut
+
+use Moose;
+use Getopt::Long qw(GetOptionsFromArray);
+use Bio::PanGenome::PostAnalysis;
+
+
+has 'args'                        => ( is => 'ro', isa => 'ArrayRef', required => 1 );
+has 'script_name'                 => ( is => 'ro', isa => 'Str',      required => 1 );
+has 'help'                        => ( is => 'rw', isa => 'Bool',     default  => 0 );
+has '_error_message'              => ( is => 'rw', isa => 'Str' );
+
+has 'fasta_files'                 => ( is => 'rw', isa => 'ArrayRef' );
+has 'input_files'                 => ( is => 'rw', isa => 'ArrayRef');
+has 'output_filename'             => ( is => 'rw', isa => 'Str',  default  => 'clustered_proteins' );
+has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str',  default  => 'pan_genome.fa' );
+has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',  default  => 'group_statisics.csv' );
+has 'output_multifasta_files'     => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'clusters_filename'           => ( is => 'rw', isa => 'Str' );
+has 'job_runner'                  => ( is => 'rw', isa => 'Str',  default  => 'LSF' );
+
+sub BUILD {
+    my ($self) = @_;
+
+    my ( $output_filename, $output_pan_geneome_filename, $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $fasta_files, $input_files, $help );
+
+    GetOptionsFromArray(
+        $self->args,
+        'o|output=s'              => \$output_filename,
+        'j|job_runner=s'          => \$job_runner,
+        'output_multifasta_files' => \$output_multifasta_files,
+        'p=s'                     => \$output_pan_geneome_filename,
+        's=s'                     => \$output_statistics_filename,
+        'c=s'                     => \$clusters_filename,
+        'f=s@'                    => \$fasta_files,
+        'i=s@'                    => \$input_files,
+        'h|help'                  => \$help,
+    );
+
+    $self->job_runner($job_runner)                                  if ( defined($job_runner) );
+    $self->fasta_files($fasta_files)                                 if (defined($fasta_files));
+    $self->input_files($input_files)                                 if (defined($input_files));
+    $self->output_filename($output_filename)                         if (defined($output_filename));
+    $self->output_pan_geneome_filename($output_pan_geneome_filename) if (defined($output_pan_geneome_filename));
+    $self->output_statistics_filename($output_statistics_filename)   if (defined($output_statistics_filename));
+    $self->output_multifasta_files($output_multifasta_files)         if (defined($output_multifasta_files));
+    $self->clusters_filename($clusters_filename)                     if (defined($clusters_filename));
+
+}
+
+sub run {
+    my ($self) = @_;
+
+    ( !$self->help ) or die $self->usage_text;
+    if ( defined( $self->_error_message ) ) {
+        print $self->_error_message . "\n";
+        die $self->usage_text;
+    }
+
+    my $obj = Bio::PanGenome::PostAnalysis->new(
+      fasta_files                     =>  $self->fasta_files                ,
+      input_files                     =>  $self->input_files                ,
+      output_filename                 =>  $self->output_filename            ,
+      output_pan_geneome_filename     =>  $self->output_pan_geneome_filename,
+      output_statistics_filename      =>  $self->output_statistics_filename ,
+      output_multifasta_files         =>  $self->output_multifasta_files    ,
+      clusters_filename               =>  $self->clusters_filename          ,
+      );                                                             
+    $obj->run();
+}
+
+sub usage_text {
+    my ($self) = @_;
+
+    return <<USAGE;
+    Usage: pan_genome_post_analysis [options]
+    Perform the post analysis on the pan genome. This script is usally only called by another script.
+    
+    #Normal usage
+    pan_genome_post_analysis 
+      -o output_groups_filename      /
+      -p output_pan_genome_filename  /
+      -s output_stats_filename       /
+      -c output_clusters_filename    /
+      -f proteins1.faa               /
+      -f proteins2.faa               /
+      -f proteins3.faa               /
+      -i annotation1.gff             /
+      -i annotation2.gff             /
+
+    # This help message
+    pan_genome_post_analysis -h
+
+USAGE
+}
+
+__PACKAGE__->meta->make_immutable;
+no Moose;
+1;
diff --git a/lib/Bio/PanGenome/External/PostAnalysis.pm b/lib/Bio/PanGenome/External/PostAnalysis.pm
@@ -0,0 +1,89 @@
+package Bio::PanGenome::External::PostAnalysis;
+
+# ABSTRACT: Perform the post analysis
+
+=head1 SYNOPSIS
+
+Perform the post analysis 
+
+   use Bio::PanGenome::External::PostAnalysis;
+   
+   my $seg= Bio::PanGenome::External::PostAnalysis->new(
+     fasta_file => 'contigs.fa',
+   );
+   
+   $seg->run();
+
+=cut
+
+use Moose;
+with 'Bio::PanGenome::JobRunner::Role';
+
+has 'input_files'                 => ( is => 'ro', isa => 'ArrayRef', required => 1 );
+has 'exec'                        => ( is => 'ro', isa => 'Str', default  => 'pan_genome_post_analysis' );
+has 'fasta_files'                 => ( is => 'ro', isa => 'ArrayRef', required => 1 );
+has 'output_filename'             => ( is => 'ro', isa => 'Str', required => 1 );
+has 'output_pan_geneome_filename' => ( is => 'ro', isa => 'Str', required => 1 );
+has 'output_statistics_filename'  => ( is => 'ro', isa => 'Str', required => 1 );
+has 'clusters_filename'           => ( is => 'ro', isa => 'Str', required => 1 );
+
+# Overload Role
+has '_memory_required_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__memory_required_in_mb' );
+has '_minimum_memory_mb'    => ( is => 'ro', isa => 'Int', default => 1000 );
+has '_memory_per_sample_mb' => ( is => 'ro', isa => 'Int', default => 100 );
+
+sub _build__memory_required_in_mb {
+    my ($self) = @_;
+    my $num_samples = @{ $self->input_files };
+
+    my $memory_required = $num_samples * $self->_memory_per_sample_mb;
+    if ( $memory_required < $self->_minimum_memory_mb ) {
+        $memory_required = $self->_minimum_memory_mb;
+    }
+
+    return $memory_required;
+}
+
+sub _command_to_run {
+    my ($self) = @_;
+
+    my $fasta_files_param = join(' -f ',@{$self->fasta_files});
+    $fasta_files_param =  ' -f '.$fasta_files_param;
+
+    my $input_files_param = join(' -i ',@{$self->input_files});
+    $input_files_param =  ' -i '.$input_files_param;
+
+    return join(
+        " ",
+        (
+            $self->exec,
+            '-o', $self->output_filename,
+            '-p', $self->output_pan_geneome_filename,
+            '-s', $self->output_statistics_filename,
+            '-c', $self->clusters_filename,
+            '--output_multifasta_files',
+            $fasta_files_param,
+            $input_files_param
+        )
+    );
+}
+
+sub run {
+    my ($self) = @_;
+    my @commands_to_run;
+    push( @commands_to_run, $self->_command_to_run );
+
+    my $job_runner_obj = $self->_job_runner_class->new(
+        commands_to_run => \@commands_to_run,
+        memory_in_mb    => $self->_memory_required_in_mb,
+        queue           => $self->_queue,
+        dont_wait       => $self->dont_wait,
+    );
+    $job_runner_obj->run();
+
+    1;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+1;
diff --git a/lib/Bio/PanGenome/JobRunner/LSF.pm b/lib/Bio/PanGenome/JobRunner/LSF.pm
@@ -51,7 +51,11 @@ sub run {
     for my $command_to_run ( @{ $self->commands_to_run } ) {
         $self->_submit_job($command_to_run);
     }
-    $self->_job_manager->wait_all_children(history => 0);
+
+    if(!(defined($self->dont_wait) && $self->dont_wait == 1 ))
+    {
+      $self->_job_manager->wait_all_children(history => 0);
+    }
     1;
 }
 

diff --git a/lib/Bio/PanGenome/JobRunner/Role.pm b/lib/Bio/PanGenome/JobRunner/Role.pm
@@ -15,6 +15,7 @@ has 'job_runner'              => ( is => 'rw', isa => 'Str',  default  => 'Local
 has '_job_runner_class'       => ( is => 'ro', isa => 'Str',  lazy => 1, builder => '_build__job_runner_class' );
 has '_memory_required_in_mb'  => ( is => 'rw', isa => 'Int',  default => '200' );
 has '_queue'                  => ( is => 'rw', isa => 'Str',  default => 'normal' );
+has 'dont_wait'               => ( is => 'rw', isa => 'Bool', default => 0 );
 
 sub _build__job_runner_class {
     my ($self) = @_;