Skip to content

Commit

Permalink
Merge pull request #34 from andrewjpage/master
Browse files Browse the repository at this point in the history
run post analysis as a job
  • Loading branch information
andrewjpage committed Oct 2, 2013
2 parents beea8e1 + 0924fa4 commit 1fc9eda
Show file tree
Hide file tree
Showing 8 changed files with 345 additions and 69 deletions.
19 changes: 19 additions & 0 deletions bin/pan_genome_post_analysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env perl

package Bio::PanGenome::Main::PanGenomePostAnalysis;

# ABSTRACT: Perform the post analysis on the pan genome
# PODNAME: pan_genome_post_analysis

=head1 SYNOPSIS
Perform the post analysis on the pan genome
=cut

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::PanGenome::CommandLine::PanGenomePostAnalysis;

Bio::PanGenome::CommandLine::PanGenomePostAnalysis->new(args => \@ARGV, script_name => $0)->run;
84 changes: 20 additions & 64 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use Bio::PanGenome::AnnotateGroups;
use Bio::PanGenome::Output::OneGenePerGroupFasta;
use Bio::PanGenome::GroupStatistics;
use Bio::PanGenome::Output::GroupsMultifastasNucleotide;
use Bio::PanGenome::External::PostAnalysis;

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 );
has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 );
Expand All @@ -32,21 +33,19 @@ has 'blastp_exec' => ( is => 'rw', isa => 'Str', default =
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );

has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );

sub run {
my ($self) = @_;

my $output_combined_filename = '_combined_files';
my $output_cd_hit_filename = '_clustered';
my $output_blast_results_filename = '_blast_results';
my $output_mcl_filename = '_uninflated_mcl_groups';
my $output_inflate_clusters_filename = '_inflated_mcl_groups';
my $output_group_labels_filename = '_labeled_mcl_groups';
my $output_combined_filename = '_combined_files';
my $output_cd_hit_filename = '_clustered';
my $output_blast_results_filename = '_blast_results';
my $output_mcl_filename = '_uninflated_mcl_groups';

my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => $output_combined_filename,
proteome_files => $self->fasta_files,
output_filename => $output_combined_filename,
);
$combine_fasta_files->create_combined_proteome_file;

Expand Down Expand Up @@ -75,64 +74,21 @@ sub run {
);
$mcl->run();

my $inflate_clusters = Bio::PanGenome::InflateClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
mcl_filename => $output_mcl_filename,
output_file => $output_inflate_clusters_filename
);
$inflate_clusters->inflate();

my $group_labels = Bio::PanGenome::GroupLabels->new(
groups_filename => $output_inflate_clusters_filename,
output_filename => $output_group_labels_filename
);
$group_labels->add_labels();

my $analyse_groups_obj = Bio::PanGenome::AnalyseGroups->new(
fasta_files => $self->fasta_files,
groups_filename => $output_group_labels_filename
);
$analyse_groups_obj->create_plots();

my $annotate_groups = Bio::PanGenome::AnnotateGroups->new(
gff_files => $self->input_files,
output_filename => $self->output_filename,
groups_filename => $output_group_labels_filename,
);
$annotate_groups->reannotate;

my $group_statistics = Bio::PanGenome::GroupStatistics->new(
output_filename => $self->output_statistics_filename,
annotate_groups_obj => $annotate_groups,
analyse_groups_obj => $analyse_groups_obj
);
$group_statistics->create_spreadsheet;
unlink($output_blast_results_filename);
unlink($output_combined_filename);

my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new(
analyse_groups => $analyse_groups_obj,
output_filename => $self->output_pan_geneome_filename
my $post_analysis = Bio::PanGenome::External::PostAnalysis->new(
job_runner => $self->job_runner,
fasta_files => $self->fasta_files,
input_files => $self->input_files,
output_filename => $self->output_filename,
output_pan_geneome_filename => $self->output_pan_geneome_filename,
output_statistics_filename => $self->output_statistics_filename,
clusters_filename => $cdhit_obj->clusters_filename,
dont_wait => 1,
);
$one_gene_per_fasta->create_file();

if($self->output_multifasta_files)
{
my $group_multifastas_nucleotides = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new(
gff_files => $self->input_files,
analyse_groups => $analyse_groups_obj,
annotate_groups => $annotate_groups,
group_names => $analyse_groups_obj->_groups
);
$group_multifastas_nucleotides->create_files();
}
$post_analysis->run();

unlink($output_blast_results_filename);
unlink($output_combined_filename);
unlink($output_cd_hit_filename);
unlink($output_mcl_filename);
unlink($output_inflate_clusters_filename);
unlink($output_group_labels_filename);
unlink( $output_cd_hit_filename . '.clstr' );
unlink( $output_cd_hit_filename . '.bak.clstr' );
}

no Moose;
Expand Down
107 changes: 107 additions & 0 deletions lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package Bio::PanGenome::CommandLine::PanGenomePostAnalysis;

# ABSTRACT: Perform the post analysis on the pan genome

=head1 SYNOPSIS
Perform the post analysis on the pan genome
=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::PostAnalysis;


has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_error_message' => ( is => 'rw', isa => 'Str' );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'input_files' => ( is => 'rw', isa => 'ArrayRef');
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' );
has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'clusters_filename' => ( is => 'rw', isa => 'Str' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );

sub BUILD {
my ($self) = @_;

my ( $output_filename, $output_pan_geneome_filename, $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $fasta_files, $input_files, $help );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'j|job_runner=s' => \$job_runner,
'output_multifasta_files' => \$output_multifasta_files,
'p=s' => \$output_pan_geneome_filename,
's=s' => \$output_statistics_filename,
'c=s' => \$clusters_filename,
'f=s@' => \$fasta_files,
'i=s@' => \$input_files,
'h|help' => \$help,
);

$self->job_runner($job_runner) if ( defined($job_runner) );
$self->fasta_files($fasta_files) if (defined($fasta_files));
$self->input_files($input_files) if (defined($input_files));
$self->output_filename($output_filename) if (defined($output_filename));
$self->output_pan_geneome_filename($output_pan_geneome_filename) if (defined($output_pan_geneome_filename));
$self->output_statistics_filename($output_statistics_filename) if (defined($output_statistics_filename));
$self->output_multifasta_files($output_multifasta_files) if (defined($output_multifasta_files));
$self->clusters_filename($clusters_filename) if (defined($clusters_filename));

}

sub run {
my ($self) = @_;

( !$self->help ) or die $self->usage_text;
if ( defined( $self->_error_message ) ) {
print $self->_error_message . "\n";
die $self->usage_text;
}

my $obj = Bio::PanGenome::PostAnalysis->new(
fasta_files => $self->fasta_files ,
input_files => $self->input_files ,
output_filename => $self->output_filename ,
output_pan_geneome_filename => $self->output_pan_geneome_filename,
output_statistics_filename => $self->output_statistics_filename ,
output_multifasta_files => $self->output_multifasta_files ,
clusters_filename => $self->clusters_filename ,
);
$obj->run();
}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: pan_genome_post_analysis [options]
Perform the post analysis on the pan genome. This script is usally only called by another script.
#Normal usage
pan_genome_post_analysis
-o output_groups_filename /
-p output_pan_genome_filename /
-s output_stats_filename /
-c output_clusters_filename /
-f proteins1.faa /
-f proteins2.faa /
-f proteins3.faa /
-i annotation1.gff /
-i annotation2.gff /
# This help message
pan_genome_post_analysis -h
USAGE
}

__PACKAGE__->meta->make_immutable;
no Moose;
1;
89 changes: 89 additions & 0 deletions lib/Bio/PanGenome/External/PostAnalysis.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package Bio::PanGenome::External::PostAnalysis;

# ABSTRACT: Perform the post analysis

=head1 SYNOPSIS
Perform the post analysis
use Bio::PanGenome::External::PostAnalysis;
my $seg= Bio::PanGenome::External::PostAnalysis->new(
fasta_file => 'contigs.fa',
);
$seg->run();
=cut

use Moose;
with 'Bio::PanGenome::JobRunner::Role';

has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'exec' => ( is => 'ro', isa => 'Str', default => 'pan_genome_post_analysis' );
has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'output_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_pan_geneome_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'output_statistics_filename' => ( is => 'ro', isa => 'Str', required => 1 );
has 'clusters_filename' => ( is => 'ro', isa => 'Str', required => 1 );

# Overload Role
has '_memory_required_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__memory_required_in_mb' );
has '_minimum_memory_mb' => ( is => 'ro', isa => 'Int', default => 1000 );
has '_memory_per_sample_mb' => ( is => 'ro', isa => 'Int', default => 100 );

sub _build__memory_required_in_mb {
my ($self) = @_;
my $num_samples = @{ $self->input_files };

my $memory_required = $num_samples * $self->_memory_per_sample_mb;
if ( $memory_required < $self->_minimum_memory_mb ) {
$memory_required = $self->_minimum_memory_mb;
}

return $memory_required;
}

sub _command_to_run {
my ($self) = @_;

my $fasta_files_param = join(' -f ',@{$self->fasta_files});
$fasta_files_param = ' -f '.$fasta_files_param;

my $input_files_param = join(' -i ',@{$self->input_files});
$input_files_param = ' -i '.$input_files_param;

return join(
" ",
(
$self->exec,
'-o', $self->output_filename,
'-p', $self->output_pan_geneome_filename,
'-s', $self->output_statistics_filename,
'-c', $self->clusters_filename,
'--output_multifasta_files',
$fasta_files_param,
$input_files_param
)
);
}

sub run {
my ($self) = @_;
my @commands_to_run;
push( @commands_to_run, $self->_command_to_run );

my $job_runner_obj = $self->_job_runner_class->new(
commands_to_run => \@commands_to_run,
memory_in_mb => $self->_memory_required_in_mb,
queue => $self->_queue,
dont_wait => $self->dont_wait,
);
$job_runner_obj->run();

1;
}

no Moose;
__PACKAGE__->meta->make_immutable;
1;
6 changes: 5 additions & 1 deletion lib/Bio/PanGenome/JobRunner/LSF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ sub run {
for my $command_to_run ( @{ $self->commands_to_run } ) {
$self->_submit_job($command_to_run);
}
$self->_job_manager->wait_all_children(history => 0);

if(!(defined($self->dont_wait) && $self->dont_wait == 1 ))
{
$self->_job_manager->wait_all_children(history => 0);
}
1;
}

Expand Down
1 change: 1 addition & 0 deletions lib/Bio/PanGenome/JobRunner/Role.pm
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local
has '_job_runner_class' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__job_runner_class' );
has '_memory_required_in_mb' => ( is => 'rw', isa => 'Int', default => '200' );
has '_queue' => ( is => 'rw', isa => 'Str', default => 'normal' );
has 'dont_wait' => ( is => 'rw', isa => 'Bool', default => 0 );

sub _build__job_runner_class {
my ($self) = @_;
Expand Down
Loading

0 comments on commit 1fc9eda

Please sign in to comment.