From 0924fa412316c86bf4449c32df6914779393a67a Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Wed, 2 Oct 2013 11:44:06 +0100 Subject: [PATCH] run post analysis as a job --- bin/pan_genome_post_analysis | 19 ++++ lib/Bio/PanGenome.pm | 84 ++++---------- .../CommandLine/PanGenomePostAnalysis.pm | 107 ++++++++++++++++++ lib/Bio/PanGenome/External/PostAnalysis.pm | 89 +++++++++++++++ lib/Bio/PanGenome/JobRunner/LSF.pm | 6 +- lib/Bio/PanGenome/JobRunner/Role.pm | 1 + lib/Bio/PanGenome/PostAnalysis.pm | 101 +++++++++++++++++ t/Bio/PanGenome/CommandLine/CreatePanGenome.t | 7 +- 8 files changed, 345 insertions(+), 69 deletions(-) create mode 100755 bin/pan_genome_post_analysis create mode 100644 lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm create mode 100644 lib/Bio/PanGenome/External/PostAnalysis.pm create mode 100644 lib/Bio/PanGenome/PostAnalysis.pm diff --git a/bin/pan_genome_post_analysis b/bin/pan_genome_post_analysis new file mode 100755 index 0000000..e92509c --- /dev/null +++ b/bin/pan_genome_post_analysis @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +package Bio::PanGenome::Main::PanGenomePostAnalysis; + +# ABSTRACT: Perform the post analysis on the pan genome +# PODNAME: pan_genome_post_analysis + +=head1 SYNOPSIS + +Perform the post analysis on the pan genome + +=cut + +BEGIN { unshift( @INC, '../lib' ) } +BEGIN { unshift( @INC, './lib' ) } +BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) } +use Bio::PanGenome::CommandLine::PanGenomePostAnalysis; + +Bio::PanGenome::CommandLine::PanGenomePostAnalysis->new(args => \@ARGV, script_name => $0)->run; diff --git a/lib/Bio/PanGenome.pm b/lib/Bio/PanGenome.pm index f594a47..0874487 100644 --- a/lib/Bio/PanGenome.pm +++ b/lib/Bio/PanGenome.pm @@ -20,6 +20,7 @@ use Bio::PanGenome::AnnotateGroups; use Bio::PanGenome::Output::OneGenePerGroupFasta; use Bio::PanGenome::GroupStatistics; use Bio::PanGenome::Output::GroupsMultifastasNucleotide; +use Bio::PanGenome::External::PostAnalysis; has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); @@ -32,21 +33,19 @@ has 'blastp_exec' => ( is => 'rw', isa => 'Str', default = has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' ); has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' ); -has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); +has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); sub run { my ($self) = @_; - my $output_combined_filename = '_combined_files'; - my $output_cd_hit_filename = '_clustered'; - my $output_blast_results_filename = '_blast_results'; - my $output_mcl_filename = '_uninflated_mcl_groups'; - my $output_inflate_clusters_filename = '_inflated_mcl_groups'; - my $output_group_labels_filename = '_labeled_mcl_groups'; + my $output_combined_filename = '_combined_files'; + my $output_cd_hit_filename = '_clustered'; + my $output_blast_results_filename = '_blast_results'; + my $output_mcl_filename = '_uninflated_mcl_groups'; my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new( - proteome_files => $self->fasta_files, - output_filename => $output_combined_filename, + proteome_files => $self->fasta_files, + output_filename => $output_combined_filename, ); $combine_fasta_files->create_combined_proteome_file; @@ -75,64 +74,21 @@ sub run { ); $mcl->run(); - my $inflate_clusters = Bio::PanGenome::InflateClusters->new( - clusters_filename => $cdhit_obj->clusters_filename, - mcl_filename => $output_mcl_filename, - output_file => $output_inflate_clusters_filename - ); - $inflate_clusters->inflate(); - - my $group_labels = Bio::PanGenome::GroupLabels->new( - groups_filename => $output_inflate_clusters_filename, - output_filename => $output_group_labels_filename - ); - $group_labels->add_labels(); - - my $analyse_groups_obj = Bio::PanGenome::AnalyseGroups->new( - fasta_files => $self->fasta_files, - groups_filename => $output_group_labels_filename - ); - $analyse_groups_obj->create_plots(); - - my $annotate_groups = Bio::PanGenome::AnnotateGroups->new( - gff_files => $self->input_files, - output_filename => $self->output_filename, - groups_filename => $output_group_labels_filename, - ); - $annotate_groups->reannotate; - - my $group_statistics = Bio::PanGenome::GroupStatistics->new( - output_filename => $self->output_statistics_filename, - annotate_groups_obj => $annotate_groups, - analyse_groups_obj => $analyse_groups_obj - ); - $group_statistics->create_spreadsheet; + unlink($output_blast_results_filename); + unlink($output_combined_filename); - my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new( - analyse_groups => $analyse_groups_obj, - output_filename => $self->output_pan_geneome_filename + my $post_analysis = Bio::PanGenome::External::PostAnalysis->new( + job_runner => $self->job_runner, + fasta_files => $self->fasta_files, + input_files => $self->input_files, + output_filename => $self->output_filename, + output_pan_geneome_filename => $self->output_pan_geneome_filename, + output_statistics_filename => $self->output_statistics_filename, + clusters_filename => $cdhit_obj->clusters_filename, + dont_wait => 1, ); - $one_gene_per_fasta->create_file(); - - if($self->output_multifasta_files) - { - my $group_multifastas_nucleotides = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new( - gff_files => $self->input_files, - analyse_groups => $analyse_groups_obj, - annotate_groups => $annotate_groups, - group_names => $analyse_groups_obj->_groups - ); - $group_multifastas_nucleotides->create_files(); - } + $post_analysis->run(); - unlink($output_blast_results_filename); - unlink($output_combined_filename); - unlink($output_cd_hit_filename); - unlink($output_mcl_filename); - unlink($output_inflate_clusters_filename); - unlink($output_group_labels_filename); - unlink( $output_cd_hit_filename . '.clstr' ); - unlink( $output_cd_hit_filename . '.bak.clstr' ); } no Moose; diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm b/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm new file mode 100644 index 0000000..3dd9448 --- /dev/null +++ b/lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm @@ -0,0 +1,107 @@ +package Bio::PanGenome::CommandLine::PanGenomePostAnalysis; + +# ABSTRACT: Perform the post analysis on the pan genome + +=head1 SYNOPSIS + +Perform the post analysis on the pan genome + +=cut + +use Moose; +use Getopt::Long qw(GetOptionsFromArray); +use Bio::PanGenome::PostAnalysis; + + +has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); +has '_error_message' => ( is => 'rw', isa => 'Str' ); + +has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' ); +has 'input_files' => ( is => 'rw', isa => 'ArrayRef'); +has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); +has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); +has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' ); +has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 ); +has 'clusters_filename' => ( is => 'rw', isa => 'Str' ); +has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' ); + +sub BUILD { + my ($self) = @_; + + my ( $output_filename, $output_pan_geneome_filename, $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $fasta_files, $input_files, $help ); + + GetOptionsFromArray( + $self->args, + 'o|output=s' => \$output_filename, + 'j|job_runner=s' => \$job_runner, + 'output_multifasta_files' => \$output_multifasta_files, + 'p=s' => \$output_pan_geneome_filename, + 's=s' => \$output_statistics_filename, + 'c=s' => \$clusters_filename, + 'f=s@' => \$fasta_files, + 'i=s@' => \$input_files, + 'h|help' => \$help, + ); + + $self->job_runner($job_runner) if ( defined($job_runner) ); + $self->fasta_files($fasta_files) if (defined($fasta_files)); + $self->input_files($input_files) if (defined($input_files)); + $self->output_filename($output_filename) if (defined($output_filename)); + $self->output_pan_geneome_filename($output_pan_geneome_filename) if (defined($output_pan_geneome_filename)); + $self->output_statistics_filename($output_statistics_filename) if (defined($output_statistics_filename)); + $self->output_multifasta_files($output_multifasta_files) if (defined($output_multifasta_files)); + $self->clusters_filename($clusters_filename) if (defined($clusters_filename)); + +} + +sub run { + my ($self) = @_; + + ( !$self->help ) or die $self->usage_text; + if ( defined( $self->_error_message ) ) { + print $self->_error_message . "\n"; + die $self->usage_text; + } + + my $obj = Bio::PanGenome::PostAnalysis->new( + fasta_files => $self->fasta_files , + input_files => $self->input_files , + output_filename => $self->output_filename , + output_pan_geneome_filename => $self->output_pan_geneome_filename, + output_statistics_filename => $self->output_statistics_filename , + output_multifasta_files => $self->output_multifasta_files , + clusters_filename => $self->clusters_filename , + ); + $obj->run(); +} + +sub usage_text { + my ($self) = @_; + + return <meta->make_immutable; +no Moose; +1; diff --git a/lib/Bio/PanGenome/External/PostAnalysis.pm b/lib/Bio/PanGenome/External/PostAnalysis.pm new file mode 100644 index 0000000..3a78294 --- /dev/null +++ b/lib/Bio/PanGenome/External/PostAnalysis.pm @@ -0,0 +1,89 @@ +package Bio::PanGenome::External::PostAnalysis; + +# ABSTRACT: Perform the post analysis + +=head1 SYNOPSIS + +Perform the post analysis + + use Bio::PanGenome::External::PostAnalysis; + + my $seg= Bio::PanGenome::External::PostAnalysis->new( + fasta_file => 'contigs.fa', + ); + + $seg->run(); + +=cut + +use Moose; +with 'Bio::PanGenome::JobRunner::Role'; + +has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'exec' => ( is => 'ro', isa => 'Str', default => 'pan_genome_post_analysis' ); +has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'output_filename' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'output_pan_geneome_filename' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'output_statistics_filename' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'clusters_filename' => ( is => 'ro', isa => 'Str', required => 1 ); + +# Overload Role +has '_memory_required_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__memory_required_in_mb' ); +has '_minimum_memory_mb' => ( is => 'ro', isa => 'Int', default => 1000 ); +has '_memory_per_sample_mb' => ( is => 'ro', isa => 'Int', default => 100 ); + +sub _build__memory_required_in_mb { + my ($self) = @_; + my $num_samples = @{ $self->input_files }; + + my $memory_required = $num_samples * $self->_memory_per_sample_mb; + if ( $memory_required < $self->_minimum_memory_mb ) { + $memory_required = $self->_minimum_memory_mb; + } + + return $memory_required; +} + +sub _command_to_run { + my ($self) = @_; + + my $fasta_files_param = join(' -f ',@{$self->fasta_files}); + $fasta_files_param = ' -f '.$fasta_files_param; + + my $input_files_param = join(' -i ',@{$self->input_files}); + $input_files_param = ' -i '.$input_files_param; + + return join( + " ", + ( + $self->exec, + '-o', $self->output_filename, + '-p', $self->output_pan_geneome_filename, + '-s', $self->output_statistics_filename, + '-c', $self->clusters_filename, + '--output_multifasta_files', + $fasta_files_param, + $input_files_param + ) + ); +} + +sub run { + my ($self) = @_; + my @commands_to_run; + push( @commands_to_run, $self->_command_to_run ); + + my $job_runner_obj = $self->_job_runner_class->new( + commands_to_run => \@commands_to_run, + memory_in_mb => $self->_memory_required_in_mb, + queue => $self->_queue, + dont_wait => $self->dont_wait, + ); + $job_runner_obj->run(); + + 1; +} + +no Moose; +__PACKAGE__->meta->make_immutable; +1; diff --git a/lib/Bio/PanGenome/JobRunner/LSF.pm b/lib/Bio/PanGenome/JobRunner/LSF.pm index 22056cc..3109a30 100644 --- a/lib/Bio/PanGenome/JobRunner/LSF.pm +++ b/lib/Bio/PanGenome/JobRunner/LSF.pm @@ -51,7 +51,11 @@ sub run { for my $command_to_run ( @{ $self->commands_to_run } ) { $self->_submit_job($command_to_run); } - $self->_job_manager->wait_all_children(history => 0); + + if(!(defined($self->dont_wait) && $self->dont_wait == 1 )) + { + $self->_job_manager->wait_all_children(history => 0); + } 1; } diff --git a/lib/Bio/PanGenome/JobRunner/Role.pm b/lib/Bio/PanGenome/JobRunner/Role.pm index 3272c55..ca6e1fd 100644 --- a/lib/Bio/PanGenome/JobRunner/Role.pm +++ b/lib/Bio/PanGenome/JobRunner/Role.pm @@ -15,6 +15,7 @@ has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local has '_job_runner_class' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__job_runner_class' ); has '_memory_required_in_mb' => ( is => 'rw', isa => 'Int', default => '200' ); has '_queue' => ( is => 'rw', isa => 'Str', default => 'normal' ); +has 'dont_wait' => ( is => 'rw', isa => 'Bool', default => 0 ); sub _build__job_runner_class { my ($self) = @_; diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm new file mode 100644 index 0000000..adefaab --- /dev/null +++ b/lib/Bio/PanGenome/PostAnalysis.pm @@ -0,0 +1,101 @@ +package Bio::PanGenome::PostAnalysis; + +# ABSTRACT: Post analysis of pan genomes + +=head1 SYNOPSIS + +Create a pan genome + +=cut + +use Moose; +use Bio::PanGenome::InflateClusters; +use Bio::PanGenome::AnalyseGroups; +use Bio::PanGenome::GroupLabels; +use Bio::PanGenome::AnnotateGroups; +use Bio::PanGenome::Output::OneGenePerGroupFasta; +use Bio::PanGenome::GroupStatistics; +use Bio::PanGenome::Output::GroupsMultifastasNucleotide; + +has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); +has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); +has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); +has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); +has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' ); +has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); + +has 'clusters_filename' => ( is => 'rw', isa => 'Str', required => 1 ); + +sub run { + my ($self) = @_; + + my $output_mcl_filename = '_uninflated_mcl_groups'; + my $output_inflate_clusters_filename = '_inflated_mcl_groups'; + my $output_group_labels_filename = '_labeled_mcl_groups'; + + my $inflate_clusters = Bio::PanGenome::InflateClusters->new( + clusters_filename => $self->clusters_filename, + mcl_filename => $output_mcl_filename, + output_file => $output_inflate_clusters_filename + ); + $inflate_clusters->inflate(); + + my $group_labels = Bio::PanGenome::GroupLabels->new( + groups_filename => $output_inflate_clusters_filename, + output_filename => $output_group_labels_filename + ); + $group_labels->add_labels(); + + my $analyse_groups_obj = Bio::PanGenome::AnalyseGroups->new( + fasta_files => $self->fasta_files, + groups_filename => $output_group_labels_filename + ); + $analyse_groups_obj->create_plots(); + + + my $one_gene_per_fasta = Bio::PanGenome::Output::OneGenePerGroupFasta->new( + analyse_groups => $analyse_groups_obj, + output_filename => $self->output_pan_geneome_filename + ); + $one_gene_per_fasta->create_file(); + + + my $annotate_groups = Bio::PanGenome::AnnotateGroups->new( + gff_files => $self->input_files, + output_filename => $self->output_filename, + groups_filename => $output_group_labels_filename, + ); + $annotate_groups->reannotate; + + + my $group_statistics = Bio::PanGenome::GroupStatistics->new( + output_filename => $self->output_statistics_filename, + annotate_groups_obj => $annotate_groups, + analyse_groups_obj => $analyse_groups_obj + ); + $group_statistics->create_spreadsheet; + + if($self->output_multifasta_files) + { + my $group_multifastas_nucleotides = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new( + gff_files => $self->input_files, + analyse_groups => $analyse_groups_obj, + annotate_groups => $annotate_groups, + group_names => $analyse_groups_obj->_groups + ); + $group_multifastas_nucleotides->create_files(); + } + + unlink($output_mcl_filename); + unlink($output_inflate_clusters_filename); + unlink($output_group_labels_filename); + unlink( $self->clusters_filename); + unlink( $self->clusters_filename . '.clstr' ); + unlink( $self->clusters_filename . '.bak.clstr' ); + +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; diff --git a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t index 8145f7a..ac9a35e 100644 --- a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t +++ b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t @@ -16,12 +16,11 @@ BEGIN { my $script_name = 'Bio::PanGenome::CommandLine::CreatePanGenome'; my $cwd = getcwd(); +local $ENV{PATH} = "$ENV{PATH}:./bin"; my %scripts_and_expected_files = ( - ' -j Local t/data/example_1.faa t/data/example_2.faa t/data/example_3.faa ' => - [ 'clustered_proteins', 't/data/expected_clustered_proteins' ], - ' -j Local t/data/example_1.faa t/data/example_2.faa t/data/example_3.faa ' => - [ 'pan_genome.fa', 't/data/expected_create_pan_genome.fa' ], + ' -j Local t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' => + [ 'pan_genome_sequences/3-group_9.fa', 't/data/3-group_9.fa' ], ' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' => [ 'pan_genome_sequences/3-group_9.fa', 't/data/3-group_9.fa' ],