-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathslurm_scheduler.pl
executable file
·187 lines (149 loc) · 7.85 KB
/
slurm_scheduler.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env perl
## Julien Roux, Jan 11, 2016
# This script launches the slurm jobs
# The script rna_seq_mapping_and_analysis.pl is launched for each library
# It is inspired from https://svn.vital-it.ch/svn/Selectome/trunk/scripts/pipeline/bsub_scheduler.pl
# Perl core modules
use strict;
use warnings;
use diagnostics;
use File::Path qw(make_path);
use File::Slurp;
use FindBin qw( $RealBin ); # directory where the script is lying
use Getopt::Long;
# Define arguments & their default value
my ($sample_info_file, $exclude_sample_file, $output_log_folder, $index_folder, $fastq_folder, $partition, $account, $kallisto_out_folder, $enc_passwd_file, $cluster_kallisto_cmd, $cluster_R_cmd) = ('', '', '', '', '', '', '', '', '', '', '', '', '');
my %opts = ('sample_info_file=s' => \$sample_info_file,
'exclude_sample_file=s' => \$exclude_sample_file,
'output_log_folder=s' => \$output_log_folder,
'account=s' => \$account,
'partition=s' => \$partition,
'index_folder=s' => \$index_folder, # same as GTF folder
'fastq_folder=s' => \$fastq_folder,
'kallisto_out_folder=s' => \$kallisto_out_folder,
'enc_passwd_file=s' => \$enc_passwd_file,
'cluster_kallisto_cmd=s' => \$cluster_kallisto_cmd,
'cluster_R_cmd=s' => \$cluster_R_cmd,
);
# Check arguments
my $test_options = Getopt::Long::GetOptions(%opts);
if ( !$test_options || $sample_info_file eq '' || $output_log_folder eq '' || $index_folder eq '' || $fastq_folder eq '' || $kallisto_out_folder eq '' || $enc_passwd_file eq '' || $cluster_kallisto_cmd eq '' || $cluster_R_cmd eq ''){
print "\n\tInvalid or missing argument:
\te.g. $0 -sample_info_file=\$(RNASEQ_SAMPINFO_FILEPATH) -exclude_sample_file=\$(RNASEQ_SAMPEXCLUDED_FILEPATH) -output_log_folder=\$(RNASEQ_CLUSTER_LOG) -index_folder=\$(RNASEQ_CLUSTER_GTF) -fastq_folder=\$(RNASEQ_SENSITIVE_FASTQ) -kallisto_out_folder=\$(RNASEQ_CLUSTER_ALL_RES) -enc_passwd_file=\$(ENCRYPT_PASSWD_FILE) -cluster_kallisto_cmd=\$(CLUSTER_KALLISTO_CMD) -cluster_R_cmd=\$(CLUSTER_R_CMD)
\t-sample_info_file rna_seq_sample_info.txt
\t-exclude_sample_file rna_seq_sample_excluded.txt
\t-output_log_folder folder for .out and .err files (produced by queuing system), and .Rout files produced by R
\t-index_folder=s Folder with Kallisto indexes (same as GTF folder)
\t-fastq_folder=s Folder with Fastq files on big bgee
\t-kallisto_out_folder=s Folder with Kallisto output and results
\t-enc_passwd_file=s File with password necessary to decrypt the GTEx data
\t-cluster_kallisto_cmd=s Command to load kallisto module on cluster
\t-cluster_R_cmd=s Command to load R module on cluster
\n";
exit 1;
}
# Tests
die "Invalid or missing [$sample_info_file]: $?\n" if ( !-e $sample_info_file || !-s $sample_info_file );
# Create output folder if not present
make_path "$output_log_folder", {verbose=>0, mode=>0775};
# Setting up SLURM parameters #################################
my $main_script = $RealBin.'/rna_seq_mapping_and_analysis.pl';
## TODO launch slurm_scheduler.pl from /data/ul/dee/bgee/GIT/pipeline/RNA_Seq/
## Beware that git pull command should be executed before
## kallisto_out_folder should be on /scratch/temporary. If too slow, consider using /scratch/local/ + cp of results file to /scratch/temporary/ or /home/bbgee, or /data/ (read-only, but should be fine via scp)
# kallisto is no multithreaded unless bootstraps are used
my $nbr_processors = 1;
# RAM needed: 10GB should be enough
my $memory_usage = 50; # in GB
my $time_limit = '12:00:00';
my $job_limit = 120; # Number of simultaneous jobs running
# Sample to exclude if any
my %manually_excluded;
EXCLUSION:
for my $line ( read_file("$exclude_sample_file", chomp=>1) ){
#libraryId excluded comment annotatorId lastModificationDate
next EXCLUSION if ( $line =~ /^#/); # header or comment
my ($sampleId, $to_exclude) = split(/\t/, $line);
next EXCLUSION if ( $to_exclude ne 'TRUE' );
$manually_excluded{$sampleId} = 1;
}
# reading library infos
my $count = 0;
JOB:
for my $line ( read_file("$sample_info_file", chomp=>1) ){
next JOB if ( $line =~ /^#/); # header
my @fields = split ("\t", $line);
my $library_id = $fields[0];
my $taxa_id = $fields[2];
# Excluded library
next JOB if ( exists $manually_excluded{$library_id} );
# Test to not re-run already finished jobs
if ( -s "$output_log_folder/$library_id/DONE.txt" ){
print "\n$library_id not launched because already analyzed\n";
next JOB;
}
# Check running jobs to not resubmit them while running
if ( `squeue --user=\$USER --account=$account --long | grep ' $library_id '` =~ / (RUNN|PEND)ING / ){
print "\n$library_id not launched because it is currently being analyzed (see squeue/sacct)\n";
next JOB;
}
# Let's launch this library!
print "\nLaunching $library_id ...\n";
$count++;
# Create output folder for library
make_path "$output_log_folder/$library_id", {verbose=>0, mode=>0775};
# library-specific arguments
my $output_file = $output_log_folder.'/'.$library_id.'/'.$library_id.'.out';
my $error_file = $output_log_folder.'/'.$library_id.'/'.$library_id.'.err';
# First, remove previous .out and .err files
unlink "$output_file", "$error_file";
my $sbatch_file = $output_log_folder.'/'.$library_id.'/'.$library_id.'.sbatch';
#NOTE check memory usage with grep 'Maximum resident set size' *.time
my $script_plus_args = "/software/bin/time -vo $output_log_folder/$library_id/$library_id.time perl $main_script -library_id=$library_id -sample_info_file=$sample_info_file -exclude_sample_file=$exclude_sample_file -index_folder=$index_folder -fastq_folder='$fastq_folder/$taxa_id' -kallisto_out_folder=$kallisto_out_folder -output_log_folder=$output_log_folder -enc_passwd_file=$enc_passwd_file > $output_log_folder/$library_id/$library_id.txt 2>&1";
# Wait for free places in job queue
my $running_jobs = check_running_jobs();
WAIT_FREE_JOB_IN_QUEUE:
while ( $running_jobs >= $job_limit ){
print "No more possible slot for the job, waiting and resubmitting\n";
sleep 30;
$running_jobs = check_running_jobs();
}
# Script can be launched! Construct SLURM sbatch command:
my $sbatch_command = '';
$sbatch_command .= "$cluster_kallisto_cmd\n";
$sbatch_command .= "$cluster_R_cmd\n\n";
$sbatch_command .= $script_plus_args;
print "Command submitted to cluster:\n$sbatch_command\n";
# Create the SBATCH script
open (my $OUT, '>', "$sbatch_file") or die "Cannot write [$sbatch_file]\n";
print {$OUT} sbatch_template($partition, $account, $nbr_processors, $memory_usage, $output_file, $error_file, $library_id);
print {$OUT} "$sbatch_command\n";
close $OUT;
# Then, run the job
system("sbatch $sbatch_file")==0 or print "Failed to submit job [$library_id]\n";
}
print "\n######################################################\nAll done. $count jobs submitted.\n######################################################\n";
exit 0;
sub check_running_jobs {
my $running_jobs = `squeue --user=\$USER --account=$account | grep -v 'JOBID' | wc -l` || 0;
chomp($running_jobs);
return $running_jobs;
}
# Add main sbatch command and options
sub sbatch_template {
my ($partition, $account, $nbr_processors, $memory_usage, $output_file, $error_file, $library_id) = @_;
my $template="#!/bin/bash
#SBATCH --partition=$partition
#SBATCH --account=$account
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=$nbr_processors
#SBATCH --mem=${memory_usage}G
#SBATCH --time=$time_limit
#SBATCH --output=$output_file
#SBATCH --error=$error_file
#SBATCH --export=NONE
#SBATCH --job-name=$library_id
";
return $template;
}