-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinsert_rna_seq_expression.pl
executable file
·218 lines (185 loc) · 11.3 KB
/
insert_rna_seq_expression.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env perl
# Perl core modules
use strict;
use warnings;
use diagnostics;
use Parallel::ForkManager;
# Frederic Bastian, created November 2012, last update Dec. 2016
# USAGE: perl insert_rna_seq_expression.pl -bgee=connection_string <OPTIONAL: -debug>
# After the insertion of bulk RNA-Seq data, this script inserts data
# into the expression table and update the rnaSeqLibraryAnnotatedSampleGeneResult table.
# -debug: if provided, run in verbose mode (print the update/insert SQL queries, not executing them)
#TODO: for Bgee 16 we have to homogeneise how expression IDs are created among RNASeq datatypes and
# refactor the code to run that step only once for all RNASeq (bulk, full-length, droplet)
#############################################################
use Getopt::Long;
use FindBin;
use lib "$FindBin::Bin/../.."; # Get lib path for Utils.pm
use Utils;
# Define arguments & their default value
my $bgee_connector = '';
my $debug = 0;
my $number_threads = 0;
my %opts = ('bgee=s' => \$bgee_connector, # Bgee connector string
'number_threads=s' => \$number_threads, # number of threads defining the number of parallel updates in the database
'debug' => \$debug,
);
# Check arguments
my $test_options = Getopt::Long::GetOptions(%opts);
if ( !$test_options || $bgee_connector eq '' ){
print "\n\tInvalid or missing argument:
\te.g. $0 -bgee=\$(BGEECMD)
\t-bgee Bgee connector string
\t-number_threads Number of threads used to insert expression
\t-debug printing the update/insert SQL queries, not executing them
\n";
exit 1;
}
# Bgee db connection
my $bgee = Utils::connect_bgee_db($bgee_connector);
$| = 1;
##########################################
# GET CONDITIONS STUDIED WITH BULK RNA-SEQ #
##########################################
print "Retrieving conditions...\n";
# retrieve conditions only for bulk RNASeq for which at least one library does not have any expressionId
my $queryConditions = $bgee->prepare('SELECT DISTINCT t2.exprMappedConditionId'.
' FROM rnaSeqLibraryAnnotatedSample AS t1'.
' INNER JOIN cond AS t2 ON t1.conditionId = t2.conditionId'.
' INNER JOIN rnaSeqLibrary AS t3 ON t3.rnaSeqLibraryId = t1.rnaSeqLibraryId'.
' WHERE t3.rnaSeqTechnologyIsSingleCell = 0'.
' AND NOT EXISTS (SELECT 1 FROM rnaSeqLibraryAnnotatedSample AS t4'.
' INNER JOIN rnaSeqLibraryAnnotatedSampleGeneResult AS t5'.
' ON t5.rnaSeqLibraryAnnotatedSampleId = t4.rnaSeqLibraryAnnotatedSampleId'.
' WHERE t1.rnaSeqLibraryId = t4.rnaSeqLibraryId'.
' AND t5.expressionId IS NOT NULL)');
$queryConditions->execute() or die $queryConditions->errstr;
my @exprMappedConditions = ();
while ( my @data = $queryConditions->fetchrow_array ){
push(@exprMappedConditions, $data[0]);
}
$queryConditions->finish;
print 'Done, ', scalar(@exprMappedConditions), " conditions retrieved.\n";
# Absent calls are not considered for all biotypes depending on the protocol used to generate a library.
# biotypes not considered for absent calls depending on the protocols are described in the table rnaSeqProtocolToBiotypeExcludedAbsentCalls.
# This query update reason of exclusion for such calls if they are not already exluded for prefiltering (gene never present)
my $updExclResult = $bgee->prepare( 'UPDATE rnaSeqLibraryAnnotatedSampleGeneResult AS t1'.
' INNER JOIN rnaSeqLibraryAnnotatedSample AS t2 ON t1.rnaSeqLibraryAnnotatedSampleId = t2.rnaSeqLibraryAnnotatedSampleId'.
' INNER JOIN rnaSeqLibrary AS t5 ON t2.rnaSeqLibraryId = t5.rnaSeqLibraryId'.
' INNER JOIN gene as t3 on t1.bgeeGeneId = t3.bgeeGeneId'.
' INNER JOIN rnaSeqPopulationCaptureToBiotypeExcludedAbsentCalls AS t4'.
' ON t5.rnaSeqPopulationCaptureId = t4.rnaSeqPopulationCaptureId'.
' AND t3.geneBioTypeId = t4.geneBioTypeId'.
' SET t1.reasonForExclusion = "'.$Utils::EXCLUDED_FOR_ABSENT_CALLS.'"'.
' WHERE t1.pValue > 0.05'.
' AND t1.expressionId IS NULL'.
# condition to remove once the script is used for all RNASeq datatypes
' AND t5.rnaSeqTechnologyIsSingleCell = 0 '.
#TODO: have to keep this condition as we do not generate calls for Bgee 15.2 and
# some genes were excluded for prefiltering for Bgee 15.0. No genes prefiltered anymore
# so this condition can be removed for the next major release.
' AND t1.reasonForExclusion !="'.$Utils::EXCLUDED_FOR_PRE_FILTERED.'"'.
#for single cell all absent calls have already been inserted with a reason for exclusion
# equal to "absent calls not reliatble". This filtering avoid updating again those data
' AND t1.reasonForExclusion !="'.$Utils::EXCLUDED_FOR_ABSENT_CALLS.'"');
if ( $debug ){
print 'UPDATE rnaSeqLibraryAnnotatedSampleGeneResult AS t1'.
' INNER JOIN rnaSeqLibraryAnnotatedSample AS t2 ON t1.rnaSeqLibraryAnnotatedSampleId = t2.rnaSeqLibraryAnnotatedSampleId'.
' INNER JOIN rnaSeqLibrary AS t5 ON t2.rnaSeqLibraryId = t5.rnaSeqLibraryId'.
' INNER JOIN gene as t3 on t1.bgeeGeneId = t3.bgeeGeneId'.
' INNER JOIN rnaSeqPopulationCaptureToBiotypeExcludedAbsentCalls AS t4'.
' ON t5.rnaSeqPopulationCaptureId = t4.rnaSeqPopulationCaptureId'.
' AND t3.geneBioTypeId = t4.geneBioTypeId'.
' SET t1.reasonForExclusion = "'.$Utils::EXCLUDED_FOR_ABSENT_CALLS.'"'.
' WHERE t1.pValue > 0.05'.
' AND t1.expressionId IS NULL'.
' AND t5.rnaSeqTechnologyIsSingleCell = 0 '.
' AND t1.reasonForExclusion !="'.$Utils::EXCLUDED_FOR_PRE_FILTERED.'"'.
' AND t1.reasonForExclusion !="'.$Utils::EXCLUDED_FOR_ABSENT_CALLS.'"';
} else {
$updExclResult->execute() or die $updExclResult->errstr;
}
$updExclResult->finish;
$bgee->disconnect;
print "Done excluding absent calls based on biotype\n";
##########################################
# PREPARE QUERIES #
##########################################
# Insert/update expression
my $insUpExprQuery = 'INSERT INTO expression (bgeeGeneId, conditionId) VALUES (?, ?) '.
'ON DUPLICATE KEY UPDATE expressionId=LAST_INSERT_ID(expressionId)';
# Query to update rnaSeqResult with the expressionId and reasonForExclusion for present calls
my $updResultQuery = 'UPDATE rnaSeqLibraryAnnotatedSampleGeneResult AS t1 '.
' INNER JOIN rnaSeqLibraryAnnotatedSample AS t2'.
' ON t1.rnaSeqLibraryAnnotatedSampleId = t2.rnaSeqLibraryAnnotatedSampleId'.
' INNER JOIN cond AS t3 ON t2.conditionId = t3.conditionId'.
' SET expressionId = ?'.
' WHERE bgeeGeneId = ? and t3.exprMappedConditionId = ?'.
' AND t1.reasonForExclusion = "'.$Utils::CALL_NOT_EXCLUDED.'"'.
' AND t1.expressionId IS NULL';
# query to get all the RNA-Seq results for a condition
my $queryResultsQuery = 'SELECT distinct t1.bgeeGeneId FROM rnaSeqLibraryAnnotatedSampleGeneResult AS t1'.
' INNER JOIN rnaSeqLibraryAnnotatedSample AS t2 ON t1.rnaSeqLibraryAnnotatedSampleId = t2.rnaSeqLibraryAnnotatedSampleId'.
' INNER JOIN cond AS t3 ON t2.conditionId = t3.conditionId'.
' WHERE t3.exprMappedConditionId = ? AND t1.reasonForExclusion = "'.$Utils::CALL_NOT_EXCLUDED.'"';
##########################################
# ITERATING CONDITIONS TO INSERT DATA #
##########################################
my $pm = new Parallel::ForkManager($number_threads);
print "Processing conditions...\n";
for my $exprMappedConditionId ( @exprMappedConditions ){
#start parallelization
my $pid = $pm->start and next;
# start thread specific connection to the database
my $bgee_thread = Utils::connect_bgee_db($bgee_connector);
# prepare queries
my $queryResults = $bgee_thread->prepare($queryResultsQuery);
my $updResult = $bgee_thread->prepare($updResultQuery);
my $insUpExpr = $bgee_thread->prepare($insUpExprQuery);
print "\tconditionId: $exprMappedConditionId\n";
# retrieve genes results for this condition
print "\t\tRetrieving genes results...\n";
$queryResults->execute($exprMappedConditionId) or die $queryResults->errstr;
my %results = ();
while ( my @data = $queryResults->fetchrow_array ){
# $data[0] = bgeeGeneId
$results{$data[0]} = 1;
}
print "\t\tDone, ", scalar(keys %results), ' genes retrieved. ',
"Generating expression summary...\n";
# now iterating the genes to insert expression data
# (one row for a gene-condition)
for my $geneId ( keys %results ){
my $expressionId = undef;
# insert or update the expression table if not only undefined calls
if ( $debug ){
print "INSERT INTO expression (bgeeGeneId, conditionId) VALUES ($geneId, $exprMappedConditionId)...\n";
} else {
$insUpExpr->execute($geneId, $exprMappedConditionId) or die $insUpExpr->errstr;
$expressionId = $bgee_thread->{'mysql_insertid'};
}
# Now update the related rnaSeqResults
if ( $debug ){
my $printExprId = 'notRetrievedForLog';
print "UPDATE rnaSeqLibraryAnnotatedSampleGeneResult AS t1 ",
"INNER JOIN rnaSeqLibraryAnnotatedSample AS t2 ",
"ON t1.rnaSeqLibraryAnnotatedSampleId = t2.rnaSeqLibraryAnnotatedSampleId ",
"INNER JOIN cond AS t3 ON t2.conditionId = t3.conditionId ",
"SET expressionId = $printExprId ",
"WHERE bgeeGeneId = $geneId AND t3.exprMappedConditionId = $exprMappedConditionId ",
"AND t1.reasonForExclusion = \"".$Utils::CALL_NOT_EXCLUDED."\"".
"AND t1.expressionId IS NULL\n";
} else {
$updResult->execute($expressionId, $geneId, $exprMappedConditionId)
or die $updResult->errstr;
}
}
$insUpExpr->finish;
$updResult->finish;
$queryResults->finish;
$bgee_thread->disconnect;
$pm->finish;
}
$pm->wait_all_children;
print "Done inserting expression IDs\n" if ( $debug );
exit 0;