-
Notifications
You must be signed in to change notification settings - Fork 3
/
vmat_utils.R
313 lines (298 loc) · 9.58 KB
/
vmat_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#' A function to compute Vplot matrix
#'
#' This function computes the underlying matrix shown as a heatmap
#' in Vplots. For each pair of coordinates (x: distance from fragment
#' midpoint to center of GRanges of interest; y: fragment size), the
#' function computes how many fragments there are.
#'
#' @param bam_granges GRanges, paired-end fragments
#' @param granges GRanges, regions to map the fragments onto
#' @param cores Integer, nb of threads to parallelize fragments subsetting
#' @param xlims The x limits of the computed Vmat
#' @param ylims The y limits of the computed Vmat
#' @return A table object
#'
#' @import S4Vectors
#' @import GenomicRanges
#' @import IRanges
#' @export
#'
#' @examples
#' data(bam_test)
#' data(ce11_all_REs)
#' Vmat <- computeVmat(bam_test, ce11_all_REs)
#' dim(Vmat)
#' Vmat[seq(1,5), seq(1,10)]
computeVmat <- function(
bam_granges,
granges,
cores = 1,
xlims = c(-250, 250),
ylims = c(50, 300)
)
{
# Fragments
frags <- bam_granges
# Subset frags whose center overlap with extended targets
targets_extended <- GenomicRanges::resize(
granges, fix = 'center', width = (xlims[2] - xlims[1])
)
breaks <- seq(1, length(frags), length.out = cores + 1)
frags_l <- parallel::mclapply(
mc.cores = cores,
seq_len(cores),
function(K) {
g <- IRanges::subsetByOverlaps(
frags[breaks[K]:breaks[K+1]],
targets_extended,
ignore.strand = TRUE
)
return(g)
}
)
frag_ov <- unlist(GRangesList(frags_l))
# Subset frags whose size is within ylims
frags_subset <- frag_ov[(IRanges::width(frag_ov) >= ylims[1]) &
(IRanges::width(frag_ov) <= ylims[2])]
# Fast resize many GRanges
frags_subset_centers <- GenomicRanges::GRanges(
GenomicRanges::seqnames(frags_subset),
IRanges::IRanges(
GenomicRanges::start(frags_subset) +
round(width(frags_subset)/2) - 1,
width = 1
)
)
# Split promoters into forward and reverse
targets <- deconvolveBidirectionalPromoters(granges)
targets_plus <- targets[GenomicRanges::strand(targets) == '+']
targets_minus <- targets[GenomicRanges::strand(targets) == '-']
# Forward promoters
if (length(targets_plus)) {
# Resize targets
targets_centers <- GenomicRanges::resize(
targets_plus, fix = 'center', width = 1
)
targets_extended <- GenomicRanges::resize(
targets_centers, fix = 'center', width = (xlims[2] - xlims[1])
)
# Get oriented distance between frag center and target center
filt <- seqnames(
frags_subset_centers
) %in% as.character(unique(seqnames(targets_centers)))
n <- nearest(
frags_subset_centers[filt],
targets_centers
)
nearest_targets <- targets_centers[n]
frags_subset$distance_to_nearest <- NA
frags_subset[filt]$distance_to_nearest <- GenomicRanges::start(
frags_subset_centers[filt]
) -
GenomicRanges::start(nearest_targets)
frags_subset$strand_nearest_target <- NA
frags_subset[filt]$strand_nearest_target <- GenomicRanges::strand(
nearest_targets
)
frags_subset$distance_to_nearest <- ifelse(
frags_subset$strand_nearest_target == '-',
-frags_subset$distance_to_nearest,
frags_subset$distance_to_nearest
)
# dists / widths
frags_subset_sub <- frags_subset[!is.na(
frags_subset$distance_to_nearest
)]
dists_plus <- frags_subset_sub$distance_to_nearest
widths_plus <- IRanges::width(frags_subset_sub)
}
else {
dists_plus <- NULL
widths_plus <- NULL
}
# Reverse promoters
if (length(targets_minus)) {
# Resize targets
targets_centers <- GenomicRanges::resize(
targets_minus, fix = 'center', width = 1
)
targets_extended <- GenomicRanges::resize(
targets_centers, fix = 'center', width = (xlims[2] - xlims[1])
)
# Get oriented distance between frag center and target center
filt <- seqnames(
frags_subset_centers
) %in% as.character(unique(seqnames(targets_centers)))
nearest_targets <- targets_centers[
nearest(frags_subset_centers[filt], targets_centers)
]
frags_subset$distance_to_nearest <- NA
frags_subset[filt]$distance_to_nearest <- GenomicRanges::start(
frags_subset_centers[filt]
) -
GenomicRanges::start(nearest_targets)
frags_subset$strand_nearest_target <- NA
frags_subset[filt]$strand_nearest_target <- GenomicRanges::strand(
nearest_targets
)
frags_subset$distance_to_nearest <- ifelse(
frags_subset$strand_nearest_target == '-',
-frags_subset$distance_to_nearest,
frags_subset$distance_to_nearest
)
# dists / widths
frags_subset_sub <- frags_subset[!is.na(
frags_subset$distance_to_nearest
)]
dists_minus <- frags_subset_sub$distance_to_nearest
widths_minus <- IRanges::width(frags_subset_sub)
}
else {
dists_minus <- NULL
widths_minus <- NULL
}
# Return table of dists / widths
tab <- table(
factor(
c(dists_plus, dists_minus),
levels = xlims[1]:xlims[2]
),
factor(
c(widths_plus, widths_minus),
levels = c(ylims[1]:ylims[2])
)
)
# Replace central Vplot line (dist = 0) by average
# between two flanking lines
tab["0", ] = round((tab["-1", ] + tab["1", ])/2)
return(tab)
}
#' A function to shuffle a Vmat
#'
#' This function works on a Vmat (the output of computeVmat()). It shuffles
#' the matrix to randomize the fragment densities.
#'
#' @param Vmat A Vmat, usually output of computeVmat
#' @return A shuffled Vmat object
#'
#' @import S4Vectors
#' @import GenomicRanges
#' @import IRanges
#' @export
#'
#' @examples
#' data(bam_test)
#' data(ce11_all_REs)
#' Vmat <- computeVmat(bam_test, ce11_all_REs)
#' Vmat <- shuffleVmat(Vmat)
shuffleVmat <- function(Vmat)
{
shuffled.Vmat <- c(Vmat)
shuffled.Vmat <- shuffled.Vmat[sample(
x = seq_along(shuffled.Vmat),
size = length(shuffled.Vmat),
replace = FALSE
)]
shuffled.Vmat <- matrix(
shuffled.Vmat, nrow = nrow(Vmat), ncol = ncol(Vmat)
)
colnames(shuffled.Vmat) <- colnames(Vmat)
row.names(shuffled.Vmat) <- row.names(Vmat)
return(shuffled.Vmat)
}
#' A function to normalized a Vmat
#'
#' This function normalizes a Vmat. Several different approaches have
#' been implemented to normalize the Vmats.
#'
#' @param Vmat A Vmat, usually output of computeVmat
#' @param bam_granges GRanges, the paired-end fragments
#' @param granges GRanges, the regions to map the fragments onto
#' @param normFun character. A Vmat should be scaled either by:
#' \itemize{
#' \item 'libdepth+nloci', e.g. the library depth and the number of
#' loci used to compute the Vmat;
#' \item zscore, if relative patterns of fragment density
#' are more important than density per se;
#' \item Alternatively, the Vmat can be scaled to % ('pct'), to
#' a chosen quantile ('quantile') or to the max Vmat value ('max').
#' }
#' @param s A float indicating which quantile to use if 'quantile'
#' normalization is chosen
#' @param roll integer, to use as the window to smooth the Vmat rows
#' by rolling mean.
#' @param verbose Boolean
#' @return A normalized Vmat object
#'
#' @import S4Vectors
#' @import GenomicRanges
#' @import IRanges
#' @importFrom zoo rollmean
#' @export
#'
#' @examples
#' data(bam_test)
#' data(ce11_all_REs)
#' Vmat <- computeVmat(bam_test, ce11_all_REs)
#' Vmat <- normalizeVmat(
#' Vmat,
#' bam_test,
#' ce11_all_REs,
#' normFun = c('libdepth+nloci')
#' )
normalizeVmat <- function(
Vmat,
bam_granges,
granges,
normFun = c('zscore'),
s = 0.99,
roll = 1,
verbose = TRUE
)
{
## Normalize the matrix
if (normFun == 'libdepth+nloci') {
if (verbose) message('Computing raw library depth')
Vmat <- Vmat / length(bam_granges) * 1000000
if (verbose) message('Dividing Vmat by its number of loci')
Vmat <- Vmat / length(granges)
}
else if (normFun == 'pct') {
Vmat <- Vmat/sum(Vmat)*100
}
else if (normFun == 'quantile') {
# Cap by quantile
q <- quantile(Vmat, probs = s)
Vmat[Vmat >= q] <- q
# Then scale so that max value is one
Vmat <- Vmat/q*100
}
else if (normFun == 'max') {
Vmat <- Vmat/max(Vmat)
}
else if (normFun == 'zscore') {
Vmat2 <- matrix(scale(c(Vmat)), byrow = FALSE, nrow = nrow(Vmat))
colnames(Vmat2) <- colnames(Vmat)
row.names(Vmat2) <- row.names(Vmat)
Vmat <- Vmat2
}
else if (normFun %in% c('', 'none', 'skip')) {
if (verbose) message('No normalization applied')
Vmat <- Vmat
}
else {
if (verbose) message('CAUTION: Normalization function
not found. Skipped normalization')
}
## Smooth the heatmap
if (roll > 1) {
if (verbose) message('Smoothing the matrix')
Vmat_rolled <- zoo::rollmean(Vmat, roll)
row.names(Vmat_rolled) <- zoo::rollmean(
as.numeric(rownames(Vmat)),
roll
)
Vmat <- Vmat_rolled
}
return(Vmat)
}