-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharithmetics.R
416 lines (390 loc) · 14.6 KB
/
arithmetics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#' HiContacts arithmetics functionalities
#'
#' @name arithmetics
#' @aliases detrend
#' @aliases autocorrelate
#' @aliases divide
#' @aliases merge,HiCExperiment,HiCExperiment-method
#' @aliases despeckle
#' @aliases aggregate,HiCExperiment-method
#' @aliases boost
#' @aliases subsample
#' @aliases coarsen
#' @aliases normalize,HiCExperiment-method
#'
#' @description
#' Different arithmetic operations can be performed on Hi-C contact matrices:
#' - `normalize` a contact matrix using iterative correction;
#' - `detrend` a contact matrix, i.e. remove the distance-dependent
#' contact trend;
#' - `autocorrelate` a contact matrix: this is is typically done to highlight
#' large-scale compartments;
#' - `divide` one contact matrix by another;
#' - `merge` multiple contact matrices;
#' - `despeckle` (i.e. smooth out) a contact matrix out;
#' - `aggregate` (average) a contact matrices over a set of genomic loci of
#' interest;
#' - `boost` Hi-C signal by enhancing long-range interactions while preserving short-
#' range interactions (this is adapted from Boost-HiC);
#' - `subsample` interactions using a proportion or a fixed number of final
#' interactions.
#' - `coarsen` a contact matrix to a larger (coarser) resolution
#'
#' @param x,y,object a `HiCExperiment` object
#' @param use.scores Which scores to use to perform operations
#' @param ... `HiCExperiment` objects. For `aggregate`, `targets` (a set of
#' GRanges or GInteractions).
#' @param detrend Detrend matrix before performing autocorrelation
#' @param ignore_ndiags ignore N diagonals when calculating correlations
#' @param by a `HiCExperiment` object
#' @param focal.size Size of the smoothing rectangle
#' @param alpha Power law scaling factor. As indicated in Boost-HiC documentation,
#' the alpha parameter influences the weighting of contacts: if alpha < 1
#' long-range interactions are prioritized; if alpha >> 1 short-range
#' interactions have more weight when computing the distance matrix.
#' @param full.replace Whether to replace the entire set of contacts,
#' rather than only filling the missing interactions (count=0) (Default: FALSE)
#' @param prop Float between 0 and 1, or integer corresponding to the # of
#' @param niters Number of iterations for ICE matrix balancing
#' @param min.nnz Filter bins with less than `min.nnz` non-zero elements when
#' performing ICE matrix balancing
#' @param mad.max Filter out bins whose log coverage is less than `mad.max`
#' median absolute deviations below the median bin log coverage.
#' @param targets Set of chromosome coordinates for which
#' interaction counts are extracted from the Hi-C contact file, provided
#' as a GRanges object (for diagnoal-centered loci) or as a GInteractions
#' object (for off-diagonal coordinates).
#' @param pseudocount Add a pseudocount when dividing matrices (Default: 0)
#' @param bin.size Bin size to coarsen a HiCExperiment at
#' @param flankingBins Number of bins on each flank of the bins containing
#' input targets.
#' @param maxDistance Maximum distance to use when compiling distance decay
#' @param FUN merging function
#' @param BPPARAM BiocParallel parameters
#'
#' @return a `HiCExperiment` object with extra scores
#'
#' @import InteractionSet
#' @import stringr
#' @import tidyr
#' @importFrom BiocGenerics normalize
#' @importFrom scales rescale
#' @importFrom scales oob_censor
#' @importFrom stats quantile
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom dplyr group_by
#' @importFrom dplyr summarize
#' @importFrom dplyr pull
#' @importFrom dplyr mutate
#' @importFrom dplyr left_join
#' @importFrom dplyr full_join
#' @importFrom dplyr rename
#' @importFrom dplyr n
#' @importFrom GenomicRanges seqnames
#' @importFrom GenomicRanges findOverlaps
#' @importFrom GenomicRanges start
#' @importFrom GenomicRanges GRanges
#' @importFrom S4Vectors metadata
#' @importFrom S4Vectors SimpleList
#' @importFrom S4Vectors mcols
#' @importFrom SummarizedExperiment assay
#' @importFrom BiocParallel bpparam
#'
#' @examples
#' #### -----
#' #### Normalize a contact matrix
#' #### -----
#'
#' library(HiContacts)
#' contacts_yeast <- contacts_yeast()
#' normalize(contacts_yeast)
#'
#' #### -----
#' #### Detrending a contact matrix
#' #### -----
#'
#' detrend(contacts_yeast)
#'
#' #### -----
#' #### Auto-correlate a contact matrix
#' #### -----
#'
#' autocorrelate(contacts_yeast)
#'
#' #### -----
#' #### Divide 2 contact matrices
#' #### -----
#'
#' contacts_yeast <- refocus(contacts_yeast, 'II')
#' contacts_yeast_eco1 <- contacts_yeast_eco1() |> refocus('II')
#' divide(contacts_yeast_eco1, by = contacts_yeast)
#'
#' #### -----
#' #### Merge 2 contact matrices
#' #### -----
#'
#' merge(contacts_yeast_eco1, contacts_yeast)
#'
#' #### -----
#' #### Despeckle (smoothen) a contact map
#' #### -----
#'
#' despeckle(contacts_yeast)
#'
#' #### -----
#' #### Aggregate a contact matrix over centromeres, at different scales
#' #### -----
#'
#' contacts <- contacts_yeast() |> zoom(resolution = 1000)
#' centros <- topologicalFeatures(contacts, 'centromeres')
#' aggregate(contacts, targets = centros, flankingBins = 51)
#'
#' #### -----
#' #### Enhance long-range interaction signal
#' #### -----
#'
#' contacts <- contacts_yeast() |> zoom(resolution = 1000) |> refocus('II')
#' boost(contacts, alpha = 1)
#'
#' #### -----
#' #### Subsample & "coarsen" contact matrix
#' #### -----
#'
#' subcontacts <- subsample(contacts, prop = 100000)
#' coarsened_subcontacts <- coarsen(subcontacts, bin.size = 4000)
NULL
#' @rdname arithmetics
#' @export
detrend <- function(x, use.scores = 'balanced') {
gis <- interactions(x)
gis$score <- scores(x, use.scores)
gis$diag <- gis$bin_id2 - gis$bin_id1
if (is.null(metadata(x)[['detrending_model']])) {
expected <- tibble::as_tibble(gis) |>
dplyr::group_by(diag) |>
dplyr::mutate(score = scales::oob_censor(
score, range = stats::quantile(score, c(0.05, 0.95), na.rm = TRUE)
)) |>
dplyr::summarize(average_interaction_per_diag = mean(score, na.rm = TRUE)) |>
dplyr::mutate(average_interaction_per_diag = average_interaction_per_diag / 2)
}
else {
expected <- metadata(x)[['detrending_model']]
expected$average_interaction_per_diag <- expected$score
expected$distance <- NULL
expected$score <- NULL
}
gis$expected <- tibble::as_tibble(gis) |>
dplyr::left_join(expected, by = 'diag') |>
dplyr::pull(average_interaction_per_diag)
gis$score_over_expected <- log2( {gis$score/sum(gis$score, na.rm = TRUE)} / {gis$expected/sum(gis$expected, na.rm = TRUE)} )
# gis$score_over_expected <- log2( {scale(gis$score)} / {scale(gis$expected)} )[,1]
scores(x, "expected") <- gis$expected
scores(x, "detrended") <- gis$score_over_expected
return(x)
}
#' @rdname arithmetics
#' @export
autocorrelate <- function(x, use.scores = 'balanced', detrend = TRUE, ignore_ndiags = 3) {
if (!requireNamespace("WGCNA", quietly = TRUE)) {
message("Install WGCNA package to perform autocorrelation.")
message("install.packages('WGCNA')")
}
if (detrend) {
if (!{'detrend' %in% names(scores(x))}) {
x <- detrend(x, use.scores = use.scores)
}
use.scores <- 'detrended'
}
gis <- interactions(x)
gis$score <- scores(x, use.scores)
reg <- regions(gis)
mat <- cm2matrix(gi2cm(gis))
for (K in seq(-ignore_ndiags, ignore_ndiags, by = 1)) {
sdiag(mat, K) <- NA
}
co <- WGCNA::cor(mat, use = 'pairwise.complete.obs', method = "pearson")
colnames(co) <- names(reg)
rownames(co) <- names(reg)
mat2 <- tibble::as_tibble(co, rownames = 'region', .name_repair = "minimal") |>
tidyr::pivot_longer(-region, names_to = "y", values_to = "corr") |>
dplyr::rename("x" = "region")
mat2$bin_id1 <- dplyr::left_join(
mat2,
data.frame(region = names(reg), id = reg$bin_id),
by = c(x = 'region')
) |> pull(id)
mat2$bin_id2 <- dplyr::left_join(
mat2,
data.frame(region = names(reg), id = reg$bin_id),
by = c(y = 'region')
) |> pull(id)
scores(x, 'autocorrelated') <- left_join(
as_tibble(gis), mat2, by = c("bin_id1", "bin_id2")
) |> pull(corr)
return(x)
}
#' @rdname arithmetics
#' @export
divide <- function(x, by, use.scores = 'balanced', pseudocount = 0) {
x_gis <- interactions(x)
x_gis$score <- S4Vectors::mcols(x_gis)[[use.scores]]
by_gis <- interactions(by)
by_gis$score <- S4Vectors::mcols(by_gis)[[use.scores]]
## -- If regions are different, manually merge them
re <- unique(
c(InteractionSet::regions(x_gis), InteractionSet::regions(by_gis))
)
InteractionSet::replaceRegions(x_gis) <- re
InteractionSet::replaceRegions(by_gis) <- re
## -- Check that all objects are comparable (bins, resolution, seqinfo)
.is_same_seqinfo(x, by)
.is_same_resolution(x, by)
.is_same_bins(x, by)
## -- Join tables, divide, and convert back to GInteractions
gis <- dplyr::full_join(
as.data.frame(x_gis) |> dplyr::select(!contains(c('.1', 'chr', 'width', 'strand', 'center'))),
as.data.frame(by_gis) |> dplyr::select(!contains(c('.1', 'chr', 'width', 'strand', 'center'))),
by = c(
'seqnames1', 'start1', 'end1', 'bin_id1',
'seqnames2', 'start2', 'end2', 'bin_id2'
),
suffix = c(".x", ".by")
) |>
dplyr::mutate(
score.x = score.x + pseudocount,
score.by = score.by + pseudocount,
fc = score.x / score.by,
l2fc = log2(fc)
) |>
HiCExperiment::df2gi()
S4Vectors::mcols(gis)[[paste0(use.scores, '.fc')]] <- gis$fc
S4Vectors::mcols(gis)[[paste0(use.scores, '.l2fc')]] <- gis$l2fc
S4Vectors::mcols(gis)[, c('score.x', 'score.by', 'fc', 'l2fc')] <- NULL
InteractionSet::replaceRegions(gis) <- re
## -- Export results as HiCExperiment
m <- S4Vectors::mcols(gis) |>
as.data.frame() |>
dplyr::select(!starts_with('weight'))
S4Vectors::mcols(gis) <- m
m <- dplyr::select(m, -c('bin_id1', 'bin_id2'))
scores <- as.list(m) |> S4Vectors::SimpleList()
## -- Create HiCExperiment
res <- methods::new("HiCExperiment",
focus = HiCExperiment::focus(x),
metadata = list(
hce_list = list(x = x, by = by),
operation = 'divide'
),
resolutions = resolution(x),
resolution = resolution(x),
interactions = gis,
scores = scores,
topologicalFeatures = S4Vectors::SimpleList(),
pairsFile = NULL,
fileName = ""
)
return(res)
}
#' @rdname arithmetics
#' @export
setMethod("merge", signature(x = "HiCExperiment", y = "HiCExperiment"), definition = function(
x, y, ..., use.scores = 'balanced', FUN = mean)
{
contacts_list <- list(x, y, ...)
## -- Check that at least 2 `HiCExperiment` objects are passed to `merge()`
.are_HiCExperiment(x, y, ...)
if (length(contacts_list) < 2) {
stop("Please provide at least 2 `HiCExperiment` objects.")
}
## -- Check that all objects are comparable (bins, regions, resolution, seqinfo)
.is_same_seqinfo(x, y, ...)
.is_same_resolution(x, y, ...)
.is_same_bins(x, y, ...)
# Unify all the interactions
score_names <- names(scores(contacts_list[[1]]))
ints <- do.call(
c, lapply(contacts_list, FUN = interactions)
) |> sort()
re <- do.call(
c, lapply(contacts_list, FUN = regions)
) |> sort() |> unique()
ints_df <- as.data.frame(ints)
merged_ints <- dplyr::select(ints_df, !any_of(score_names)) |>
dplyr::distinct() |>
HiCExperiment:::asGInteractions()
replaceRegions(merged_ints) <- re
# Group by bin_id1/bin_id2 and merge scores
if (identical(FUN, mean)) {
.FUN <- function(x) mean(x, na.rm = TRUE)
}
else if (identical(FUN, sum)){
.FUN <- function(x) sum(x, na.rm = TRUE)
}
scores <- dplyr::select(ints_df, dplyr::any_of(c('bin_id1', 'bin_id2', score_names))) |>
dplyr::group_by(bin_id1, bin_id2) |>
dplyr::summarise(dplyr::across(dplyr::all_of(score_names), .FUN), .groups = "drop") |>
dplyr::select(-bin_id1, -bin_id2) |>
as("SimpleList")
## -- Create a new HiCExperiment object
files <- unlist(lapply(contacts_list, fileName))
res <- methods::new("HiCExperiment",
focus = focus(contacts_list[[1]]),
metadata = list(
hce_list = contacts_list,
operation = .FUN
),
resolutions = resolutions(contacts_list[[1]]),
resolution = resolution(contacts_list[[1]]),
interactions = merged_ints,
scores = scores,
topologicalFeatures = S4Vectors::SimpleList(),
pairsFile = NULL,
fileName = fileName(contacts_list[[1]])
)
return(res)
})
#' @rdname arithmetics
#' @export
despeckle <- function(x, use.scores = 'balanced', focal.size = 1) {
if (!requireNamespace("terra", quietly = TRUE)) {
message("Install terra package to despeckle contact matrix.")
message("install.packages('terra')")
}
gis <- HiCExperiment::interactions(x)
gis$score <- HiCExperiment::scores(x, use.scores)
cm <- HiCExperiment::gi2cm(gis)
an <- InteractionSet::anchors(cm)
mat <- HiCExperiment::cm2matrix(cm)
r <- terra::rast(mat)
gauss <- matrix(data = 1/{(focal.size*2+1)*(focal.size*2+1)-1}, ncol = focal.size*2+1, nrow = focal.size*2+1)
gauss[focal.size+1, focal.size+1] <- 0
mat_despeckled <- terra::focal(
x = r,
w = gauss,
fun = sum,
na.rm = TRUE
)#/{(focal.size*2+1)*(focal.size*2+1)-1}
mat_despeckled <- terra::as.array(mat_despeckled)[, , 1]
cm_despeckled <- InteractionSet::ContactMatrix(
mat_despeckled,
anchor1 = an[[1]],
anchor2 = an[[2]],
regions = InteractionSet::regions(cm)
)
is_despeckled <- InteractionSet::deflate(cm_despeckled)
gis_despeckled <- InteractionSet::interactions(is_despeckled)
gis_despeckled$bin_id1 <- HiCExperiment::anchors(gis_despeckled)[[1]]$bin_id
gis_despeckled$bin_id2 <- HiCExperiment::anchors(gis_despeckled)[[2]]$bin_id
HiCExperiment::interactions(x) <- gis_despeckled
m <- dplyr::left_join(
as.data.frame(S4Vectors::mcols(gis_despeckled)),
as.data.frame(S4Vectors::mcols(gis)),
by = c('bin_id1', 'bin_id2')
) |> dplyr::select(-bin_id1, -bin_id2, -score)
m[[paste0(use.scores, '.despeckled')]] <- SummarizedExperiment::assay(is_despeckled, 1)[, 1]
l <- as.list(m) |> S4Vectors::SimpleList()
x@scores <- l
return(x)
}