-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRobusTAD.R
executable file
·238 lines (181 loc) · 9.64 KB
/
RobusTAD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/env Rscript
##################################################################
## ##
## RobusTAD.R ##
## Rola Dali, Guillaume Bourque, Mathieu Blanchette ##
## Feb 2018 ##
## v1.0 ##
## ##
##################################################################
## The following script produces TAD Boundary scores for each bin on a chromosome.
## Input: interaction frequency matrix for a chromosome
## Output: 2 files: I- file with TAD Boundary scores: Right Boundary scores, Left Boundary scores and Final combined score (max(R, L)).
## II- file with TAD boundary calls identified by looking for local maxima above the set threshold
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ settings:
options(scipen=999)
library("optparse")
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ functions:
leftRightBoundaryScores <- function(data, minW, maxW, minRatio) {
## function that calculates left and right boundary scores for each bin
s = dim(data)[1]
datanorm <- matrix(0,dim(data)[1],dim(data)[2])
if(status == "raw"){
## normalize if data is raw, skip otherwise
print("... ... Normalizing Data")
sumrow <- matrix(0,dim(data)[1],1)
for (i in 1:s) {
sumrow[i,1] = sum(data[i,])
}
ssum = sum(sumrow)
for (i in 1:s) {
for (j in 1:s) {
datanorm[i,j]=data[i,j]/(sumrow[i,1]*sumrow[j,1])*ssum;
}
}
} else {
print("... ... skipping normalization step!")
datanorm <- data
}
print("... ... Calculating Boundary Scores")
leftBoundaryScore=matrix(1,s);
rightBoundaryScore=matrix(1,s);
for (i in (maxW+2):(s-maxW-2)) {
rightScore=0;
leftScore =0;
leftBoundaryScore[i+1,1]=-99999;
rightBoundaryScore[i+1,1]=-99999;
diag=matrix(NA,maxW,maxW);
for (d in 1:maxW) {
maxiPossible = (d^3 + 3 * d^2 + 2* d)/6;
for (j in 1:d) {
diag[d,j] = datanorm[i-j+1,i-j+1+d];
}
for (del in 1:d) {
diff = sum(datanorm[i-d,i-d+del]/(diag[del,]+0.001)>minRatio, na.rm=T) - sum(datanorm[i-d,i-d+del]/(diag[del,]+0.001)<1.0/minRatio, na.rm=T)
rightScore = rightScore + diff;
}
if (d>=minW) {
z = rightScore/sqrt(maxiPossible);
# note: the score calculated pertains to the presence of a boundary on the right end of bin i. However it is more convenient to report scores relating to the left end of bins, so we associate this score to bin i+1 instead of i.
if (z>rightBoundaryScore[i+1,1]) {
rightBoundaryScore[i+1,1]=z;
}
}
for (del in 1:d) {
diff = sum(datanorm[i+1+d-del,i+1+d]/(diag[del,]+0.001)>minRatio, na.rm=T) - sum(datanorm[i+1+d-del,i+1+d]/(diag[del,]+0.001)<1.0/minRatio, na.rm=T)
leftScore = leftScore + diff;
}
if (d>=minW) {
z = leftScore/sqrt(maxiPossible);
# note: the score calculated pertains to the presence of a boundary on the right end of bin i. However it is more convenient to report scores relating to the left end of bins, so we associate this score to bin i+1 instead of i.
if (z>leftBoundaryScore[i+1,1]) leftBoundaryScore[i+1,1]=z;
}
}
}
return(cbind(leftBoundaryScore,rightBoundaryScore));
}
localpeaks <- function(score){
## locates peaks across the TAD score profile
index <- c()
for (i in 3:(length(score)-2)){
if(score[i] > score[i-2] && score[i] > score[i-1] && score[i] > score[i+1] && score[i] > score[i+2]){
index <- c(index, i)
}
}
return(index)
}
TADBoundcalls <- function(boundaries, threshold){
## calls TAD Boundaries using a threshold T by locating the maxima that are above the threshold
BoundIndexR <- localpeaks(boundaries[,3])
BoundIndexL <- localpeaks(boundaries[,2])
BoundIndex <- unique(sort(c(BoundIndexR, BoundIndexL)))
Boundmaxima <- boundaries[BoundIndex,]
TADcalls <- Boundmaxima[Boundmaxima[,4] >= threshold,]
return(TADcalls)
}
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ main:
## parse command line arguments:
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Interaction Frequency Matrix. Must be a square matrix: number of columns = number of rows"),
make_option(c("-H", "--header"), action = "store_true", default=TRUE,
help="include -H if input contains a header/column names"),
make_option(c("-n", "--norm"), action = "store", default ="raw",
help="indicates if IF matrix is raw or normalized [default = %default]; [options: {raw, norm}]"),
make_option(c("-o", "--outDir"), type="character", default=".",
help="output directory name"),
make_option(c("-b", "--binsize"), type="integer", default=50,
help="binsize or resolution used in Hi-C analysis in kb [default = %default]"),
make_option(c("-r", "--minRatio"), type="double", default=1.5,
help="minimum ratio of Within to Across IF values to contribute to boundary score calculation [default = %default]"),
make_option(c("-w", "--minWin"), type="integer", default=250,
help="minimum window around the bin used to calculate the TAD score in kb [default = %default]"),
make_option(c("-W", "--maxWin"), type="integer", default=500,
help="maximum window around the bin used to calculate the TAD score in kb [default = %default]"),
make_option(c("-T", "--threshold"), type="double", default=0.2,
help="data percentile of TAD scores used to calculate threshold in order to call significant TAD boundaries. [default = %default]; [options: 0-1];\n\t\tthe lower the threshold, the more stringent the TAD calls")
);
description = "\n\n==============================================================================================================================================================\n\nRobusTAD calculates TAD Boundary scores for each bin on a chromosome.\n\tInput: interaction frequency matrix for a chromosome.\n\tOutput: 2 files: \n\t\t\tI- file with TAD boundary scores (BoundaryScores_*): contains Right Boundary scores, Left Boundary scores and Final combined score (max(R, L)).\n\t\t\tII- file with TAD boundary calls (TADBoundaryCalls_*) identified by looking for local maxima above the set threshod.\n\nUsage: RobusTAD.R -i InputMatrix [options]\n\n==============================================================================================================================================================\n\n"
epilogue = "\n\n==============================================================================================================================================================\n\nRobusTAD is available under a GPL liscence and comes with no warranties @ https://github.com/rdali/RobusTAD\n\n==============================================================================================================================================================\n\n"
opt_parser = OptionParser(option_list=option_list, description = description, epilogue=epilogue);
opt = parse_args(opt_parser);
if (is.null(opt$input)){
print_help(opt_parser)
stop("Error! missing arguments... Interaction frequency matrix is required!\n", call.=FALSE)
}
mymatrix <- opt$input
minRatio <- as.numeric(opt$minRatio)
binSize <- as.numeric(opt$binsize)
minWin <- as.numeric(opt$minWin)
maxWin <- as.numeric(opt$maxWin)
perc <- as.numeric(opt$threshold)
outDir <- opt$outDir
status <- opt$norm
header <- opt$header
if(!file.exists(mymatrix)){
stop ("Error! provided matrix file does not exist")
}
##create outDir if it doesnt exist:
if (!file.exists(outDir)) {
dir.create(outDir)
}
if(status != "raw" & status != "norm" ){
stop ("Error! -n must be set to 'raw' or 'norm'")
}
if(perc < 0 | perc > 1){
stop ("Error! --threshold should be set to a value between 0 and 1!")
}
minW <- as.integer(minWin/binSize)
maxW <- as.integer(maxWin/binSize)
## extract base name of matrix to use in output files:
prefix <- basename(mymatrix)
prefix <- substr(prefix, 1, nchar(prefix)-4)
## read in matrix
if(header){
data <- as.matrix(read.table(mymatrix, header = TRUE, sep="\t", row.names = 1, as.is=TRUE))
} else {
data <- as.matrix(read.table(mymatrix, header = FALSE, sep="\t", as.is=TRUE))
}
## check that input file is a square matrix:
if(nrow(data) != ncol(data)){
stop ("Error! provided matrix is not a square matrix!")
}
boundaries = leftRightBoundaryScores(data, minW, maxW, minRatio)
rownames(boundaries) <- rownames(data)
colnames(boundaries) <- c("LeftBoundaryScore", "RightBoundaryScore")
## scale data to produce z-scores:
boundaries <- scale(boundaries, center = T, scale = T)
## calculate combined TAD score:
TADscore <- apply(boundaries, 1, max)
boundaries <- data.frame(boundaries, TADscore)
coordinates <- rownames(boundaries)
boundaries <- data.frame(boundaries, coordinates)
boundaries <- boundaries[,c(4,1,2,3)]
## call TAD boundaries:
print("... ... Calling Significant TAD Boundaries")
threshold <- quantile(boundaries$TADscore, na.rm = T, probs = (1-perc))[[1]]
TADcalls <- TADBoundcalls(boundaries, threshold)
## Save files:
write.table(boundaries, file.path(outDir, paste0("BoundaryScores_", prefix, "_binSize",binSize ,"_minW", minWin, "_maxW", maxWin,"_minRatio", minRatio,".txt")), quote = F, row.names=F)
write.table(TADcalls, file.path(outDir, paste0("TADBoundaryCalls_", prefix, "_binSize",binSize ,"_minW", minWin, "_maxW", maxWin,"_minRatio", minRatio,"_threshold",perc, ".txt")), quote = F, row.names=F)
print("... ... RobusTAD run is complete!")