-
Notifications
You must be signed in to change notification settings - Fork 0
/
pvclust-internal.R
341 lines (278 loc) · 12 KB
/
pvclust-internal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
hc2axes <- function(x)
{
A <- x$merge # (n,n-1) matrix
n <- nrow(A) + 1
x.axis <- c()
y.axis <- x$height
x.tmp <- rep(0,2)
zz <- match(1:length(x$order),x$order)
for(i in 1:(n-1)) {
ai <- A[i,1]
if(ai < 0)
x.tmp[1] <- zz[-ai]
else
x.tmp[1] <- x.axis[ai]
ai <- A[i,2]
if(ai < 0)
x.tmp[2] <- zz[-ai]
else
x.tmp[2] <- x.axis[ai]
x.axis[i] <- mean(x.tmp)
}
return(data.frame(x.axis=x.axis,y.axis=y.axis))
}
hc2split <- function(x)
{
A <- x$merge # (n-1,n) matrix
n <- nrow(A) + 1
B <- list()
for(i in 1:(n-1)){
ai <- A[i,1]
if(ai < 0)
B[[i]] <- -ai
else
B[[i]] <- B[[ai]]
ai <- A[i,2]
if(ai < 0)
B[[i]] <- sort(c(B[[i]],-ai))
else
B[[i]] <- sort(c(B[[i]],B[[ai]]))
}
CC <- matrix(rep(0,n*(n-1)),nrow=(n-1),ncol=n)
for(i in 1:(n-1)){
bi <- B[[i]]
m <- length(bi)
for(j in 1:m)
CC[i,bi[j]] <- 1
}
split <- list(pattern=apply(CC,1,paste,collapse=""), member=B)
return(split)
}
pvclust.node <- function(x, r,...) #this does the bootstraping for a single node
{
mboot.node <- lapply(r, boot.hclust, nboot=x, ...) #do the bootstraping once for each r value
return(mboot.node)
}
boot.hclust <- function(r, data, object.hclust, method.dist, use.cor,
method.hclust, nboot, store, weight=F, storeCop, copDistance, normalize, cladeChunkIn, chunkSize, storeChunks, rowSample=FALSE, relFreq)
{
n <- nrow(data) #get the number of rows (each row contains a single word)
size <- round(n*r, digits=0) #calculate the number of rows to resample
if(size == 0)
stop("invalid scale parameter(r)")
r <- size/n #recalculate r to match exactly size/n instead of approxiametly
pattern <- hc2split(object.hclust)$pattern #get the pattern (the contents of each clade) from the original hclust object
edges.cnt <- table(factor(pattern)) - table(factor(pattern)) #create a table that shows the number of times each clade is formed in the bootstraped clusters
st <- list() #list that will store the hclust objects created
stc <- list() #list storing cophenetic correlations
stCh <- list() #list that will store the tables containing the new chunks
# bootstrap start
rp <- as.character(round(r,digits=2)); if(r == 1) rp <- paste(rp,".0",sep="")
cat(paste("Bootstrap (r = ", rp, ")... ", sep=""))
w0 <- rep(1,n) # equal weight
na.flag <- 0
sect1Time <- 0
sect2Time <- 0
for(i in 1:nboot){
if(weight && r>10) { ## <- this part should be improved
print("Code for weight resampling is not implemented currently. Look at code for nonresampled weight in boot.hclust for example of how to implement")
w1 <- as.vector(rmultinom(1,size,w0)) # resampled weight
if(normalize)
{
print("Code for normalizing resampled weight is not in yet. Look at code for nonresampled weight in boot.hclust for example of how to implement")
}
suppressWarnings(distance <- distw.pvclust(data,w1,method=method.dist,use.cor=use.cor))
}
else if(rowSample) #row resampling is done in here. Disabled by default until I decide if it should be a parameter
{
smpl <- sample(1:n, size, replace=TRUE) #creates a index vector with each element being the index of the row chosen
bootStrapData = data[smpl,] #get the new bootStrap values using the index vector
if(normalize)
{
#normalize the new data
colSums <- apply(bootStrapData, 2, sum) #each example/observation/object is one column, so find the sums of the columns
denoms <- matrix(rep(colSums, dim(bootStrapData)[1]), byrow=T, ncol=dim(bootStrapData)[2]) #compute matrix to divide current matrix by to normalize matrix. Each entry in a column is the sum of the column
bootStrapData <- bootStrapData/denoms
}
suppressWarnings(distance <- dist.pvclust(bootStrapData,method=method.dist,use.cor=use.cor)) #ceate the new distance matrix
}
else {
bootStrapData <- matrix(data = 0, nrow = nrow(data), ncol = ncol(data), dimnames=list(rownames(data),colnames(data))) #set up empty table to hold word counts for bootstrapped data
#bootStrapData2 <- matrix(data = 0, nrow = nrow(data), ncol = ncol(data), dimnames=list(rownames(data),colnames(data))) #set up empty table to hold word counts for bootstrapped data
for(j in 1:ncol(data)){ #for each chunk
size <- chunkSize[[j]] * r #calculate the number of words in the new bootstrapped chunk based on original size and r value
#size will equal sum(data[,i]) when r = 1
chunklistIndex <- cladeChunkIn[j] #get clade the chunk is in to know what sublist of the wordlist to use
counts <- rmultinom(1, size, relFreq[,chunklistIndex])
bootStrapData[,j] <- counts #store the word counts
}
if(storeChunks) #if storing the new chunks
{
stCh[[i]] <- bootStrapData
}
if(normalize)
{
#normalize the new data
colSums <- apply(bootStrapData, 2, sum) #each example/observation/object is one column, so find the sums of the columns
denoms <- matrix(rep(colSums, dim(bootStrapData)[1]), byrow=T, ncol=dim(bootStrapData)[2]) #compute matrix to divide current matrix by to normalize matrix. Each entry in a column is the sum of the column
bootStrapData <- bootStrapData/denoms
}
suppressWarnings(distance <- dist.pvclust(bootStrapData,method=method.dist,use.cor=use.cor)) #create the new distance matrix
}
if(all(is.finite(distance))) { # check if distance is valid
x.hclust <- hclust(distance,method=method.hclust) #create the hclust object
pattern.i <- hc2split(x.hclust)$pattern # get the contents of the clades
edges.cnt <- edges.cnt + table(factor(pattern.i, levels=pattern)) #add the clades identical to the clades in the original object to the count of the number of times those clades appeared
} else {
x.hclust <- NULL
na.flag <- 1
}
if(store) #if storing the hclust objects
st[[i]] <- x.hclust
if(storeCop) #if storing cophenetic corelations, find and store the correlation now
{
#x.hclust
bootCop <- cophenetic(x.hclust) #get the cophenetic distance matrix
stc[[i]] <- cor(copDistance, bootCop) #find the correlation between the cophenetic distance matrix and the distance matrix from the original data.
}
}
cat("Done.\n")
# bootstrap done
if(na.flag == 1)
warning(paste("inappropriate distance matrices are omitted in computation: r = ", r), call.=FALSE)
boot <- list(edges.cnt=edges.cnt, method.dist=method.dist, use.cor=use.cor,
method.hclust=method.hclust, nboot=nboot, size=size, r=r, storehClust=st, storeCop=stc, storeChunks=stCh) #store all the data in a list
class(boot) <- "boot.hclust"
return(boot) #return that list to the function call
}
#this function takes all the data computed and converts it into the final results
pvclust.merge <- function(data, object.hclust, mboot, distance, seed){
pattern <- hc2split(object.hclust)$pattern #get the contents of the clades in the original hclust objects
#get the following data out of the list
r <- unlist(lapply(mboot,"[[","r"))
nboot <- unlist(lapply(mboot,"[[","nboot"))
storehClust <- lapply(mboot,"[[", "storehClust")
storeCop <-lapply(mboot,"[[", "storeCop")
storeChunks <- lapply(mboot,"[[", "storeChunks")
rl <- length(mboot) #get the number of r values
ne <- length(pattern) #get the number of clades
edges.bp <- edges.cnt <- data.frame(matrix(rep(0,ne*rl),nrow=ne,ncol=rl)) #Creates two big tables, with each row being a clade, and each column being a r value. A individual entry denotes either the number of times that clade occured or the bp value for that clade for a particular r value.
row.names(edges.bp) <- pattern
names(edges.cnt) <- paste("r", 1:rl, sep="")
for(j in 1:rl) { #for each r value
edges.cnt[,j] <- as.vector(mboot[[j]]$edges.cnt) #holds the number of times each clade is formed for this particular r value
edges.bp[,j] <- edges.cnt[,j] / nboot[j] #holds the number of times each clade is formed divided by the total number of bootstraps for this particular r value
}
ms.fitted <- lapply(as.list(1:ne), #does the function one per clade that computes bp and au values
function(x, edges.bp, r, nboot){
msfit(as.vector(t(edges.bp[x,])), r, nboot)},
edges.bp, r, nboot)
class(ms.fitted) <- "mslist"
p <- lapply(ms.fitted,"[[","p") #p is a list of objects one per clade that hold both the au and bp values of that clade
se <- lapply(ms.fitted,"[[","se") #se holds a different set of bp and au values????
coef <- lapply(ms.fitted,"[[","coef")
au <- unlist(lapply(p,"[[","au"))
bp <- unlist(lapply(p,"[[","bp"))
se.au <- unlist(lapply(se,"[[","au"))
se.bp <- unlist(lapply(se,"[[","bp"))
v <- unlist(lapply(coef,"[[","v"))
cc <- unlist(lapply(coef,"[[","c"))
pchi <- unlist(lapply(ms.fitted,"[[","pchi"))
#store all the data in a single table
edges.pv <- data.frame(au=au, bp=bp, se.au=se.au, se.bp=se.bp,
v=v, c=cc, pchi=pchi)
row.names(edges.pv) <- row.names(edges.cnt) <- 1:ne #names by default seem to just be numbers 1-n with n being number of clades
#combine all the data into a list
result <- list(hclust=object.hclust, edges=edges.pv, count=edges.cnt,
msfit=ms.fitted, nboot=nboot, r=r, storehClust=storehClust, storeCop=storeCop, distance=distance, seed=seed, storeChunks=storeChunks)
class(result) <- "pvclust"
return(result) #return that list
}
dist.pvclust <- function(x, method="euclidean", use.cor="pairwise.complete.obs")
{
if(!is.na(pmatch(method,"correlation"))){
res <- as.dist(1 - cor(x, method="pearson", use=use.cor))
attr(res,"method") <- "correlation"
return(res)
}
else if(!is.na(pmatch(method,"abscor"))){
res <- as.dist(1 - abs(cor(x,method="pearson",use=use.cor)))
attr(res,"method") <- "abscor"
return(res)
}
else if(!is.na(pmatch(method,"uncentered"))){
if(sum(is.na(x)) > 0){
x <- na.omit(x)
warning("Rows including NAs were omitted")
}
x <- as.matrix(x)
P <- crossprod(x)
qq <- matrix(diag(P),ncol=ncol(P))
Q <- sqrt(crossprod(qq))
res <- as.dist(1 - P/Q)
attr(res,"method") <- "uncentered"
return(res)
}
else
dist(t(x),method)
}
corw <- function(x,w,
use=c("all.obs","complete.obs","pairwise.complete.obs")
) {
if(is.data.frame(x)) x <- as.matrix(x)
x <- x[w>0,,drop=F]
w <- w[w>0]
n <- nrow(x) # sample size
m <- ncol(x) # number of variables
if(missing(w)) w <- rep(1,n)
r <- matrix(0,m,m,dimnames=list(colnames(x),colnames(x)))
diag(r) <- 1
use <- match.arg(use)
pairu <- F
if(use=="all.obs") {
u <- rep(T,n)
} else if(use=="complete.obs") {
u <- apply(x,1,function(y) !any(is.na(y)))
} else if(use=="pairwise.complete.obs") {
pairu <- T
ux <- is.finite(x)
} else stop("unknown use")
for(i in 1+seq(length=m-1)) {
for(j in seq(length=i-1)) {
if(pairu) u <- ux[,i] & ux[,j]
wu <- w[u]; xi <- x[u,i]; xj <- x[u,j]
ws <- sum(wu)
if(ws > 1e-8) {
xi <- xi - sum(wu*xi)/ws
xj <- xj - sum(wu*xj)/ws
vxi <- sum(wu*xi*xi)/ws
vxj <- sum(wu*xj*xj)/ws
if(min(vxi,vxj) > 1e-8) {
vxij <- sum(wu*xi*xj)/ws
rij <- vxij/sqrt(vxi*vxj)
} else {
rij <- 0
}
} else {
rij <- 0
}
r[i,j] <- r[j,i] <- rij
}
}
r
}
### calculate distance by weight
distw.pvclust <- function(x,w,method="correlation", use.cor="pairwise.complete.obs")
{
if(!is.na(pmatch(method,"correlation"))){
res <- as.dist(1 - corw(x,w, use=use.cor))
attr(res,"method") <- "correlation"
return(res)
}
else if(!is.na(pmatch(method,"abscor"))){
res <- as.dist(1 - abs(corw(x,w, use=use.cor)))
attr(res,"method") <- "abscor"
return(res)
}
stop("wrong method")
}