-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustr1.r
executable file
·124 lines (96 loc) · 4.77 KB
/
clustr1.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Idea for text labeling input
# textlabs is a list of character strings for the text part of the the label
# chunksize is a list of numeric for the number of chunks in each text
# textlabs and chunksize must have same length and are assumed to correspond elementwise
# so that first textlabs has first chunksize number of chunks
myCluster <- function(input.file , textlabs = NULL , chunksize = NULL ,
metric = "euclidean" , method = "average" ,
output.type = "pdf", outputfile = "Dendogram" ,
main = "Dendogram",header=T, comment.char="",
row.names=1, p=2, type='tsv',
labelFile=NULL,scrubtags=" ",divitags=" "){
## List of possible distance metrics
## METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
## "binary", "minkowski")
## List of possible cluster-distance methods
## METHODS <- c("ward", "single", "complete", "average", "mcquitty",
## "median", "centroid")
library(stats)
#change this for the text you'd like to input
if (type == 'csv') {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep=",")
}
else if (type == 'txt') {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="")
}
else {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="\t")
}
tTable <- input.data #Transpose is necessary if data file originally has words for rows and chunks for columns
# hclust assumes objects/chunks are the rows
# if someone passes us data, we should check which direction
rowSums <- apply(tTable, 1, sum) # getting the total for each chunk/row, we know we are adding acrossing the rows because of the second argument
denoms <- matrix(rep(rowSums, dim(tTable)[2]), byrow=F, ncol=dim(tTable)[2]) # repeat the row sums by n times; n= the number of columns.
# the matrix is filled by column.
relFreq <- tTable/denoms # the original data divides denoms piece-wise. This gives the proportion of each word in a chunk
if( !is.null(textlabs) && !is.null(chunksize)) { # if use textlabs and chunksize, the data must be order with one text followed by the next
if(length(textlabs) != length(chunksize)) stop("number of texts and corresponding chunk numbers must match")
else {# check that sum(chunksize) == dim(relFreq)[1] , total number of chunks equals number of rows in relFreq
L <- length(chunksize)
temp <- NULL
for(i in 1:L) {
for(k in 1:chunksize[i]){
temp <- c(temp,paste(textlabs[i],as.character(k),sep=""))
}
}
row.names(relFreq) <- temp
}
}
# else 0
# change the names of the labels
#row.names(relFreq) <- c("a","b","c","d","e","f","g","h","i","j"...)
#if (file.info(labelFile)$size!=0)
#{
# tempLABELS <- read.csv(as.character(labelFile), sep=",", as.is=TRUE, header=FALSE);
# if (length(tempLABELS) == length(row.names(relFreq)))
# {
# row.names(relFreq) <- tempLABELS;
# }
#}
dist.tTable <- dist(relFreq , method = metric, p=p)
hCluster <- hclust(dist.tTable, method = method)
if(!is.character(main)) stop("main must be a character string")
if(output.type=="pdf"){
# dev.control()
#outfilename <- paste(outputfile,".pdf",sep="")
outfilename<-paste(outputfile,sep="")
pdf(outfilename , onefile = TRUE, width=7.25, height=10)
max <- max( nchar( hCluster$labels ) ) # be sure there's room for the labels
par( mar=c( 6.1, 2.1, 4.1, ( max / 2.0 ) ) ) # margins
#plot(hCluster, hang = -1, main = main)
# create bottom lines of tags from scrubber an divitext
t.subtitle <- paste("TreeView Options: Distance Metric: ",metric,", Linkage Method: ",method)
s.subtitle <- gsub("_"," ",scrubtags,fixed=T)
s.subtitle <- paste(s.subtitle,sep="")
d.subtitle <- gsub("_"," ",divitags,fixed=T)
d.subtitle <- paste(d.subtitle,sep="")
subtitle <- paste(s.subtitle,"\n",d.subtitle,"\n",t.subtitle)
plot( as.dendrogram(hCluster), main=main, horiz=TRUE, cex=2, axes=FALSE, xlab="", cex.sub=.5);
# to put the title on top:
# change ylab=main to main=main
# change font.lab=2 to font.main=2
# and adjust margins
junk <- dev.off() # junk catches stdout of dev. (otherwise everything breaks)
}
else if (output.type=="phyloxml"){
outfilename <- paste(outputfile,sep="");
hClustToXML(hCluster, outfilename, TRUE, metric, method)
}
else{}
# return the row labels as a string
str<-"<r>";
for (i in row.names(relFreq)) {
str<-paste(str,i,sep=",");
}
return (str);
}