-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans.py
executable file
·202 lines (161 loc) · 7.08 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# figure out how to add axes labels to J curve
# stuck on line 80. i know w/o classes how to get methods to run through
# the same data over and over, as it gets constantly updated. but i can't
# seem to get it to make a variable w/in a method be equal to calling
# another method in that class.
import numpy as np
import pandas as pd
def test():
from test_for_kmeans import grab_values
values = grab_values.values
a = KMeans(max_iters=7, tries=15)
a.fit(values=values, clusters=3)
print '-->Fitting KMeans for one value of K works!'
a.plotj(maxK=4) # must do after fit() in order to load the values
print '--->Fitting KMeans for multiple values of K works!'
a.find_centroid_for_each()
print '--->Finding the group for each dot for the lowest-cost centroid position works!'
class KMeans(object):
def __init__(self, max_iters, tries):
"""How many times to cycle the new centroids mean and closest points in each K-means iteration?
How many K-means iterations to try to find the one w/the lowest J, assuming random initial centroids?"""
if max_iters < 3:
print 'K must be 3 or more.'
self.max_iters = max_iters
self.tries = tries
# Part 2: What are the initial centroids?
def evencentroids(self): # tested for 1D, 2D, 3D
x = self.x
K = self.K
dim = x.shape[1]
therange = (x.max(axis=0) - x.min(axis=0)) / float(K + 1)
c = np.arange(1, K + 1)
step1 = c.reshape(K, 1) * therange
step2 = step1 + x.min(axis=0)
step3 = step2.reshape(K, 1, dim)
self.centroids = step3
def randomcentroids(self): # tested for 1D, 2D, 3D
step1 = np.random.permutation(self.x)
step2 = step1[:self.K]
self.centroids = step2.reshape(self.K, 1, self.dim)
# Part 3: Functions to iterate through the centroids and closest points.
def closestcentroids(self):
centroids = self.centroids
X = self.x
dim = self.dim
if dim == 1:
# works only for 1D data, not n*D like Octave script
dist = (centroids - X) ** 2
self.index = dist.argmin(axis=0)
# X must have dimensions m,n. Centroids must have dimensions m,1,n.
# It's a 3D vector to keep it in one 'row.'
elif dim > 1:
# axis=2 for 3d through trial-and-error. same w/T!
dist = np.linalg.norm(X - centroids, axis=2).T
self.index = dist.argmin(axis=1)
else:
print 'Can\'t understand how many dimensions this data has!'
def newcentroids(self):
centroids = self.centroids
idx = self.index
X = self.x
dim = X.shape[1]
for i in range(centroids.shape[0]):
centroids[i][0] = X[idx == i].mean(axis=0)
self.centroids = centroids
# sometimes X[idx==i] is empty when centroids are too close to each other.
# i just skip this calculation in run_k_means, using pass.
# Part 4: Running the K-means clustering.
def run_k_means(self):
"""return centroid position, how many for each, and cost"""
centroids = self.centroids
for i in range(self.max_iters):
self.closestcentroids()
self.newcentroids()
J = 0
X = self.x
m = len(X)
idx = self.index
K = self.K
dim = X.shape[1]
for num in range(K):
# find the index of all entries where idx==n
indexentries = np.nonzero(idx == num)[0]
# the values in X that have the index in indesxentries
values = X[indexentries]
# using one of the K centroids to do the calculation. K<=2 doesn't
# work here for some reason.
centroid = centroids[num, 0]
J += np.sum((values - centroid) ** 2)
return [centroids.reshape((1, K, dim)), [X[idx == k].size for k in range(K)], J / m]
# Part 5: Computing the cost.
def compute_cost(self, index, X):
J = 0
m = len(X)
X = self.x
for num in range(K):
# find the index of all entries where idx==n
indexentries = np.nonzero(index == num)[0]
# the values in X that have the index in indesxentries
values = X[indexentries]
# using one of the K centroids to do the calculation
centroid = centroids[0, num]
J += np.sum((values - centroid) ** 2)
return J / m
# Part 6: Computing the centroids multiple times to find the one w/lowest cost.
def k_means_multiple(self, K):
"""How many centroids K to cluster the data with? The algorithm will cycle through self.max_iters times for each try, self.tries."""
self.K = K
table = []
for numberoftimes in range(self.tries):
self.randomcentroids()
try:
atry = self.run_k_means()
table.append(atry)
except ValueError:
pass
c = ['centroid position', 'how many for each', 'J']
self.table = pd.DataFrame(table, columns=c).sort_index(by=['J']).head()
self.table = self.table[['J', 'how many for each', 'centroid position']]
# Part 7: Trying multiple # of centroids, or K, and plotting the cost J for each.
def plotj(self, maxK):
"""Plots the cost for each number of centroids K to find the optimal number of centroids.
How many different Ks to calculate, in order to find the right number of centroids?"""
list = []
print 'Calculating for',
for k in range(2, maxK + 1):
if k==maxK:
print 'K={}'.format(k)
else:
print 'K={},'.format(k),
self.k_means_multiple(k)
table = self.table
table = table.reset_index()
table = table.drop('index', 1)
iwant = table.ix[0]
J = iwant['J']
list.append([k, J])
toplot = pd.DataFrame(list)
toplot = toplot.set_index(0)
ax = toplot.plot(
legend=False, ylim=0, xlim=2, xticks=range(2, maxK + 1))
ax.set_xlabel('Number of centroids')
ax.set_ylabel('Cost of centroids and their points')
def fit(self, values, clusters):
self.x = values
try:
self.dim = self.x.shape[1]
except:
self.dim = 1
self.K = clusters
self.randomcentroids()
print 'Running K-means {} times, moving the {} centroids a max of {} times on each try...'.format(self.tries, self.K, self.max_iters)
self.k_means_multiple(self.K)
print self.table
def find_centroid_for_each(self):
"""Which entries belong to which groups?"""
step1 = self.table['centroid position'].reset_index().ix[0]
step2 = np.array(step1)[1]
self.centroids = step2.reshape(self.K, 1, self.dim)
self.closestcentroids()
return self.index