-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
k_means_clustering.c
390 lines (375 loc) · 11.2 KB
/
k_means_clustering.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
/**
* @file k_means_clustering.c
* @brief K Means Clustering Algorithm implemented
* @details
* This file has K Means algorithm implemmented
* It prints test output in eps format
*
* Note:
* Though the code for clustering works for all the
* 2D data points and can be extended for any size vector
* by making the required changes, but note that
* the output method i.e. printEPS is only good for
* polar data points i.e. in a circle and both test
* use the same.
* @author [Lakhan Nad](https://github.com/Lakhan-Nad)
*/
#define _USE_MATH_DEFINES /* required for MS Visual C */
#include <float.h> /* DBL_MAX, DBL_MIN */
#include <math.h> /* PI, sin, cos */
#include <stdio.h> /* printf */
#include <stdlib.h> /* rand */
#include <string.h> /* memset */
#include <time.h> /* time */
/*!
* @addtogroup machine_learning Machine Learning Algorithms
* @{
* @addtogroup k_means K-Means Clustering Algorithm
* @{
*/
/*! @struct observation
* a class to store points in 2d plane
* the name observation is used to denote
* a random point in plane
*/
typedef struct observation
{
double x; /**< abscissa of 2D data point */
double y; /**< ordinate of 2D data point */
int group; /**< the group no in which this observation would go */
} observation;
/*! @struct cluster
* this class stores the coordinates
* of centroid of all the points
* in that cluster it also
* stores the count of observations
* belonging to this cluster
*/
typedef struct cluster
{
double x; /**< abscissa centroid of this cluster */
double y; /**< ordinate of centroid of this cluster */
size_t count; /**< count of observations present in this cluster */
} cluster;
/*!
* Returns the index of centroid nearest to
* given observation
*
* @param o observation
* @param clusters array of cluster having centroids coordinates
* @param k size of clusters array
*
* @returns the index of nearest centroid for given observation
*/
int calculateNearst(observation* o, cluster clusters[], int k)
{
double minD = DBL_MAX;
double dist = 0;
int index = -1;
int i = 0;
for (; i < k; i++)
{
/* Calculate Squared Distance*/
dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
(clusters[i].y - o->y) * (clusters[i].y - o->y);
if (dist < minD)
{
minD = dist;
index = i;
}
}
return index;
}
/*!
* Calculate centoid and assign it to the cluster variable
*
* @param observations an array of observations whose centroid is calculated
* @param size size of the observations array
* @param centroid a reference to cluster object to store information of
* centroid
*/
void calculateCentroid(observation observations[], size_t size,
cluster* centroid)
{
size_t i = 0;
centroid->x = 0;
centroid->y = 0;
centroid->count = size;
for (; i < size; i++)
{
centroid->x += observations[i].x;
centroid->y += observations[i].y;
observations[i].group = 0;
}
centroid->x /= centroid->count;
centroid->y /= centroid->count;
}
/*!
* --K Means Algorithm--
* 1. Assign each observation to one of k groups
* creating a random initial clustering
* 2. Find the centroid of observations for each
* cluster to form new centroids
* 3. Find the centroid which is nearest for each
* observation among the calculated centroids
* 4. Assign the observation to its nearest centroid
* to create a new clustering.
* 5. Repeat step 2,3,4 until there is no change
* the current clustering and is same as last
* clustering.
*
* @param observations an array of observations to cluster
* @param size size of observations array
* @param k no of clusters to be made
*
* @returns pointer to cluster object
*/
cluster* kMeans(observation observations[], size_t size, int k)
{
cluster* clusters = NULL;
if (k <= 1)
{
/*
If we have to cluster them only in one group
then calculate centroid of observations and
that will be a ingle cluster
*/
clusters = (cluster*)malloc(sizeof(cluster));
memset(clusters, 0, sizeof(cluster));
calculateCentroid(observations, size, clusters);
}
else if (k < size)
{
clusters = malloc(sizeof(cluster) * k);
memset(clusters, 0, k * sizeof(cluster));
/* STEP 1 */
for (size_t j = 0; j < size; j++)
{
observations[j].group = rand() % k;
}
size_t changed = 0;
size_t minAcceptedError =
size /
10000; // Do until 99.99 percent points are in correct cluster
int t = 0;
do
{
/* Initialize clusters */
for (int i = 0; i < k; i++)
{
clusters[i].x = 0;
clusters[i].y = 0;
clusters[i].count = 0;
}
/* STEP 2*/
for (size_t j = 0; j < size; j++)
{
t = observations[j].group;
clusters[t].x += observations[j].x;
clusters[t].y += observations[j].y;
clusters[t].count++;
}
for (int i = 0; i < k; i++)
{
clusters[i].x /= clusters[i].count;
clusters[i].y /= clusters[i].count;
}
/* STEP 3 and 4 */
changed = 0; // this variable stores change in clustering
for (size_t j = 0; j < size; j++)
{
t = calculateNearst(observations + j, clusters, k);
if (t != observations[j].group)
{
changed++;
observations[j].group = t;
}
}
} while (changed > minAcceptedError); // Keep on grouping until we have
// got almost best clustering
}
else
{
/* If no of clusters is more than observations
each observation can be its own cluster
*/
clusters = (cluster*)malloc(sizeof(cluster) * k);
memset(clusters, 0, k * sizeof(cluster));
for (int j = 0; j < size; j++)
{
clusters[j].x = observations[j].x;
clusters[j].y = observations[j].y;
clusters[j].count = 1;
observations[j].group = j;
}
}
return clusters;
}
/**
* @}
* @}
*/
/*!
* A function to print observations and clusters
* The code is taken from
* http://rosettacode.org/wiki/K-means%2B%2B_clustering.
* Even the K Means code is also inspired from it
*
* @note To print in a file use pipeline operator
* ```sh
* ./k_means_clustering > image.eps
* ```
*
* @param observations observations array
* @param len size of observation array
* @param cent clusters centroid's array
* @param k size of cent array
*/
void printEPS(observation pts[], size_t len, cluster cent[], int k)
{
int W = 400, H = 400;
double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
double scale = 0, cx = 0, cy = 0;
double* colors = (double*)malloc(sizeof(double) * (k * 3));
int i;
size_t j;
double kd = k * 1.0;
for (i = 0; i < k; i++)
{
*(colors + 3 * i) = (3 * (i + 1) % k) / kd;
*(colors + 3 * i + 1) = (7 * i % k) / kd;
*(colors + 3 * i + 2) = (9 * i % k) / kd;
}
for (j = 0; j < len; j++)
{
if (max_x < pts[j].x)
{
max_x = pts[j].x;
}
if (min_x > pts[j].x)
{
min_x = pts[j].x;
}
if (max_y < pts[j].y)
{
max_y = pts[j].y;
}
if (min_y > pts[j].y)
{
min_y = pts[j].y;
}
}
scale = W / (max_x - min_x);
if (scale > (H / (max_y - min_y)))
{
scale = H / (max_y - min_y);
};
cx = (max_x + min_x) / 2;
cy = (max_y + min_y) / 2;
printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
H + 10);
printf(
"/l {rlineto} def /m {rmoveto} def\n"
"/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
"/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
" gsave 1 setgray fill grestore gsave 3 setlinewidth"
" 1 setgray stroke grestore 0 setgray stroke }def\n");
for (int i = 0; i < k; i++)
{
printf("%g %g %g setrgbcolor\n", *(colors + 3 * i),
*(colors + 3 * i + 1), *(colors + 3 * i + 2));
for (j = 0; j < len; j++)
{
if (pts[j].group != i)
{
continue;
}
printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
(pts[j].y - cy) * scale + H / 2);
}
printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
(cent[i].y - cy) * scale + H / 2);
}
printf("\n%%%%EOF");
// free accquired memory
free(colors);
}
/*!
* A function to test the kMeans function
* Generates 100000 points in a circle of
* radius 20.0 with center at (0,0)
* and cluster them into 5 clusters
*
* <img alt="Output for 100000 points divided in 5 clusters" src=
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
* width="400px" heiggt="400px">
* @returns None
*/
static void test()
{
size_t size = 100000L;
observation* observations =
(observation*)malloc(sizeof(observation) * size);
double maxRadius = 20.00;
double radius = 0;
double ang = 0;
size_t i = 0;
for (; i < size; i++)
{
radius = maxRadius * ((double)rand() / RAND_MAX);
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
observations[i].x = radius * cos(ang);
observations[i].y = radius * sin(ang);
}
int k = 5; // No of clusters
cluster* clusters = kMeans(observations, size, k);
printEPS(observations, size, clusters, k);
// Free the accquired memory
free(observations);
free(clusters);
}
/*!
* A function to test the kMeans function
* Generates 1000000 points in a circle of
* radius 20.0 with center at (0,0)
* and cluster them into 11 clusters
*
* <img alt="Output for 1000000 points divided in 11 clusters" src=
* "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
* width="400px" heiggt="400px">
* @returns None
*/
void test2()
{
size_t size = 1000000L;
observation* observations =
(observation*)malloc(sizeof(observation) * size);
double maxRadius = 20.00;
double radius = 0;
double ang = 0;
size_t i = 0;
for (; i < size; i++)
{
radius = maxRadius * ((double)rand() / RAND_MAX);
ang = 2 * M_PI * ((double)rand() / RAND_MAX);
observations[i].x = radius * cos(ang);
observations[i].y = radius * sin(ang);
}
int k = 11; // No of clusters
cluster* clusters = kMeans(observations, size, k);
printEPS(observations, size, clusters, k);
// Free the accquired memory
free(observations);
free(clusters);
}
/*!
* This function calls the test
* function
*/
int main()
{
srand(time(NULL));
test();
/* test2(); */
return 0;
}