-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdc_kmeans.py
250 lines (230 loc) · 10.2 KB
/
dc_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#############################
# This file implements the DC-Kmeans, and it is adapted from the publicly available implementations
# at https://github.com/tginart/deletion-efficient-kmeans.
#############################
import numpy as np
from utils import induced_loss
import pickle
import math
from tqdm import tqdm
from sklearn.cluster import KMeans
from model import MyKmeans
import time
class DCnode(MyKmeans):
"""
A k-means subproblem for the divide-and-conquer tree
in DC-k-means algorithm
"""
def __init__(self, k, iters):
MyKmeans.__init__(self, k, max_iters=iters)
self.children = []
self.parent = None
self.node_data = set()
self.data_prop = set()
def _run_node(self, X):
self._set_node_data(X)
self._lloyd_iterations()
def _set_node_data(self, X):
self.data = X[list(self.node_data)]
self._set_data(self.data)
class DCKmeans():
def __init__(self, ks, widths, iters=10, n_init=10):
"""
ks - list of k parameter for each layer of DC-tree
widths - list of width parameter (number of buckets) for each layer
iters - # of iterations to run
"""
self.ks = ks
self.widths = widths
self.dc_tree = self._init_tree(ks, widths, iters, n_init)
self.data_partition_table = dict()
self.data = dict()
self.dels = set()
self.valid_ids = []
self.d = 0
self.n = 0
self.h = len(self.dc_tree)
for i in range(self.h):
self.data[i] = None
def run(self, dataset, assignments=True):
"""
X - numpy matrix, n-by-d, each row is a data point
assignments (optional) - bool flag, computes assignments and loss
NOTE: Without assignments flag, this only returns the centroids
OUTPUT:
centroids - k-by-d matrix of centroids
IF assignments FLAG IS SET ALSO RETURNS:
assignments - Vector of length n, with datapoint to center assignments
loss - The loss of the final partition
"""
X = dataset["full_data"]
self._init_data(X)
# random partition
# self._partition_data(X)
# fixed partition
self._set_partition_data(dataset)
self._run()
if assignments:
# self.centroids is the root node (server) centroids
# we should use this to compute the induced loss
# note that we only consider the two-layer structure here
if self.h != 2:
raise NotImplementedError
# self.assignments is the root node (server) assignment
# the root node now is a sklearn.KMeans class
# self.assignments = self.dc_tree[0][0].assignments
self.assignments = self.dc_tree[0][0].labels_
local_induced_loss = 0.0
leaf_start_idx = 0
global_assignment = []
for leaf_id in range(len(self.dc_tree[-1])):
leaf = self.dc_tree[-1][leaf_id] # DCnode
leaf_global_assignment = self.assignments[leaf.assignments +
leaf_start_idx]
leaf_start_idx += self.ks[1]
local_induced_loss += induced_loss(leaf.data, self.centroids,
leaf_global_assignment)
global_assignment.append(leaf_global_assignment)
global_assignment = np.concatenate(global_assignment)
return self.centroids, global_assignment, local_induced_loss
return self.centroids
def delete(self, del_idx, assignments=True):
"""
del_idx is the index in the updated mapping, not the index in the original mapping
idx = self.valid_ids[del_idx] is the real removed points wrt the original data_full
"""
start = time.time()
idx = self.valid_ids[del_idx]
self.valid_ids.pop(del_idx)
self.dels.add(idx)
node = self.dc_tree[-1][self.data_partition_table[idx]]
node.node_data.remove(idx)
l = self.h - 1
self.n -= 1
while True:
if node.parent == None:
# root node, sklearn.KMeans object
# get the data for root node
# root_data = np.vstack([self.data[0][root_data_idx] for root_data_idx in range(self.widths[1]*self.ks[1])])
node.fit(
self.data[0]
) # fit new data to sklearn.KMeans will reset parameters automatically
self.centroids = node.cluster_centers_
break
# otherwise node is normal DCnode
node._init_placeholders() # need to reset manually for MyKmeans
node._run_node(self.data[l])
data_prop = list(node.data_prop)
for c_id in range(len(node.centroids)):
idx = data_prop[c_id]
self.data[l - 1][idx] = node.centroids[c_id]
node = node.parent
l -= 1
removal_time = time.time() - start
# compute the induced loss after update
if assignments:
# self.centroids is the root node (server) centroids
# we should use this to compute the induced loss
# note that we only consider the two-layer structure here
if self.h != 2:
raise NotImplementedError
# self.assignments is the root node (server) assignment
self.assignments = self.dc_tree[0][0].labels_
local_induced_loss = 0.0
leaf_start_idx = 0
global_assignment = []
for leaf_id in range(len(self.dc_tree[-1])):
leaf = self.dc_tree[-1][leaf_id] # DCnode
leaf_global_assignment = self.assignments[leaf.assignments +
leaf_start_idx]
leaf_start_idx += self.ks[1]
local_induced_loss += induced_loss(leaf.data, self.centroids,
leaf_global_assignment)
global_assignment.append(leaf_global_assignment)
global_assignment = np.concatenate(global_assignment)
return self.centroids, global_assignment, local_induced_loss, removal_time
return self.centroids, removal_time
def _init_data(self, X):
self.n = len(X)
self.valid_ids = list(range(self.n))
self.d = len(X[0])
data_layer_size = self.n
for i in range(self.h - 1, -1, -1):
self.data[i] = np.zeros((data_layer_size, self.d))
data_layer_size = self.ks[i] * self.widths[i]
def _partition_data(self, X):
"""
Random data partition for leaves --> layer self.h-1
"""
self.d = len(X[0])
num_leaves = len(self.dc_tree[-1])
for i in range(len(X)):
leaf_id = np.random.choice(num_leaves)
leaf = self.dc_tree[-1][leaf_id]
self.data_partition_table[i] = leaf_id
leaf.node_data.add(i)
self.data[self.h - 1][i] = X[i]
def _set_partition_data(self, dataset):
"""
This function will set the data for each leaf based on dataset (dict) deterministically instead of random partition
"""
num_leaves = len(self.dc_tree[-1])
data_full = dataset["full_data"]
print("=" * 5, "preparing data for dc-kmeans", "=" * 5)
for leaf_id in tqdm(range(num_leaves)):
client_data = dataset["client_" + str(leaf_id)]
leaf = self.dc_tree[-1][leaf_id]
for i_client_data in range(client_data.shape[0]):
i_in_data_full = np.where(
(data_full == client_data[i_client_data]).all(
axis=1))[0][0]
# assert i_in_data_full.size == 1, "there are identical data points in dataset"
# i_in_data_full = i_in_data_full[0]
self.data_partition_table[i_in_data_full] = leaf_id
leaf.node_data.add(i_in_data_full)
self.data[
self.h -
1][i_in_data_full] = data_full[i_in_data_full].copy()
data_full[i_in_data_full] = np.ones(
(self.d, )) * 2.0 # 2 will never happen
def _run(self):
print("=" * 5, "initial training of dc-kmeans", "=" * 5)
for l in range(self.h - 1, -1, -1):
c = 0
for j in tqdm(range(self.widths[l])):
subproblem = self.dc_tree[l][j]
if subproblem.parent == None:
# root_data = np.vstack([self.data[0][root_data_idx] for root_data_idx in range(self.widths[1]*self.ks[1])])
subproblem.fit(self.data[0])
self.centroids = subproblem.cluster_centers_
else:
subproblem._run_node(self.data[l])
for c_id in range(len(subproblem.centroids)):
subproblem.data_prop.add(c)
subproblem.parent.node_data.add(c)
self.data[l - 1][c] = subproblem.centroids[c_id]
c += 1
def _init_tree(self, ks, widths, iters, n_init):
# tree = [[DCnode(ks[0], iters)]] # root node
root_node = KMeans(n_clusters=ks[0], n_init=n_init, max_iter=iters)
root_node.parent = None
root_node.children = []
root_node.node_data = set()
tree = [[root_node]] # use sklearn.KMeans for the root node
for i in range(1, len(widths)):
k = ks[i]
assert widths[i] % widths[i -
1] == 0, "Inconsistent widths in tree"
merge_factor = int(widths[i] / widths[i - 1])
level = []
for j in range(widths[i - 1]):
parent = tree[i - 1][j]
for _ in range(merge_factor):
child = DCnode(k, iters=iters)
child.parent = parent
parent.children.append(child)
level.append(child)
tree.append(level)
return tree