Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Features/599 kmeans and friends #634

Merged
merged 28 commits into from
Jul 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
713d9cb
Implementation of Manhattan distance function (L1 distance)
Jul 7, 2020
4b3250b
Implement base class for k-clustering algorithms
Jul 8, 2020
2fd8514
Implementation of Kmedians and bugfix in Manhattan distance
Jul 8, 2020
acf68fa
Implementation of kmedoids as kmedians version that snaps median to n…
Jul 8, 2020
af85223
Bugfix for init
Jul 8, 2020
411753d
Refactoring of Implementation Detail Classes
Jul 8, 2020
4a5b89b
Failsafe for median k-clusterers in case no point is assigned to the …
Jul 8, 2020
1d5c51b
Bugsquashin for kmedians, extended test for kmeans
Jul 9, 2020
6b7e9bf
Merge remote-tracking branch 'upstream/master' into features/599-kmea…
Jul 10, 2020
46a2003
Enhance testing for kmeans with spherical cluster dataset (Adresses #…
Jul 13, 2020
de43b16
Enhance testing for kmedians with spherical cluster dataset
Jul 13, 2020
4e01da4
Fixing Kmedoids
Jul 13, 2020
0417141
Fixed Tests for Kmedoids
Jul 13, 2020
2b06c00
Merge remote-tracking branch 'upstream/master' into features/599-kmea…
Jul 14, 2020
83703f6
Tweaking tests, found bugs
Jul 14, 2020
82b25ca
Writing Documentation
Jul 14, 2020
ad1d3d2
Intermediate State of fixing spherical cluster tests
Jul 15, 2020
e2bb708
Introducing example for k-clustering algorithms
Jul 16, 2020
e9e6a96
Merge remote-tracking branch 'upstream/master' into features/599-kmea…
Jul 16, 2020
b7d61b2
Moved test on spherical clusters to example; removed unsplit test for…
Jul 16, 2020
51fd4b6
Changed Changelog
Jul 16, 2020
ae4f374
Unit tests for manhattan distance
Jul 16, 2020
e305864
Fix typo in tests
Jul 16, 2020
fc9add6
Merge remote-tracking branch 'upstream/master' into features/599-kmea…
Jul 22, 2020
901f3d1
Changes from PR review
Jul 22, 2020
2f9c10e
Further changes from PR review
Jul 22, 2020
c77261c
Final Change in print
Jul 22, 2020
ef92850
Spellcheck
Jul 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- [#620](https://github.com/helmholtz-analytics/heat/pull/620) New feature: KNN
- [#624](https://github.com/helmholtz-analytics/heat/pull/624) Bugfix: distributed median() indexing and casting
- [#629](https://github.com/helmholtz-analytics/heat/pull/629) New features: `asin`, `acos`, `atan`, `atan2`
- [#634](https://github.com/helmholtz-analytics/heat/pull/634) New features: `kmedians`, `kmedoids`, `manhattan`
- [#633](https://github.com/helmholtz-analytics/heat/pull/633) Documentation: updated contributing.md
- [#635](https://github.com/helmholtz-analytics/heat/pull/635) `DNDarray.__getitem__` balances and resplits the given key to None if the key is a DNDarray

Expand Down
106 changes: 106 additions & 0 deletions examples/cluster/demo_kClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import torch
import heat as ht


def create_spherical_dataset(
num_samples_cluster, radius=1.0, offset=4.0, dtype=ht.float32, random_state=1
):
"""
Creates k=4 sperical clusters in 3D space along the space-diagonal

Parameters
----------
num_samples_cluster: int
Number of samples per cluster. Each process will create n // MPI_WORLD.size elements for each cluster
radius: float
Radius of the sphere
offset: float
Shift of the clusters along the axes. The 4 clusters will be positioned centered around c1=(offset, offset,offset),
c2=(2*offset,2*offset,2*offset), c3=(-offset, -offset, -offset) and c4=(2*offset, -2*offset, -2*offset)
dtype: ht.datatype
random_state: int
seed of the torch random number generator
"""
# create k sperical clusters with each n elements per cluster. Each process creates k * n/p elements
ht.random.seed(random_state)
# radius between 0 and 1
r = ht.random.rand(num_samples_cluster, split=0) * radius
# theta between 0 and pi
theta = ht.random.rand(num_samples_cluster, split=0) * ht.constants.PI
# phi between 0 and 2pi
phi = ht.random.rand(num_samples_cluster, split=0) * 2 * ht.constants.PI
# Cartesian coordinates
x = r * ht.sin(theta) * ht.cos(phi)
x.astype(dtype, copy=False)
y = r * ht.sin(theta) * ht.sin(phi)
y.astype(dtype, copy=False)
z = r * ht.cos(theta)
z.astype(dtype, copy=False)

cluster1 = ht.stack((x + offset, y + offset, z + offset), axis=1)
cluster2 = ht.stack((x + 2 * offset, y + 2 * offset, z + 2 * offset), axis=1)
cluster3 = ht.stack((x - offset, y - offset, z - offset), axis=1)
cluster4 = ht.stack((x - 2 * offset, y - 2 * offset, z - 2 * offset), axis=1)

data = ht.concatenate((cluster1, cluster2, cluster3, cluster4), axis=0)
return data


def main():
seed = 1
n = 20 * ht.MPI_WORLD.size

data = create_spherical_dataset(
num_samples_cluster=n, radius=1.0, offset=4.0, dtype=ht.float32, random_state=seed
)
reference = ht.array([[-8, -8, -8], [-4, -4, -4], [4, 4, 4], [8, 8, 8]], dtype=ht.float32)

clusterer = {
"kmeans": ht.cluster.KMeans(n_clusters=4, init="kmeans++"),
"kmedians": ht.cluster.KMedians(n_clusters=4, init="kmedians++"),
"kmedoids": ht.cluster.KMedoids(n_clusters=4, init="kmedoids++"),
}

print(f"4 Spherical clusters with radius 1.0, each {n} samples (dtype = ht.float32)")
for name, c in clusterer.items():
c.fit(data)
print(
f"### Fitting with {name} ### \n"
f"Original sphere centers = {reference}\n"
f"Fitted cluster centers = {c.cluster_centers_}"
)

# More Samples
n = 100 * ht.MPI_WORLD.size
data = create_spherical_dataset(
num_samples_cluster=n, radius=1.0, offset=4.0, dtype=ht.float32, random_state=seed
)
print("4 Spherical with radius 1.0, each {} samples (dtype = ht.float32) ".format(n))
for name, c in clusterer.items():
c.fit(data)
print(
f"### Fitting with {name} ### \n"
f"Original sphere centers = {reference}\n"
f"Fitted cluster centers = {c.cluster_centers_}"
)

# On integers (different radius, offset and datatype)
n = 20 * ht.MPI_WORLD.size
data = create_spherical_dataset(
num_samples_cluster=n, radius=10.0, offset=40.0, dtype=ht.int32, random_state=seed
)
reference = ht.array(
[[-80, -80, -80], [-40, -40, -40], [40, 40, 40], [80, 80, 80]], dtype=ht.float32
)
print("4 Spherical clusters with radius 10, each {} samples (dtype = ht.int32) ".format(n))
for name, c in clusterer.items():
c.fit(data)
print(
f"### Fitting with {name} ### \n"
f"Original sphere centers = {reference}\n"
f"Fitted cluster centers = {c.cluster_centers_}"
)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions heat/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
from ._kcluster import *
from .kmeans import *
from .kmedians import *
from .kmedoids import *
from .spectral import *
249 changes: 249 additions & 0 deletions heat/cluster/_kcluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
import heat as ht


class _KCluster(ht.ClusteringMixin, ht.BaseEstimator):
"""
Base class for k-statistics clustering algorithms (kmeans, kmedians, kmedoids).
The clusters are represented by centroids ci (we use the term from kmeans for simplicity)

Parameters
----------
metric : function
One of the distance metrics in ht.spatial.distance. Needs to be passed as lambda function to take only two arrays as input
n_clusters : int
The number of clusters to form as well as the number of centroids to generate.
init : str or DNDarray
Method for initialization, defaults to ‘random’:
- ‘probability_based’ : selects initial cluster centers for the clustering in a smart way to speed up convergence (k-means++) \n
Markus-Goetz marked this conversation as resolved.
Show resolved Hide resolved
- ‘random’: choose k observations (rows) at random from data for the initial centroids. \n
- ``DNDarray``: gives the initial centers, should be of Shape = (n_clusters, n_features)
max_iter : int
Maximum number of iterations for a single run.
tol : float, default: 1e-4
Relative tolerance with regards to inertia to declare convergence.
random_state : int
Determines random number generation for centroid initialization.
"""

def __init__(self, metric, n_clusters, init, max_iter, tol, random_state):
self.n_clusters = n_clusters
self.init = init
self.max_iter = max_iter
self.tol = tol
self.random_state = random_state

# in-place properties
self._metric = metric
self._cluster_centers = None
self._labels = None
self._inertia = None
self._n_iter = None

@property
def cluster_centers_(self):
"""
Returns the coordinates of cluster centers.
"""
return self._cluster_centers

@property
def labels_(self):
"""
Returns the labels of each point.
"""
return self._labels

@property
def inertia_(self):
"""
Returns sum of squared distances of samples to their closest cluster center.
"""
return self._inertia

@property
def n_iter_(self):
"""
Returns the number of iterations run.
"""
return self._n_iter

def _initialize_cluster_centers(self, X):
"""
Initializes the K-Means centroids.

Parameters
----------
X : DNDarray,
The data to initialize the clusters for. Shape = (n_point, n_features)
"""
# always initialize the random state
if self.random_state is not None:
ht.random.seed(self.random_state)

# initialize the centroids by randomly picking some of the points
if self.init == "random":
# Samples will be equally distributed drawn from all involved processes
_, displ, _ = X.comm.counts_displs_shape(shape=X.shape, axis=0)
# At least float32 precision, if X has higher precision datatype, use that
datatype = ht.promote_types(X.dtype, ht.float32)
centroids = ht.empty(
(self.n_clusters, X.shape[1]),
split=None,
dtype=datatype,
device=X.device,
comm=X.comm,
)
if X.split is None or X.split == 0:
for i in range(self.n_clusters):
samplerange = (
X.gshape[0] // self.n_clusters * i,
X.gshape[0] // self.n_clusters * (i + 1),
)
sample = ht.random.randint(samplerange[0], samplerange[1]).item()
proc = 0
for p in range(X.comm.size):
if displ[p] > sample:
break
proc = p
xi = ht.zeros(X.shape[1], dtype=X.dtype)
if X.comm.rank == proc:
idx = sample - displ[proc]
xi = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
xi.comm.Bcast(xi, root=proc)
centroids[i, :] = xi

else:
raise NotImplementedError("Not implemented for other splitting-axes")

self._cluster_centers = centroids

# directly passed centroids
elif isinstance(self.init, ht.DNDarray):
if len(self.init.shape) != 2:
raise ValueError(
"passed centroids need to be two-dimensional, but are {}".format(len(self.init))
)
if self.init.shape[0] != self.n_clusters or self.init.shape[1] != X.shape[1]:
raise ValueError("passed centroids do not match cluster count or data shape")
self._cluster_centers = self.init.resplit(None)

# smart centroid guessing, random sampling with probability weight proportional to distance to existing centroids
elif self.init == "probability_based":
datatype = ht.promote_types(X.dtype, ht.float32)
if X.split is None or X.split == 0:
centroids = ht.zeros(
(self.n_clusters, X.shape[1]),
split=None,
dtype=datatype,
device=X.device,
comm=X.comm,
)
sample = ht.random.randint(0, X.shape[0] - 1).item()
_, displ, _ = X.comm.counts_displs_shape(shape=X.shape, axis=0)
proc = 0
for p in range(X.comm.size):
if displ[p] > sample:
break
proc = p
x0 = ht.zeros(X.shape[1], dtype=X.dtype, device=X.device, comm=X.comm)
if X.comm.rank == proc:
idx = sample - displ[proc]
x0 = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
x0.comm.Bcast(x0, root=proc)
centroids[0, :] = x0
for i in range(1, self.n_clusters):
distances = self._metric(X, centroids)
D2 = distances.min(axis=1)
D2.resplit_(axis=None)
prob = D2 / D2.sum()
x = ht.random.rand().item()
sample = 0
sum = 0
for j in range(len(prob)):
if sum > x:
break
sum += prob[j].item()
sample = j
proc = 0
for p in range(X.comm.size):
if displ[p] > sample:
break
proc = p
xi = ht.zeros(X.shape[1], dtype=X.dtype)
if X.comm.rank == proc:
idx = sample - displ[proc]
xi = ht.array(X.lloc[idx, :], device=X.device, comm=X.comm)
xi.comm.Bcast(xi, root=proc)
centroids[i, :] = xi

else:
raise NotImplementedError("Not implemented for other splitting-axes")

self._cluster_centers = centroids

else:
raise ValueError(
'init needs to be one of "random", ht.DNDarray or "kmeans++", but was {}'.format(
self.init
)
)

def _assign_to_cluster(self, X):
"""
Assigns the passed data points to the centroids based on the respective metric

Parameters
----------
X : DNDarray
Data points, Shape = (n_samples, n_features)
"""
# calculate the distance matrix and determine the closest centroid
distances = self._metric(X, self._cluster_centers)
matching_centroids = distances.argmin(axis=1, keepdim=True)

return matching_centroids

def _update_centroids(self, X, matching_centroids):
"""
The Update strategy is algorithm specific (e.g. calculate mean of assigned points for kmeans, median for kmedians, etc.)

Parameters
----------
X : DNDarray
Input Data
matching_centroids: DNDarray
Index array of assigned centroids

"""
raise NotImplementedError()

def fit(self, X):
"""
Computes the centroid of the clustering algorithm to fit the data X. The full pipeline is algorithm specific.

Parameters
----------
X : DNDarray
Training instances to cluster. Shape = (n_samples, n_features)

"""
raise NotImplementedError()

def predict(self, X):
"""
Predict the closest cluster each sample in X belongs to.

In the vector quantization literature, cluster_centers_ is called the code book and each value returned by
predict is the index of the closest code in the code book.

Parameters
----------
X : DNDarray
New data to predict. Shape = (n_samples, n_features)
"""
# input sanitation
if not isinstance(X, ht.DNDarray):
raise ValueError("input needs to be a ht.DNDarray, but was {}".format(type(X)))

# determine the centroids
return self._assign_to_cluster(X)
Loading