diff --git a/tests/communities/test_spectral.py b/tests/communities/test_spectral.py new file mode 100644 index 00000000..483ab71e --- /dev/null +++ b/tests/communities/test_spectral.py @@ -0,0 +1,117 @@ +import numpy as np + +import pytest + +import xgi +from xgi.exception import XGIError + + +class TestKMeans: + def test_k_is_1(self): + X = np.random.random((3, 3)) + clusters = xgi.communities.spectral._kmeans(X, 1) + + assert len(clusters) == 3 + assert np.all(map(lambda v: v == 1, clusters.values())) + assert np.all(map(lambda v: isinstance(v, int), clusters.values())) + + def test_perfectly_separable_low_dimensions(self): + X = np.zeros((10, 10)) + X[:5, :] = np.random.random((5, 10)) + X[5:10, :] = 37 + np.random.random((5, 10)) + + clusters = xgi.communities.spectral._kmeans(X, 2, seed=2) + assert len(clusters) == 10 + + c1 = list(filter(lambda node: clusters[node] == 0, clusters.keys())) + c2 = list(filter(lambda node: clusters[node] == 1, clusters.keys())) + assert len(c1) == 5 + assert len(c2) == 5 + assert (set(c1) == {0, 1, 2, 3, 4} and set(c2) == {5, 6, 7, 8, 9}) or ( + set(c2) == {0, 1, 2, 3, 4} and set(c1) == {5, 6, 7, 8, 9} + ) + + def test_perfectly_separable_high_dimensions(self): + X = np.zeros((10, 100)) + X[:5, :] = np.random.random((5, 100)) + X[5:10, :] = 37 + np.random.random((5, 100)) + + clusters = xgi.communities.spectral._kmeans(X, 2, seed=2) + assert len(clusters) == 10 + + c1 = list(filter(lambda node: clusters[node] == 0, clusters.keys())) + c2 = list(filter(lambda node: clusters[node] == 1, clusters.keys())) + assert len(c1) == 5 + assert len(c2) == 5 + assert (set(c1) == {0, 1, 2, 3, 4} and set(c2) == {5, 6, 7, 8, 9}) or ( + set(c2) == {0, 1, 2, 3, 4} and set(c1) == {5, 6, 7, 8, 9} + ) + + +class TestSpectralClustering: + def test_errors_num_clusters(self): + H = xgi.complete_hypergraph(5, order=2) + + with pytest.raises(XGIError): + xgi.spectral_clustering(H, 6) + + def test_perfectly_separable_low_dimensions(self): + H = xgi.Hypergraph( + [ + [1, 2], + [2, 3], + [3, 4], + [4, 5], + [1, 3], + [2, 4], + [1, 5], + [6, 7], + [7, 8], + [8, 9], + [9, 10], + [6, 8], + [7, 9], + [6, 10], + ] + ) + + clusters = xgi.communities.spectral.spectral_clustering(H, 2) + assert len(clusters) == 10 + + c1 = list(filter(lambda node: clusters[node] == 0, clusters.keys())) + c2 = list(filter(lambda node: clusters[node] == 1, clusters.keys())) + assert len(c1) == 5 + assert len(c2) == 5 + assert (set(c1) == {1, 2, 3, 4, 5} and set(c2) == {6, 7, 8, 9, 10}) or ( + set(c2) == {1, 2, 3, 4, 5} and set(c1) == {6, 7, 8, 9, 10} + ) + + def test_strongly_separable_low_dimensions(self): + H = xgi.Hypergraph( + [ + [1, 2, 3], + [4, 5], + [1, 3], + [2, 4], + [1, 5], + [4, 9], + [6, 7, 8], + [7, 8], + [8, 9], + [9, 10], + [6, 8], + [7, 9], + [6, 10], + ] + ) + + clusters = xgi.communities.spectral.spectral_clustering(H, 2) + assert len(clusters) == 10 + + # Some nodes obviously in same cluster + assert clusters[1] == clusters[2] + assert clusters[2] == clusters[3] + + # Some nodes obviously not + assert clusters[1] != clusters[8] + assert clusters[2] != clusters[7] diff --git a/xgi/__init__.py b/xgi/__init__.py index 9e6b48ab..aa8327f9 100644 --- a/xgi/__init__.py +++ b/xgi/__init__.py @@ -2,6 +2,7 @@ utils, core, algorithms, + communities, convert, drawing, dynamics, @@ -13,6 +14,7 @@ from .utils import * from .core import * from .algorithms import * +from .communities import * from .convert import * from .drawing import * from .dynamics import * diff --git a/xgi/communities/__init__.py b/xgi/communities/__init__.py new file mode 100644 index 00000000..84c5a562 --- /dev/null +++ b/xgi/communities/__init__.py @@ -0,0 +1,2 @@ +from . import spectral +from .spectral import * diff --git a/xgi/communities/spectral.py b/xgi/communities/spectral.py new file mode 100644 index 00000000..bdb5f26e --- /dev/null +++ b/xgi/communities/spectral.py @@ -0,0 +1,115 @@ +import numpy as np +from scipy.sparse.linalg import eigsh + +from ..core import Hypergraph +from ..linalg.laplacian_matrix import normalized_hypergraph_laplacian + +from ..exception import XGIError + +__all__ = [ + "spectral_clustering", +] + +MAX_ITERATIONS = 10_000 + + +def spectral_clustering(H, k=None): + """Cluster into k-many groups using spectral techniques. + + Compute a spectral clustering according to the heuristic suggested in [1]. + + Parameters + ---------- + H : Hypergraph + Hypergraph + k : int, optional + Number of clusters to find. If unspecified, computes spectral gap. + + Returns + ------- + dict + A dictionary mapping node ids to their clusters. Clusters begin at 0. + + Raises + ------ + XGIError + If more groups are specified than nodes in the hypergraph. + + + References + ---------- + .. [1] Zhou, D., Huang, J., & Schölkopf, B. (2006). + Learning with Hypergraphs: Clustering, Classification, and Embedding + Advances in Neural Information Processing Systems. + + """ + if k is None: + raise NotImplementedError( + "Choosing a number of clusters organically is currently unsupported. Please specify an integer value for paramater 'k'!" + ) + else: + if k > H.num_nodes: + raise XGIError( + "The number of desired clusters cannot exceed the number of nodes!" + ) + + # Compute normalize Laplacian and its spectra + L, rowdict = normalized_hypergraph_laplacian(H, index=True) + evals, eigs = eigsh(L, k=k, which="SA") + + # Form metric space representation + X = np.array(eigs) + print(X.shape, X) + + # Apply k-means clustering + _clusters = _kmeans(X, k) + + # Remap to node ids + clusters = {rowdict[id]: cluster for id, cluster in _clusters.items()} + + return clusters + + +def _kmeans(X, k, seed=37): + rng = np.random.default_rng(seed=seed) + + # Handle edge cases + if k == 1: + return {node_idx: 1 for node_idx in range(X.shape[0])} + + # Initialize stopping criterion + num_cluster_changes = np.inf + num_iterations = 0 + + # Instantiate random centers + bounds_inf = X.min(axis=0) + bounds_sup = X.max(axis=0) + width = bounds_sup - bounds_inf + + centroids = width * rng.random((k, X.shape[1])) + + # Instantiate random clusters + previous_clusters = {node: rng.integers(0, k) for node in range(X.shape[0])} + + # Iterate main kmeans computation + while (num_cluster_changes > 0) and (num_iterations < MAX_ITERATIONS): + # Find nearest centroid to each point + next_clusters = dict() + for node, vector in enumerate(X): + distances = list( + map(lambda centroid: np.linalg.norm(vector - centroid), centroids) + ) + closest_centroid = np.argmin(distances) + next_clusters[node] = closest_centroid + + # Update convergence condition + cluster_changes = { + node: next_clusters[node] != previous_clusters[node] + for node in range(X.shape[0]) + } + num_cluster_changes = len( + list(filter(lambda diff: diff, cluster_changes.values())) + ) + num_iterations += 1 + + return next_clusters