diff --git a/docs/.pages b/docs/.pages index 9f7b7e5360..c72bb6ee96 100644 --- a/docs/.pages +++ b/docs/.pages @@ -6,3 +6,4 @@ nav: - faq - releases - benchmarks + - license diff --git a/docs/license/.pages b/docs/license/.pages new file mode 100644 index 0000000000..319cebe79a --- /dev/null +++ b/docs/license/.pages @@ -0,0 +1,2 @@ +title: License 📝 + diff --git a/docs/license/license.md b/docs/license/license.md new file mode 100644 index 0000000000..87980bb6fe --- /dev/null +++ b/docs/license/license.md @@ -0,0 +1,3 @@ +# License + +River is free and open-source software licensed under the [3-clause BSD license](https://github.com/online-ml/river/blob/main/LICENSE). \ No newline at end of file diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py index e098f0de2e..0b4408271f 100644 --- a/river/cluster/dbstream.py +++ b/river/cluster/dbstream.py @@ -227,10 +227,10 @@ def _update(self, x): self.s_t[i][j] = self._time_stamp except KeyError: try: - self.s[i][j] = 0 + self.s[i][j] = 1 self.s_t[i][j] = self._time_stamp except KeyError: - self.s[i] = {j: 0} + self.s[i] = {j: 1} self.s_t[i] = {j: self._time_stamp} # prevent collapsing clusters @@ -266,6 +266,15 @@ def _cleanup(self): if micro_cluster_i.weight * value < weight_weak: micro_clusters.pop(i) + self.s.pop(i, None) + self.s_t.pop(i, None) + # Since self.s and self.s_t always have the same keys and are arranged in ascending orders + for j in self.s: + if j < i: + self.s[j].pop(i, None) + self.s_t[j].pop(i, None) + else: + break # Update microclusters self._micro_clusters = micro_clusters diff --git a/river/cluster/denstream.py b/river/cluster/denstream.py index 4d060795de..5100b30e2b 100644 --- a/river/cluster/denstream.py +++ b/river/cluster/denstream.py @@ -120,16 +120,16 @@ class DenStream(base.Clusterer): ... denstream.learn_one(x) >>> denstream.predict_one({0: -1, 1: -2}) - 0 + 1 >>> denstream.predict_one({0: 5, 1: 4}) - 1 + 2 >>> denstream.predict_one({0: 1, 1: 1}) 0 >>> denstream.n_clusters - 2 + 3 """ @@ -183,7 +183,7 @@ def centers(self): @staticmethod def _distance(point_a, point_b): - return math.sqrt(utils.math.minkowski_distance(point_a, point_b, 2)) + return utils.math.minkowski_distance(point_a, point_b, 2) def _get_closest_cluster_key(self, point, clusters): min_distance = math.inf diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py index 255c71e7a3..6b5e3776f4 100644 --- a/river/cluster/test_dbstream.py +++ b/river/cluster/test_dbstream.py @@ -1,11 +1,13 @@ from __future__ import annotations import pytest +from sklearn.datasets import make_blobs +from river import metrics, stream, utils from river.cluster import DBSTREAM -def build_dbstream(fading_factor=0.001, intersection_factor=0.05): +def build_dbstream(fading_factor=0.01, intersection_factor=0.05): return DBSTREAM( fading_factor=fading_factor, clustering_threshold=1, @@ -31,27 +33,44 @@ def test_cluster_formation_and_cleanup(): X = [ {1: 1}, + {1: 2}, {1: 3}, {1: 3}, {1: 3}, {1: 5}, {1: 7}, {1: 9}, + {1: 10}, {1: 11}, {1: 11}, + {1: 12}, {1: 13}, {1: 11}, {1: 15}, + {1: 15}, + {1: 16}, + {1: 17}, + {1: 17}, {1: 17}, ] for x in X: dbstream.learn_one(x) - assert len(dbstream._micro_clusters) == 3 - assert_micro_cluster_properties(dbstream.micro_clusters[1], center={1: 3}, last_update=3) - assert_micro_cluster_properties(dbstream.micro_clusters[5], center={1: 11}, last_update=10) - assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12) + assert len(dbstream._micro_clusters) == 4 + assert_micro_cluster_properties(dbstream.micro_clusters[2], center={1: 3}, last_update=4) + assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 11}, last_update=13) + assert_micro_cluster_properties(dbstream.micro_clusters[8], center={1: 15}, last_update=15) + assert_micro_cluster_properties(dbstream.micro_clusters[10], center={1: 17}, last_update=19) + + assert dbstream.predict_one({1: 2.0}) == 0 + assert dbstream.predict_one({1: 13.0}) == 1 + assert dbstream.predict_one({1: 13 + 1e-10}) == 2 + assert dbstream.predict_one({1: 16 - 1e-10}) == 2 + assert dbstream.predict_one({1: 18}) == 3 + + assert len(dbstream._clusters) == 4 + assert dbstream.s == dbstream.s_t == {} def test_with_two_micro_clusters(): @@ -59,24 +78,21 @@ def test_with_two_micro_clusters(): add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) - # Points in the middle of first and second micro-clusters - for _ in range(5): - dbstream.learn_one({1: 2, 2: 2}) - assert len(dbstream._micro_clusters) == 2 + assert len(dbstream.micro_clusters) == 2 assert_micro_cluster_properties( - dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56 + dbstream.micro_clusters[0], center={1: 2.137623, 2: 2.137623}, last_update=51 ) assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.402677, 2: 2.402677}, last_update=56 + dbstream.micro_clusters[1], center={1: 2.914910, 2: 2.914910}, last_update=51 ) - assert dbstream.s == {0: {1: 3.995844478090532}} - assert dbstream.s_t == {0: {1: 56}} + assert dbstream.s == {0: {1: 23.033438964246173}} + assert dbstream.s_t == {0: {1: 51}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033}) + assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.415239, 2: 2.415239}) def test_density_graph_with_three_micro_clusters(): @@ -88,34 +104,38 @@ def test_density_graph_with_three_micro_clusters(): for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) + assert dbstream.s == {0: {1: 23.033438964246173}} + assert dbstream.s_t == {0: {1: 51}} + add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) # Points in the middle of second and third micro-clusters for _ in range(4): dbstream.learn_one({1: 3, 2: 3}) assert len(dbstream._micro_clusters) == 3 - assert_micro_cluster_properties( - dbstream.micro_clusters[0], center={1: 1.597322, 2: 1.597322}, last_update=56 + dbstream.micro_clusters[0], center={1: 2.0, 2: 2.0}, last_update=56 ) assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86 + dbstream.micro_clusters[1], center={1: 3.0, 2: 3.0}, last_update=86 ) assert_micro_cluster_properties( - dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86 + dbstream.micro_clusters[2], center={1: 3.982141, 2: 3.982141}, last_update=82 ) - assert dbstream.s[0] == pytest.approx({1: 3.995844}) - assert dbstream.s[1] == pytest.approx({2: 2.997921}) - assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}} + assert dbstream.s[0] == pytest.approx({1: 23.033439}) + assert dbstream.s[1] == pytest.approx({2: 23.033439}) + assert dbstream.s_t == {0: {1: 51}, 1: {2: 82}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894}) + print(dbstream.clusters[0].center) + assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.800788, 2: 2.800788}) def test_density_graph_with_removed_microcluster(): - dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3) + dbstream = build_dbstream(fading_factor=0.1, + intersection_factor=0.3) add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25) add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25) @@ -123,23 +143,70 @@ def test_density_graph_with_removed_microcluster(): for _ in range(5): dbstream.learn_one({1: 2, 2: 2}) - add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25) + add_cluster(dbstream, initial_point={1: 3.5, 2: 3.5}, move_towards={1: 2.9, 2: 2.9}, times=25) + # Points in the middle of second and third micro-clusters for _ in range(4): - dbstream.learn_one({1: 3, 2: 3}) + dbstream.learn_one({1: 2.6, 2: 2.6}) assert len(dbstream._micro_clusters) == 2 assert_micro_cluster_properties( - dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86 + dbstream.micro_clusters[0], center={1: 2.023498, 2: 2.023498}, last_update=86 ) assert_micro_cluster_properties( - dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86 + dbstream.micro_clusters[1], center={1: 2.766543, 2: 2.766543}, last_update=86 ) - assert dbstream.s[0] == pytest.approx({1: 3.615835}) - assert dbstream.s[1] == pytest.approx({2: 2.803583}) - assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}} + assert dbstream.s == {0: {1: 4.702391097045977}} + assert dbstream.s_t == {0: {1: 86}} dbstream._recluster() assert len(dbstream.clusters) == 1 - assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231}) + assert_micro_cluster_properties( + dbstream.clusters[0], center={1: 2.560647, 2: 2.560647} + ) + + +def test_dbstream_synthetic_sklearn(): + centers = [(-10, -10), (-5, -5), (0, 0), (5, 5), (10, 10)] + cluster_std = [0.6] * 5 + + # Create a dataset with 15000 data points with 5 centers and cluster SD of 0.6 each + X, y = make_blobs(n_samples=15_000, + cluster_std=cluster_std, + centers=centers, + n_features=2, + random_state=42) + + dbstream = DBSTREAM( + clustering_threshold=2, + fading_factor=0.05, + intersection_factor=0.1, + cleanup_interval=1.0, + minimum_weight=1.0, + ) + + # Use VBeta as the metric to investigate the performance of DBSTREAM + v_beta = metrics.VBeta(beta=1.0) + + for x, y_true in stream.iter_array(X, y): + dbstream.learn_one(x) + y_pred = dbstream.predict_one(x) + v_beta.update(y_true, y_pred) + + assert len(dbstream._micro_clusters) == 12 + assert round(v_beta.get(), 4) == 0.9816 + + assert dbstream.s.keys() == dbstream.s_t.keys() + + dbstream._recluster() + + # Check that the resulted cluster centers are close to the expected centers + dbstream_expected_centers = {0: {0: 10, 1: 10}, + 1: {0: -5, 1: -5}, + 2: {0: 0, 1: 0}, + 3: {0: 5, 1: 5}, + 4: {0: -10, 1: -10}} + + for i in dbstream.centers.keys(): + assert utils.math.minkowski_distance(dbstream.centers[i], dbstream_expected_centers[i], 2) < 0.2 diff --git a/river/metrics/silhouette.py b/river/metrics/silhouette.py index d3c6d0a27d..3c0bb3d05c 100644 --- a/river/metrics/silhouette.py +++ b/river/metrics/silhouette.py @@ -47,7 +47,7 @@ class Silhouette(metrics.base.ClusteringMetric): ... metric.update(x, y_pred, k_means.centers) >>> metric - Silhouette: 0.568058 + Silhouette: 0.32145 References ---------- @@ -65,18 +65,18 @@ def __init__(self): @staticmethod def _find_distance_second_closest_center(centers, x): - distances = {i: math.sqrt(utils.math.minkowski_distance(centers[i], x, 2)) for i in centers} + distances = {i: utils.math.minkowski_distance(centers[i], x, 2) for i in centers} return sorted(distances.values())[-2] def update(self, x, y_pred, centers, w=1.0): - distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) + distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2) self._sum_distance_closest_centroid += distance_closest_centroid distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x) self._sum_distance_second_closest_centroid += distance_second_closest_centroid def revert(self, x, y_pred, centers, w=1.0): - distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) + distance_closest_centroid = utils.math.minkowski_distance(centers[y_pred], x, 2) self._sum_distance_closest_centroid -= distance_closest_centroid distance_second_closest_centroid = self._find_distance_second_closest_center(centers, x)