From 15580cb572ab7a48b8e0681425b0003f1eab35bb Mon Sep 17 00:00:00 2001 From: Lee Pin-Tzu Date: Sat, 11 Nov 2023 14:22:33 -0800 Subject: [PATCH 1/3] 11/11: update test_core_Series, tsutils 1. add outlier test t6 (LOF), t7 (n_frac, contamination) 2. modify detect_outliers_kmeans --- pyleoclim/tests/test_core_Series.py | 33 +++++++++++++++++++++++++++++ pyleoclim/utils/tsutils.py | 20 +++++++++++++---- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/pyleoclim/tests/test_core_Series.py b/pyleoclim/tests/test_core_Series.py index 83c3bce2..a250d2cf 100644 --- a/pyleoclim/tests/test_core_Series.py +++ b/pyleoclim/tests/test_core_Series.py @@ -787,6 +787,39 @@ def test_outliers_t5(self): # Remove outliers ts_out = ts2.outliers(method = 'DBSCAN', settings={'nbr_clusters':2}) + @pytest.mark.parametrize('LOF_param', [True,False]) + def test_outliers_t6(self,LOF_param): + #Generate data + ts = gen_ts() + #Add outliers + outliers_start = np.mean(ts.value)+5*np.std(ts.value) + outliers_end = np.mean(ts.value)+7*np.std(ts.value) + outlier_values = np.arange(outliers_start,outliers_end,0.1) + index = np.random.randint(0,len(ts.value),6) + v_out = ts.value + for i,ind in enumerate(index): + v_out[ind] = outlier_values[i] + # Get a series object + ts2 = pyleo.Series(time = ts.time, value = v_out) + # Remove outliers + ts_out = ts2.outliers(method = 'kmeans', settings={'LOF':LOF_param}) + + def test_outliers_t7(self): + #Generate data + ts = gen_ts() + #Add outliers + outliers_start = np.mean(ts.value)+5*np.std(ts.value) + outliers_end = np.mean(ts.value)+7*np.std(ts.value) + outlier_values = np.arange(outliers_start,outliers_end,0.1) + index = np.random.randint(0,len(ts.value),6) + v_out = ts.value + for i,ind in enumerate(index): + v_out[ind] = outlier_values[i] + # Get a series object + ts2 = pyleo.Series(time = ts.time, value = v_out) + # Remove outliers + ts_out = ts2.outliers(method = 'kmeans', settings={'LOF'=True, 'n_frac':0.8, 'contamination':0.1}) + class TestUISeriesGkernel: ''' Unit tests for the TestUISeriesGkernel function diff --git a/pyleoclim/utils/tsutils.py b/pyleoclim/utils/tsutils.py index 5d68000a..610a2808 100644 --- a/pyleoclim/utils/tsutils.py +++ b/pyleoclim/utils/tsutils.py @@ -1177,7 +1177,7 @@ def detect_outliers_DBSCAN(ys, nbr_clusters = None, eps=None, min_samples=None, return indices, res -def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=3, kmeans_kwargs=None): +def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=3, LOF=False, n_frac=0.9, contamination='auto', kmeans_kwargs=None): """ Outlier detection using the unsupervised alogrithm kmeans. The algorithm runs through various number of clusters and optimizes based on the silhouette score. @@ -1194,7 +1194,14 @@ def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold= max_cluster : int, optional The maximum number of clusters to consider in the optimization based on the Silhouette Score. The default is 10. threshold : int, optional - The algorithm uses the suclidean distance for each point in the cluster to identify the outliers. This parameter sets the threshold on the euclidean distance to define an outlier. The default is 3. + The algorithm uses the euclidean distance for each point in the cluster to identify the outliers. This parameter sets the threshold on the euclidean distance to define an outlier. The default is 3. + LOF : bool, optional + By default, detect_outliers_kmeans uses euclidean distance for outlier detection. Set LOF to True to use LocalOutlierFactor for outlier detection. + n_frac : float, optional + The percentage of the time series length (the length, representing number of points) to be used to set the n_neighbors parameter for the LOF function in scikit-learn. + We recommend using at least 50% (n_frac=0.5) of the timeseries. You cannot use 100% (n_frac!=1) + contamination : ('auto', float), optional + Same as LOF parameter from scikit-learn. We recommend using the default mode of auto. See: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html for details. kmeans_kwargs : dict, optional Other parameters for the kmeans function. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html for details. The default is None. @@ -1231,8 +1238,13 @@ def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold= kmeans.fit(ys.reshape(-1, 1), **kmeans_kwargs) silhouette_avg.append(silhouette_score(ys.reshape(-1, 1), kmeans.labels_)) center=kmeans.cluster_centers_[kmeans.labels_,0] - distance=np.sqrt((ys-center)**2) - idx_out.append(np.argwhere(distance>threshold).reshape(1,-1)[0]) + if LOF: + model = LocalOutlierFactor(n_neighbors=int(ys.size*n_frac), contamination=contamination) + pred = model.fit_predict(ys.reshape(-1,1)) + idx_out.append(np.where(pred==-1)) + else: + distance=np.sqrt((ys-center)**2) + idx_out.append(np.argwhere(distance>threshold).reshape(1,-1)[0]) clusters.append(kmeans.labels_) res = pd.DataFrame({'number of clusters':range_n_clusters, 'silhouette score':silhouette_avg,'outlier indices':idx_out,'clusters':clusters}) From c2c72910ae2373e327d89465864f5cdee1f7e690 Mon Sep 17 00:00:00 2001 From: Lee Pin-Tzu Date: Sat, 11 Nov 2023 15:32:18 -0800 Subject: [PATCH 2/3] 11/11: fix error in test_core_Series, tsutils --- pyleoclim/tests/test_core_Series.py | 2 +- pyleoclim/utils/tsutils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyleoclim/tests/test_core_Series.py b/pyleoclim/tests/test_core_Series.py index a250d2cf..d32d1512 100644 --- a/pyleoclim/tests/test_core_Series.py +++ b/pyleoclim/tests/test_core_Series.py @@ -818,7 +818,7 @@ def test_outliers_t7(self): # Get a series object ts2 = pyleo.Series(time = ts.time, value = v_out) # Remove outliers - ts_out = ts2.outliers(method = 'kmeans', settings={'LOF'=True, 'n_frac':0.8, 'contamination':0.1}) + ts_out = ts2.outliers(method = 'kmeans', settings={'LOF':True, 'n_frac':0.8, 'contamination':0.1}) class TestUISeriesGkernel: diff --git a/pyleoclim/utils/tsutils.py b/pyleoclim/utils/tsutils.py index 610a2808..7da3661b 100644 --- a/pyleoclim/utils/tsutils.py +++ b/pyleoclim/utils/tsutils.py @@ -32,6 +32,7 @@ from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score +from sklearn.neighbors import LocalOutlierFactor #import matplotlib.pyplot as plt import statsmodels.tsa.stattools as sms From 6884bc053cea0d6d93d4417d2d06d07f48cae2e6 Mon Sep 17 00:00:00 2001 From: Lee Pin-Tzu <54591654+Aragath@users.noreply.github.com> Date: Tue, 14 Nov 2023 06:05:14 +0800 Subject: [PATCH 3/3] Update tsutils.py --- pyleoclim/utils/tsutils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyleoclim/utils/tsutils.py b/pyleoclim/utils/tsutils.py index 7da3661b..1bcfc1b0 100644 --- a/pyleoclim/utils/tsutils.py +++ b/pyleoclim/utils/tsutils.py @@ -1186,6 +1186,8 @@ def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold= The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. For additional details, see: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html + Outliers are identified based on their distance from the clusters. This can be done in two ways: (1) by using a threshold that corresponds to the Euclidean distance from the centroid and (2) using the Local Outlier Function (https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html) + Parameters ---------- ys : numpy.array