Skip to content

Commit

Permalink
Merge pull request #474 from Aragath/master
Browse files Browse the repository at this point in the history
LOF implementation for outlier detection using Kmeans
  • Loading branch information
khider authored Nov 13, 2023
2 parents d1462f4 + 6884bc0 commit 512c92d
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 4 deletions.
33 changes: 33 additions & 0 deletions pyleoclim/tests/test_core_Series.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,39 @@ def test_outliers_t5(self):
# Remove outliers
ts_out = ts2.outliers(method = 'DBSCAN', settings={'nbr_clusters':2})

@pytest.mark.parametrize('LOF_param', [True,False])
def test_outliers_t6(self,LOF_param):
#Generate data
ts = gen_ts()
#Add outliers
outliers_start = np.mean(ts.value)+5*np.std(ts.value)
outliers_end = np.mean(ts.value)+7*np.std(ts.value)
outlier_values = np.arange(outliers_start,outliers_end,0.1)
index = np.random.randint(0,len(ts.value),6)
v_out = ts.value
for i,ind in enumerate(index):
v_out[ind] = outlier_values[i]
# Get a series object
ts2 = pyleo.Series(time = ts.time, value = v_out)
# Remove outliers
ts_out = ts2.outliers(method = 'kmeans', settings={'LOF':LOF_param})

def test_outliers_t7(self):
#Generate data
ts = gen_ts()
#Add outliers
outliers_start = np.mean(ts.value)+5*np.std(ts.value)
outliers_end = np.mean(ts.value)+7*np.std(ts.value)
outlier_values = np.arange(outliers_start,outliers_end,0.1)
index = np.random.randint(0,len(ts.value),6)
v_out = ts.value
for i,ind in enumerate(index):
v_out[ind] = outlier_values[i]
# Get a series object
ts2 = pyleo.Series(time = ts.time, value = v_out)
# Remove outliers
ts_out = ts2.outliers(method = 'kmeans', settings={'LOF':True, 'n_frac':0.8, 'contamination':0.1})


class TestUISeriesGkernel:
''' Unit tests for the TestUISeriesGkernel function
Expand Down
23 changes: 19 additions & 4 deletions pyleoclim/utils/tsutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.neighbors import LocalOutlierFactor
#import matplotlib.pyplot as plt

import statsmodels.tsa.stattools as sms
Expand Down Expand Up @@ -1177,14 +1178,16 @@ def detect_outliers_DBSCAN(ys, nbr_clusters = None, eps=None, min_samples=None,

return indices, res

def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=3, kmeans_kwargs=None):
def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=3, LOF=False, n_frac=0.9, contamination='auto', kmeans_kwargs=None):
"""
Outlier detection using the unsupervised alogrithm kmeans. The algorithm runs through various number of clusters and optimizes based on the silhouette score.
KMeans implementation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. For additional details, see: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
Outliers are identified based on their distance from the clusters. This can be done in two ways: (1) by using a threshold that corresponds to the Euclidean distance from the centroid and (2) using the Local Outlier Function (https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html)
Parameters
----------
ys : numpy.array
Expand All @@ -1194,7 +1197,14 @@ def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=
max_cluster : int, optional
The maximum number of clusters to consider in the optimization based on the Silhouette Score. The default is 10.
threshold : int, optional
The algorithm uses the suclidean distance for each point in the cluster to identify the outliers. This parameter sets the threshold on the euclidean distance to define an outlier. The default is 3.
The algorithm uses the euclidean distance for each point in the cluster to identify the outliers. This parameter sets the threshold on the euclidean distance to define an outlier. The default is 3.
LOF : bool, optional
By default, detect_outliers_kmeans uses euclidean distance for outlier detection. Set LOF to True to use LocalOutlierFactor for outlier detection.
n_frac : float, optional
The percentage of the time series length (the length, representing number of points) to be used to set the n_neighbors parameter for the LOF function in scikit-learn.
We recommend using at least 50% (n_frac=0.5) of the timeseries. You cannot use 100% (n_frac!=1)
contamination : ('auto', float), optional
Same as LOF parameter from scikit-learn. We recommend using the default mode of auto. See: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html for details.
kmeans_kwargs : dict, optional
Other parameters for the kmeans function. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html for details. The default is None.
Expand Down Expand Up @@ -1231,8 +1241,13 @@ def detect_outliers_kmeans(ys, nbr_clusters = None, max_cluster = 10, threshold=
kmeans.fit(ys.reshape(-1, 1), **kmeans_kwargs)
silhouette_avg.append(silhouette_score(ys.reshape(-1, 1), kmeans.labels_))
center=kmeans.cluster_centers_[kmeans.labels_,0]
distance=np.sqrt((ys-center)**2)
idx_out.append(np.argwhere(distance>threshold).reshape(1,-1)[0])
if LOF:
model = LocalOutlierFactor(n_neighbors=int(ys.size*n_frac), contamination=contamination)
pred = model.fit_predict(ys.reshape(-1,1))
idx_out.append(np.where(pred==-1))
else:
distance=np.sqrt((ys-center)**2)
idx_out.append(np.argwhere(distance>threshold).reshape(1,-1)[0])
clusters.append(kmeans.labels_)

res = pd.DataFrame({'number of clusters':range_n_clusters, 'silhouette score':silhouette_avg,'outlier indices':idx_out,'clusters':clusters})
Expand Down

0 comments on commit 512c92d

Please sign in to comment.