-
Notifications
You must be signed in to change notification settings - Fork 1
/
spectral_cluster.py
145 lines (125 loc) · 3.34 KB
/
spectral_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy as np
from sklearn.cluster import KMeans
import math
import matplotlib.pyplot as plt
def load_data(filename):
"""
载入数据
:param filename: 文件名
:return:
"""
infile = open(filename, 'r')
data, l_x, l_y = [], [], []
for line in infile:
words = line.split(',') # 以逗号分开
x1 = float(words[0])
x2 = float(words[1])
y1 = int(words[2][0:1])
l_x.append([1, x1, x2])
l_y.append([y1])
data.append([x1, x2, y1])
infile.close()
l_x = np.array(l_x)
l_y = np.array(l_y)
data = np.array(data)
return data, l_x, l_y
def distance(x1, x2):
"""
获得两个样本点之间的距离
:param x1: 样本点1
:param x2: 样本点2
:return:
"""
dist = np.sqrt(np.power(x1 - x2, 2).sum())
return dist
def get_dist_matrix(data):
"""
获取距离矩阵
:param data: 样本集合
:return: 距离矩阵
"""
n = len(data) # 样本总数
dist_matrix = np.zeros((n, n)) # 初始化邻接矩阵为n×n的全0矩阵
for i in range(n):
for j in range(i + 1, n):
dist_matrix[i][j] = dist_matrix[j][i] = distance(data[i], data[j])
return dist_matrix
def getW(data, k):
"""
获的邻接矩阵 W
:param data: 样本集合
:param k : KNN参数
:return: W
"""
n = len(data)
dist_matrix = get_dist_matrix(data)
W = np.zeros((n, n))
for idx, item in enumerate(dist_matrix):
idx_array = np.argsort(item) # 每一行距离列表进行排序,得到对应的索引列表
W[idx][idx_array[1:k + 1]] = 1
transpW = np.transpose(W)
return (W + transpW) / 2
def getD(W):
"""
获得度矩阵
:param W: 邻接矩阵
:return: D
"""
D = np.diag(sum(W))
return D
def getL(D, W):
"""
获得拉普拉斯矩阵
:param W: 邻接矩阵
:param D: 度矩阵
:return: L
"""
return D - W
def getEigen(L, cluster_num):
"""
获得拉普拉斯矩阵的特征矩阵
:param L:
:param cluter_num: 聚类数目
:return:
"""
eigval, eigvec = np.linalg.eig(L)
ix = np.argsort(eigval)[0:cluster_num]
return eigvec[:, ix]
def plotRes(data, clusterResult, clusterNum):
"""
结果可似化
:param data: 样本集
:param clusterResult: 聚类结果
:param clusterNum: 聚类个数
:return:
"""
n = len(data)
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange']
for i in range(clusterNum):
color = scatterColors[i % len(scatterColors)]
x1 = [];
y1 = []
for j in range(n):
if clusterResult[j] == i:
x1.append(data[j, 0])
y1.append(data[j, 1])
plt.scatter(x1, y1, c=color, marker='+')
plt.show()
def cluster(data, cluster_num, k):
data = np.array(data)
W = getW(data, k)
D = getD(W)
L = getL(D, W)
eigvec = getEigen(L, cluster_num)
clf = KMeans(n_clusters=cluster_num)
s = clf.fit(eigvec) # 聚类
label = s.labels_
return label
if __name__ == '__main__':
cluster_num = 2
knn_k = 5
filename = 'ex2data2.txt'
data, x, y = load_data(filename=filename)
data = data[0:-1] # 最后一列为标签列
label = cluster(data, cluster_num, knn_k)
plotRes(data, label, cluster_num)