-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsolution_01_kmean.py
105 lines (72 loc) · 3.42 KB
/
solution_01_kmean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## 1. handle the NAs. A mean imputation should work here
#handling NAs
df_mam_I = df_mam.fillna(df_mam.mean())
## 2. perform a PCA. Plot the PCA projected data as well as the weight of each column on the axes. What can you say ?
sc = StandardScaler()
df_mam_IS = sc.fit_transform(df_mam_I)
pca_mam = PCA()
x_pca = pca_mam.fit_transform(df_mam_IS)
plt.subplots(figsize=(15,15))
## plotting
sns.scatterplot( x=x_pca[:,0] , y=x_pca[:,1] , s=0)
## adding the species name
for i,sp in enumerate( df_mam.index ):
plt.text( x_pca[i,0] , x_pca[i,1] , sp , color='blue' )
var_explained = pca_mam.explained_variance_ratio_
plt.xlabel('First Principal Component ({0:.2f}%)'.format(var_explained[0]*100))
plt.ylabel('Second Principal Component ({0:.2f}%)'.format(var_explained[1]*100))
feature_vectors = pca_mam.components_.T
arrow_size = 10
# projections of the original features
for i, v in enumerate(feature_vectors): # enumerate over the rows of feature_vectors
plt.arrow(0, 0, arrow_size * v[0], arrow_size * v[1],
head_width=0.00008, head_length=0.00008, width=0.00005, color='k')
text_pos = -0.005 if v[0] < 0 else 0.0001
plt.text(v[0]*arrow_size+text_pos,
v[1]*arrow_size+text_pos,
df_mam.columns[i] ,fontsize=10)
## 3. use t-SNE to get an embedding of the data in 2D and represent it.
## **bonus :** plot the species names in the embedded space with `plt.text`
tsne_exo_sample=TSNE(n_components=2,perplexity=20, init="pca", learning_rate='auto').fit(df_mam_IS)
X_embedded_exo_sample = tsne_exo_sample.embedding_
plt.figure(figsize=(10,10))
plt.title('a point is a sample',fontsize=20)
sns.scatterplot(x=X_embedded_exo_sample[:, 0],
y=X_embedded_exo_sample[:, 1], s=0, lw=0)
for i,sp in enumerate( df_mam.index ):
plt.text( X_embedded_exo_sample[i,0] , X_embedded_exo_sample[i,1] , sp , color='blue' )
plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
## 4. perform a Kmean, or hierarchical clustering, or DBSCAN clustering on the PCA projected data. What is the best number of cluster according to the silhouette score?
nr_clusters = np.arange(15)+2
silhouettes = []
for n in nr_clusters:
kmeans = cluster.KMeans(n)
kmeans.fit(x_pca)
silhouettes.append(silhouette_score(x_pca,kmeans.labels_))
## getting the K with maximum silhouette
bestI = np.argmax( silhouettes )
bestK = nr_clusters[bestI]
print('best K :',bestK)
plt.plot(nr_clusters, silhouettes, ls="-", lw=2)
plt.xlabel('Number of clusters',fontsize=20)
plt.ylabel('Silhouette score',fontsize=20)
plt.title('k-means clustering',fontsize=20)
## 5. plot the t-SNE projected data colored according to the cluster they belong to.
kmeans_mam = cluster.KMeans(bestK)
kmeans_mam.fit(x_pca)
# reporting the number of observations for each clusters
cluster_labels_mam = kmeans_mam.labels_
## I put here a bunch of colors I like
cluster_2_colors = [ 'xkcd:teal' , 'xkcd:lavender' , 'xkcd:mustard' , 'xkcd:sage' ]
from collections import Counter
print(Counter(cluster_labels_mam))
plt.figure(figsize=(10,10))
plt.title('a point is a sample',fontsize=20)
sns.scatterplot(x=X_embedded_exo_sample[:, 0],
y=X_embedded_exo_sample[:, 1], s=0, lw=0)
for i,sp in enumerate( df_mam.index ):
plt.text( X_embedded_exo_sample[i,0] , X_embedded_exo_sample[i,1] , sp ,
color= cluster_2_colors[cluster_labels_mam[i]] )
plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')