-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathtrain_cluster.py
109 lines (93 loc) · 3.6 KB
/
train_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
########################################################################################################################
#
# train_cluster.py
# A python script to perform PCA transformation and clustering analysis using KMeans
#
#
# Author: Javier Lopatin
# Email: [email protected]
# Date: 07/07/2022
# Version: 1.0
#
# Usage:
#
# python train_cluster.py -i <Input csv data> -d <if drop first column>
#
##########################################################################################################################
import argparse
import os
import pandas as pd
import geopandas as gpd
import rasterio
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
def test_PCA(inData):
# preprocessing
# standardize variables
ss = StandardScaler()
if isinstance(inData, np.ndarray):
X_std = ss.fit_transform(np.nan_to_num(inData))
else:
X_std = ss.fit_transform(np.nan_to_num(inData.values))
# PCA
pca = PCA() # PCA with 3 primary components
# fit and transform both PCA models
X_pca = pca.fit_transform(X_std)
variance = pca.explained_variance_ratio_
print('Cumulative PCA variance: ')
comp = pd.concat([pd.DataFrame(np.linspace(
1, inData.shape[1], inData.shape[1])), pd.DataFrame(np.cumsum(variance))], axis=1)
comp.columns = ['n_comp', 'cumsum']
print(comp)
# Stop!!! ask for the best n_Clusters to be used. Enter the number in the terminal
n_components = int(
input("Please enter your selected number of components: "))
pca = PCA(n_components=n_components).fit(X_std)
X_pca = pca.transform(X_std)
if isinstance(inData, np.ndarray):
X_std = ss.fit(np.nan_to_num(inData))
else:
X_std = ss.fit(np.nan_to_num(inData.values))
return X_std, pca, X_pca
def test_cluster(X_pca):
n = list(range(3, 15)) # sequence of clusters to run
# The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.
silhouette = []
print('Processing clusters...')
model = KMeans(random_state=42)
elb_visualizer = KElbowVisualizer(model, k=(2,15), metric='silhouette')
elb_visualizer.fit(X_pca)
elb_visualizer.show()
# Stop!!! ask for the best n_Clusters to be used. Enter the number in the terminal
n_clusters = int(input("Please enter your selected number of clusters: "))
return KMeans(n_clusters=n_clusters, random_state=42).fit(X_pca)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inData', help='Input Data', type=str)
parser.add_argument(
'-d', '--drop', help='drop first column', action='store_true')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
args = vars(parser.parse_args())
# os.chdir('/home/javierlopatin/Documentos/temp/Siusun/')
# inData = pd.read_csv('points_phenoshape.csv')
inData = pd.read_csv(args['inData']).astype('float32')
inData = inData.dropna()
# drop undersirabe columns
if args['drop']:
inData.drop(inData.columns[[0]], axis=1, inplace=True)
# test pca
std, pca, X_pca = test_PCA(inData)
# test clusters
cluster = test_cluster(X_pca)
# save pipeline with best models
pipe = Pipeline([('std', std), ('pca', pca), ('cluster', cluster)])
# save the model to disk
filename = 'ClusterPipeline.sav'
pickle.dump(pipe, open(filename, 'wb'))