-
Notifications
You must be signed in to change notification settings - Fork 35
/
feature_reduce.py
140 lines (120 loc) · 5.27 KB
/
feature_reduce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'''
AAA lllllll lllllll iiii
A:::A l:::::l l:::::l i::::i
A:::::A l:::::l l:::::l iiii
A:::::::A l:::::l l:::::l
A:::::::::A l::::l l::::l iiiiiii eeeeeeeeeeee
A:::::A:::::A l::::l l::::l i:::::i ee::::::::::::ee
A:::::A A:::::A l::::l l::::l i::::i e::::::eeeee:::::ee
A:::::A A:::::A l::::l l::::l i::::i e::::::e e:::::e
A:::::A A:::::A l::::l l::::l i::::i e:::::::eeeee::::::e
A:::::AAAAAAAAA:::::A l::::l l::::l i::::i e:::::::::::::::::e
A:::::::::::::::::::::A l::::l l::::l i::::i e::::::eeeeeeeeeee
A:::::AAAAAAAAAAAAA:::::A l::::l l::::l i::::i e:::::::e
A:::::A A:::::A l::::::ll::::::li::::::ie::::::::e
A:::::A A:::::A l::::::ll::::::li::::::i e::::::::eeeeeeee
A:::::A A:::::A l::::::ll::::::li::::::i ee:::::::::::::e
AAAAAAA AAAAAAAlllllllllllllllliiiiiiii eeeeeeeeeeeeee
______ _
| ___ \ (_)
| |_/ / __ ___ _ __ _ __ ___ ___ ___ ___ ___ _ _ __ __ _
| __/ '__/ _ \ '_ \| '__/ _ \ / __/ _ \/ __/ __| | '_ \ / _` |
| | | | | __/ |_) | | | (_) | (_| __/\__ \__ \ | | | | (_| |
\_| |_| \___| .__/|_| \___/ \___\___||___/___/_|_| |_|\__, |
| | __/ |
|_| |___/
___ ______ _____
/ _ \ | ___ \_ _|
/ /_\ \| |_/ / | |
| _ || __/ | |
| | | || | _| |_
\_| |_/\_| \___/
Employ dimensionality reduction strategies as part of Allie's preprocessing API.
'''
import json, os, sys
import numpy as np
from sklearn.model_selection import train_test_split
def prev_dir(directory):
g=directory.split('/')
dir_=''
for i in range(len(g)):
if i != len(g)-1:
if i==0:
dir_=dir_+g[i]
else:
dir_=dir_+'/'+g[i]
# print(dir_)
return dir_
def feature_reduce(dimensionality_selector, X_train, y_train, component_num):
if dimensionality_selector == 'autoencoder':
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# preprocess labels (make into integers)
label_encoder = LabelEncoder()
y_train=label_encoder.fit_transform(y_train)
y_test=label_encoder.fit_transform(y_test)
# this is the size of our encoded representations (208 features in X)
encoding_dim = 32
# add a few dimensions for encoder and decoder
input_dim = Input(shape=X_train[0].shape)
encoder=Dense(encoding_dim, activation='tanh')
autoencoder = Model(input_dim, decoded)
# this model maps an input to its encoded representation
encoder = Model(input_dim, encoded)
# create a placeholder for an encoded (50-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))
# now train autoencoder
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
autoencoder.fit(X_train, y_train,
epochs=50,
batch_size=256,
shuffle=True,
validation_data=(X_test, y_test))
# predict emebddings
encoded_audio = encoder.predict(X_test)
decoded_audio = decoder.predict(encoded_audio)
print('not saving model due to keras autoencoder')
elif dimensionality_selector == 'cca':
from sklearn.cross_decomposition import CCA
cca = CCA(n_components=component_num)
return cca
elif dimensionality_selector == 'dictionary':
from sklearn.decomposition import MiniBatchDictionaryLearning
dico_X = MiniBatchDictionaryLearning(n_components=component_num, alpha=1, n_iter=500)
model=dico_X
elif dimensionality_selector == 'ica':
from sklearn.decomposition import FastICA
ica = FastICA(n_components=component_num)
model=ica
elif dimensionality_selector == 'kmeans':
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=component_num, random_state=0)
model=kmeans
elif dimensionality_selector == 'lda':
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=component_num).fit(X_train, y_train).transform(X_train)
model=lda
elif dimensionality_selector == 'manifold':
from sklearn import manifold
manifold_X = manifold.Isomap(10, component_num)
model=manifold_X
elif dimensionality_selector == 'neighborhood':
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(random_state=42)
model=nca
# feature_engineering.gradient.selector
elif dimensionality_selector == 'pca':
from sklearn.decomposition import PCA
pca = PCA(n_components=component_num)
model = pca
elif dimensionality_selector == 'pls':
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=component_num)
model=pls
return model