-
Notifications
You must be signed in to change notification settings - Fork 15
/
solution_02_FS.py
81 lines (56 loc) · 2.68 KB
/
solution_02_FS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# Creating the object SelectKBest and settling for 5 best features
skb = SelectKBest(f_classif, k=5)
skb.fit(
X_cancer,
y_cancer)
#get associated pvalues
dico_pval={df_cancer.columns[i]:v for i,v in enumerate(skb.pvalues_)}
sortedPvals = sorted(dico_pval.items(), key=lambda x: x[1], reverse=False)
print("features F scores (p-values):")
for feature,pval in sortedPvals:
if pval > 0.01 : # let's ignore everything with a pval>0.01
print("\t\trest has pval>0.01")
break
print('\t',feature , ':' , pval )
selected5 = [x for x,p in sortedPvals[:5] ]
print("selected best:" , selected10 )
sns.pairplot( df_cancer , hue='malignant' , vars=selected5 )
## that is very nice, but a lot of these are highly correlated...
## Let's start transforming our data so we work with independent features:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_cancer_norm = sc.fit_transform(X_cancer)
pca = PCA()
x_pca = pca.fit_transform(X_cancer_norm)
## now we can select the best feature among the principal components
skb = SelectKBest(f_classif, k=5)
skb.fit(
x_pca,
y_cancer)
#all the features and the chi2 pvalues associated. use .pvalues_
dico_pval={i:v for i,v in enumerate(skb.pvalues_)}
sortedPvals = sorted(dico_pval.items(), key=lambda x: x[1], reverse=False)
significantComponents = []
print("features Chi2 scores (p-values):")
for feature,pval in sortedPvals:
if pval > 0.01 : # let's ignore everything with a pval>0.01
print("\t\trest has pval>0.01")
break
print('\tPC',feature , ':' , pval )
significantComponents.append(feature)
## Wow, they actually correspond to the elements with the highest variance ratio representation!
print( "selected components explained variance fractions:\n\t",pca.explained_variance_ratio_[ significantComponents ] )
print( "Total :\t",sum(pca.explained_variance_ratio_[ significantComponents ]) )
df_pca = pd.DataFrame( x_pca[:, significantComponents ] )
df_pca['target'] = y_cancer
sns.pairplot( df_pca , hue="target" )
df_comp = pd.DataFrame(pca.components_,columns=list(df_cancer.columns)[:-1])
# pca.components_ : recovering the matrix that describe the principal component in the former feature basis. It gives you the
# values of the coefficients in front of each features to build your PCA components.
plt.figure(figsize=(15,15))
sns.heatmap(df_comp,cmap='coolwarm',cbar_kws={'label':'Eigenvalue'},linewidths=.05)
plt.yticks(np.arange(0+0.5,len(df_comp.columns)+0.5,1),['PCA axis '+str(i) for i in range(len(df_comp.columns))],rotation=0)
plt.show()