This repository has been archived by the owner on Aug 2, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsmote_svm.py
115 lines (94 loc) · 4.54 KB
/
smote_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
smote_svm.py
Created on Tue May 2 08:29:21 2017
@author: jack
找出所有属性约简的最优参数和F-measure Score
"""
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from svm_tuning import svm_tuning
def smote_svm(reduct, X, y, X_scaled, nfolds):
T = [X, y] # 训练集
# 第一次分类预测,找出错分的少数类样本
T_scaled_reduct = X_scaled.loc[:, reduct]
indices, model = svm_tuning(T_scaled_reduct, T[1], T_scaled_reduct, T[1], nfolds)
FN = [T[0].loc[indices], T[1].loc[indices]] #FN: False Positive samples
# 如果少数类样本少于等于多数类样本,且有错分样本
# 用SMOTE生成新的样本,并加入到原样本集中
while len(FN[1])+sum(T[1]==1) <= sum(T[1]==0) and len(FN[1])!=0:
# print([len(FN[1]), sum(T[1]==1), sum(T[1]==0)])
indices_listed = list(indices)
# 训练knn分类器
knn = neighbors.KNeighborsClassifier(n_neighbors = 6)
knn.fit(T[0][np.array(T[1]==1, dtype=bool)], T[1][np.array(T[1]==1, dtype=bool)])
# 找出错分样本的k临近样本
# 将错分样本和其k临近样本混合为新样本集合
# 作为SMOTE生成样本的样本池
for i in indices:
# print('indices in the fit samples: ',knn.kneighbors([FN[0].loc[i]])[1][0])
# print('lenght of the fit samples: ',len(T[1][np.array(T[1]==1, dtype=bool)]))
Neighbors = T[1][np.array(
T[1]==1, dtype=bool)].index[knn.kneighbors([FN[0].loc[i]])[1][0]]
# print('real indices of neighbors: ',Neighbors)
# print('samples: ',T[1].loc[Neighbors])
diff_set = Neighbors.difference(indices)
for n in diff_set:
indices_listed.append(n)
indices = pd.Int64Index(indices_listed)
FN = [T[0].loc[indices], T[1].loc[indices]]
# print(FN[0].index)
# 生成用于SMOTE的样本池
sample_pool_X = pd.concat([T[0].loc[T[1]==0], FN[0]])
sample_pool_y = pd.concat([T[1].loc[T[1]==0], FN[1]])
length = len(sample_pool_y) # 原样本池的长度
# print([len(FN[1]), sum(T[1]==1), sum(T[1]==0)])
ratio = (sum(T[1]==1)+len(FN[1]))/sum(T[1]==0) # SMOTE 生成样本比例
if ratio > 1:
ratio = 1.0
print('ratio: ', ratio)
# print(sum(sample_pool_y==1), sum(sample_pool_y==0))
smote = SMOTE(ratio = ratio)
sample_pool_X, sample_pool_y = smote.fit_sample(sample_pool_X, sample_pool_y)
print('everything is fine')
new_sample_X = pd.DataFrame(sample_pool_X[length:])
new_sample_y = pd.Series(sample_pool_y[length:])
# 设置新数据的index和新数据集的columns,方便后面使用
new_sample_X.index = pd.RangeIndex(
len(T[1]), len(T[1])+len(new_sample_X))
new_sample_y.index = pd.RangeIndex(
len(T[1]), len(T[1])+len(new_sample_y))
new_sample_X.columns = X.columns
# 将新样本与原样本集混合
T[0] = pd.concat([T[0], new_sample_X])
T[1] = pd.concat([T[1], new_sample_y])
# 新样本集标准化
X_new_scaled = pd.DataFrame(StandardScaler().fit_transform(T[0]))
# 根据属性约简筛选样本集
X_new_scaled_reduct = X_new_scaled.loc[:, reduct]
# 使用新样本集训练SVM
indices, model = svm_tuning(
X_new_scaled_reduct, T[1], T_scaled_reduct, y, nfolds)
FN = [T[0].loc[indices], T[1].loc[indices]] #FN: False Positive samples
# indices_listed = list(indices)
#
# # 找出错分样本的k临近样本
# knn = neighbors.KNeighborsClassifier(n_neighbors = 6)
# knn.fit(X_new[np.array(y_new==1, dtype=bool)], y_new[np.array(y_new==1, dtype=bool)])
#
# for i in indices:
# Neighbors = pd.Int64Index(knn.kneighbors([X_new.loc[i]])[1][0])
# Neighbors = Neighbors.difference(indices)
#
# for n in Neighbors:
# indices_listed.append(n)
# indices = pd.Int64Index(indices_listed)
# print(indices)
# FN_X , FN_y= X_new.loc[indices], y_new.loc[indices]
# # 生成用于SMOTE的样本池
# sample_pool_X = pd.concat([X_new.loc[y_new==0], FN_X])
# sample_pool_y = pd.concat([y_new.loc[y_new==0], FN_y])
return model