-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_multiclass.py
119 lines (96 loc) · 3.78 KB
/
train_multiclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import sys
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import argparse
from src import predict_multiclass, preprocessing
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn import over_sampling
from imblearn import combine
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# set save path
def set_path(basename):
os.makedirs('./output', exist_ok=True)
name = os.path.splitext(basename)
save_path = 'output/{}.csv'.format(name[0])
return save_path, name[0]
# train data + mndo data
def append_mndo(X_train, y_train, df):
X_mndo = df.drop('Label', axis=1)
y_mndo = df.Label
X_mndo = np.concatenate((X_mndo, X_train), axis=0)
y_mndo = np.concatenate((y_mndo, y_train), axis=0)
return X_mndo, y_mndo
if __name__ == '__main__':
# Load dataset
parser = argparse.ArgumentParser()
parser.add_argument('data', help='dataset')
parser.add_argument('generated', help='generated data')
args = parser.parse_args()
try:
data = pd.read_csv(args.data)
mndo_generated = pd.read_csv(args.generated)
save_path, file_name = set_path(os.path.basename(args.data))
except IndexError:
sys.exit('error: Must specify dataset file')
except FileNotFoundError:
sys.exit('error: No such file or directory')
# one-hot encoding and sort columns
data = pd.get_dummies(data)
mndo_generated = mndo_generated.ix[:, data.columns]
# split the data
X = data.drop('Label', axis=1)
y = data.Label
X = np.array(X)
y = np.array(y)
# calc number of samples to synthesize
RANDOM_STATE=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
shuffle=True, random_state=RANDOM_STATE)
print('y_train: {}'.format(Counter(y_train)))
print('y_test: {}'.format(Counter(y_test)))
#-----------------
# Preprocessing
#-----------------
# Multivariate over-sampling
X_mndo, y_mndo = append_mndo(X_train, y_train, mndo_generated)
print('y_mndo: {}'.format(Counter(y_mndo)))
os_list = [[X_mndo, y_mndo]]
# scaling
os_list, X_test_scaled = preprocessing.normalization(os_list, X_test)
#os_list, X_test_scaled = preprocessing.standardization(os_list, X_test)
#-------------
# Learning
#-------------
svm_clf = []
pred_tmp = []
#svm
for i in range(len(os_list)):
svm_clf.append(svm.SVC(kernel='rbf', gamma='auto',
random_state=RANDOM_STATE, probability=True).fit(os_list[i][0], os_list[i][1]))
for i in range(len(svm_clf)):
prob = svm_clf[i].predict_proba(X_test_scaled[i])[:,1]
pred_tmp.append(predict_multiclass.calc_metrics(y_test, svm_clf[i].predict(X_test_scaled[i]), i))
# tree
tree_clf = []
for i in range(len(os_list)):
tree_clf.append(DecisionTreeClassifier(random_state=RANDOM_STATE).fit(os_list[i][0], os_list[i][1]))
for i in range(len(tree_clf)):
prob = tree_clf[i].predict_proba(X_test_scaled[i])[:,1]
pred_tmp.append(predict_multiclass.calc_metrics(y_test, tree_clf[i].predict(X_test_scaled[i]), i))
#k-NN
k=5
knn_clf = []
for i in range(len(os_list)):
knn_clf.append(KNeighborsClassifier(n_neighbors=k).fit(os_list[i][0], os_list[i][1]))
for i in range(len(knn_clf)):
prob = knn_clf[i].predict_proba(X_test_scaled[i])[:,1]
pred_tmp.append(predict_multiclass.calc_metrics(y_test, knn_clf[i].predict(X_test_scaled[i]), i))
pred_df = pd.DataFrame(pred_tmp)
pred_df.columns = ['os', 'Sensitivity', 'Specificity', 'Geometric mean', 'F-1', 'MCC']
# export resualt
pred_df.to_csv(save_path, index=False)