-
Notifications
You must be signed in to change notification settings - Fork 3
/
run.py
73 lines (64 loc) · 2.51 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from scipy.io import arff
from sklearn.metrics import roc_auc_score
import tqdm
import setting
import dataset
import feature
from common import *
def compare_models(ds, f, filter, pBar):
result = [os.path.splitext(f)[0]]
fpath = os.path.join(*setting.ARFF_BASE_PATH, ds, f)
data = arff.loadarff(fpath)
df = pd.DataFrame(data[0])
x = df[df.columns[:-1]]
label = df[df.columns[-1:]]
y = pd.DataFrame([int(l == b'Y') for l in label.iloc[:,0]], dtype='int')
x = feature.drop_useless_feature(x, f)
x = filter(x, y)
# May contain all zero column when using KPCA and NMF
useless_feature = np.where(x.nunique().values == 1)[0].tolist()
x = x.drop(x.columns[useless_feature], axis=1)
for M in setting.MODELS:
pred = M(x)
if pred is not None:
pred = pred.T.values.tolist()[0]
invert = list(map(lambda x: 1 if x == 0 else 0, pred))
try:
result.extend([roc_auc_score(pred, y), roc_auc_score(invert, y)])
except Exception as e:
result.extend([None, None])
warning("AUC evaluation error in {} with {}".format(os.path.splitext(f)[0], M.__name__))
else:
result.extend([None, None])
pBar.update(1)
return result
def to_csv(result, ds, filter_used, prefix=None):
df = pd.DataFrame(result)
files = df.iloc[:,0].values.tolist()
df = pd.DataFrame(df.iloc[:, 1:].values, index=files)
name = '{}_{}.csv'.format(ds, filter_used)
if prefix:
name = '{}_{}'.format(prefix, name)
df.to_csv(os.path.join(*setting.REPORT_PATH, name))
def run():
dataset.generate_dataset()
for fs in feature.feature_selection_method:
for ds in setting.DATASET:
path = os.path.join(*setting.ARFF_BASE_PATH, ds)
number_thread = setting.PARALLEL_NUMBER_THREAD.get(ds, -1)
for (dirpath, dirnames, filenames) in os.walk(path):
fnames = filter_arff(filenames)
bar = tqdm.tqdm(total=len(fnames), desc="{} with {}".format(ds, fs.__name__), ascii=True)
result = Parallel(
n_jobs=number_thread,
backend=setting.PARALLEL_BACKEND)(
delayed(compare_models)(ds, f, fs, bar) for f in fnames
)
to_csv(result, ds, fs.__name__, setting.OUTPUT_PREFIX)
bar.close()
if __name__ == "__main__":
run()