-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandomForestMain.py
98 lines (84 loc) · 3.21 KB
/
RandomForestMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#! /usr/bin/env python3
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
def get_error_rate(pred, Y):
return sum(pred != Y) / len(pred)
class RandomForest:
def __init__(self, ntrees=70):
self.ntrees = ntrees
self.trees = []
self.labels = set()
def subsample(self, X, Y):
X_sample = []
Y_sample = []
while len(X_sample) < len(X):
index = random.randrange(0, len(X))
X_sample.append(X[index])
Y_sample.append(Y[index])
return np.array(X_sample), np.array(Y_sample)
def fit(self, X, y):
self.trees = []
# get labels
self.labels = set(y)
# fit every decision tree
for i in range(self.ntrees):
# get temp_X_train, temp_Y_train with bagging
temp_X_train, temp_Y_train = self.subsample(X, y)
# train it with decision tree
self.trees.append(DecisionTreeClassifier(max_features='log2'))
self.trees[i].fit(temp_X_train, temp_Y_train)
# print(self.trees[0].predict(X[0].reshape(1,-1)))
def predict(self, X):
n_samples = len(X)
# compute all predicate scores
pred_scores = []
for tree in self.trees:
pred_scores.append(tree.predict(X))
# initial votes
votes = list()
for i in range(n_samples):
votes.append(dict(zip(self.labels, [0] * len(self.labels))))
# start to vote
for score in pred_scores:
for i in range(len(score)):
votes[i][score[i]] += 1
# get final scores
pred = []
score = []
for vote in votes:
max_key = max(vote, key=vote.get)
pred.append(max_key)
score.append(vote[1] / self.ntrees)
return np.array(pred), np.array(score)
if __name__ == '__main__':
with open('adult_dataset/adult_train_feature.txt') as f:
X_train = pd.read_table(f, sep=' ', header=None)
X_train = X_train.values
with open('adult_dataset/adult_train_label.txt') as f:
Y_train = pd.read_table(f, sep=' ', header=None)
Y_train = Y_train.values.ravel()
for i in range(len(Y_train)):
if Y_train[i] == 0:
Y_train[i] = -1
with open('adult_dataset/adult_test_feature.txt') as f:
X_test = pd.read_table(f, sep=' ', header=None)
X_test = X_test.values
with open('adult_dataset/adult_test_label.txt') as f:
Y_test = pd.read_table(f, sep=' ', header=None)
Y_test = Y_test.values.ravel()
for i in range(len(Y_test)):
if Y_test[i] == 0:
Y_test[i] = -1
randomForest = RandomForest()
randomForest.fit(X_train, Y_train)
pred_train = randomForest.predict(X_train)
pred_test = randomForest.predict(X_test)
print('T = %d, AUC on test data: %f, error rate: %f' % (
randomForest.ntrees, roc_auc_score(Y_test, pred_test), get_error_rate(pred_test, Y_test)))
# standard_rf = RandomForestClassifier()
# standard_rf.fit(X_train, Y_train)
# pred_test = standard_rf.predict(X_test)
# print(roc_auc_score(Y_test, pred_test))