forked from andresantonioriveros/pyRF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmontecarlo.py
98 lines (74 loc) · 3.48 KB
/
montecarlo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8
# Entreno un grupo de árboles, cada uno sobre un set de entrenamiento distinto.
# Después al clasificar, junto la votación de cada árbol para tomar la decisión final
# -------------------------------------------------------------------------------------------------
from functools import partial
from multiprocessing import Pool
import argparse
import sys
from sklearn import cross_validation
import pandas as pd
import metrics
import parallel
import utils
if __name__ == '__main__':
# Recibo parámetros de la linea de comandos
print ' '.join(sys.argv)
parser = argparse.ArgumentParser()
parser.add_argument('--percentage', required=True, type=str)
parser.add_argument('--n_processes', required=True, type=int)
parser.add_argument('--catalog', default='MACHO', choices=['MACHO', 'EROS', 'OGLE'])
parser.add_argument('--folds', required=True, type=int)
parser.add_argument('--sets_path', required=True, type=str)
parser.add_argument('--result_path', required=True, type=str)
parser.add_argument('--feature_filter', nargs='*', type=str)
args = parser.parse_args(sys.argv[1:])
percentage = args.percentage
catalog = args.catalog
n_processes = args.n_processes
folds = args.folds
sets_path = args.sets_path
result_path = args.result_path
feature_filter = args.feature_filter
paths = [sets_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)]
data = pd.read_csv(paths[0], index_col=0)
y = data['class']
skf = cross_validation.StratifiedKFold(y, n_folds=folds)
resultados = []
for train_index, test_index in skf:
print 'Entrenamiento de arboles'
partial_train = partial(parallel.train_tree, feature_filter=feature_filter,
train_index=train_index)
pool = Pool(processes=n_processes, maxtasksperchild=2)
arboles = pool.map(partial_train, paths)
pool.close()
pool.join()
# print 'Guardado de arboles'
# count = 0
# for clf in arboles:
# output = open(result_path + "Arboles/arbol_" + str(count) + '.pkl', 'wb+')
# pickle.dump(clf, output)
# output.close()
# count += 1
print 'Consolido resultados'
# Guardo las votaciones de clasificaciones para cada dataset
sample_set_result = []
for path in paths:
data = pd.read_csv(path, index_col=0)
data, y = utils.filter_data(data, feature_filter=feature_filter)
test_X = data.iloc[test_index]
test_y = y.iloc[test_index]
# Guardo la clasificacion de cada árbol para el dataset actual
aux = []
for clf in arboles:
result = clf.predict_table(test_X, test_y)
aux.append(result)
# Consolido las votaciones de los árboles en un solo frame
consolidated_frame = reduce(lambda a, b: a+b, map(metrics.result_to_frame, aux))
sample_set_result.append(consolidated_frame)
print 'Largo de lista para cada muestra: ' + str(len(sample_set_result))
resultados.append(metrics.matrix_to_result(reduce(lambda a, b: a+b, sample_set_result), test_y))
print 'Largo de lista para folds: ' + str(len(resultados))
print 'Memoria de dataframe: ' + str(resultados[0].memory_usage(index=True))
result = pd.concat(resultados)
result.to_csv(result_path + 'Predicciones/result_' + percentage + '.csv')