-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDFMapper.py
152 lines (118 loc) · 4.35 KB
/
DFMapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import ipdb
import itertools
def isinstance_func(x):
return hasattr(x, '__call__')
# Takes numpy array, and returns a row array
def row(arr):
if len(arr.shape) == 1:
return arr.reshape(1, len(arr))
return arr
def col(arr):
if len(arr.shape) == 1:
return arr.reshape(len(arr), 1)
return arr
def explode(matrix, order):
cols = matrix.shape[1]
assert order > 1, "order is not greater than 1"
new_cols = []
for combos in itertools.combinations(xrange(cols),order):
first_column_index = combos[0]
# Create the combination column
combo_column = np.copy(matrix[:,first_column_index])
for cur_column_index in combos[1:]:
combo_column *= matrix[:, cur_column_index]
new_cols.append(col(combo_column))
return np.hstack(new_cols)
class DFMapper(object):
def __init__(self):
self.dict_list = []
self.index = None
self.options = {}
# Key is a column of the original data
# function list is a list of one of the following
# - A class that implements the Transformer API
# - A function
def _add(self, key, function_list, is_X, is_Y, is_index, as_col=True):
if not isinstance(function_list, list):
function_list = [function_list]
if isinstance(key, str):
key = [key]
dict_values = {}
dict_values['pipeline'] = function_list
dict_values['is_X'] = is_X
dict_values['is_Y'] = is_Y
dict_values['is_index'] = is_index
dict_values['as_col'] = as_col
self.dict_list.append((key,dict_values))
def add_X(self, key, function_list=[], as_col = True):
self._add(key, function_list, is_X=True, is_Y=False, is_index=False, as_col=as_col)
def add_Y(self, key, function_list=[], as_col = True):
self._add(key, function_list, is_X=False, is_Y=True, is_index=False, as_col=as_col)
def add_index(self, key, function_list=[], as_col=True):
self._add(key, function_list, is_X=False, is_Y=False, is_index=True, as_col=as_col)
def add_option(self, key, val=True):
self.options[key] = val
def evaluate(self, key, dict_options, df, eval_type):
for el in key:
if (el not in df):
# If you are missing an X column, this is bad.
# You should find it.
if dict_options['is_X']:
ValueError("The column %s is not in your dataframe" % key)
# If you are missing Y columns, that is not a big deal
# You could just be transforming the test set.
if dict_options['is_Y']:
return None
if dict_options['as_col']:
cur_val = col(df[key].values)
else:
cur_val = df[key]
#import ipdb; ipdb.set_trace()
for (index, f) in enumerate(dict_options['pipeline']):
if isinstance_func(f):
cur_val = f(cur_val)
else:
if 'fit_transform' == eval_type:
cur_val = f.fit_transform(cur_val)
elif 'transform' == eval_type:
cur_val = f.transform(cur_val)
elif 'fit' == eval_type:
# Just call fit at the end
# otherwise call fit transform
if index+1 == len(dict_options['pipeline']):
f.fit(cur_val)
return None
else:
cur_val = f.fit_transform(cur_val)
else:
assert False, "Only support options fit, transform and fit_transform"
return cur_val
def eval_and_coalesce(self, df, eval_type):
results_X = []
results_Y = []
for (key, dict_options) in self.dict_list:
cur_val = self.evaluate(key,dict_options, df, eval_type)
# This occurs when you are trying to evaluate
# a key that is not in the dataframe
if cur_val == None:
continue
if dict_options['is_X']:
results_X.append(cur_val)
if dict_options['is_Y']:
results_Y.append(cur_val)
if dict_options['is_index']:
self.index = cur_val
results_X = np.hstack(results_X) if results_X else np.array([])
results_Y = np.hstack(results_Y) if results_Y else np.array([])
if ('explode' in self.options) and (len(results_X) > 0):
order = self.options['explode']
results_X = np.hstack([results_X, explode(results_X,order)])
return results_X, results_Y
def fit(self, df):
self.eval_and_coalesce(df, 'fit')
return self
def transform(self, df):
return self.eval_and_coalesce(df, 'transform')
def fit_transform(self, df):
return self.eval_and_coalesce(df, 'fit_transform')