-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathsklearn_example.py
208 lines (192 loc) · 9.62 KB
/
sklearn_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
"""
Examples on how to use the scikit-learn wrapper interface for the GPBoost
algorithm for combining tree-boosting with random effects and Gaussian
process models
Author: Fabio Sigrist
"""
import numpy as np
import gpboost as gpb
import random
import matplotlib.pyplot as plt
print('Simulating data...')
# Simulate data
n = 5000 # number of samples
m = 500 # number of groups
# Simulate grouped random effects
np.random.seed(1)
# simulate grouped random effects
group = np.arange(n) # grouping variable
for i in range(m):
group[int(i * n / m):int((i + 1) * n / m)] = i
b1 = np.random.normal(size=m) # simulate random effects
eps = b1[group]
# simulate fixed effects
def f1d(x):
"""Non-linear function for simulation"""
return (1.7 * (1 / (1 + np.exp(-(x - 0.5) * 20)) + 0.75 * x))
X = np.random.rand(n, 2)
f = f1d(X[:, 0])
xi = np.sqrt(0.01) * np.random.normal(size=n) # simulate error term
y = f + eps + xi # observed data
#--------------------Training----------------
# define GPModel
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
# train
bst = gpb.GPBoostRegressor(max_depth=3, learning_rate=0.01, n_estimators=100)
bst.fit(X, y, gp_model=gp_model)
print("Estimated random effects model: ")
gp_model.summary()
#--------------------Prediction----------------
# predict
group_test = np.arange(m)
Xtest = np.zeros((m, 2))
Xtest[:, 0] = np.linspace(0, 1, m)
pred = bst.predict(X=Xtest, group_data_pred=group_test, raw_score = True)
# Compare true and predicted random effects
plt.figure("Comparison of true and predicted random effects")
plt.scatter(b1, pred['random_effect_mean'])
plt.title("Comparison of true and predicted random effects")
plt.xlabel("truth")
plt.ylabel("predicted")
plt.show()
# Fixed effect
plt.figure("Comparison of true and fitted fixed effect")
plt.scatter(Xtest[:, 0], pred['fixed_effect'], linewidth=2, color="b", label="fit")
x = np.linspace(0, 1, 200, endpoint=True)
plt.plot(x, f1d(x), linewidth=2, color="r", label="true")
plt.title("Comparison of true and fitted fixed effect")
plt.legend()
plt.show()
# feature importances
print('Feature importances:', list(bst.feature_importances_))
#--------------------Choosing tuning parameters using the TPESampler from optuna----------------
# Define search space
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
search_space = { 'learning_rate': [0.001, 10],
'min_data_in_leaf': [1, 1000],
'max_depth': [-1, 10],
'num_leaves': [2, 1024],
'lambda_l2': [0, 100],
'max_bin': [63, np.min([10000,n])],
'line_search_step_length': [True, False] }
# Define metric
metric = "mse" # Can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
# Run parameter optimization using the TPE algorithm and 4-fold CV
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
nfold=4, gp_model=gp_model, metric=metric, tpe_seed=1,
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
print("Best parameters: " + str(opt_params['best_params']))
print("Best number of iterations: " + str(opt_params['best_iter']))
print("Best score: " + str(opt_params['best_score']))
# Alternatively and faster: using manually defined validation data instead of cross-validation
np.random.seed(10)
permute_aux = np.random.permutation(n)
train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data
valid_tune_idx = permute_aux[int(0.8 * n):n]
folds = [(train_tune_idx, valid_tune_idx)]
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
folds=folds, gp_model=gp_model, metric=metric, tpe_seed=1,
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
#--------------------Choosing tuning parameters using random grid search----------------
# Note: scikit-learn's 'GridSearchCV' is not supported, use the GPBoost internal
# function 'grid_search_tune_parameters' instead as shown below
# Define parameter search grid
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 1, 10],
'min_data_in_leaf': [1, 10, 100, 1000],
'max_depth': [-1, 1, 2, 3, 5, 10],
'num_leaves': 2**np.arange(1,10),
'lambda_l2': [0, 1, 10, 100],
'max_bin': [250, 500, 1000, np.min([10000,n])],
'line_search_step_length': [True, False]}
other_params = {'verbose': 0}
# Define metric
metric = "mse" # Can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
data_train = gpb.Dataset(data=X, label=y)
# Run parameter optimization using random grid search and 4-fold CV
# Note: deterministic grid search can be done by setting 'num_try_random=None'
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
num_try_random=100, nfold=4, seed=1000,
train_set=data_train, gp_model=gp_model,
use_gp_model_for_validation=True, verbose_eval=1,
num_boost_round=1000, early_stopping_rounds=20,
metric=metric)
print("Best parameters: " + str(opt_params['best_params']))
print("Best number of iterations: " + str(opt_params['best_iter']))
print("Best score: " + str(opt_params['best_score']))
# Alternatively and faster: using manually defined validation data instead of cross-validation
np.random.seed(10)
permute_aux = np.random.permutation(n)
train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data
valid_tune_idx = permute_aux[int(0.8 * n):n]
folds = [(train_tune_idx, valid_tune_idx)]
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
num_try_random=100, folds=folds, seed=1000,
train_set=data_train, gp_model=gp_model,
use_gp_model_for_validation=True, verbose_eval=1,
num_boost_round=1000, early_stopping_rounds=20,
metric=metric)
#--------------------Using a validation set for finding number of iterations----------------
# split into training an test data
train_ind = random.sample(range(n), int(n / 2))
test_ind = [x for x in range(n) if (x not in train_ind)]
X_train = X[train_ind, :]
y_train = y[train_ind]
group_train = group[train_ind]
X_test = X[test_ind, :]
y_test = y[test_ind]
group_test = group[test_ind]
# train
gp_model = gpb.GPModel(group_data=group_train, likelihood="gaussian")
gp_model.set_prediction_data(group_data_pred=group_test)
bst = gpb.GPBoostRegressor(max_depth=3, learning_rate=0.01, n_estimators=100)
bst.fit(X_train, y_train, gp_model=gp_model,
eval_set=[(X_test, y_test)],
eval_metric='l1',
early_stopping_rounds=5)
#--------------------Saving a model and loading it from a file----------------
# Train model and make prediction
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
bst = gpb.GPBoostRegressor(max_depth=3, learning_rate=0.01, n_estimators=50)
bst.fit(X, y, gp_model=gp_model)
group_test = np.array([1,2,-1])
Xtest = np.random.rand(len(group_test), 2)
pred = bst.predict(X=Xtest, group_data_pred=group_test,
predict_var=True, raw_score = True)
# Save model
import joblib
joblib.dump(bst, 'bst.pkl')
# load model
bst_loaded = joblib.load('bst.pkl')
pred_loaded = bst_loaded.predict(X=Xtest, group_data_pred=group_test,
predict_var=True, raw_score=True)
# Check equality
print(pred['fixed_effect'] - pred_loaded['fixed_effect'])
print(pred['random_effect_mean'] - pred_loaded['random_effect_mean'])
print(pred['random_effect_cov'] - pred_loaded['random_effect_cov'])
# Alternative way of saving and loading without the use of joblib
# Note: with the following approach, the loaded model is not a "sklearn-API"
# 'GPBoostRegressor' object anymore but a standard 'Booster' object
model_str = bst.booster_.model_to_string()
bst_loaded = gpb.Booster(model_str = model_str)
pred_loaded = bst_loaded.predict(data=Xtest, group_data_pred=group_test,
predict_var=True, pred_latent=True)
print(pred['fixed_effect'] - pred_loaded['fixed_effect'])
print(pred['random_effect_mean'] - pred_loaded['random_effect_mean'])
print(pred['random_effect_cov'] - pred_loaded['random_effect_cov'])
#--------------------Classification example----------------
y_bin = ((y - np.mean(y)) > 0) * 1.
gp_model = gpb.GPModel(group_data=group, likelihood="gaussian")
# train
bst = gpb.GPBoostClassifier(max_depth=3, learning_rate=0.01, n_estimators=50)
bst.fit(X, y_bin, gp_model=gp_model)
print("Estimated random effects model: ")
gp_model.summary()
# predict
group_test = np.arange(m)
Xtest = np.zeros((m, 2))
Xtest[:, 0] = np.linspace(0, 1, m)
pred = bst.predict(X=Xtest, group_data_pred=group_test, raw_score = True)