-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Xgboost objective function customizing error. #999
Comments
Ah, interesting! Thanks for the issue. The xgboost engine itself will return the same predictions given that objective function, but tidymodels then only knows how to post-process predictions from the default objective function to convert them to probabilities. This is why the In this case, the debugging process may have been aided by library(parsnip)
boost_tree("classification") %>% set_engine("xgboost") %>% translate()
#> Boosted Tree Model Specification (classification)
#>
#> Computational engine: xgboost
#>
#> Model fit template:
#> parsnip::xgb_train(x = missing_arg(), y = missing_arg(), weights = missing_arg(),
#> nthread = 1, verbose = 0) It was also helpful to see when xgboost itself knows when outputted predictions are log-odds and when it doesn't: library(tidymodels)
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
data(agaricus.train, package = "xgboost")
data(agaricus.test, package = "xgboost")
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
watchlist <- list(train = dtrain, eval = dtest)
# no `objective` argument at all
param <- list(max_depth = 2, eta = 1, nthread = 2,eval_metric = "auc")
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.987601 eval-auc:0.986900
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] 0.08585284 0.94223028 0.08585284 0.08585284 0.02808682 0.94223028
# `objective` argument is a pre-configured option
param$objective <- "binary:logistic"
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -0.915723 2.496895 -0.915723 -0.915723 -2.909239 2.496895
# `objective` argument is "custom"
param$objective <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#> [1] train-auc:0.958228 eval-auc:0.960373
#> [2] train-auc:0.981413 eval-auc:0.979930
head(predict(bst, dtest, outputmargin = FALSE))
#> [1] -1.049978 2.575050 -1.049978 -1.049978 -3.019169 2.575050
head(predict(bst, dtest, outputmargin = TRUE))
#> [1] -1.049978 2.575050 -1.049978 -1.049978 -3.019169 2.575050 Created on 2023-10-06 with reprex v2.0.2 |
The method you gave me was really helpful. # build custom engine -----------------------------------------------------
#set_new_model("boost_tree")
library(tidymodels)
set_model_mode("boost_tree", "classification")
set_model_engine("boost_tree", "classification", "xgboost_custom")
set_dependency("boost_tree", eng = "xgboost_custom",pkg = "xgboost",mode = "classification")
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "tree_depth",
original = "max_depth",
func = list(pkg = "dials", fun = "tree_depth"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "trees",
original = "nrounds",
func = list(pkg = "dials", fun = "trees"),
has_submodel = TRUE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "learn_rate",
original = "eta",
func = list(pkg = "dials", fun = "learn_rate"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "mtry",
original = "colsample_bynode",
func = list(pkg = "dials", fun = "mtry"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "min_n",
original = "min_child_weight",
func = list(pkg = "dials", fun = "min_n"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "loss_reduction",
original = "gamma",
func = list(pkg = "dials", fun = "loss_reduction"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "sample_size",
original = "subsample",
func = list(pkg = "dials", fun = "sample_size"),
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "xgboost_custom",
parsnip = "stop_iter",
original = "early_stop",
func = list(pkg = "dials", fun = "stop_iter"),
has_submodel = FALSE
)
set_encoding(
model = "boost_tree",
eng = "xgboost_custom",
mode = "regression",
options = list(
predictor_indicators = "one_hot",
compute_intercept = FALSE,
remove_intercept = TRUE,
allow_sparse_x = TRUE
)
)
set_fit(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
value = list(
interface = "matrix",
protect = c("x", "y", "weights"),
func = c(pkg = "parsnip", fun = "xgb_train"),
defaults = list(nthread = 1, verbose = 0)
)
)
set_encoding(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
options = list(
predictor_indicators = "one_hot",
compute_intercept = FALSE,
remove_intercept = TRUE,
allow_sparse_x = TRUE
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "class",
value = list(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
event_level <- parsnip:::get_event_level(object$spec)
if (event_level == "first") {
x <- ifelse(x >= 0.5, object$lvl[1], object$lvl[2])
} else {
x <- ifelse(x >= 0.5, object$lvl[2], object$lvl[1])
}
} else {
x <- object$lvl[apply(x, 1, which.max)]
}
x
},
func = c(pkg = NULL, fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "prob",
value = list(
pre = NULL,
post = function(x, object) {
if (is.vector(x)) {
event_level <- parsnip:::get_event_level(object$spec)
if (event_level == "first") {
x <- tibble(v1 = x, v2 = 1 - x)
} else {
x <- tibble(v1 = 1 - x, v2 = x)
}
} else {
x <- as_tibble(x, .name_repair = "minimal")
}
colnames(x) <- object$lvl
x
},
func = c(pkg = NULL, fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
set_pred(
model = "boost_tree",
eng = "xgboost_custom",
mode = "classification",
type = "raw",
value = list(
pre = NULL,
post = NULL,
func = c(fun = "xgb_predict_custom"),
args = list(object = quote(object$fit), new_data = quote(new_data))
)
)
##################################################################################
##################################################################################
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# logit_default -----------------------------------------------------------
logistic_test <- function() {
## link
linkfun <- function(mu){log(mu/(1-mu))}
## inverse link
linkinv <- function(eta) {1/(1+exp(-eta))}
## derivative of invlink wrt eta
mu.eta <- function(eta) { exp(-eta)/(1+exp(-eta))^2 }
valideta <- function(eta) {TRUE}
link <- "logistic_test"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
logistic<- logistic_test()
xgb_predict_custom <- function(object, new_data, ...) {
if (!inherits(new_data, "xgb.DMatrix")) {
new_data <- maybe_matrix(new_data)
new_data <- xgboost::xgb.DMatrix(data = new_data, missing = NA)
}
res <- predict(object, new_data, ...)
x <- switch(
object$params$objective %||% 3L,
"binary:logitraw" = stats::binomial()$linkinv(res),
"multi:softprob" = matrix(res, ncol = object$params$num_class, byrow = TRUE),
# Link functions related to custom objective functions
logistic$linkinv(res))
x
}
set.seed(100)
res <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost",objective = "binary:logitraw"),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
set.seed(100)
res_custom <-
fit_resamples(
boost_tree("classification") %>% set_engine("xgboost_custom", objective = logregobj),
Class ~ .,
control =control_grid(verbose = FALSE,
allow_par = FALSE,
parallel_over = NULL,save_pred = TRUE),
bootstraps(two_class_dat, 5)
)
collect_metrics(res)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.788 5 0.00719 Preprocessor1_Model1
#> 2 roc_auc binary 0.860 5 0.00995 Preprocessor1_Model1
collect_metrics(res_custom)
#> # A tibble: 2 × 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy binary 0.788 5 0.00719 Preprocessor1_Model1
#> 2 roc_auc binary 0.860 5 0.00995 Preprocessor1_Model1
origin <- res$.predictions[[1]] |>
select(contains("pred"),.row,-.config,-.pred_class)
custom <- res_custom$.predictions[[1]] |>
select(.pred_Class1_custom = .pred_Class1,
.pred_Class2_custom = .pred_Class2,
,-.config,.row)
left_join(origin,custom,by=".row") |> head(15)
#> # A tibble: 15 × 5
#> .pred_Class1 .pred_Class2 .row .pred_Class1_custom .pred_Class2_custom
#> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 0.821 0.179 3 0.821 0.179
#> 2 0.0663 0.934 5 0.0663 0.934
#> 3 0.249 0.751 8 0.249 0.751
#> 4 0.981 0.0191 20 0.981 0.0191
#> 5 0.977 0.0228 22 0.977 0.0228
#> 6 0.0636 0.936 25 0.0636 0.936
#> 7 0.981 0.0186 27 0.981 0.0186
#> 8 0.981 0.0193 29 0.981 0.0193
#> 9 0.643 0.357 33 0.643 0.357
#> 10 0.0331 0.967 35 0.0331 0.967
#> 11 0.530 0.470 36 0.530 0.470
#> 12 0.984 0.0155 39 0.984 0.0155
#> 13 0.981 0.0191 40 0.981 0.0191
#> 14 0.196 0.804 41 0.196 0.804
#> 15 0.979 0.0210 42 0.979 0.0210 Created on 2023-10-08 with reprex v2.0.2 |
This issue is a continuation of #873 and #875.
The customized objective function for "logregobj" and the one trained with the default options should calculate the same, but they didn't, so I saved the predictions to check.
The sum of the predicted probability values is 1, but each value is not fixed between 0 and 1.
Below is the minimal reproducible code attached.
Created on 2023-09-09 with reprex v2.0.2
The text was updated successfully, but these errors were encountered: