From 1135cf30ea4dc1236abb0e07b4583f4715b6157e Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Tue, 20 Jun 2017 01:47:03 -0500 Subject: [PATCH 1/2] xgboost: expose watchlist and callbacks; remove silent from params; set default lambda=1; add tweedie_variance_power param --- R/RLearner_classif_xgboost.R | 29 ++++++++++++++++++++--------- R/RLearner_regr_xgboost.R | 30 +++++++++++++++++++++--------- tests/testthat/test_regr_xgboost.R | 6 +++--- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/R/RLearner_classif_xgboost.R b/R/RLearner_classif_xgboost.R index cae749f1ac..47230116c2 100644 --- a/R/RLearner_classif_xgboost.R +++ b/R/RLearner_classif_xgboost.R @@ -7,7 +7,7 @@ makeRLearner.classif.xgboost = function() { # we pass all of what goes in 'params' directly to ... of xgboost # makeUntypedLearnerParam(id = "params", default = list()), makeDiscreteLearnerParam(id = "booster", default = "gbtree", values = c("gbtree", "gblinear", "dart")), - makeIntegerLearnerParam(id = "silent", default = 0L, tunable = FALSE), + makeUntypedLearnerParam(id = "watchlist", default = NULL, tunable = FALSE), makeNumericLearnerParam(id = "eta", default = 0.3, lower = 0, upper = 1), makeNumericLearnerParam(id = "gamma", default = 0, lower = 0), makeIntegerLearnerParam(id = "max_depth", default = 6L, lower = 1L), @@ -16,7 +16,7 @@ makeRLearner.classif.xgboost = function() { makeNumericLearnerParam(id = "colsample_bytree", default = 1, lower = 0, upper = 1), makeNumericLearnerParam(id = "colsample_bylevel", default = 1, lower = 0, upper = 1), makeIntegerLearnerParam(id = "num_parallel_tree", default = 1L, lower = 1L), - makeNumericLearnerParam(id = "lambda", default = 0, lower = 0), + makeNumericLearnerParam(id = "lambda", default = 1, lower = 0), makeNumericLearnerParam(id = "lambda_bias", default = 0, lower = 0), makeNumericLearnerParam(id = "alpha", default = 0, lower = 0), makeUntypedLearnerParam(id = "objective", default = "binary:logistic", tunable = FALSE), @@ -26,6 +26,7 @@ makeRLearner.classif.xgboost = function() { makeNumericLearnerParam(id = "missing", default = NULL, tunable = FALSE, when = "both", special.vals = list(NA, NA_real_, NULL)), makeIntegerVectorLearnerParam(id = "monotone_constraints", default = 0, lower = -1, upper = 1), + makeNumericLearnerParam(id = "tweedie_variance_power", lower = 1, upper = 2, default = 1.5, requires = quote(objective == "reg:tweedie")), makeIntegerLearnerParam(id = "nthread", lower = 1L, tunable = FALSE), makeIntegerLearnerParam(id = "nrounds", default = 1L, lower = 1L), # FIXME nrounds seems to have no default in xgboost(), if it has 1, par.vals is redundant @@ -38,7 +39,14 @@ makeRLearner.classif.xgboost = function() { makeDiscreteLearnerParam(id = "sample_type", default = "uniform", values = c("uniform", "weighted"), requires = quote(booster == "dart")), makeDiscreteLearnerParam(id = "normalize_type", default = "tree", values = c("tree", "forest"), requires = quote(booster == "dart")), makeNumericLearnerParam(id = "rate_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")), - makeNumericLearnerParam(id = "skip_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")) + makeNumericLearnerParam(id = "skip_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")), + # TODO: uncomment the following after the next CRAN update, and set max_depth's lower = 0L + #makeLogicalLearnerParam(id = "one_drop", default = FALSE, requires = quote(booster == "dart")), + #makeDiscreteLearnerParam(id = "tree_method", default = "exact", values = c("exact", "hist"), requires = quote(booster != "gblinear")), + #makeDiscreteLearnerParam(id = "grow_policy", default = "depthwise", values = c("depthwise", "lossguide"), requires = quote(tree_method == "hist")), + #makeIntegerLearnerParam(id = "max_leaves", default = 0L, lower = 0L, requires = quote(grow_policy == "lossguide")), + #makeIntegerLearnerParam(id = "max_bin", default = 256L, lower = 2L, requires = quote(tree_method == "hist")), + makeUntypedLearnerParam(id = "callbacks", default = list(), tunable = FALSE) ), par.vals = list(nrounds = 1L, verbose = 0L), properties = c("twoclass", "multiclass", "numerics", "prob", "weights", "missings", "featimp"), @@ -54,8 +62,6 @@ trainLearner.classif.xgboost = function(.learner, .task, .subset, .weights = NUL td = getTaskDesc(.task) parlist = list(...) - parlist$data = data.matrix(getTaskData(.task, .subset, target.extra = TRUE)$data) - parlist$label = match(as.character(getTaskData(.task, .subset, target.extra = TRUE)$target), td$class.levels) - 1 nc = length(td$class.levels) if (is.null(parlist$objective)) @@ -68,10 +74,17 @@ trainLearner.classif.xgboost = function(.learner, .task, .subset, .weights = NUL if (parlist$objective %in% c("multi:softprob", "multi:softmax")) parlist$num_class = nc + task.data = getTaskData(.task, .subset, target.extra = TRUE) + label = match(as.character(task.data$target), td$class.levels) - 1 + parlist$data = xgboost::xgb.DMatrix(data = data.matrix(task.data$data), label = label) + if (!is.null(.weights)) - parlist$data = xgboost::xgb.DMatrix(data = parlist$data, label = parlist$label, weight = .weights) + xgboost::setinfo(parlist$data, "weight", .weights) + + if (is.null(parlist$watchlist)) + parlist$watchlist = list(train = parlist$data) - do.call(xgboost::xgboost, parlist) + do.call(xgboost::xgb.train, parlist) } #' @export @@ -131,5 +144,3 @@ getFeatureImportanceLearner.classif.xgboost = function(.learner, .model, ...) { fiv = imp$Gain setNames(fiv, imp$Feature) } - - diff --git a/R/RLearner_regr_xgboost.R b/R/RLearner_regr_xgboost.R index cb7ea7f726..2c843840fc 100644 --- a/R/RLearner_regr_xgboost.R +++ b/R/RLearner_regr_xgboost.R @@ -7,7 +7,7 @@ makeRLearner.regr.xgboost = function() { # we pass all of what goes in 'params' directly to ... of xgboost #makeUntypedLearnerParam(id = "params", default = list()), makeDiscreteLearnerParam(id = "booster", default = "gbtree", values = c("gbtree", "gblinear", "dart")), - makeIntegerLearnerParam(id = "silent", default = 0L, tunable = FALSE), + makeUntypedLearnerParam(id = "watchlist", default = NULL, tunable = FALSE), makeNumericLearnerParam(id = "eta", default = 0.3, lower = 0, upper = 1), makeNumericLearnerParam(id = "gamma", default = 0, lower = 0), makeIntegerLearnerParam(id = "max_depth", default = 6L, lower = 1L), @@ -16,16 +16,17 @@ makeRLearner.regr.xgboost = function() { makeNumericLearnerParam(id = "colsample_bytree", default = 1, lower = 0, upper = 1), makeNumericLearnerParam(id = "colsample_bylevel", default = 1, lower = 0, upper = 1), makeIntegerLearnerParam(id = "num_parallel_tree", default = 1L, lower = 1L), - makeNumericLearnerParam(id = "lambda", default = 0, lower = 0), + makeNumericLearnerParam(id = "lambda", default = 1, lower = 0), makeNumericLearnerParam(id = "lambda_bias", default = 0, lower = 0), makeNumericLearnerParam(id = "alpha", default = 0, lower = 0), makeUntypedLearnerParam(id = "objective", default = "reg:linear", tunable = FALSE), makeUntypedLearnerParam(id = "eval_metric", default = "rmse", tunable = FALSE), makeNumericLearnerParam(id = "base_score", default = 0.5, tunable = FALSE), - + makeNumericLearnerParam(id = "max_delta_step", lower = 0, default = 0), makeNumericLearnerParam(id = "missing", default = NULL, tunable = FALSE, when = "both", special.vals = list(NA, NA_real_, NULL)), makeIntegerVectorLearnerParam(id = "monotone_constraints", default = 0, lower = -1, upper = 1), + makeNumericLearnerParam(id = "tweedie_variance_power", lower = 1, upper = 2, default = 1.5, requires = quote(objective == "reg:tweedie")), makeIntegerLearnerParam(id = "nthread", lower = 1L, tunable = FALSE), makeIntegerLearnerParam(id = "nrounds", default = 1L, lower = 1L), # FIXME nrounds seems to have no default in xgboost(), if it has 1, par.vals is redundant @@ -35,9 +36,17 @@ makeRLearner.regr.xgboost = function() { requires = quote(verbose == 1L)), makeIntegerLearnerParam(id = "early_stopping_rounds", default = NULL, lower = 1L, special.vals = list(NULL), tunable = FALSE), makeLogicalLearnerParam(id = "maximize", default = NULL, special.vals = list(NULL), tunable = FALSE), + makeDiscreteLearnerParam(id = "sample_type", default = "uniform", values = c("uniform", "weighted"), requires = quote(booster == "dart")), makeDiscreteLearnerParam(id = "normalize_type", default = "tree", values = c("tree", "forest"), requires = quote(booster == "dart")), makeNumericLearnerParam(id = "rate_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")), - makeNumericLearnerParam(id = "skip_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")) + makeNumericLearnerParam(id = "skip_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")), + # TODO: uncomment the following after the next CRAN update, and set max_depth's lower = 0L + #makeLogicalLearnerParam(id = "one_drop", default = FALSE, requires = quote(booster == "dart")), + #makeDiscreteLearnerParam(id = "tree_method", default = "exact", values = c("exact", "hist"), requires = quote(booster != "gblinear")), + #makeDiscreteLearnerParam(id = "grow_policy", default = "depthwise", values = c("depthwise", "lossguide"), requires = quote(tree_method == "hist")), + #makeIntegerLearnerParam(id = "max_leaves", default = 0L, lower = 0L, requires = quote(grow_policy == "lossguide")), + #makeIntegerLearnerParam(id = "max_bin", default = 256L, lower = 2L, requires = quote(tree_method == "hist")), + makeUntypedLearnerParam(id = "callbacks", default = list(), tunable = FALSE) ), par.vals = list(nrounds = 1L, verbose = 0L), properties = c("numerics", "weights", "featimp", "missings"), @@ -52,16 +61,19 @@ makeRLearner.regr.xgboost = function() { trainLearner.regr.xgboost = function(.learner, .task, .subset, .weights = NULL, ...) { parlist = list(...) - parlist$label = getTaskData(.task, .subset, target.extra = TRUE)$target - parlist$data = data.matrix(getTaskData(.task, .subset, target.extra = TRUE)$data) - if (is.null(parlist$objective)) parlist$objective = "reg:linear" + task.data = getTaskData(.task, .subset, target.extra = TRUE) + parlist$data = xgboost::xgb.DMatrix(data = data.matrix(task.data$data), label = task.data$target) + if (!is.null(.weights)) - parlist$data = xgboost::xgb.DMatrix(data = parlist$data, label = parlist$label, weight = .weights) + xgboost::setinfo(parlist$data, "weight", .weights) + + if (is.null(parlist$watchlist)) + parlist$watchlist = list(train = parlist$data) - do.call(xgboost::xgboost, parlist) + do.call(xgboost::xgb.train, parlist) } #' @export diff --git a/tests/testthat/test_regr_xgboost.R b/tests/testthat/test_regr_xgboost.R index 81bb36171e..aa5bf715a4 100644 --- a/tests/testthat/test_regr_xgboost.R +++ b/tests/testthat/test_regr_xgboost.R @@ -31,9 +31,9 @@ test_that("regr_xgboost", { }) test_that("xgboost works with different 'missing' arg vals", { - lrn = makeLearner("classif.xgboost", missing = NA_real_) - lrn = makeLearner("classif.xgboost", missing = NA) - lrn = makeLearner("classif.xgboost", missing = NULL) + lrn = makeLearner("regr.xgboost", missing = NA_real_) + lrn = makeLearner("regr.xgboost", missing = NA) + lrn = makeLearner("regr.xgboost", missing = NULL) }) From 44f53fdffc4bab33f9d6c16e32e54a6f41b61fdb Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Wed, 21 Jun 2017 22:26:23 -0500 Subject: [PATCH 2/2] disable TODO linter --- tests/testthat/helper_lint.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/helper_lint.R b/tests/testthat/helper_lint.R index 9a1824d444..46e0a8847a 100644 --- a/tests/testthat/helper_lint.R +++ b/tests/testthat/helper_lint.R @@ -266,7 +266,7 @@ if (isLintrVersionOk() && require("lintr", quietly = TRUE) && require("rex", qui seq = lintr::seq_linter, unneeded.concatenation = lintr::unneeded_concatenation_linter, trailing.whitespace = lintr::trailing_whitespace_linter, - todo.comment = lintr::todo_comment_linter(todo = "todo"), # is case-insensitive + #todo.comment = lintr::todo_comment_linter(todo = "todo"), # is case-insensitive spaces.inside = lintr::spaces_inside_linter, infix.spaces = infix.spaces.linter, object.naming = object.naming.linter)