Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update variance estimation methods for RF learners #1784

Merged
merged 18 commits into from
May 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

- `classif.liquidSVM` and `regr.liquidSVM` have been removed because `liquidSVM` has been removed from CRAN.
- fixed a bug that caused an incorrect aggregation of probabilities in some cases. The bug existed since quite some time and was exposed due to the change of `data.table`s default in `rbindlist()`. See #2578 for more information. (@mllg, #2579)
- `regr.randomForest` gains three new methods to estimate the standard error:
- `se.method = "jackknife"`
- `se.method = "bootstrap"`
- `se.method = "sd"`
See `?regr.randomForest` for more details.
`regr.ranger` relies on the functions provided by the package ("jackknife" and "infjackknife" (default))
(@jakob-r, #1784)

## learners - new
- add learner `cluster.MiniBatchKmeans` from package _ClusterR_ (@Prasiddhi, #2554)
Expand Down
64 changes: 36 additions & 28 deletions R/RLearner_regr_randomForest.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ makeRLearner.regr.randomForest = function() {
trainLearner.regr.randomForest = function(.learner, .task, .subset, .weights = NULL, se.method = "sd", keep.inbag = NULL, se.boot = 50L, se.ntree = 100L, ...) {

data = getTaskData(.task, .subset, target.extra = TRUE)
m = randomForest::randomForest(x = data[["data"]], y = data[["target"]],
keep.inbag = if (is.null(keep.inbag)) TRUE else keep.inbag, ...)
if (is.null(keep.inbag)) keep.inbag = (se.method == "jackknife" && .learner$predict.type == "se")
m = randomForest::randomForest(x = data[["data"]], y = data[["target"]], keep.inbag = keep.inbag, ...)
if (.learner$predict.type == "se" && se.method == "bootstrap") {
base.lrn = setPredictType(.learner, "response")
base.lrn = setHyperPars(base.lrn, ntree = se.ntree)
Expand All @@ -108,24 +108,28 @@ predictLearner.regr.randomForest = function(.learner, .model, .newdata, se.metho
if (se.method == "bootstrap") {
pred = predict(.model$learner.model$single.model, newdata = .newdata, ...)
} else {
pred = predict(.model$learner.model, newdata = .newdata, ...)
pred = predict(.model$learner.model, newdata = .newdata, predict.all = (.learner$predict.type == "se"), ...)
}
if (.learner$predict.type == "se") {
se.fun = switch(se.method,
bootstrap = bootstrapStandardError,
jackknife = jackknifeStandardError,
sd = sdStandardError
)
se = se.fun(.learner, .model, .newdata, ...)
return(cbind(pred, se))
if (se.method == "bootstrap") {
se = bootstrapStandardError(.learner, .model, .newdata, ...)
return(cbind(pred, se))
} else if (se.method == "jackknife") {
se = jacknifeStandardError(
aggregated.predictions = pred$aggregate,
individual.predictions = pred$individual,
bag.counts = .model$learner.model$inbag)
} else if (se.method == "sd") {
se = sdStandardError(individual.predictions = pred$individual)
}
return(cbind(pred$aggregate, se))
} else {
return(pred)
}
}

#' @export
getOOBPredsLearner.regr.randomForest = function(.learner, .model) {

getLearnerModel(.model, more.unwrap = TRUE)$predicted
}

Expand All @@ -152,7 +156,7 @@ bootstrapStandardError = function(.learner, .model, .newdata,
# )
# )
bias = rowSums(matrix(vapply(pred.boot.all, function(p) rowSums(p - rowMeans(p))^2, numeric(nrow(pred.boot.all[[1]]))), nrow = nrow(.newdata), ncol = se.boot, byrow = FALSE))
bist = ((1 / se.ntree) - (1 / ntree)) / (se.boot * se.ntree * (se.ntree - 1)) * bias
bias = ((1 / se.ntree) - (1 / ntree)) / (se.boot * se.ntree * (se.ntree - 1)) * bias
pred.boot.aggregated = extractSubList(pred.bagged, "aggregate")
pred.boot.aggregated = matrix(pred.boot.aggregated, nrow = nrow(.newdata), ncol = se.boot, byrow = FALSE)
var.boot = apply(pred.boot.aggregated, 1, var) - bias
Expand All @@ -161,33 +165,37 @@ bootstrapStandardError = function(.learner, .model, .newdata,
}

# Computes the mc bias-corrected jackknife after bootstrap
jackknifeStandardError = function(.learner, .model, .newdata, ...) {

model = .model$learner.model
model$inbag = model$inbag[rowSums(model$inbag == 0) > 0, , drop = FALSE]
n = nrow(model$inbag)
ntree = model$ntree
pred = predict(model, newdata = .newdata, predict.all = TRUE, ...)
oob = model$inbag == 0
jack.n = apply(oob, 1, function(x) rowMeans(pred$individual[, x, drop = FALSE]))
# @param aggregated.predictions `vector(n)`
# Vector of length n of predictions, aggregated over all individual predictions
# @param individual.predictions `matrix`
# The individual predictions. Each row represents one individual and each column represents the predictions of one base learner.
# @param bag.counts `matrix`
# These are the inbag counts of the model. Each row represents an observation of the training set and each row represents one base learner.
# The number indicates how often this observation exists in the bootstrap sample for the respective base learner.
jacknifeStandardError = function(aggregated.predictions, individual.predictions, bag.counts) {

nbase = ncol(individual.predictions)
bag.counts = bag.counts[rowSums(bag.counts == 0) > 0, , drop = FALSE]
n = nrow(bag.counts)
oob = bag.counts == 0
jack.n = apply(oob, 1, function(x) rowMeans(individual.predictions[, x, drop = FALSE]))
if (is.vector(jack.n)) {
jack.n = t(as.matrix(jack.n))
}
jack = (n - 1) / n * rowSums((jack.n - pred$aggregate)^2)
bias = (exp(1) - 1) * n / ntree^2 * rowSums((pred$individual - pred$aggregate)^2)
jack = (n - 1) / n * rowSums((jack.n - aggregated.predictions)^2)
bias = (exp(1) - 1) * n / nbase^2 * rowSums((individual.predictions - aggregated.predictions)^2)
jab = pmax(jack - bias, 0)
sqrt(jab)
}

# computes the standard deviation across trees
sdStandardError = function(.learner, .model, .newdata, ...) {

pred = predict(.model$learner.model, newdata = .newdata, predict.all = TRUE, ...)
apply(pred$individual, 1, sd)
# @param individual.predictions `matrix`
# The individual predictions. Each row represents one individual and each column represents the predictions of one base learner.
sdStandardError = function(individual.predictions) {
apply(individual.predictions, 1, sd)
}

#' @export
getFeatureImportanceLearner.regr.randomForest = function(.learner, .model, ...) {

getFeatureImportanceLearner.classif.randomForest(.learner, .model, ...)
}
25 changes: 14 additions & 11 deletions R/RLearner_regr_ranger.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,45 +25,48 @@ makeRLearner.regr.ranger = function() {
makeIntegerLearnerParam(id = "num.random.splits", lower = 1L, default = 1L, requires = quote(splitrule == "extratrees")),
makeNumericLearnerParam(id = "alpha", lower = 0L, upper = 1L, default = 0.5, requires = quote(splitrule == "maxstat")),
makeNumericLearnerParam(id = "minprop", lower = 0, upper = 0.5, default = 0.1, requires = quote(splitrule == "maxstat")),
makeLogicalLearnerParam(id = "keep.inbag", default = FALSE, tunable = FALSE)
makeLogicalLearnerParam(id = "keep.inbag", default = FALSE, tunable = FALSE),
makeDiscreteLearnerParam(id = "se.method", default = "infjack", values = c("jack", "infjack"), requires = quote(keep.inbag == TRUE), when = "predict")
),
par.vals = list(num.threads = 1L, verbose = FALSE, respect.unordered.factors = "order"),
properties = c("numerics", "factors", "ordered", "oobpreds", "featimp", "se", "weights"),
name = "Random Forests",
short.name = "ranger",
note = "By default, internal parallelization is switched off (`num.threads = 1`), `verbose` output is disabled, `respect.unordered.factors` is set to `order` for all splitrules.",
note = "By default, internal parallelization is switched off (`num.threads = 1`), `verbose` output is disabled, `respect.unordered.factors` is set to `order` for all splitrules. All settings are changeable. `mtry.perc` sets `mtry` to `mtry.perc*getTaskNFeats(.task)`. Default for `mtry` is the floor of square root of number of features in task. Se estimation is mc bias-corrected jackknife after bootstrap, see '?regr.randomForest' for more details.",
callees = "ranger"
)
}

#' @export
trainLearner.regr.ranger = function(.learner, .task, .subset, .weights = NULL, ...) {
trainLearner.regr.ranger = function(.learner, .task, .subset, .weights = NULL, keep.inbag = NULL, ...) {

tn = getTaskTargetNames(.task)
if (is.null(keep.inbag)) keep.inbag = (.learner$predict.type == "se") # needed for jacknife and infjack!
ranger::ranger(formula = NULL, dependent.variable = tn, data = getTaskData(.task, .subset),
case.weights = .weights, ...)
case.weights = .weights, keep.inbag = keep.inbag, ...)
}

#' @export
predictLearner.regr.ranger = function(.learner, .model, .newdata, ...) {
predictLearner.regr.ranger = function(.learner, .model, .newdata, se.method = "sd", ...) {

type = if (.learner$predict.type == "se") "se" else "response"
p = predict(object = .model$learner.model, data = .newdata, type = type, ...)
pred = predict(object = .model$learner.model, data = .newdata, type = ifelse(.learner$predict.type == "se", "se", "response"), ...)
p = pred$predictions
if (is.matrix(p)) { #if someone set predict.all = TRUE for ranger
p = rowMeans(pred$predictions)
}
if (.learner$predict.type == "se") {
return(cbind(p$predictions, p$se))
return(cbind(p, pred$se))
} else {
return(p$predictions)
return(p)
}
}

#' @export
getOOBPredsLearner.regr.ranger = function(.learner, .model) {

getLearnerModel(.model, more.unwrap = TRUE)$predictions
}

#' @export
getFeatureImportanceLearner.regr.ranger = function(.learner, .model, ...) {

getFeatureImportanceLearner.classif.ranger(.learner, .model, ...)
}
2 changes: 1 addition & 1 deletion docs/articles/tutorial/integrated_learners.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 15 additions & 7 deletions tests/testthat/test_regr_ranger.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,33 @@ test_that("regr_ranger", {
testSimpleParsets("regr.ranger", regr.df, regr.target, regr.train.inds, old.predicts.list, parset.list)
})

test_that("regr_ranger se", {

test_that("se with se.method = sd", {
requirePackagesOrSkip("ranger", default.method = "load")

parset.list = list(
list(keep.inbag = TRUE),
list(num.trees = 100, keep.inbag = TRUE),
list(num.trees = 250, mtry = 4, keep.inbag = TRUE),
list(num.trees = 500, min.node.size = 2, keep.inbag = TRUE)
list(),
list(num.trees = 100),
list(num.trees = 100, se.method = "jack"),
list(num.trees = 250, mtry = 4),
list(num.trees = 500, min.node.size = 2)
)

old.predicts.list = list()

parset.only.for.predict = "se.method"

for (i in seq_along(parset.list)) {
parset = parset.list[[i]]
parset.list[[i]] = c(parset, predict.type = "se")
parset = c(parset, list(data = regr.train, formula = regr.formula, respect.unordered.factors = "order"))
parset = c(parset, list(data = regr.train, formula = regr.formula, keep.inbag = TRUE))
parset = dropNamed(parset, parset.only.for.predict)
set.seed(getOption("mlr.debug.seed"))
m = do.call(ranger::ranger, parset)
set.seed(getOption("mlr.debug.seed"))
p = predict(m, data = regr.test, type = "se")
predict.parset = parset[parset.only.for.predict]
predict.parset = c(list(object = m, data = regr.test, type = "se"), predict.parset)
p = do.call(predict, predict.parset)
old.predicts.list[[i]] = cbind(p$predictions, p$se)
}

Expand Down