From 12c6b7ceeabdb277af1d100acd42ecdb5229ad85 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 26 Aug 2024 22:16:36 +0200 Subject: [PATCH] [R] Remove demos (#10750) --- R-package/demo/00Index | 14 --- R-package/demo/README.md | 19 ---- R-package/demo/basic_walkthrough.R | 113 --------------------- R-package/demo/boost_from_prediction.R | 26 ----- R-package/demo/create_sparse_matrix.R | 117 ---------------------- R-package/demo/cross_validation.R | 51 ---------- R-package/demo/custom_objective.R | 65 ------------ R-package/demo/early_stopping.R | 40 -------- R-package/demo/generalized_linear_model.R | 33 ------ R-package/demo/gpu_accelerated.R | 45 --------- R-package/demo/interaction_constraints.R | 113 --------------------- R-package/demo/poisson_regression.R | 6 -- R-package/demo/predict_first_ntree.R | 23 ----- R-package/demo/predict_leaf_indices.R | 54 ---------- R-package/demo/runall.R | 13 --- R-package/demo/tweedie_regression.R | 49 --------- tests/ci_build/test_r_package.py | 3 - 17 files changed, 784 deletions(-) delete mode 100644 R-package/demo/00Index delete mode 100644 R-package/demo/README.md delete mode 100644 R-package/demo/basic_walkthrough.R delete mode 100644 R-package/demo/boost_from_prediction.R delete mode 100644 R-package/demo/create_sparse_matrix.R delete mode 100644 R-package/demo/cross_validation.R delete mode 100644 R-package/demo/custom_objective.R delete mode 100644 R-package/demo/early_stopping.R delete mode 100644 R-package/demo/generalized_linear_model.R delete mode 100644 R-package/demo/gpu_accelerated.R delete mode 100644 R-package/demo/interaction_constraints.R delete mode 100644 R-package/demo/poisson_regression.R delete mode 100644 R-package/demo/predict_first_ntree.R delete mode 100644 R-package/demo/predict_leaf_indices.R delete mode 100644 R-package/demo/runall.R delete mode 100644 R-package/demo/tweedie_regression.R diff --git a/R-package/demo/00Index b/R-package/demo/00Index deleted file mode 100644 index fa09fa900486..000000000000 --- a/R-package/demo/00Index +++ /dev/null @@ -1,14 +0,0 @@ -basic_walkthrough Basic feature walkthrough -custom_objective Customize loss function, and evaluation metric -boost_from_prediction Boosting from existing prediction -predict_first_ntree Predicting using first n trees -generalized_linear_model Generalized Linear Model -cross_validation Cross validation -create_sparse_matrix Create Sparse Matrix -predict_leaf_indices Predicting the corresponding leaves -early_stopping Early Stop in training -poisson_regression Poisson regression on count data -tweedie_regression Tweedie regression -gpu_accelerated GPU-accelerated tree building algorithms -interaction_constraints Interaction constraints among features - diff --git a/R-package/demo/README.md b/R-package/demo/README.md deleted file mode 100644 index 99a492230d45..000000000000 --- a/R-package/demo/README.md +++ /dev/null @@ -1,19 +0,0 @@ -XGBoost R Feature Walkthrough -==== -* [Basic walkthrough of wrappers](basic_walkthrough.R) -* [Customize loss function, and evaluation metric](custom_objective.R) -* [Boosting from existing prediction](boost_from_prediction.R) -* [Predicting using first n trees](predict_first_ntree.R) -* [Generalized Linear Model](generalized_linear_model.R) -* [Cross validation](cross_validation.R) -* [Create a sparse matrix from a dense one](create_sparse_matrix.R) -* [Use GPU-accelerated tree building algorithms](gpu_accelerated.R) - -Benchmarks -==== -* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs) - -Notes -==== -* Contribution of examples, benchmarks is more than welcomed! -* If you like to share how you use xgboost to solve your problem, send a pull request :) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R deleted file mode 100644 index c65790109fc2..000000000000 --- a/R-package/demo/basic_walkthrough.R +++ /dev/null @@ -1,113 +0,0 @@ -require(xgboost) -require(methods) - -# we load in the agaricus dataset -# In this example, we are aiming to predict whether a mushroom is edible -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -train <- agaricus.train -test <- agaricus.test -# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} -class(train$label) -class(train$data) - -#-------------Basic Training using XGBoost----------------- -# this is the basic usage of xgboost you can put matrix in data field -# note: we are putting in sparse matrix here, xgboost naturally handles sparse input -# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) -print("Training xgboost with sparseMatrix") -bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)), - params = list(max_depth = 2, eta = 1), - nrounds = 2, nthread = 2) -# alternatively, you can put in dense matrix, i.e. basic R-matrix -print("Training xgboost with Matrix") -bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)), - params = list(max_depth = 2, eta = 1), - nrounds = 2, nthread = 2) - -# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features -print("Training xgboost with xgb.DMatrix") -dtrain <- xgb.DMatrix(data = train$data, label = train$label) -params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic") -bst <- xgb.train(data = dtrain, params = params, nrounds = 2) - -# Verbose = 0,1,2 -print("Train xgboost with verbose 0, no message") -bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0) -print("Train xgboost with verbose 1, print evaluation metric") -bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1) -print("Train xgboost with verbose 2, also print information about tree") -bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2) - -# you can also specify data as file path to a LIBSVM format input -# since we do not have this file with us, the following line is just for illustration -# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic") - -#--------------------basic prediction using xgboost-------------- -# you can do prediction using the following line -# you can put in Matrix, sparseMatrix, or xgb.DMatrix -pred <- predict(bst, test$data) -err <- mean(as.numeric(pred > 0.5) != test$label) -print(paste("test-error=", err)) - -#-------------------save and load models------------------------- -# save model to binary local file -xgb.save(bst, "xgboost.model") -# load binary model to R -# Function doesn't take 'nthreads', but can be set like this: -RhpcBLASctl::omp_set_num_threads(1) -bst2 <- xgb.load("xgboost.model") -pred2 <- predict(bst2, test$data) -# pred2 should be identical to pred -print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) - -# save model to R's raw vector -raw <- xgb.save.raw(bst) -# load binary model to R -bst3 <- xgb.load.raw(raw) -pred3 <- predict(bst3, test$data) -# pred3 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred)))) - -#----------------Advanced features -------------- -# to use advanced features, we need to put data in xgb.DMatrix -dtrain <- xgb.DMatrix(data = train$data, label = train$label) -dtest <- xgb.DMatrix(data = test$data, label = test$label) -#---------------Using an evaluation set---------------- -# 'evals' is a list of xgb.DMatrix, each of them is tagged with name -evals <- list(train = dtrain, test = dtest) -# to train with an evaluation set, use xgb.train, which contains more advanced features -# 'evals' argument allows us to monitor the evaluation result on all data in the list -print("Train xgboost using xgb.train with evaluation data") -bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals, - nthread = 2, objective = "binary:logistic") -# we can change evaluation metrics, or use multiple evaluation metrics -print("train xgboost using xgb.train with evaluation data, watch logloss and error") -bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals, - eval_metric = "error", eval_metric = "logloss", - nthread = 2, objective = "binary:logistic") - -# xgb.DMatrix can also be saved using xgb.DMatrix.save -xgb.DMatrix.save(dtrain, "dtrain.buffer") -# to load it in, simply call xgb.DMatrix -dtrain2 <- xgb.DMatrix("dtrain.buffer") -bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, evals = evals, - nthread = 2, objective = "binary:logistic") -# information can be extracted from xgb.DMatrix using getinfo -label <- getinfo(dtest, "label") -pred <- predict(bst, dtest) -err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) -print(paste("test-error=", err)) - -# You can dump the tree you learned using xgb.dump into a text file -dump_path <- file.path(tempdir(), 'dump.raw.txt') -xgb.dump(bst, dump_path, with_stats = TRUE) - -# Finally, you can check which features are the most important. -print("Most important features (look at column Gain):") -imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst) -print(imp_matrix) - -# Feature importance bar plot by gain -print("Feature importance Plot : ") -print(xgb.plot.importance(importance_matrix = imp_matrix)) diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R deleted file mode 100644 index 75af70dba0d7..000000000000 --- a/R-package/demo/boost_from_prediction.R +++ /dev/null @@ -1,26 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) - -evals <- list(eval = dtest, train = dtrain) -### -# advanced: start from a initial base prediction -# -print('start running example to start from a initial prediction') -# train xgboost for 1 round -param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') -bst <- xgb.train(param, dtrain, 1, evals) -# Note: we need the margin value instead of transformed prediction in set_base_margin -# do predict with output_margin=TRUE, will always give you margin values before logistic transformation -ptrain <- predict(bst, dtrain, outputmargin = TRUE) -ptest <- predict(bst, dtest, outputmargin = TRUE) -# set the base_margin property of dtrain and dtest -# base margin is the base prediction we will boost from -setinfo(dtrain, "base_margin", ptrain) -setinfo(dtest, "base_margin", ptest) - -print('this is result of boost from initial prediction') -bst <- xgb.train(params = param, data = dtrain, nrounds = 1, evals = evals) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R deleted file mode 100644 index 08a40608cdf8..000000000000 --- a/R-package/demo/create_sparse_matrix.R +++ /dev/null @@ -1,117 +0,0 @@ -require(xgboost) -require(Matrix) -require(data.table) -if (!require(vcd)) { - install.packages('vcd') #Available in CRAN. Used for its dataset with categorical values. - require(vcd) -} -# According to its documentation, XGBoost works only on numbers. -# Sometimes the dataset we have to work on have categorical data. -# A categorical variable is one which have a fixed number of values. -# By example, if for each observation a variable called "Colour" can have only -# "red", "blue" or "green" as value, it is a categorical variable. -# -# In R, categorical variable is called Factor. -# Type ?factor in console for more information. -# -# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix -# before analyzing it in XGBoost. -# The method we are going to see is usually called "one hot encoding". - -#load Arthritis dataset in memory. -data(Arthritis) - -# create a copy of the dataset with data.table package -# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent -# and its performance are really good). -df <- data.table(Arthritis, keep.rownames = FALSE) - -# Let's have a look to the data.table -cat("Print the dataset\n") -print(df) - -# 2 columns have factor type, one has ordinal type -# (ordinal variable is a categorical variable with values which can be ordered, here: None > Some > Marked). -cat("Structure of the dataset\n") -str(df) - -# Let's add some new categorical features to see if it helps. -# Of course these feature are highly correlated to the Age feature. -# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, -# even in case of highly correlated features. - -# For the first feature we create groups of age by rounding the real age. -# Note that we transform it to factor (categorical data) so the algorithm treat them as independent values. -df[, AgeDiscret := as.factor(round(Age / 10, 0))] - -# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. -# I choose this value based on nothing. -# We will see later if simplifying the information based on arbitrary values is a good strategy -# (I am sure you already have an idea of how well it will work!). -df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] - -# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). -df[, ID := NULL] - -# List the different values for the column Treatment: Placebo, Treated. -cat("Values of the categorical feature Treatment\n") -print(levels(df[, Treatment])) - -# Next step, we will transform the categorical data to dummy variables. -# This method is also called one hot encoding. -# The purpose is to transform each value of each categorical feature in one binary feature. -# -# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. -# Each of them will be binary. -# For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, -# the value 1 in the new column Placebo and the value 0 in the new column Treated. -# -# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values. -# Column Improved is excluded because it will be our output column, the one we want to predict. -sparse_matrix <- sparse.model.matrix(Improved ~ . - 1, data = df) - -cat("Encoding of the sparse Matrix\n") -print(sparse_matrix) - -# Create the output vector (not sparse) -# 1. Set, for all rows, field in Y column to 0; -# 2. set Y to 1 when Improved == Marked; -# 3. Return Y column -output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y] - -# Following is the same process as other demo -cat("Learning...\n") -bst <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = output_vector), max_depth = 9, - eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic") - -importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst) -print(importance) -# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. -# The second most important feature is having received a placebo or not. -# The sex is third. -# Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column). - -# Does these result make sense? -# Let's check some Chi2 between each of these features and the outcome. - -print(chisq.test(df$Age, df$Y)) -# Pearson correlation between Age and illness disappearing is 35 - -print(chisq.test(df$AgeDiscret, df$Y)) -# Our first simplification of Age gives a Pearson correlation of 8. - -print(chisq.test(df$AgeCat, df$Y)) -# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. -# It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), -# but for the illness we are studying, the age to be vulnerable is not the same. -# Don't let your "gut" lower the quality of your model. In "data science", there is science :-) - -# As you can see, in general destroying information by simplifying it won't improve your model. -# Chi2 just demonstrates that. -# But in more complex cases, creating a new feature based on existing one which makes link with the outcome -# more obvious may help the algorithm and improve the model. -# The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets. -# However it's almost always worse when you add some arbitrary rules. -# Moreover, you can notice that even if we have added some not useful new features highly correlated with -# other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. -# Linear model may not be that strong in these scenario. diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R deleted file mode 100644 index cf048c5ed600..000000000000 --- a/R-package/demo/cross_validation.R +++ /dev/null @@ -1,51 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) - -nrounds <- 2 -param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic') - -cat('running cross validation\n') -# do cross validation, this will print result out as -# [iteration] metric_name:mean_value+std_value -# std_value is standard deviation of the metric -xgb.cv(param, dtrain, nrounds, nfold = 5, metrics = 'error') - -cat('running cross validation, disable standard deviation display\n') -# do cross validation, this will print result out as -# [iteration] metric_name:mean_value+std_value -# std_value is standard deviation of the metric -xgb.cv(param, dtrain, nrounds, nfold = 5, - metrics = 'error', showsd = FALSE) - -### -# you can also do cross validation with customized loss function -# See custom_objective.R -## -print('running cross validation, with customized loss function') - -logregobj <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - preds <- 1 / (1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} -evalerror <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0))) / length(labels) - return(list(metric = "error", value = err)) -} - -param <- list(max_depth = 2, eta = 1, - objective = logregobj, eval_metric = evalerror) -# train with customized objective -xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5) - -# do cross validation with prediction values for each fold -res <- xgb.cv(params = param, data = dtrain, nrounds = nrounds, nfold = 5, prediction = TRUE) -res$evaluation_log -length(res$pred) diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R deleted file mode 100644 index 03d7b346471b..000000000000 --- a/R-package/demo/custom_objective.R +++ /dev/null @@ -1,65 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) - -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -evals <- list(eval = dtest, train = dtrain) -num_round <- 2 - -# user define objective function, given prediction, return gradient and second order gradient -# this is log likelihood loss -logregobj <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - preds <- 1 / (1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} - -# user defined evaluation function, return a pair metric_name, result -# NOTE: when you do customized loss function, the default prediction value is margin -# this may make builtin evaluation metric not function properly -# for example, we are doing logistic loss, the prediction is score before logistic transformation -# the builtin evaluation error assumes input is after logistic transformation -# Take this in mind when you use the customization, and maybe you need write customized evaluation function -evalerror <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0))) / length(labels) - return(list(metric = "error", value = err)) -} - -param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, - objective = logregobj, eval_metric = evalerror) -print('start training with user customized objective') -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, evals) - -# -# there can be cases where you want additional information -# being considered besides the property of DMatrix you can get by getinfo -# you can set additional information as attributes if DMatrix - -# set label attribute of dtrain to be label, we use label as an example, it can be anything -attr(dtrain, 'label') <- getinfo(dtrain, 'label') -# this is new customized objective, where you can access things you set -# same thing applies to customized evaluation function -logregobjattr <- function(preds, dtrain) { - # now you can access the attribute in customized function - labels <- attr(dtrain, 'label') - preds <- 1 / (1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} -param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0, - objective = logregobjattr, eval_metric = evalerror) -print('start training with user customized objective, with additional attributes in DMatrix') -# training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train -bst <- xgb.train(param, dtrain, num_round, evals) diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R deleted file mode 100644 index 057440882567..000000000000 --- a/R-package/demo/early_stopping.R +++ /dev/null @@ -1,40 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0) -evals <- list(eval = dtest) -num_round <- 20 -# user define objective function, given prediction, return gradient and second order gradient -# this is log likelihood loss -logregobj <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - preds <- 1 / (1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} -# user defined evaluation function, return a pair metric_name, result -# NOTE: when you do customized loss function, the default prediction value is margin -# this may make builtin evaluation metric not function properly -# for example, we are doing logistic loss, the prediction is score before logistic transformation -# the builtin evaluation error assumes input is after logistic transformation -# Take this in mind when you use the customization, and maybe you need write customized evaluation function -evalerror <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0))) / length(labels) - return(list(metric = "error", value = err)) -} -print('start training with early Stopping setting') - -bst <- xgb.train(param, dtrain, num_round, evals, - objective = logregobj, eval_metric = evalerror, maximize = FALSE, - early_stopping_round = 3) -bst <- xgb.cv(param, dtrain, num_round, nfold = 5, - objective = logregobj, eval_metric = evalerror, - maximize = FALSE, early_stopping_rounds = 3) diff --git a/R-package/demo/generalized_linear_model.R b/R-package/demo/generalized_linear_model.R deleted file mode 100644 index d29a6dc5be58..000000000000 --- a/R-package/demo/generalized_linear_model.R +++ /dev/null @@ -1,33 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -## -# this script demonstrate how to fit generalized linear model in xgboost -# basically, we are using linear model, instead of tree for our boosters -# you can fit a linear regression, or logistic regression model -## - -# change booster to gblinear, so that we are fitting a linear model -# alpha is the L1 regularizer -# lambda is the L2 regularizer -# you can also set lambda_bias which is L2 regularizer on the bias term -param <- list(objective = "binary:logistic", booster = "gblinear", - nthread = 2, alpha = 0.0001, lambda = 1) - -# normally, you do not need to set eta (step_size) -# XGBoost uses a parallel coordinate descent algorithm (shotgun), -# there could be affection on convergence with parallelization on certain cases -# setting eta to be smaller value, e.g 0.5 can make the optimization more stable - -## -# the rest of settings are the same -## -evals <- list(eval = dtest, train = dtrain) -num_round <- 2 -bst <- xgb.train(param, dtrain, num_round, evals) -ypred <- predict(bst, dtest) -labels <- getinfo(dtest, 'label') -cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n') diff --git a/R-package/demo/gpu_accelerated.R b/R-package/demo/gpu_accelerated.R deleted file mode 100644 index 617a63e74542..000000000000 --- a/R-package/demo/gpu_accelerated.R +++ /dev/null @@ -1,45 +0,0 @@ -# An example of using GPU-accelerated tree building algorithms -# -# NOTE: it can only run if you have a CUDA-enable GPU and the package was -# specially compiled with GPU support. -# -# For the current functionality, see -# https://xgboost.readthedocs.io/en/latest/gpu/index.html -# - -library('xgboost') - -# Simulate N x p random matrix with some binomial response dependent on pp columns -set.seed(111) -N <- 1000000 -p <- 50 -pp <- 25 -X <- matrix(runif(N * p), ncol = p) -betas <- 2 * runif(pp) - 1 -sel <- sort(sample(p, pp)) -m <- X[, sel] %*% betas - 1 + rnorm(N) -y <- rbinom(N, 1, plogis(m)) - -tr <- sample.int(N, N * 0.75) -dtrain <- xgb.DMatrix(X[tr, ], label = y[tr]) -dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr]) -evals <- list(train = dtrain, test = dtest) - -# An example of running 'gpu_hist' algorithm -# which is -# - similar to the 'hist' -# - the fastest option for moderately large datasets -# - current limitations: max_depth < 16, does not implement guided loss -# You can use tree_method = 'gpu_hist' for another GPU accelerated algorithm, -# which is slower, more memory-hungry, but does not use binning. -param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4, - max_bin = 64, tree_method = 'gpu_hist') -pt <- proc.time() -bst_gpu <- xgb.train(param, dtrain, evals = evals, nrounds = 50) -proc.time() - pt - -# Compare to the 'hist' algorithm: -param$tree_method <- 'hist' -pt <- proc.time() -bst_hist <- xgb.train(param, dtrain, evals = evals, nrounds = 50) -proc.time() - pt diff --git a/R-package/demo/interaction_constraints.R b/R-package/demo/interaction_constraints.R deleted file mode 100644 index 72287513eeeb..000000000000 --- a/R-package/demo/interaction_constraints.R +++ /dev/null @@ -1,113 +0,0 @@ -library(xgboost) -library(data.table) - -set.seed(1024) - -# Function to obtain a list of interactions fitted in trees, requires input of maximum depth -treeInteractions <- function(input_tree, input_max_depth) { - ID_merge <- i.id <- i.feature <- NULL # Suppress warning "no visible binding for global variable" - - trees <- data.table::copy(input_tree) # copy tree input to prevent overwriting - if (input_max_depth < 2) return(list()) # no interactions if max depth < 2 - if (nrow(input_tree) == 1) return(list()) - - # Attach parent nodes - for (i in 2:input_max_depth) { - if (i == 2) trees[, ID_merge := ID] else trees[, ID_merge := get(paste0('parent_', i - 2))] - parents_left <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = Yes)] - parents_right <- trees[!is.na(Split), list(i.id = ID, i.feature = Feature, ID_merge = No)] - - data.table::setorderv(trees, 'ID_merge') - data.table::setorderv(parents_left, 'ID_merge') - data.table::setorderv(parents_right, 'ID_merge') - - trees <- merge(trees, parents_left, by = 'ID_merge', all.x = TRUE) - trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) - := list(i.id, i.feature)] - trees[, c('i.id', 'i.feature') := NULL] - - trees <- merge(trees, parents_right, by = 'ID_merge', all.x = TRUE) - trees[!is.na(i.id), c(paste0('parent_', i - 1), paste0('parent_feat_', i - 1)) - := list(i.id, i.feature)] - trees[, c('i.id', 'i.feature') := NULL] - } - - # Extract nodes with interactions - interaction_trees <- trees[!is.na(Split) & !is.na(parent_1), # nolint: object_usage_linter - c('Feature', paste0('parent_feat_', 1:(input_max_depth - 1))), - with = FALSE] - interaction_trees_split <- split(interaction_trees, seq_len(nrow(interaction_trees))) - interaction_list <- lapply(interaction_trees_split, as.character) - - # Remove NAs (no parent interaction) - interaction_list <- lapply(interaction_list, function(x) x[!is.na(x)]) - - # Remove non-interactions (same variable) - interaction_list <- lapply(interaction_list, unique) # remove same variables - interaction_length <- lengths(interaction_list) - interaction_list <- interaction_list[interaction_length > 1] - interaction_list <- unique(lapply(interaction_list, sort)) - return(interaction_list) -} - -# Generate sample data -x <- list() -for (i in 1:10) { - x[[i]] <- i * rnorm(1000, 10) -} -x <- as.data.table(x) - -y <- -1 * x[, rowSums(.SD)] + x[['V1']] * x[['V2']] + x[['V3']] * x[['V4']] * x[['V5']] - + rnorm(1000, 0.001) + 3 * sin(x[['V7']]) - -train <- as.matrix(x) - -# Interaction constraint list (column names form) -interaction_list <- list(c('V1', 'V2'), c('V3', 'V4', 'V5')) - -# Convert interaction constraint list into feature index form -cols2ids <- function(object, col_names) { - LUT <- seq_along(col_names) - 1 - names(LUT) <- col_names - rapply(object, function(x) LUT[x], classes = "character", how = "replace") -} -interaction_list_fid <- cols2ids(interaction_list, colnames(train)) - -# Fit model with interaction constraints -bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000, - interaction_constraints = interaction_list_fid) - -bst_tree <- xgb.model.dt.tree(colnames(train), bst) -bst_interactions <- treeInteractions(bst_tree, 4) - # interactions constrained to combinations of V1*V2 and V3*V4*V5 - -# Fit model without interaction constraints -bst2 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000) - -bst2_tree <- xgb.model.dt.tree(colnames(train), bst2) -bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions - -# Fit model with both interaction and monotonicity constraints -bst3 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4, - eta = 0.1, nthread = 2, nrounds = 1000, - interaction_constraints = interaction_list_fid, - monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0)) - -bst3_tree <- xgb.model.dt.tree(colnames(train), bst3) -bst3_interactions <- treeInteractions(bst3_tree, 4) - # interactions still constrained to combinations of V1*V2 and V3*V4*V5 - -# Show monotonic constraints still apply by checking scores after incrementing V1 -x1 <- sort(unique(x[['V1']])) -for (i in seq_along(x1)){ - testdata <- copy(x[, - ('V1')]) - testdata[['V1']] <- x1[i] - testdata <- testdata[, paste0('V', 1:10), with = FALSE] - pred <- predict(bst3, as.matrix(testdata)) - - # Should not print out anything due to monotonic constraints - if (i > 1) if (any(pred > prev_pred)) print(i) - prev_pred <- pred -} diff --git a/R-package/demo/poisson_regression.R b/R-package/demo/poisson_regression.R deleted file mode 100644 index 685314b30e96..000000000000 --- a/R-package/demo/poisson_regression.R +++ /dev/null @@ -1,6 +0,0 @@ -data(mtcars) -head(mtcars) -bst <- xgb.train(data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]), - objective = 'count:poisson', nrounds = 5) -pred <- predict(bst, as.matrix(mtcars[, -11])) -sqrt(mean((pred - mtcars[, 11]) ^ 2)) diff --git a/R-package/demo/predict_first_ntree.R b/R-package/demo/predict_first_ntree.R deleted file mode 100644 index ba15ab39a74f..000000000000 --- a/R-package/demo/predict_first_ntree.R +++ /dev/null @@ -1,23 +0,0 @@ -require(xgboost) -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) - -param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') -evals <- list(eval = dtest, train = dtrain) -nrounds <- 2 - -# training the model for two rounds -bst <- xgb.train(param, dtrain, nrounds, nthread = 2, evals = evals) -cat('start testing prediction from first n trees\n') -labels <- getinfo(dtest, 'label') - -### predict using first 1 tree -ypred1 <- predict(bst, dtest, iterationrange = c(1, 1)) -# by default, we predict using all the trees -ypred2 <- predict(bst, dtest) - -cat('error of ypred1=', mean(as.numeric(ypred1 > 0.5) != labels), '\n') -cat('error of ypred2=', mean(as.numeric(ypred2 > 0.5) != labels), '\n') diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R deleted file mode 100644 index a57baf668896..000000000000 --- a/R-package/demo/predict_leaf_indices.R +++ /dev/null @@ -1,54 +0,0 @@ -require(xgboost) -require(data.table) -require(Matrix) - -set.seed(1982) - -# load in the agaricus dataset -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) - -param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic') -nrounds <- 4 - -# training the model for two rounds -bst <- xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) - -# Model accuracy without new features -accuracy.before <- (sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) - / length(agaricus.test$label)) - -# by default, we predict using all the trees -pred_with_leaf <- predict(bst, dtest, predleaf = TRUE) -head(pred_with_leaf) - -create.new.tree.features <- function(model, original.features) { - pred_with_leaf <- predict(model, original.features, predleaf = TRUE) - cols <- list() - for (i in 1:xgb.get.num.boosted.rounds(model)) { - # max is not the real max but it s not important for the purpose of adding features - leaf.id <- sort(unique(pred_with_leaf[, i])) - cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) - } - cbind(original.features, sparse.model.matrix(~ . - 1, as.data.frame(cols))) -} - -# Convert previous features to one hot encoding -new.features.train <- create.new.tree.features(bst, agaricus.train$data) -new.features.test <- create.new.tree.features(bst, agaricus.test$data) -colnames(new.features.test) <- colnames(new.features.train) - -# learning with new features -new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) -new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) -bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) - -# Model accuracy with new features -accuracy.after <- (sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) - / length(agaricus.test$label)) - -# Here the accuracy was already good and is now perfect. -cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", - accuracy.after, "!\n")) diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R deleted file mode 100644 index ab1822a5b8ad..000000000000 --- a/R-package/demo/runall.R +++ /dev/null @@ -1,13 +0,0 @@ -# running all scripts in demo folder, removed during packaging. -demo(basic_walkthrough, package = 'xgboost') -demo(custom_objective, package = 'xgboost') -demo(boost_from_prediction, package = 'xgboost') -demo(predict_first_ntree, package = 'xgboost') -demo(generalized_linear_model, package = 'xgboost') -demo(cross_validation, package = 'xgboost') -demo(create_sparse_matrix, package = 'xgboost') -demo(predict_leaf_indices, package = 'xgboost') -demo(early_stopping, package = 'xgboost') -demo(poisson_regression, package = 'xgboost') -demo(tweedie_regression, package = 'xgboost') -#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support diff --git a/R-package/demo/tweedie_regression.R b/R-package/demo/tweedie_regression.R deleted file mode 100644 index b07858e761fa..000000000000 --- a/R-package/demo/tweedie_regression.R +++ /dev/null @@ -1,49 +0,0 @@ -library(xgboost) -library(data.table) -library(cplm) - -data(AutoClaim) - -# auto insurance dataset analyzed by Yip and Yau (2005) -dt <- data.table(AutoClaim) - -# exclude these columns from the model matrix -exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY') - -# retains the missing values -# NOTE: this dataset is comes ready out of the box -options(na.action = 'na.pass') -x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = FALSE]) -options(na.action = 'na.omit') - -# response -y <- dt[, CLM_AMT5] - -d_train <- xgb.DMatrix(data = x, label = y, missing = NA) - -# the tweedie_variance_power parameter determines the shape of -# distribution -# - closer to 1 is more poisson like and the mass -# is more concentrated near zero -# - closer to 2 is more gamma like and the mass spreads to the -# the right with less concentration near zero - -params <- list( - objective = 'reg:tweedie', - eval_metric = 'rmse', - tweedie_variance_power = 1.4, - max_depth = 6, - eta = 1) - -bst <- xgb.train( - data = d_train, - params = params, - maximize = FALSE, - evals = list(train = d_train), - nrounds = 20) - -var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst) - -preds <- predict(bst, d_train) - -rmse <- sqrt(sum(mean((y - preds) ^ 2))) diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py index add31b97313c..735140a8099b 100644 --- a/tests/ci_build/test_r_package.py +++ b/tests/ci_build/test_r_package.py @@ -45,7 +45,6 @@ def pkgroot(path: str) -> None: ) shutil.copytree("R-package", dest) - os.remove(dest / "demo" / "runall.R") # core shutil.copytree("src", dest / "src" / "src") shutil.copytree("include", dest / "src" / "include") @@ -221,7 +220,6 @@ def test_with_autotools() -> None: subprocess.check_call( ["R.exe", "-q", "-e", "library(testthat); setwd('tests'); source('testthat.R')"] ) - subprocess.check_call(["R.exe", "-q", "-e", "demo(runall, package = 'xgboost')"]) @record_time @@ -296,7 +294,6 @@ def test_with_cmake(args: argparse.Namespace) -> None: "library(testthat); setwd('tests'); source('testthat.R')", ] ) - subprocess.check_call([R, "-q", "-e", "demo(runall, package = 'xgboost')"]) @record_time