From a03b92e8a1b68322a385987d74f5d7c1dd2d4437 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 3 Dec 2024 07:50:34 +0100 Subject: [PATCH 01/16] [R] remove GC safety test (#11043) --- R-package/tests/testthat/test_gc_safety.R | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 R-package/tests/testthat/test_gc_safety.R diff --git a/R-package/tests/testthat/test_gc_safety.R b/R-package/tests/testthat/test_gc_safety.R deleted file mode 100644 index 44d8f81a4eda..000000000000 --- a/R-package/tests/testthat/test_gc_safety.R +++ /dev/null @@ -1,14 +0,0 @@ -context("Garbage Collection Safety Check") - -test_that("train and prediction when gctorture is on", { - data(agaricus.train, package = 'xgboost') - data(agaricus.test, package = 'xgboost') - train <- agaricus.train - test <- agaricus.test - gctorture(TRUE) - bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max.depth = 2, - eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") - pred <- predict(bst, test$data) - gctorture(FALSE) - expect_length(pred, length(test$label)) -}) From e25d56de7bb55ef9dfa2d700d5eb48769d036f77 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 4 Dec 2024 02:39:29 +0800 Subject: [PATCH 02/16] [jvm-packages] LTR: distribute the features with same group into same partition (#11023) --- .../scala/spark/GpuXGBoostPluginSuite.scala | 49 +++++++++++++++++++ .../xgboost4j/scala/spark/XGBoostRanker.scala | 17 +++++++ .../scala/spark/XGBoostRankerSuite.scala | 48 ++++++++++++++++++ 3 files changed, 114 insertions(+) diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala index 6559d90c7887..a5ff2ba0f589 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala @@ -542,6 +542,55 @@ class GpuXGBoostPluginSuite extends GpuTestSuite { } } + test("Same group must be in the same partition") { + val num_workers = 3 + withGpuSparkSession() { spark => + import spark.implicits._ + val df = spark.createDataFrame(spark.sparkContext.parallelize(Seq( + (0.1, 1, 0), + (0.1, 1, 0), + (0.1, 1, 0), + (0.1, 1, 1), + (0.1, 1, 1), + (0.1, 1, 1), + (0.1, 1, 2), + (0.1, 1, 2), + (0.1, 1, 2)), 1)).toDF("label", "f1", "group") + + // The original pattern will repartition df in a RoundRobin manner + val oriRows = df.repartition(num_workers) + .sortWithinPartitions(df.col("group")) + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + assert(oriRows.length == 3) + assert(oriRows.contains("0,1,2")) + + // The fix has replaced repartition with repartitionByRange which will put the + // instances with same group into the same partition + val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers) + val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df) + val rows = processedDf + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + + rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains) + } + } + test("Ranker: XGBoost-Spark should match xgboost4j") { withGpuSparkSession() { spark => import spark.implicits._ diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala index 14d13e34ff61..0265eac55979 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala @@ -22,6 +22,7 @@ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader} import org.apache.spark.ml.xgboost.SparkUtils import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DoubleType, StructType} import ml.dmlc.xgboost4j.scala.Booster @@ -62,6 +63,22 @@ class XGBoostRanker(override val uid: String, } } + /** + * Repartition the dataset to the numWorkers if needed. + * + * @param dataset to be repartition + * @return the repartitioned dataset + */ + override private[spark] def repartitionIfNeeded(dataset: Dataset[_]) = { + val numPartitions = dataset.rdd.getNumPartitions + if (getForceRepartition || getNumWorkers != numPartitions) { + // Please note that the output of repartitionByRange is not deterministic + dataset.repartitionByRange(getNumWorkers, col(getGroupCol)) + } else { + dataset + } + } + /** * Sort partition for Ranker issue. * diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala index 81a770bfe327..063836538931 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala @@ -151,6 +151,54 @@ class XGBoostRankerSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite }} } + test("Same group must be in the same partition") { + val spark = ss + import spark.implicits._ + val num_workers = 3 + val df = ss.createDataFrame(sc.parallelize(Seq( + (0.1, Vectors.dense(1.0, 2.0, 3.0), 0), + (0.1, Vectors.dense(0.0, 0.0, 0.0), 0), + (0.1, Vectors.dense(0.0, 3.0, 0.0), 0), + (0.1, Vectors.dense(2.0, 0.0, 4.0), 1), + (0.1, Vectors.dense(0.2, 1.2, 2.0), 1), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 1), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2)), 1)).toDF("label", "features", "group") + + // The original pattern will repartition df in a RoundRobin manner + val oriRows = df.repartition(num_workers) + .sortWithinPartitions(df.col("group")) + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + assert(oriRows.length == 3) + assert(oriRows.contains("0,1,2")) + + // The fix has replaced repartition with repartitionByRange which will put the + // instances with same group into the same partition + val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers) + val (processedDf, _) = ranker.preprocess(df) + val rows = processedDf + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + + rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains) + } + private def runLengthEncode(input: Seq[Int]): Seq[Int] = { if (input.isEmpty) return Seq(0) From 91a6bb80a6b73a1aa7862ceeeee89cac6823fd12 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 4 Dec 2024 15:26:38 +0800 Subject: [PATCH 03/16] Reset the booster. (#11042) --- include/xgboost/c_api.h | 37 ++++++++++++++++++++-------- include/xgboost/learner.h | 4 +++ python-package/xgboost/core.py | 12 ++++++++- python-package/xgboost/training.py | 4 +-- src/c_api/c_api.cc | 7 ++++++ src/learner.cc | 23 ++++++++++++++++++ tests/cpp/test_learner.cu | 39 ++++++++++++++++++++++++++++++ 7 files changed, 112 insertions(+), 14 deletions(-) create mode 100644 tests/cpp/test_learner.cu diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 7e8ed2f29568..6ae1dea8d3ce 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -876,31 +876,48 @@ XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *conf * @defgroup Booster Booster * * @brief The `Booster` class is the gradient-boosted model for XGBoost. + * + * During training, the booster object has many caches for improved performance. In + * addition to gradient and prediction, it also includes runtime buffers like leaf + * partitions. These buffers persist with the Booster object until either XGBoosterReset() + * is called or the booster is deleted by the XGBoosterFree(). + * * @{ */ -/*! - * \brief create xgboost learner - * \param dmats matrices that are set to be cached - * \param len length of dmats - * \param out handle to the result booster - * \return 0 when success, -1 when failure happens +/** + * @brief Create a XGBoost learner (booster) + * + * @param dmats matrices that are set to be cached by the booster. + * @param len length of dmats + * @param out handle to the result booster + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out); /** * @example c-api-demo.c */ -/*! - * \brief free obj in handle - * \param handle handle to be freed - * \return 0 when success, -1 when failure happens +/** + * @brief Delete the booster. + * + * @param handle The handle to be freed. + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterFree(BoosterHandle handle); /** * @example c-api-demo.c inference.c external_memory.c */ +/** + * @brief Reset the booster object to release data caches used for training. + * + * @since 3.0.0 + */ +XGB_DLL int XGBoosterReset(BoosterHandle handle); + /*! * \brief Slice a model using boosting index. The slice m:n indicates taking all trees * that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1). diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 939324e4a6c4..1499804c8592 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -249,6 +249,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable { std::string format) = 0; virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0; + /** + * @brief Reset the booster object to release data caches used for training. + */ + virtual void Reset() = 0; /*! * \brief Create a new instance of learner. * \param cache_data The matrix to cache the prediction. diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index c2034652322d..5351bdca1973 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2008,7 +2008,8 @@ def __setstate__(self, state: Dict) -> None: self.__dict__.update(state) def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster": - """Get a slice of the tree-based model. + """Get a slice of the tree-based model. Attributes like `best_iteration` and + `best_score` are removed in the resulting booster. .. versionadded:: 1.3.0 @@ -2107,6 +2108,15 @@ def copy(self) -> "Booster": """ return copy.copy(self) + def reset(self) -> "Booster": + """Reset the booster object to release data caches used for training. + + .. versionadded:: 3.0.0 + + """ + _check_call(_LIB.XGBoosterReset(self.handle)) + return self + def attr(self, key: str) -> Optional[str]: """Get attribute string from the Booster. diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index bb4ebe44e1ed..86370469a400 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -187,9 +187,7 @@ def train( if evals_result is not None: evals_result.update(cb_container.history) - # Copy to serialise and unserialise booster to reset state and free - # training memory - return bst.copy() + return bst.reset() class CVPack: diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 90407fcf58ac..ee99922cdd1c 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -980,6 +980,13 @@ XGB_DLL int XGBoosterFree(BoosterHandle handle) { API_END(); } +XGB_DLL int XGBoosterReset(BoosterHandle handle) { + API_BEGIN(); + CHECK_HANDLE(); + static_cast(handle)->Reset(); + API_END(); +} + XGB_DLL int XGBoosterSetParam(BoosterHandle handle, const char *name, const char *value) { diff --git a/src/learner.cc b/src/learner.cc index e6642b0874ac..1dcd0fcfc7eb 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -860,6 +860,7 @@ class LearnerIO : public LearnerConfiguration { // Will be removed once JSON takes over. Right now we still loads some RDS files from R. std::string const serialisation_header_ { u8"CONFIG-offset:" }; + protected: void ClearCaches() { this->prediction_container_ = PredictionContainer{}; } public: @@ -1264,6 +1265,28 @@ class LearnerImpl : public LearnerIO { return out_impl; } + void Reset() override { + this->Configure(); + this->CheckModelInitialized(); + // Global data + auto local_map = LearnerAPIThreadLocalStore::Get(); + if (local_map->find(this) != local_map->cend()) { + local_map->erase(this); + } + + // Model + std::string buf; + common::MemoryBufferStream fo(&buf); + this->Save(&fo); + + common::MemoryFixSizeBuffer fs(buf.data(), buf.size()); + this->Load(&fs); + + // Learner self cache. Prediction is cleared in the load method + CHECK(this->prediction_container_.Container().empty()); + this->gpair_ = decltype(this->gpair_){}; + } + void UpdateOneIter(int iter, std::shared_ptr train) override { monitor_.Start("UpdateOneIter"); TrainingObserver::Instance().Update(iter); diff --git a/tests/cpp/test_learner.cu b/tests/cpp/test_learner.cu new file mode 100644 index 000000000000..2fde49ca0fdb --- /dev/null +++ b/tests/cpp/test_learner.cu @@ -0,0 +1,39 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include +#include // for DeviceSym +#include // for GlobalConfigThreadLocalStore +#include + +#include // for int32_t +#include // for unique_ptr + +#include "../../src/common/device_vector.cuh" // for GlobalMemoryLogger +#include "helpers.h" // for RandomDataGenerator + +namespace xgboost { +TEST(Learner, Reset) { + dh::GlobalMemoryLogger().Clear(); + + auto verbosity = GlobalConfigThreadLocalStore::Get()->verbosity; + ConsoleLogger::Configure({{"verbosity", "3"}}); + auto p_fmat = RandomDataGenerator{1024, 32, 0.0}.GenerateDMatrix(true); + std::unique_ptr learner{Learner::Create({p_fmat})}; + learner->SetParam("device", DeviceSym::CUDA()); + learner->Configure(); + for (std::int32_t i = 0; i < 2; ++i) { + learner->UpdateOneIter(i, p_fmat); + } + + auto cur = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + p_fmat.reset(); + auto after_p_fmat_reset = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + ASSERT_LT(after_p_fmat_reset, cur); + learner->Reset(); + auto after_learner_reset = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + ASSERT_LT(after_learner_reset, after_p_fmat_reset); + ASSERT_LE(after_learner_reset, 64); + ConsoleLogger::Configure({{"verbosity", std::to_string(verbosity)}}); +} +} // namespace xgboost From 23aaddad48497d48b0e8254450241c828e8f62a7 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 4 Dec 2024 15:28:51 +0800 Subject: [PATCH 04/16] [R] Drop support for text inputs. (#11026) --------- Co-authored-by: david-cortes --- R-package/R/xgb.DMatrix.R | 53 +++++++++++------------------------- R-package/man/xgb.DMatrix.Rd | 48 ++++++++++---------------------- 2 files changed, 30 insertions(+), 71 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 429cf3f0422c..280fcf52ee3e 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -9,12 +9,13 @@ #' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the #' sorted-indices method (`tree_method = "exact"`), nor for the approximate method #' (`tree_method = "approx"`). +#' #' @param data Data from which to create a DMatrix, which can then be used for fitting models or #' for getting predictions out of a fitted model. #' -#' Supported input types are as follows:\itemize{ -#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`. -#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`. +#' Supported input types are as follows: +#' - `matrix` objects, with types `numeric`, `integer`, or `logical`. +#' - `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor` #' #' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1 #' encoding') will be converted inside the function call. Be aware that the encoding used for `factor` @@ -23,33 +24,14 @@ #' was constructed. #' #' Other column types are not supported. -#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. -#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for -#' 'xgb.QuantileDMatrix'. -#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted -#' as a single row (only when making predictions from a fitted model). -#' \item Text files in a supported format, passed as a `character` variable containing the URI path to -#' the file, with an optional format specifier. -#' -#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{ -#' \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()]. -#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix -#' `?format=libsvm` at the end of the file path. It will be the default format if not -#' otherwise specified. -#' \item CSV files (comma-separated values). This format can be specified by adding suffix -#' `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions. -#' } +#' - CSR matrices, as class `dgRMatrix` from package `Matrix`. +#' - CSC matrices, as class `dgCMatrix` from package `Matrix`. #' -#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', -#' it will not look at the extension or file contents to determine that it is a comma-separated value. -#' Instead, the format must be specified following the URI format, so the input to `data` should be passed -#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column -#' corresponds to the labels). +#' These are **not** supported by `xgb.QuantileDMatrix`. +#' - XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()]. +#' - Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted +#' as a single row (only when making predictions from a fitted model). #' -#' For more information about passing text files as input, see the articles -#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and -#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. -#' } #' @param label Label of the training data. For classification problems, should be passed encoded as #' integers with numeration starting at zero. #' @param weight Weight for each instance. @@ -95,15 +77,9 @@ #' @param label_lower_bound Lower bound for survival training. #' @param label_upper_bound Upper bound for survival training. #' @param feature_weights Set feature weights for column sampling. -#' @param data_split_mode When passing a URI (as R `character`) as input, this signals -#' whether to split by row or column. Allowed values are `"row"` and `"col"`. -#' -#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on -#' how the file was split beforehand. Default to row. -#' -#' This is not used when `data` is not a URI. -#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional -#' subclass 'xgb.QuantileDMatrix'. +#' @param data_split_mode Not used yet. This parameter is for distributed training, which is not yet available for the R package. +#' @return An 'xgb.DMatrix' object. If calling `xgb.QuantileDMatrix`, it will have additional +#' subclass `xgb.QuantileDMatrix`. #' #' @details #' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()]. @@ -145,6 +121,9 @@ xgb.DMatrix <- function( if (!is.null(group) && !is.null(qid)) { stop("Either one of 'group' or 'qid' should be NULL") } + if (data_split_mode != "row") { + stop("'data_split_mode' is not supported yet.") + } nthread <- as.integer(NVL(nthread, -1L)) if (typeof(data) == "character") { if (length(data) > 1) { diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 2cfa2e713038..23a24dec4226 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -45,9 +45,11 @@ xgb.QuantileDMatrix( \item{data}{Data from which to create a DMatrix, which can then be used for fitting models or for getting predictions out of a fitted model. -Supported input types are as follows:\itemize{ +Supported input types are as follows: +\itemize{ \item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}. -\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}. +\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor} +} Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1 encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor} @@ -56,32 +58,16 @@ responsibility to ensure that factor columns have the same levels as the ones fr was constructed. Other column types are not supported. +\itemize{ \item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}. -\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \strong{not} supported for -'xgb.QuantileDMatrix'. -\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted -as a single row (only when making predictions from a fitted model). -\item Text files in a supported format, passed as a \code{character} variable containing the URI path to -the file, with an optional format specifier. - -These are \strong{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{ -\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}. -\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix -\code{?format=libsvm} at the end of the file path. It will be the default format if not -otherwise specified. -\item CSV files (comma-separated values). This format can be specified by adding suffix -\code{?format=csv} at the end ofthe file path. It will \strong{not} be auto-deduced from file extensions. +\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. } -Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', -it will not look at the extension or file contents to determine that it is a comma-separated value. -Instead, the format must be specified following the URI format, so the input to \code{data} should be passed -like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column -corresponds to the labels). - -For more information about passing text files as input, see the articles -\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and -\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. +These are \strong{not} supported by \code{xgb.QuantileDMatrix}. +\itemize{ +\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}. +\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted +as a single row (only when making predictions from a fitted model). }} \item{label}{Label of the training data. For classification problems, should be passed encoded as @@ -144,13 +130,7 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h \item{feature_weights}{Set feature weights for column sampling.} -\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals -whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}. - -In distributed mode, the file is split accordingly; otherwise this is only an indicator on -how the file was split beforehand. Default to row. - -This is not used when \code{data} is not a URI.} +\item{data_split_mode}{Not used yet. This parameter is for distributed training, which is not yet available for the R package.} \item{ref}{The training dataset that provides quantile information, needed when creating validation/test dataset with \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}. Supplying the training DMatrix @@ -163,8 +143,8 @@ applied to the validation/test data} This is only supported when constructing a QuantileDMatrix.} } \value{ -An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional -subclass 'xgb.QuantileDMatrix'. +An 'xgb.DMatrix' object. If calling \code{xgb.QuantileDMatrix}, it will have additional +subclass \code{xgb.QuantileDMatrix}. } \description{ Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions From 337265ae0ea3b521509853591bb68a80df8d1ee4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 4 Dec 2024 02:53:57 -0600 Subject: [PATCH 05/16] Adapt to scikit-learn 1.6 estimator tag changes (#11021) --- .gitignore | 2 + python-package/pyproject.toml | 2 + python-package/xgboost/compat.py | 27 +++++--- python-package/xgboost/core.py | 2 +- python-package/xgboost/dask/__init__.py | 1 - python-package/xgboost/sklearn.py | 80 +++++++++++++++++++--- python-package/xgboost/spark/core.py | 4 +- python-package/xgboost/spark/estimator.py | 3 +- python-package/xgboost/spark/params.py | 1 - python-package/xgboost/spark/utils.py | 2 +- python-package/xgboost/testing/data.py | 2 +- tests/python/test_with_sklearn.py | 81 ++++++++++++++++++++++- 12 files changed, 181 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 082e85e2c67f..d53f3f1f255d 100644 --- a/.gitignore +++ b/.gitignore @@ -144,11 +144,13 @@ credentials.csv .bloop # python tests +*.bin demo/**/*.txt *.dmatrix .hypothesis __MACOSX/ model*.json +/tests/python/models/models/ # R tests *.htm diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 0420b2672e1e..7d79d6726eec 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -63,6 +63,8 @@ disable = [ "import-error", "attribute-defined-outside-init", "import-outside-toplevel", + "too-few-public-methods", + "too-many-ancestors", "too-many-nested-blocks", "unsubscriptable-object", "useless-object-inheritance" diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 26399f0da2f8..bcd3d8d4ee54 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -43,32 +43,43 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: # sklearn try: + from sklearn import __version__ as _sklearn_version from sklearn.base import BaseEstimator as XGBModelBase from sklearn.base import ClassifierMixin as XGBClassifierBase from sklearn.base import RegressorMixin as XGBRegressorBase - from sklearn.preprocessing import LabelEncoder try: - from sklearn.model_selection import KFold as XGBKFold from sklearn.model_selection import StratifiedKFold as XGBStratifiedKFold except ImportError: - from sklearn.cross_validation import KFold as XGBKFold from sklearn.cross_validation import StratifiedKFold as XGBStratifiedKFold + # sklearn.utils Tags types can be imported unconditionally once + # xgboost's minimum scikit-learn version is 1.6 or higher + try: + from sklearn.utils import Tags as _sklearn_Tags + except ImportError: + _sklearn_Tags = object + SKLEARN_INSTALLED = True except ImportError: SKLEARN_INSTALLED = False # used for compatibility without sklearn - XGBModelBase = object - XGBClassifierBase = object - XGBRegressorBase = object - LabelEncoder = object + class XGBModelBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.BaseEstimator.""" + + class XGBClassifierBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.ClassifierMixin.""" + + class XGBRegressorBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.RegressorMixin.""" - XGBKFold = None XGBStratifiedKFold = None + _sklearn_Tags = object + _sklearn_version = object + _logger = logging.getLogger(__name__) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 5351bdca1973..b21cf80aea56 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -430,7 +430,7 @@ def c_array( def from_array_interface(interface: dict) -> NumpyOrCupy: """Convert array interface to numpy or cupy array""" - class Array: # pylint: disable=too-few-public-methods + class Array: """Wrapper type for communicating with numpy and cupy.""" _interface: Optional[dict] = None diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py index 76fcc1a6ad92..e0221310bc51 100644 --- a/python-package/xgboost/dask/__init__.py +++ b/python-package/xgboost/dask/__init__.py @@ -1,7 +1,6 @@ # pylint: disable=too-many-arguments, too-many-locals # pylint: disable=missing-class-docstring, invalid-name # pylint: disable=too-many-lines -# pylint: disable=too-few-public-methods """ Dask extensions for distributed training ---------------------------------------- diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 25448657c8ad..c337505f7641 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -35,6 +35,8 @@ XGBClassifierBase, XGBModelBase, XGBRegressorBase, + _sklearn_Tags, + _sklearn_version, import_cupy, ) from .config import config_context @@ -54,7 +56,7 @@ from .training import train -class XGBRankerMixIn: # pylint: disable=too-few-public-methods +class XGBRankerMixIn: """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn base classes. @@ -79,7 +81,7 @@ def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool: return tree_method in ("hist", "gpu_hist", None, "auto") and not_sycl -class _SklObjWProto(Protocol): # pylint: disable=too-few-public-methods +class _SklObjWProto(Protocol): def __call__( self, y_true: ArrayLike, @@ -805,6 +807,41 @@ def _more_tags(self) -> Dict[str, bool]: tags["non_deterministic"] = True return tags + @staticmethod + def _update_sklearn_tags_from_dict( + *, + tags: _sklearn_Tags, + tags_dict: Dict[str, bool], + ) -> _sklearn_Tags: + """Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes. + + ``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags. + ref: https://github.com/scikit-learn/scikit-learn/pull/29677 + + This method handles updating that instance based on the values in ``self._more_tags()``. + """ + tags.non_deterministic = tags_dict.get("non_deterministic", False) + tags.no_validation = tags_dict["no_validation"] + tags.input_tags.allow_nan = tags_dict["allow_nan"] + return tags + + def __sklearn_tags__(self) -> _sklearn_Tags: + # XGBModelBase.__sklearn_tags__() cannot be called unconditionally, + # because that method isn't defined for scikit-learn<1.6 + if not hasattr(XGBModelBase, "__sklearn_tags__"): + err_msg = ( + "__sklearn_tags__() should not be called when using scikit-learn<1.6. " + f"Detected version: {_sklearn_version}" + ) + raise AttributeError(err_msg) + + # take whatever tags are provided by BaseEstimator, then modify + # them with XGBoost-specific values + return self._update_sklearn_tags_from_dict( + tags=super().__sklearn_tags__(), # pylint: disable=no-member + tags_dict=self._more_tags(), + ) + def __sklearn_is_fitted__(self) -> bool: return hasattr(self, "_Booster") @@ -898,13 +935,27 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]: """Get parameters.""" # Based on: https://stackoverflow.com/questions/59248211 # The basic flow in `get_params` is: - # 0. Return parameters in subclass first, by using inspect. - # 1. Return parameters in `XGBModel` (the base class). + # 0. Return parameters in subclass (self.__class__) first, by using inspect. + # 1. Return parameters in all parent classes (especially `XGBModel`). # 2. Return whatever in `**kwargs`. # 3. Merge them. + # + # This needs to accommodate being called recursively in the following + # inheritance graphs (and similar for classification and ranking): + # + # XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator + # XGBRegressor -> XGBModel -> BaseEstimator + # XGBModel -> BaseEstimator + # params = super().get_params(deep) cp = copy.copy(self) - cp.__class__ = cp.__class__.__bases__[0] + # If the immediate parent defines get_params(), use that. + if callable(getattr(cp.__class__.__bases__[0], "get_params", None)): + cp.__class__ = cp.__class__.__bases__[0] + # Otherwise, skip it and assume the next class will have it. + # This is here primarily for cases where the first class in MRO is a scikit-learn mixin. + else: + cp.__class__ = cp.__class__.__bases__[1] params.update(cp.__class__.get_params(cp, deep)) # if kwargs is a dict, update params accordingly if hasattr(self, "kwargs") and isinstance(self.kwargs, dict): @@ -1481,7 +1532,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> Number of boosting rounds. """, ) -class XGBClassifier(XGBModel, XGBClassifierBase): +class XGBClassifier(XGBClassifierBase, XGBModel): # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes @_deprecate_positional_args def __init__( @@ -1497,6 +1548,12 @@ def _more_tags(self) -> Dict[str, bool]: tags["multilabel"] = True return tags + def __sklearn_tags__(self) -> _sklearn_Tags: + tags = super().__sklearn_tags__() + tags_dict = self._more_tags() + tags.classifier_tags.multi_label = tags_dict["multilabel"] + return tags + @_deprecate_positional_args def fit( self, @@ -1769,7 +1826,7 @@ def fit( "Implementation of the scikit-learn API for XGBoost regression.", ["estimators", "model", "objective"], ) -class XGBRegressor(XGBModel, XGBRegressorBase): +class XGBRegressor(XGBRegressorBase, XGBModel): # pylint: disable=missing-docstring @_deprecate_positional_args def __init__( @@ -1783,6 +1840,13 @@ def _more_tags(self) -> Dict[str, bool]: tags["multioutput_only"] = False return tags + def __sklearn_tags__(self) -> _sklearn_Tags: + tags = super().__sklearn_tags__() + tags_dict = self._more_tags() + tags.target_tags.multi_output = tags_dict["multioutput"] + tags.target_tags.single_output = not tags_dict["multioutput_only"] + return tags + @xgboost_model_doc( "scikit-learn API for XGBoost random forest regression.", @@ -1910,7 +1974,7 @@ def _get_qid( `qid` can be a special column of input `X` instead of a separated parameter, see :py:meth:`fit` for more info.""", ) -class XGBRanker(XGBModel, XGBRankerMixIn): +class XGBRanker(XGBRankerMixIn, XGBModel): # pylint: disable=missing-docstring,too-many-arguments,invalid-name @_deprecate_positional_args def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any): diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 166acbe1764b..32d7c1e490c8 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -2,8 +2,8 @@ import base64 -# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name -# pylint: disable=too-few-public-methods, too-many-lines, too-many-branches +# pylint: disable=fixme, protected-access, no-member, invalid-name +# pylint: disable=too-many-lines, too-many-branches import json import logging import os diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index f53ef72eb99e..011f7ea0b715 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -1,7 +1,6 @@ """Xgboost pyspark integration submodule for estimator API.""" -# pylint: disable=too-many-ancestors -# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name +# pylint: disable=fixme, protected-access, no-member, invalid-name # pylint: disable=unused-argument, too-many-locals import warnings diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py index a177c73fe413..f173d3301286 100644 --- a/python-package/xgboost/spark/params.py +++ b/python-package/xgboost/spark/params.py @@ -2,7 +2,6 @@ from typing import Dict -# pylint: disable=too-few-public-methods from pyspark.ml.param import TypeConverters from pyspark.ml.param.shared import Param, Params diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py index c96ec284abe3..e0d3e094a805 100644 --- a/python-package/xgboost/spark/utils.py +++ b/python-package/xgboost/spark/utils.py @@ -47,7 +47,7 @@ def _get_default_params_from_func( return filtered_params_dict -class CommunicatorContext(CCtx): # pylint: disable=too-few-public-methods +class CommunicatorContext(CCtx): """Context with PySpark specific task ID.""" def __init__(self, context: BarrierTaskContext, **args: CollArgsVals) -> None: diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py index d9a4c85af326..34f55c077a85 100644 --- a/python-package/xgboost/testing/data.py +++ b/python-package/xgboost/testing/data.py @@ -566,7 +566,7 @@ def is_binary(self) -> bool: return self.max_rel == 1 -class PBM: # pylint: disable=too-few-public-methods +class PBM: """Simulate click data with position bias model. There are other models available in `ULTRA `_ like the cascading model. diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 6c4540301432..937e59095863 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -2,6 +2,7 @@ import os import pickle import random +import re import tempfile import warnings from typing import Callable, Optional @@ -825,6 +826,32 @@ def get_tm(clf: xgb.XGBClassifier) -> str: assert clf.get_params()["tree_method"] is None +def test_get_params_works_as_expected(): + # XGBModel -> BaseEstimator + params = xgb.XGBModel(max_depth=2).get_params() + assert params["max_depth"] == 2 + # 'objective' defaults to None in the signature of XGBModel + assert params["objective"] is None + + # XGBRegressor -> XGBModel -> BaseEstimator + params = xgb.XGBRegressor(max_depth=3).get_params() + assert params["max_depth"] == 3 + # 'objective' defaults to 'reg:squarederror' in the signature of XGBRegressor + assert params["objective"] == "reg:squarederror" + # 'colsample_bynode' defaults to 'None' for XGBModel (which XGBRegressor inherits from), so it + # should be in get_params() output + assert params["colsample_bynode"] is None + + # XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator + params = xgb.XGBRFRegressor(max_depth=4, objective="reg:tweedie").get_params() + assert params["max_depth"] == 4 + # 'objective' is a keyword argument for XGBRegressor, so it should be in get_params() output + # ... but values passed through kwargs should override the default from the signature of XGBRegressor + assert params["objective"] == "reg:tweedie" + # 'colsample_bynode' defaults to 0.8 for XGBRFRegressor...that should be preferred to the None from XGBRegressor + assert params["colsample_bynode"] == 0.8 + + def test_kwargs_error(): params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1} with pytest.raises(TypeError): @@ -1517,7 +1544,7 @@ def test_tags() -> None: assert tags["multioutput"] is True assert tags["multioutput_only"] is False - for clf in [xgb.XGBClassifier()]: + for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]: tags = clf._more_tags() assert "multioutput" not in tags assert tags["multilabel"] is True @@ -1526,6 +1553,58 @@ def test_tags() -> None: assert "multioutput" not in tags +# the try-excepts in this test should be removed once xgboost's +# minimum supported scikit-learn version is at least 1.6 +def test_sklearn_tags(): + + def _assert_has_xgbmodel_tags(tags): + # values set by XGBModel.__sklearn_tags__() + assert tags.non_deterministic is False + assert tags.no_validation is True + assert tags.input_tags.allow_nan is True + + for reg in [xgb.XGBRegressor(), xgb.XGBRFRegressor()]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = reg.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + # regressor-specific values + assert tags.estimator_type == "regressor" + assert tags.regressor_tags is not None + assert tags.classifier_tags is None + assert tags.target_tags.multi_output is True + assert tags.target_tags.single_output is True + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = clf.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + # classifier-specific values + assert tags.estimator_type == "classifier" + assert tags.regressor_tags is None + assert tags.classifier_tags is not None + assert tags.classifier_tags.multi_label is True + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + for rnk in [xgb.XGBRanker(),]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = rnk.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + def test_doc_link() -> None: for est in [ xgb.XGBRegressor(), From d5693bd24909fba5ff6c66c9b26e9066e79a6c41 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 4 Dec 2024 15:28:36 +0100 Subject: [PATCH 06/16] [R] Add predict method for new `xgboost()` (#11041) --- .gitignore | 1 + R-package/NAMESPACE | 1 + R-package/R/xgb.Booster.R | 15 +- R-package/R/xgboost.R | 237 +++++++++++++++++ R-package/man/predict.xgb.Booster.Rd | 17 +- R-package/man/predict.xgboost.Rd | 138 ++++++++++ R-package/man/print.xgboost.Rd | 19 ++ R-package/tests/testthat/test_xgboost.R | 324 ++++++++++++++++++++++++ 8 files changed, 747 insertions(+), 5 deletions(-) create mode 100644 R-package/man/predict.xgboost.Rd create mode 100644 R-package/man/print.xgboost.Rd diff --git a/.gitignore b/.gitignore index d53f3f1f255d..c29dcc43d9d3 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ Debug *.bak #.Rbuildignore R-package.Rproj +R-package/build/* *.cache* .mypy_cache/ doxygen diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 035f4ae45f47..8bd8caabc20f 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -10,6 +10,7 @@ S3method(getinfo,xgb.Booster) S3method(getinfo,xgb.DMatrix) S3method(length,xgb.Booster) S3method(predict,xgb.Booster) +S3method(predict,xgboost) S3method(print,xgb.Booster) S3method(print,xgb.DMatrix) S3method(print,xgb.cv.synchronous) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 808289b63de3..b38cd42bcef3 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -126,6 +126,8 @@ xgb.get.handle <- function(object) { #' of the iterations (rounds) otherwise. #' #' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. +#' +#' Not applicable to `gblinear` booster. #' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode #' regardless of the model type - meaning that, for example, both a multi-class and a binary classification #' model would generate output arrays with the same number of dimensions, with the 'class' dimension having @@ -144,7 +146,13 @@ xgb.get.handle <- function(object) { #' #' If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows #' will be the last dimensions instead of the first dimension. -#' @param base_margin Base margin used for boosting from existing model. +#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to +#' all observations independently of the trees in the model). +#' +#' If supplied, should be either a vector with length equal to the number of rows in `newdata` +#' (for objectives which produces a single score per observation), or a matrix with number of +#' rows matching to the number rows in `newdata` and number of columns matching to the number +#' of scores estimated by the model (e.g. number of classes for multi-class classification). #' #' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will #' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as @@ -206,6 +214,9 @@ xgb.get.handle <- function(object) { #' For multi-class / multi-target, they will be arranged so that columns in the output will have #' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`, #' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...). +#' +#' If there is more than one parallel tree (e.g. random forests), the parallel trees will be the +#' last grouping in the resulting order, which will still be 2D. #' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions #' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value. #' @@ -222,7 +233,7 @@ xgb.get.handle <- function(object) { #' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]` #' } #' -#' If passing `strict_shape=FALSE`, the result is always an array: +#' If passing `strict_shape=TRUE`, the result is always a matrix (if 2D) or array (if 3D or higher): #' - For normal predictions, the dimension is `[nrows, ngroups]`. #' - For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`. #' - For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`. diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 48a81fab34d8..c22752a3f506 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -949,6 +949,243 @@ xgboost <- function( return(model) } +#' @title Compute predictions from XGBoost model on new data +#' @description Predict values on data based on XGBoost model. +#' @param object An XGBoost model object of class `xgboost`, as produced by function [xgboost()]. +#' +#' Note that there is also a lower-level [predict.xgb.Booster()] method for models of class +#' `xgb.Booster` as produced by [xgb.train()], which can also be used for `xgboost` class models as +#' an alternative that performs fewer validations and post-processings. +#' @param newdata Data on which to compute predictions from the model passed in `object`. Supported +#' input classes are: +#' - Data Frames (class `data.frame` from base R and subclasses like `data.table`). +#' - Matrices (class `matrix` from base R). +#' - Sparse matrices from package `Matrix`, either as class `dgRMatrix` (CSR) or `dgCMatrix` (CSC). +#' - Sparse vectors from package `Matrix`, which will be interpreted as containing a single +#' observation. +#' +#' In the case of data frames, if there are any categorical features, they should be of class +#' `factor` and should have the same levels as the `factor` columns of the data from which the model +#' was constructed. +#' +#' If there are named columns and the model was fitted to data with named columns, they will be +#' matched by name by default (see `validate_features`). +#' @param type Type of prediction to make. Supported options are: +#' - `"response"`: will output model predictions on the scale of the response variable (e.g. +#' probabilities of belonging to the last class in the case of binary classification). Result will +#' be either a numeric vector with length matching to rows in `newdata`, or a numeric matrix with +#' shape `[nrows(newdata), nscores]` (for objectives that produce more than one score per +#' observation such as multi-class classification or multi-quantile regression). +#' - `"raw"`: will output the unprocessed boosting scores (e.g. log-odds in the case of objective +#' `binary:logistic`). Same output shape and type as for `"response"`. +#' - `"class"`: will output the class with the highest predicted probability, returned as a `factor` +#' (only applicable to classification objectives) with length matching to rows in `newdata`. +#' - `"leaf"`: will output the terminal node indices of each observation across each tree, as an +#' integer matrix of shape `[nrows(newdata), ntrees]`, or as an integer array with an extra one or +#' two dimensions, up to `[nrows(newdata), ntrees, nscores, n_parallel_trees]` for models that +#' produce more than one score per tree and/or which have more than one parallel tree (e.g. +#' random forests). +#' +#' Only applicable to tree-based boosters (not `gblinear`). +#' - `"contrib"`: will produce per-feature contribution estimates towards the model score for a +#' given observation, based on SHAP values. The contribution values are on the scale of +#' untransformed margin (e.g., for binary classification, the values are log-odds deviations from +#' the baseline). +#' +#' Output will be a numeric matrix with shape `[nrows, nfeatures+1]`, with the intercept being the +#' last feature, or a numeric array with shape `[nrows, nscores, nfeatures+1]` if the model +#' produces more than one score per observation. +#' - `"interaction"`: similar to `"contrib"`, but computing SHAP values of contributions of +#' interaction of each pair of features. Note that this operation might be rather expensive in +#' terms of compute and memory. +#' +#' Since it quadratically depends on the number of features, it is recommended to perform +#' selection of the most important features first. +#' +#' Output will be a numeric array of shape `[nrows, nfeatures+1, nfeatures+1]`, or shape +#' `[nrows, nscores, nfeatures+1, nfeatures+1]` (for objectives that produce more than one score +#' per observation). +#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to +#' all observations independently of the trees in the model). +#' +#' If supplied, should be either a vector with length equal to the number of rows in `newdata` +#' (for objectives which produces a single score per observation), or a matrix with number of +#' rows matching to the number rows in `newdata` and number of columns matching to the number +#' of scores estimated by the model (e.g. number of classes for multi-class classification). +#' @param iteration_range Sequence of rounds/iterations from the model to use for prediction, specified by passing +#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e. +#' base-1 indexing, and inclusive of both ends). +#' +#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will +#' predict using only the first one. +#' +#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all +#' of the iterations (rounds) otherwise. +#' +#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. +#' +#' Not applicable to `gblinear` booster. +#' @param validate_features Validate that the feature names in the data match to the feature names +#' in the column, and reorder them in the data otherwise. +#' +#' If passing `FALSE`, it is assumed that the feature names and types are the same, +#' and come in the same order as in the training data. +#' +#' Be aware that this only applies to column names and not to factor levels in categorical columns. +#' +#' Note that this check might add some sizable latency to the predictions, so it's +#' recommended to disable it for performance-sensitive applications. +#' @param ... Not used. +#' @return Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array +#' (for 3D and higher), or `factor` (for class predictions). See documentation for parameter `type` +#' for details about what the output type and shape will be. +#' @method predict xgboost +#' @export +#' @examples +#' data("ToothGrowth") +#' y <- ToothGrowth$supp +#' x <- ToothGrowth[, -2L] +#' model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) +#' pred_prob <- predict(model, x[1:5, ], type = "response") +#' pred_raw <- predict(model, x[1:5, ], type = "raw") +#' pred_class <- predict(model, x[1:5, ], type = "class") +#' +#' # Relationships between these +#' manual_probs <- 1 / (1 + exp(-pred_raw)) +#' manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2]) +#' +#' # They should match up to numerical precision +#' round(pred_prob, 6) == round(manual_probs, 6) +#' pred_class == manual_class +predict.xgboost <- function( + object, + newdata, + type = "response", + base_margin = NULL, + iteration_range = NULL, + validate_features = TRUE, + ... +) { + if (inherits(newdata, "xgb.DMatrix")) { + stop( + "Predictions on 'xgb.DMatrix' objects are not supported with 'xgboost' class.", + " Try 'xgb.train' or 'predict.xgb.Booster'." + ) + } + + outputmargin <- FALSE + predleaf <- FALSE + predcontrib <- FALSE + predinteraction <- FALSE + pred_class <- FALSE + strict_shape <- FALSE + allowed_types <- c( + "response", + "raw", + "class", + "leaf", + "contrib", + "interaction" + ) + type <- head(type, 1L) + if (!is.character(type) || !(type %in% allowed_types)) { + stop("'type' must be one of: ", paste(allowed_types, collapse = ", ")) + } + + if (type != "response") { + switch( + type, + "raw" = { + outputmargin <- TRUE + }, "class" = { + if (is.null(attributes(object)$metadata$y_levels)) { + stop("Prediction type 'class' is only for classification objectives.") + } + pred_class <- TRUE + outputmargin <- TRUE + }, "leaf" = { + predleaf <- TRUE + strict_shape <- TRUE # required for 3D and 4D outputs + }, "contrib" = { + predcontrib <- TRUE + }, "interaction" = { + predinteraction <- TRUE + } + ) + } + out <- predict.xgb.Booster( + object, + newdata, + outputmargin = outputmargin, + predleaf = predleaf, + predcontrib = predcontrib, + predinteraction = predinteraction, + iterationrange = iteration_range, + strict_shape = strict_shape, + validate_features = validate_features, + base_margin = base_margin + ) + + if (strict_shape) { + # Should only end up here for leaf predictions + out_dims <- dim(out) + dims_remove <- integer() + if (out_dims[3L] == 1L) { + dims_remove <- c(dims_remove, -3L) + } + if (length(out_dims) >= 4L && out_dims[4L] == 1L) { + dims_remove <- c(dims_remove, -4L) + } + if (length(dims_remove)) { + new_dimnames <- dimnames(out)[dims_remove] + dim(out) <- out_dims[dims_remove] + dimnames(out) <- new_dimnames + } + } + + if (pred_class) { + + if (is.null(dim(out))) { + out <- as.integer(out >= 0) + 1L + } else { + out <- max.col(out, ties.method = "first") + } + attr_out <- attributes(out) + attr_out$class <- "factor" + attr_out$levels <- attributes(object)$metadata$y_levels + attributes(out) <- attr_out + + } else if (NCOL(out) > 1L || (strict_shape && length(dim(out)) >= 3L)) { + + names_use <- NULL + if (NROW(attributes(object)$metadata$y_levels) > 2L) { + names_use <- attributes(object)$metadata$y_levels + } else if (NROW(attributes(object)$metadata$y_names)) { + names_use <- attributes(object)$metadata$y_names + } else if (NROW(attributes(object)$params$quantile_alpha) > 1L) { + names_use <- paste0("q", attributes(object)$params$quantile_alpha) + if (anyDuplicated(names_use)) { + warning("Cannot add quantile names to output due to clashes in their character conversions") + names_use <- NULL + } + } + if (NROW(names_use)) { + dimnames_out <- dimnames(out) + dim_with_names <- if (type == "leaf") 3L else 2L + dimnames_out[[dim_with_names]] <- names_use + .Call(XGSetArrayDimNamesInplace_R, out, dimnames_out) + } + + } + + return(out) +} + +#' @title Print info from XGBoost model +#' @description Prints basic properties of an XGBoost model object. +#' @param x An XGBoost model object of class `xgboost`, as produced by function [xgboost()]. +#' @param ... Not used. +#' @return Same object `x`, after printing its info. #' @method print xgboost #' @export print.xgboost <- function(x, ...) { diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index d97984e7fa48..5cdfed97f504 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -80,7 +80,9 @@ predict using only the first one. If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all of the iterations (rounds) otherwise. -If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.} +If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. + +Not applicable to \code{gblinear} booster.} \item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode regardless of the model type - meaning that, for example, both a multi-class and a binary classification @@ -118,7 +120,13 @@ and come in the same order as in the training data. Note that this check might add some sizable latency to the predictions, so it's recommended to disable it for performance-sensitive applications.} -\item{base_margin}{Base margin used for boosting from existing model. +\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to +all observations independently of the trees in the model). + +If supplied, should be either a vector with length equal to the number of rows in \code{newdata} +(for objectives which produces a single score per observation), or a matrix with number of +rows matching to the number rows in \code{newdata} and number of columns matching to the number +of scores estimated by the model (e.g. number of classes for multi-class classification). Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as @@ -141,6 +149,9 @@ Note that objective variant \code{multi:softmax} defaults towards predicting mos For multi-class / multi-target, they will be arranged so that columns in the output will have the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1}, \code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...). + +If there is more than one parallel tree (e.g. random forests), the parallel trees will be the +last grouping in the resulting order, which will still be 2D. \item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions \verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value. @@ -157,7 +168,7 @@ dimension should produce practically the same result as \code{predcontrib = TRUE For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]} } -If passing \code{strict_shape=FALSE}, the result is always an array: +If passing \code{strict_shape=TRUE}, the result is always a matrix (if 2D) or array (if 3D or higher): \itemize{ \item For normal predictions, the dimension is \verb{[nrows, ngroups]}. \item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}. diff --git a/R-package/man/predict.xgboost.Rd b/R-package/man/predict.xgboost.Rd new file mode 100644 index 000000000000..15e75965aaa6 --- /dev/null +++ b/R-package/man/predict.xgboost.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost.R +\name{predict.xgboost} +\alias{predict.xgboost} +\title{Compute predictions from XGBoost model on new data} +\usage{ +\method{predict}{xgboost}( + object, + newdata, + type = "response", + base_margin = NULL, + iteration_range = NULL, + validate_features = TRUE, + ... +) +} +\arguments{ +\item{object}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}. + +Note that there is also a lower-level \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} method for models of class +\code{xgb.Booster} as produced by \code{\link[=xgb.train]{xgb.train()}}, which can also be used for \code{xgboost} class models as +an alternative that performs fewer validations and post-processings.} + +\item{newdata}{Data on which to compute predictions from the model passed in \code{object}. Supported +input classes are: +\itemize{ +\item Data Frames (class \code{data.frame} from base R and subclasses like \code{data.table}). +\item Matrices (class \code{matrix} from base R). +\item Sparse matrices from package \code{Matrix}, either as class \code{dgRMatrix} (CSR) or \code{dgCMatrix} (CSC). +\item Sparse vectors from package \code{Matrix}, which will be interpreted as containing a single +observation. +} + +In the case of data frames, if there are any categorical features, they should be of class +\code{factor} and should have the same levels as the \code{factor} columns of the data from which the model +was constructed. + +If there are named columns and the model was fitted to data with named columns, they will be +matched by name by default (see \code{validate_features}).} + +\item{type}{Type of prediction to make. Supported options are: +\itemize{ +\item \code{"response"}: will output model predictions on the scale of the response variable (e.g. +probabilities of belonging to the last class in the case of binary classification). Result will +be either a numeric vector with length matching to rows in \code{newdata}, or a numeric matrix with +shape \verb{[nrows(newdata), nscores]} (for objectives that produce more than one score per +observation such as multi-class classification or multi-quantile regression). +\item \code{"raw"}: will output the unprocessed boosting scores (e.g. log-odds in the case of objective +\code{binary:logistic}). Same output shape and type as for \code{"response"}. +\item \code{"class"}: will output the class with the highest predicted probability, returned as a \code{factor} +(only applicable to classification objectives) with length matching to rows in \code{newdata}. +\item \code{"leaf"}: will output the terminal node indices of each observation across each tree, as an +integer matrix of shape \verb{[nrows(newdata), ntrees]}, or as an integer array with an extra one or +two dimensions, up to \verb{[nrows(newdata), ntrees, nscores, n_parallel_trees]} for models that +produce more than one score per tree and/or which have more than one parallel tree (e.g. +random forests). + +Only applicable to tree-based boosters (not \code{gblinear}). +\item \code{"contrib"}: will produce per-feature contribution estimates towards the model score for a +given observation, based on SHAP values. The contribution values are on the scale of +untransformed margin (e.g., for binary classification, the values are log-odds deviations from +the baseline). + +Output will be a numeric matrix with shape \verb{[nrows, nfeatures+1]}, with the intercept being the +last feature, or a numeric array with shape \verb{[nrows, nscores, nfeatures+1]} if the model +produces more than one score per observation. +\item \code{"interaction"}: similar to \code{"contrib"}, but computing SHAP values of contributions of +interaction of each pair of features. Note that this operation might be rather expensive in +terms of compute and memory. + +Since it quadratically depends on the number of features, it is recommended to perform +selection of the most important features first. + +Output will be a numeric array of shape \verb{[nrows, nfeatures+1, nfeatures+1]}, or shape +\verb{[nrows, nscores, nfeatures+1, nfeatures+1]} (for objectives that produce more than one score +per observation). +}} + +\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to +all observations independently of the trees in the model). + +If supplied, should be either a vector with length equal to the number of rows in \code{newdata} +(for objectives which produces a single score per observation), or a matrix with number of +rows matching to the number rows in \code{newdata} and number of columns matching to the number +of scores estimated by the model (e.g. number of classes for multi-class classification).} + +\item{iteration_range}{Sequence of rounds/iterations from the model to use for prediction, specified by passing +a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e. +base-1 indexing, and inclusive of both ends). + +For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will +predict using only the first one. + +If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all +of the iterations (rounds) otherwise. + +If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. + +Not applicable to \code{gblinear} booster.} + +\item{validate_features}{Validate that the feature names in the data match to the feature names +in the column, and reorder them in the data otherwise. + +If passing \code{FALSE}, it is assumed that the feature names and types are the same, +and come in the same order as in the training data. + +Be aware that this only applies to column names and not to factor levels in categorical columns. + +Note that this check might add some sizable latency to the predictions, so it's +recommended to disable it for performance-sensitive applications.} + +\item{...}{Not used.} +} +\value{ +Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array +(for 3D and higher), or \code{factor} (for class predictions). See documentation for parameter \code{type} +for details about what the output type and shape will be. +} +\description{ +Predict values on data based on XGBoost model. +} +\examples{ +data("ToothGrowth") +y <- ToothGrowth$supp +x <- ToothGrowth[, -2L] +model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) +pred_prob <- predict(model, x[1:5, ], type = "response") +pred_raw <- predict(model, x[1:5, ], type = "raw") +pred_class <- predict(model, x[1:5, ], type = "class") + +# Relationships between these +manual_probs <- 1 / (1 + exp(-pred_raw)) +manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2]) + +# They should match up to numerical precision +round(pred_prob, 6) == round(manual_probs, 6) +pred_class == manual_class +} diff --git a/R-package/man/print.xgboost.Rd b/R-package/man/print.xgboost.Rd new file mode 100644 index 000000000000..235f3e36bdd0 --- /dev/null +++ b/R-package/man/print.xgboost.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost.R +\name{print.xgboost} +\alias{print.xgboost} +\title{Print info from XGBoost model} +\usage{ +\method{print}{xgboost}(x, ...) +} +\arguments{ +\item{x}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}.} + +\item{...}{Not used.} +} +\value{ +Same object \code{x}, after printing its info. +} +\description{ +Prints basic properties of an XGBoost model object. +} diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R index a4ac658a11b8..8f0c1e7ba9a7 100644 --- a/R-package/tests/testthat/test_xgboost.R +++ b/R-package/tests/testthat/test_xgboost.R @@ -1,5 +1,8 @@ library(survival) library(data.table) +data("iris") +data("mtcars") +data("ToothGrowth") test_that("Auto determine objective", { y_num <- seq(1, 10) @@ -621,3 +624,324 @@ test_that("Whole function works", { expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE))) expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE))) }) + +test_that("Can predict probabilities and raw scores", { + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_prob <- predict(model, x, type = "response") + pred_raw <- predict(model, x, type = "raw") + expect_true(is.vector(pred_prob)) + expect_equal(length(pred_prob), nrow(x)) + expect_true(min(pred_prob) >= 0) + expect_true(max(pred_prob) <= 1) + + expect_equal(length(pred_raw), nrow(x)) + expect_true(is.vector(pred_raw)) + expect_true(min(pred_raw) < 0) + expect_true(max(pred_raw) > 0) + + expect_equal( + pred_prob, + 1 / (1 + exp(-pred_raw)), + tolerance = 1e-6 + ) +}) + +test_that("Can predict class", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) + + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) + + probs <- predict(model, x, type = "response") + expect_true(all(pred_class[probs >= 0.5] == levels(y)[[2L]])) + expect_true(all(pred_class[probs < 0.5] == levels(y)[[1L]])) + + # Check that it fails for regression models + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + expect_error({ + predict(model, x, type = "class") + }) +}) + +test_that("Metadata survives serialization", { + y <- iris$Species + x <- iris[, -5L] + model_fresh <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + temp_file <- file.path(tempdir(), "xgb_model.Rds") + saveRDS(model_fresh, temp_file) + model <- readRDS(temp_file) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) +}) + +test_that("Column names aren't added when not appropriate", { + pred_types <- c( + "response", + "raw", + "leaf" + ) + for (pred_type in pred_types) { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 3L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = pred_type) + if (pred_type %in% c("raw", "response")) { + expect_true(is.vector(pred)) + } else { + expect_true(length(dim(pred)) >= 2L) + expect_null(colnames(pred)) + } + + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred <- predict(model, x, type = pred_type) + if (pred_type %in% c("raw", "response")) { + expect_true(is.vector(pred)) + } else { + expect_true(length(dim(pred)) >= 2L) + expect_null(colnames(pred)) + } + } +}) + +test_that("Column names from multiclass are added to non-class predictions", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 3L) + expect_equal(colnames(pred), levels(y)) + } +}) + +test_that("Column names from multitarget are added to predictions", { + y <- data.frame( + ylog = log(mtcars$mpg), + ysqrt = sqrt(mtcars$mpg) + ) + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 2L) + expect_equal(colnames(pred), colnames(y)) + } +}) + +test_that("Column names from multiquantile are added to predictions", { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 3L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 3L) + expect_equal(colnames(pred), c("q0.25", "q0.5", "q0.75")) + } +}) + +test_that("Leaf predictions have multiple dimensions when needed", { + # single score, multiple trees + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + + # single score, single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + + # multiple score, multiple trees + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # multiple score, single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # parallel trees, single tree, single score + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "count:poisson", + num_parallel_tree = 2L + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 2L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_null(dimnames(pred)[[3L]]) + + # num_parallel_tree>1 + multiple scores is not supported at the moment so no test for it. +}) + +test_that("Column names from multiclass are added to leaf predictions", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(dimnames(pred)[[3L]], levels(y)) + + # Check also for a single tree + model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(dimnames(pred)[[3L]], levels(y)) +}) + +test_that("Column names from multitarget are added to leaf predictions", { + y <- data.frame( + ylog = log(mtcars$mpg), + ysqrt = sqrt(mtcars$mpg) + ) + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 2L)) + expect_equal(dimnames(pred)[[3L]], colnames(y)) + + # Check also for a single tree + model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 2L)) + expect_equal(dimnames(pred)[[3L]], colnames(y)) +}) + +test_that("Column names from multiquantile are added to leaf predictions", { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # Check also for a single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) +}) From 544a52e6ae438871024123b5de92b2314ddb3f78 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 4 Dec 2024 23:53:58 +0800 Subject: [PATCH 07/16] [pyspark] LTR: distribute the features with same group into same partition (#11047) --- python-package/xgboost/spark/core.py | 55 ++++++++----------- .../test_with_spark/test_spark_local.py | 15 ++++- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 32d7c1e490c8..689e747e8a5c 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -475,10 +475,7 @@ def _validate_params(self) -> None: ) if self.getOrDefault("early_stopping_rounds") is not None: - if not ( - self.isDefined(self.validationIndicatorCol) - and self.getOrDefault(self.validationIndicatorCol) != "" - ): + if not self._col_is_defined_not_empty(self.validationIndicatorCol): raise ValueError( "If 'early_stopping_rounds' param is set, you need to set " "'validation_indicator_col' param as well." @@ -517,6 +514,9 @@ def _run_on_gpu(self) -> bool: or self.getOrDefault(self.getParam("tree_method")) == "gpu_hist" ) + def _col_is_defined_not_empty(self, param: "Param[str]") -> bool: + return self.isDefined(param) and self.getOrDefault(param) != "" + def _validate_and_convert_feature_col_as_float_col_list( dataset: DataFrame, features_col_names: List[str] @@ -805,16 +805,13 @@ def _prepare_input_columns_and_feature_prop( ) select_cols.append(features_array_col) - if self.isDefined(self.weightCol) and self.getOrDefault(self.weightCol) != "": + if self._col_is_defined_not_empty(self.weightCol): select_cols.append( col(self.getOrDefault(self.weightCol)).alias(alias.weight) ) has_validation_col = False - if ( - self.isDefined(self.validationIndicatorCol) - and self.getOrDefault(self.validationIndicatorCol) != "" - ): + if self._col_is_defined_not_empty(self.validationIndicatorCol): select_cols.append( col(self.getOrDefault(self.validationIndicatorCol)).alias(alias.valid) ) @@ -823,15 +820,12 @@ def _prepare_input_columns_and_feature_prop( # which will cause exception or hanging issue when creating DMatrix. has_validation_col = True - if ( - self.isDefined(self.base_margin_col) - and self.getOrDefault(self.base_margin_col) != "" - ): + if self._col_is_defined_not_empty(self.base_margin_col): select_cols.append( col(self.getOrDefault(self.base_margin_col)).alias(alias.margin) ) - if self.isDefined(self.qid_col) and self.getOrDefault(self.qid_col) != "": + if self._col_is_defined_not_empty(self.qid_col): select_cols.append(col(self.getOrDefault(self.qid_col)).alias(alias.qid)) feature_prop = FeatureProp( @@ -862,17 +856,22 @@ def _prepare_input(self, dataset: DataFrame) -> Tuple[DataFrame, FeatureProp]: ) if self._repartition_needed(dataset): - # If validationIndicatorCol defined, and if user unionise train and validation - # dataset, users must set force_repartition to true to force repartition. - # Or else some partitions might contain only train or validation dataset. - if self.getOrDefault(self.repartition_random_shuffle): - # In some cases, spark round-robin repartition might cause data skew - # use random shuffle can address it. - dataset = dataset.repartition(num_workers, rand(1)) + if self._col_is_defined_not_empty(self.qid_col): + # For ranking problem, we need to try best the put the instances with + # same group into the same partition + dataset = dataset.repartitionByRange(num_workers, alias.qid) else: - dataset = dataset.repartition(num_workers) + # If validationIndicatorCol defined, and if user unionise train and validation + # dataset, users must set force_repartition to true to force repartition. + # Or else some partitions might contain only train or validation dataset. + if self.getOrDefault(self.repartition_random_shuffle): + # In some cases, spark round-robin repartition might cause data skew + # use random shuffle can address it. + dataset = dataset.repartition(num_workers, rand(1)) + else: + dataset = dataset.repartition(num_workers) - if self.isDefined(self.qid_col) and self.getOrDefault(self.qid_col) != "": + if self._col_is_defined_not_empty(self.qid_col): # XGBoost requires qid to be sorted for each partition dataset = dataset.sortWithinPartitions(alias.qid, ascending=True) @@ -1306,10 +1305,7 @@ def _get_feature_col( def _get_pred_contrib_col_name(self) -> Optional[str]: """Return the pred_contrib_col col name""" pred_contrib_col_name = None - if ( - self.isDefined(self.pred_contrib_col) - and self.getOrDefault(self.pred_contrib_col) != "" - ): + if self._col_is_defined_not_empty(self.pred_contrib_col): pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col) return pred_contrib_col_name @@ -1413,10 +1409,7 @@ def _transform(self, dataset: DataFrame) -> DataFrame: xgb_sklearn_model = self._xgb_sklearn_model base_margin_col = None - if ( - self.isDefined(self.base_margin_col) - and self.getOrDefault(self.base_margin_col) != "" - ): + if self._col_is_defined_not_empty(self.base_margin_col): base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias( alias.margin ) diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py index 1f8374e06d11..79569c7fd373 100644 --- a/tests/test_distributed/test_with_spark/test_spark_local.py +++ b/tests/test_distributed/test_with_spark/test_spark_local.py @@ -4,7 +4,7 @@ import tempfile import uuid from collections import namedtuple -from typing import Generator, Sequence +from typing import Generator, Iterable, List, Sequence import numpy as np import pytest @@ -1794,3 +1794,16 @@ def test_ranker_qid_sorted(self, ltr_data: LTRData) -> None: assert ranker.getOrDefault(ranker.objective) == "rank:ndcg" model = ranker.fit(ltr_data.df_train_1) model.transform(ltr_data.df_test).collect() + + def test_ranker_same_qid_in_same_partition(self, ltr_data: LTRData) -> None: + ranker = SparkXGBRanker(qid_col="qid", num_workers=4, force_repartition=True) + df, _ = ranker._prepare_input(ltr_data.df_train_1) + + def f(iterator: Iterable) -> List[int]: + yield list(set(iterator)) + + rows = df.select("qid").rdd.mapPartitions(f).collect() + assert len(rows) == 4 + for row in rows: + assert len(row) == 1 + assert row[0].qid in [6, 7, 8, 9] From bb2e701bf30fe75a0a4b48e599a7d64020325727 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 5 Dec 2024 03:27:42 +0800 Subject: [PATCH 08/16] [Dask] Sort and partition for ranking. (#11007) - Implement automatic local sort. - Implement partitioning by query ID. - Document for distributed ranking. --- demo/dask/dask_learning_to_rank.py | 201 ++++++++++++++++++ demo/guide-python/learning_to_rank.py | 10 +- doc/tutorials/dask.rst | 62 +++++- doc/tutorials/learning_to_rank.rst | 18 +- python-package/xgboost/core.py | 6 +- python-package/xgboost/dask/__init__.py | 134 +++++++++++- python-package/xgboost/dask/data.py | 171 ++++++++++++++- python-package/xgboost/testing/__init__.py | 13 +- python-package/xgboost/testing/dask.py | 79 ++++++- src/data/data.cc | 4 +- src/objective/lambdarank_obj.cc | 3 +- src/objective/lambdarank_obj.cuh | 11 +- tests/ci_build/lint_python.py | 3 +- .../test_gpu_with_dask/conftest.py | 6 +- .../test_gpu_with_dask/test_gpu_demos.py | 6 +- .../test_gpu_with_dask/test_gpu_ranking.py | 18 ++ .../test_with_dask/test_ranking.py | 11 +- 17 files changed, 699 insertions(+), 57 deletions(-) create mode 100644 demo/dask/dask_learning_to_rank.py create mode 100644 tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py diff --git a/demo/dask/dask_learning_to_rank.py b/demo/dask/dask_learning_to_rank.py new file mode 100644 index 000000000000..c08450fec56e --- /dev/null +++ b/demo/dask/dask_learning_to_rank.py @@ -0,0 +1,201 @@ +""" +Learning to rank with the Dask Interface +======================================== + + .. versionadded:: 3.0.0 + +This is a demonstration of using XGBoost for learning to rank tasks using the +MSLR_10k_letor dataset. For more infomation about the dataset, please visit its +`description page `_. + +See :ref:`ltr-dist` for a general description for distributed learning to rank and +:ref:`ltr-dask` for Dask-specific features. + +""" + +from __future__ import annotations + +import argparse +import os +from contextlib import contextmanager +from typing import Generator + +import dask +import numpy as np +from dask import dataframe as dd +from distributed import Client, LocalCluster, wait +from sklearn.datasets import load_svmlight_file + +from xgboost import dask as dxgb + + +def load_mslr_10k( + device: str, data_path: str, cache_path: str +) -> tuple[dd.DataFrame, dd.DataFrame, dd.DataFrame]: + """Load the MSLR10k dataset from data_path and save parquet files in the cache_path.""" + root_path = os.path.expanduser(args.data) + cache_path = os.path.expanduser(args.cache) + + # Use only the Fold1 for demo: + # Train, Valid, Test + # {S1,S2,S3}, S4, S5 + fold = 1 + + if not os.path.exists(cache_path): + os.mkdir(cache_path) + fold_path = os.path.join(root_path, f"Fold{fold}") + train_path = os.path.join(fold_path, "train.txt") + valid_path = os.path.join(fold_path, "vali.txt") + test_path = os.path.join(fold_path, "test.txt") + + X_train, y_train, qid_train = load_svmlight_file( + train_path, query_id=True, dtype=np.float32 + ) + columns = [f"f{i}" for i in range(X_train.shape[1])] + X_train = dd.from_array(X_train.toarray(), columns=columns) + y_train = y_train.astype(np.int32) + qid_train = qid_train.astype(np.int32) + + X_train["y"] = dd.from_array(y_train) + X_train["qid"] = dd.from_array(qid_train) + X_train.to_parquet(os.path.join(cache_path, "train"), engine="pyarrow") + + X_valid, y_valid, qid_valid = load_svmlight_file( + valid_path, query_id=True, dtype=np.float32 + ) + X_valid = dd.from_array(X_valid.toarray(), columns=columns) + y_valid = y_valid.astype(np.int32) + qid_valid = qid_valid.astype(np.int32) + + X_valid["y"] = dd.from_array(y_valid) + X_valid["qid"] = dd.from_array(qid_valid) + X_valid.to_parquet(os.path.join(cache_path, "valid"), engine="pyarrow") + + X_test, y_test, qid_test = load_svmlight_file( + test_path, query_id=True, dtype=np.float32 + ) + + X_test = dd.from_array(X_test.toarray(), columns=columns) + y_test = y_test.astype(np.int32) + qid_test = qid_test.astype(np.int32) + + X_test["y"] = dd.from_array(y_test) + X_test["qid"] = dd.from_array(qid_test) + X_test.to_parquet(os.path.join(cache_path, "test"), engine="pyarrow") + + df_train = dd.read_parquet( + os.path.join(cache_path, "train"), calculate_divisions=True + ) + df_valid = dd.read_parquet( + os.path.join(cache_path, "valid"), calculate_divisions=True + ) + df_test = dd.read_parquet( + os.path.join(cache_path, "test"), calculate_divisions=True + ) + + return df_train, df_valid, df_test + + +def ranking_demo(client: Client, args: argparse.Namespace) -> None: + """Learning to rank with data sorted locally.""" + df_tr, df_va, _ = load_mslr_10k(args.device, args.data, args.cache) + + X_train: dd.DataFrame = df_tr[df_tr.columns.difference(["y", "qid"])] + y_train = df_tr[["y", "qid"]] + Xy_train = dxgb.DaskQuantileDMatrix(client, X_train, y_train.y, qid=y_train.qid) + + X_valid: dd.DataFrame = df_va[df_va.columns.difference(["y", "qid"])] + y_valid = df_va[["y", "qid"]] + Xy_valid = dxgb.DaskQuantileDMatrix( + client, X_valid, y_valid.y, qid=y_valid.qid, ref=Xy_train + ) + # Upon training, you will see a performance warning about sorting data based on + # query groups. + dxgb.train( + client, + {"objective": "rank:ndcg", "device": args.device}, + Xy_train, + evals=[(Xy_train, "Train"), (Xy_valid, "Valid")], + num_boost_round=100, + ) + + +def ranking_wo_split_demo(client: Client, args: argparse.Namespace) -> None: + """Learning to rank with data partitioned according to query groups.""" + df_tr, df_va, df_te = load_mslr_10k(args.device, args.data, args.cache) + + X_tr = df_tr[df_tr.columns.difference(["y", "qid"])] + X_va = df_va[df_va.columns.difference(["y", "qid"])] + + # `allow_group_split=False` makes sure data is partitioned according to the query + # groups. + ltr = dxgb.DaskXGBRanker(allow_group_split=False, device=args.device) + ltr.client = client + ltr = ltr.fit( + X_tr, + df_tr.y, + qid=df_tr.qid, + eval_set=[(X_tr, df_tr.y), (X_va, df_va.y)], + eval_qid=[df_tr.qid, df_va.qid], + verbose=True, + ) + + df_te = df_te.persist() + wait([df_te]) + + X_te = df_te[df_te.columns.difference(["y", "qid"])] + predt = ltr.predict(X_te) + y = client.compute(df_te.y) + wait([predt, y]) + + +@contextmanager +def gen_client(device: str) -> Generator[Client, None, None]: + match device: + case "cuda": + from dask_cuda import LocalCUDACluster + + with LocalCUDACluster() as cluster: + with Client(cluster) as client: + with dask.config.set( + { + "array.backend": "cupy", + "dataframe.backend": "cudf", + } + ): + yield client + case "cpu": + with LocalCluster() as cluster: + with Client(cluster) as client: + yield client + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Demonstration of learning to rank using XGBoost." + ) + parser.add_argument( + "--data", + type=str, + help="Root directory of the MSLR-WEB10K data.", + required=True, + ) + parser.add_argument( + "--cache", + type=str, + help="Directory for caching processed data.", + required=True, + ) + parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu") + parser.add_argument( + "--no-split", + action="store_true", + help="Flag to indicate query groups should not be split.", + ) + args = parser.parse_args() + + with gen_client(args.device) as client: + if args.no_split: + ranking_wo_split_demo(client, args) + else: + ranking_demo(client, args) diff --git a/demo/guide-python/learning_to_rank.py b/demo/guide-python/learning_to_rank.py index b131b31f76f6..fbc1f44baf50 100644 --- a/demo/guide-python/learning_to_rank.py +++ b/demo/guide-python/learning_to_rank.py @@ -12,8 +12,8 @@ train on relevance degree, and the second part simulates click data and enable the position debiasing training. -For an overview of learning to rank in XGBoost, please see -:doc:`Learning to Rank `. +For an overview of learning to rank in XGBoost, please see :doc:`Learning to Rank +`. """ from __future__ import annotations @@ -31,7 +31,7 @@ from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples -def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV: +def load_mslr_10k(data_path: str, cache_path: str) -> RelDataCV: """Load the MSLR10k dataset from data_path and cache a pickle object in cache_path. Returns @@ -89,7 +89,7 @@ def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV: def ranking_demo(args: argparse.Namespace) -> None: """Demonstration for learning to rank with relevance degree.""" - data = load_mlsr_10k(args.data, args.cache) + data = load_mslr_10k(args.data, args.cache) # Sort data according to query index X_train, y_train, qid_train = data.train @@ -123,7 +123,7 @@ def ranking_demo(args: argparse.Namespace) -> None: def click_data_demo(args: argparse.Namespace) -> None: """Demonstration for learning to rank with click data.""" - data = load_mlsr_10k(args.data, args.cache) + data = load_mslr_10k(args.data, args.cache) train, test = simulate_clicks(data) assert test is not None diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index 6e68d83a0083..036b1e725d47 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -355,15 +355,18 @@ Working with asyncio .. versionadded:: 1.2.0 -XGBoost's dask interface supports the new ``asyncio`` in Python and can be integrated into -asynchronous workflows. For using dask with asynchronous operations, please refer to -`this dask example `_ and document in -`distributed `_. To use XGBoost's -dask interface asynchronously, the ``client`` which is passed as an argument for training and -prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when the -``client`` is created (example below). All functions (including ``DaskDMatrix``) provided -by the functional interface will then return coroutines which can then be awaited to retrieve -their result. +XGBoost's dask interface supports the new :py:mod:`asyncio` in Python and can be +integrated into asynchronous workflows. For using dask with asynchronous operations, +please refer to `this dask example +`_ and document in `distributed +`_. To use XGBoost's Dask +interface asynchronously, the ``client`` which is passed as an argument for training and +prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when +the ``client`` is created (example below). All functions (including ``DaskDMatrix``) +provided by the functional interface will then return coroutines which can then be awaited +to retrieve their result. Please note that XGBoost is a compute-bounded application, where +parallelism is more important than concurrency. The support for `asyncio` is more about +compatibility instead of performance gain. Functional interface: @@ -526,6 +529,47 @@ See https://github.com/coiled/dask-xgboost-nyctaxi for a set of examples of usin with dask and optuna. +.. _ltr-dask: + +**************** +Learning to Rank +**************** + + .. versionadded:: 3.0.0 + + .. note:: + + Position debiasing is not yet supported. + +There are two operation modes in the Dask learning to rank for performance reasons. The +difference is whether a distributed global sort is needed. Please see :ref:`ltr-dist` for +how ranking works with distributed training in general. Below we will discuss some of the +Dask-specific features. + +First, if you use the :py:class:`~xgboost.dask.DaskQuantileDMatrix` interface or the +:py:class:`~xgboost.dask.DaskXGBRanker` with ``allow_group_split`` set to ``True``, +XGBoost will try to sort and group the samples for each worker based on the query ID. This +mode tries to skip the global sort and sort only worker-local data, and hence no +inter-worker data shuffle. Please note that even worker-local sort is costly, particularly +in terms of memory usage as there's no spilling when +:py:meth:`~pandas.DataFrame.sort_values` is used, and we need to concatenate the +data. XGBoost first checks whether the QID is already sorted before actually performing +the sorting operation. One can choose this if the query groups are relatively consecutive, +meaning most of the samples within a query group are close to each other and are likely to +be resided to the same worker. Don't use this if you have performed a random shuffle on +your data. + +If the input data is random, then there's no way we can guarantee most of data within the +same group being in the same worker. For large query groups, this might not be an +issue. But for small query groups, it's possible that each worker gets only one or two +samples from their group for all groups, which can lead to disastrous performance. In that +case, we can partition the data according to query group, which is the default behavior of +the :py:class:`~xgboost.dask.DaskXGBRanker` unless the ``allow_group_split`` is set to +``True``. This mode performs a sort and a groupby on the entire dataset in addition to an +encoding operation for the query group IDs. Along with partition fragmentation, this +option can lead to slow performance. See +:ref:`sphx_glr_python_dask-examples_dask_learning_to_rank.py` for a worked example. + .. _tracker-ip: *************** diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst index 4d2cbad4aa47..8743a672d219 100644 --- a/doc/tutorials/learning_to_rank.rst +++ b/doc/tutorials/learning_to_rank.rst @@ -165,10 +165,26 @@ On the other hand, if you have comparatively small amount of training data: For any method chosen, you can modify ``lambdarank_num_pair_per_sample`` to control the amount of pairs generated. +.. _ltr-dist: + ******************** Distributed Training ******************** -XGBoost implements distributed learning-to-rank with integration of multiple frameworks including Dask, Spark, and PySpark. The interface is similar to the single-node counterpart. Please refer to document of the respective XGBoost interface for details. Scattering a query group onto multiple workers is theoretically sound but can affect the model accuracy. For most of the use cases, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. As a result, users don't need to partition the data based on query groups. As long as each data partition is correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly. + +XGBoost implements distributed learning-to-rank with integration of multiple frameworks +including :doc:`Dask `, :doc:`Spark `, and +:doc:`PySpark `. The interface is similar to the single-node +counterpart. Please refer to document of the respective XGBoost interface for details. + +.. warning:: + + Position-debiasing is not yet supported for existing distributed interfaces. + +XGBoost works with collective operations, which means data is scattered to multiple workers. We can divide the data partitions by query group and ensure no query group is split among workers. However, this requires a costly sort and groupby operation and might only be necessary for selected use cases. Splitting and scattering a query group to multiple workers is theoretically sound but can affect the model's accuracy. If there are only a small number of groups sitting at the boundaries of workers, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. + +For a longer explanation, assuming the pairwise ranking method is used, we calculate the gradient based on relevance degree by constructing pairs within a query group. If a single query group is split among workers and we use worker-local data for gradient calculation, then we are simply sampling pairs from a smaller group for each worker to calculate the gradient and the evaluation metric. The comparison between each pair doesn't change because a group is split into sub-groups, what changes is the number of total and effective pairs and normalizers like `IDCG`. One can generate more pairs from a large group than it's from two smaller subgroups. As a result, the obtained gradient is still valid from a theoretical standpoint but might not be optimal. As long as each data partitions within a worker are correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly. And both the (Py)Spark interface and the Dask interface can sort the data according to query ID, please see respected tutorials for more information. + +However, it's possible that a distributed framework shuffles the data during map reduce and splits every query group into multiple workers. In that case, the performance would be disastrous. As a result, it depends on the data and the framework for whether a sorted groupby is needed. ******************* Reproducible Result diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index b21cf80aea56..07924623955d 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -3215,11 +3215,7 @@ def trees_to_dataframe(self, fmap: PathLike = "") -> DataFrame: } ) - if callable(getattr(df, "sort_values", None)): - # pylint: disable=no-member - return df.sort_values(["Tree", "Node"]).reset_index(drop=True) - # pylint: disable=no-member - return df.sort(["Tree", "Node"]).reset_index(drop=True) + return df.sort_values(["Tree", "Node"]).reset_index(drop=True) def _assign_dmatrix_features(self, data: DMatrix) -> None: if data.num_row() == 0: diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py index e0221310bc51..6c92e9205dc9 100644 --- a/python-package/xgboost/dask/__init__.py +++ b/python-package/xgboost/dask/__init__.py @@ -72,6 +72,7 @@ Tuple, TypeAlias, TypedDict, + TypeGuard, TypeVar, Union, ) @@ -117,7 +118,7 @@ ) from ..tracker import RabitTracker from ..training import train as worker_train -from .data import _create_dmatrix, _create_quantile_dmatrix +from .data import _create_dmatrix, _create_quantile_dmatrix, no_group_split from .utils import get_address_from_user, get_n_threads _DaskCollection: TypeAlias = Union[da.Array, dd.DataFrame, dd.Series] @@ -1898,10 +1899,21 @@ def _argmax(x: Any) -> Any: """, ["estimators", "model"], + extra_parameters=""" + allow_group_split : + + .. versionadded:: 3.0.0 + + Whether a query group can be split among multiple workers. When set to `False`, + inputs must be Dask dataframes or series. If you have many small query groups, + this can significantly increase the fragmentation of the data, and the internal + DMatrix construction can take longer. + +""", end_note=""" .. note:: - For dask implementation, group is not supported, use qid instead. + For the dask implementation, group is not supported, use qid instead. """, ) class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn): @@ -1910,36 +1922,36 @@ def __init__( self, *, objective: str = "rank:pairwise", + allow_group_split: bool = False, coll_cfg: Optional[CollConfig] = None, **kwargs: Any, ) -> None: if callable(objective): raise ValueError("Custom objective function not supported by XGBRanker.") + self.allow_group_split = allow_group_split super().__init__(objective=objective, coll_cfg=coll_cfg, **kwargs) + def _wrapper_params(self) -> Set[str]: + params = super()._wrapper_params() + params.add("allow_group_split") + return params + async def _fit_async( self, X: _DataT, y: _DaskCollection, *, - group: Optional[_DaskCollection], qid: Optional[_DaskCollection], sample_weight: Optional[_DaskCollection], base_margin: Optional[_DaskCollection], eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]], sample_weight_eval_set: Optional[Sequence[_DaskCollection]], base_margin_eval_set: Optional[Sequence[_DaskCollection]], - eval_group: Optional[Sequence[_DaskCollection]], eval_qid: Optional[Sequence[_DaskCollection]], verbose: Union[int, bool], xgb_model: Optional[Union[XGBModel, Booster]], feature_weights: Optional[_DaskCollection], ) -> "DaskXGBRanker": - msg = "Use the `qid` instead of the `group` with the dask interface." - if not (group is None and eval_group is None): - raise ValueError(msg) - if qid is None: - raise ValueError("`qid` is required for ranking.") params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, @@ -2006,8 +2018,108 @@ def fit( base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, feature_weights: Optional[_DaskCollection] = None, ) -> "DaskXGBRanker": - args = {k: v for k, v in locals().items() if k not in ("self", "__class__")} - return self._client_sync(self._fit_async, **args) + msg = "Use the `qid` instead of the `group` with the dask interface." + if not (group is None and eval_group is None): + raise ValueError(msg) + if qid is None: + raise ValueError("`qid` is required for ranking.") + + def check_df(X: _DaskCollection) -> TypeGuard[dd.DataFrame]: + if not isinstance(X, dd.DataFrame): + raise TypeError( + "When `allow_group_split` is set to False, X is required to be" + " a dataframe." + ) + return True + + def check_ser( + qid: Optional[_DaskCollection], name: str + ) -> TypeGuard[Optional[dd.Series]]: + if not isinstance(qid, dd.Series) and qid is not None: + raise TypeError( + f"When `allow_group_split` is set to False, {name} is required to be" + " a series." + ) + return True + + if not self.allow_group_split: + assert ( + check_df(X) + and check_ser(qid, "qid") + and check_ser(y, "y") + and check_ser(sample_weight, "sample_weight") + and check_ser(base_margin, "base_margin") + ) + assert qid is not None and y is not None + X_id = id(X) + X, qid, y, sample_weight, base_margin = no_group_split( + self.device, + X, + qid, + y=y, + sample_weight=sample_weight, + base_margin=base_margin, + ) + + if eval_set is not None: + new_eval_set = [] + new_eval_qid = [] + new_sample_weight_eval_set = [] + new_base_margin_eval_set = [] + assert eval_qid + for i, (Xe, ye) in enumerate(eval_set): + we = sample_weight_eval_set[i] if sample_weight_eval_set else None + be = base_margin_eval_set[i] if base_margin_eval_set else None + assert check_df(Xe) + assert eval_qid + qe = eval_qid[i] + assert ( + eval_qid + and check_ser(qe, "qid") + and check_ser(ye, "y") + and check_ser(we, "sample_weight") + and check_ser(be, "base_margin") + ) + assert qe is not None and ye is not None + if id(Xe) != X_id: + Xe, qe, ye, we, be = no_group_split( + self.device, Xe, qe, ye, we, be + ) + else: + Xe, qe, ye, we, be = X, qid, y, sample_weight, base_margin + + new_eval_set.append((Xe, ye)) + new_eval_qid.append(qe) + + if we is not None: + new_sample_weight_eval_set.append(we) + if be is not None: + new_base_margin_eval_set.append(be) + + eval_set = new_eval_set + eval_qid = new_eval_qid + sample_weight_eval_set = ( + new_sample_weight_eval_set if new_sample_weight_eval_set else None + ) + base_margin_eval_set = ( + new_base_margin_eval_set if new_base_margin_eval_set else None + ) + + return self._client_sync( + self._fit_async, + X=X, + y=y, + qid=qid, + sample_weight=sample_weight, + base_margin=base_margin, + eval_set=eval_set, + eval_qid=eval_qid, + verbose=verbose, + xgb_model=xgb_model, + sample_weight_eval_set=sample_weight_eval_set, + base_margin_eval_set=base_margin_eval_set, + feature_weights=feature_weights, + ) # FIXME(trivialfis): arguments differ due to additional parameters like group and # qid. diff --git a/python-package/xgboost/dask/data.py b/python-package/xgboost/dask/data.py index c4f0f138b298..f92f1666499f 100644 --- a/python-package/xgboost/dask/data.py +++ b/python-package/xgboost/dask/data.py @@ -3,15 +3,30 @@ import logging from collections.abc import Sequence -from typing import Any, Callable, Dict, List, Optional, TypeVar, Union - +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + TypeVar, + Union, + cast, + overload, +) + +import dask import distributed import numpy as np +import pandas as pd from dask import dataframe as dd +from .. import collective as coll from .._typing import _T, FeatureNames -from ..compat import concat +from ..compat import concat, import_cupy from ..core import DataIter, DMatrix, QuantileDMatrix +from ..data import is_on_cuda LOGGER = logging.getLogger("[xgboost.dask]") @@ -96,6 +111,153 @@ def next(self, input_data: Callable) -> bool: return True +@overload +def _add_column(df: dd.DataFrame, col: dd.Series) -> Tuple[dd.DataFrame, str]: ... + + +@overload +def _add_column(df: dd.DataFrame, col: None) -> Tuple[dd.DataFrame, None]: ... + + +def _add_column( + df: dd.DataFrame, col: Optional[dd.Series] +) -> Tuple[dd.DataFrame, Optional[str]]: + if col is None: + return df, col + + trails = 0 + uid = f"{col.name}_{trails}" + while uid in df.columns: + trails += 1 + uid = f"{col.name}_{trails}" + + df = df.assign(**{uid: col}) + return df, uid + + +def no_group_split( # pylint: disable=too-many-positional-arguments + device: str | None, + df: dd.DataFrame, + qid: dd.Series, + y: dd.Series, + sample_weight: Optional[dd.Series], + base_margin: Optional[dd.Series], +) -> Tuple[ + dd.DataFrame, dd.Series, dd.Series, Optional[dd.Series], Optional[dd.Series] +]: + """A function to prevent query group from being scattered to different + workers. Please see the tutorial in the document for the implication for not having + partition boundary based on query groups. + + """ + + df, qid_uid = _add_column(df, qid) + df, y_uid = _add_column(df, y) + df, w_uid = _add_column(df, sample_weight) + df, bm_uid = _add_column(df, base_margin) + + # `tasks` shuffle is required as of rapids 24.12 + shuffle = "p2p" if device is None or device == "cpu" else "tasks" + with dask.config.set({"dataframe.shuffle.method": shuffle}): + df = df.persist() + # Encode the QID to make it dense. + df[qid_uid] = df[qid_uid].astype("category").cat.as_known().cat.codes + # The shuffle here is costly. + df = df.sort_values(by=qid_uid) + cnt = df.groupby(qid_uid)[qid_uid].count() + div = cnt.index.compute().values.tolist() + div = sorted(div) + div = tuple(div + [div[-1] + 1]) + + df = df.set_index( + qid_uid, + drop=False, + divisions=div, + ).persist() + + qid = df[qid_uid] + y = df[y_uid] + sample_weight, base_margin = ( + cast(dd.Series, df[uid]) if uid is not None else None for uid in (w_uid, bm_uid) + ) + + uids = [uid for uid in [qid_uid, y_uid, w_uid, bm_uid] if uid is not None] + df = df.drop(uids, axis=1).persist() + return df, qid, y, sample_weight, base_margin + + +def sort_data_by_qid(**kwargs: List[Any]) -> Dict[str, List[Any]]: + """Sort worker-local data by query ID for learning to rank tasks.""" + data_parts = kwargs.get("data") + assert data_parts is not None + n_parts = len(data_parts) + + if is_on_cuda(data_parts[0]): + from cudf import DataFrame + else: + from pandas import DataFrame + + def get_dict(i: int) -> Dict[str, list]: + """Return a dictionary containing all the meta info and all partitions.""" + + def _get(attr: Optional[List[Any]]) -> Optional[list]: + if attr is not None: + return attr[i] + return None + + data_opt = {name: _get(kwargs.get(name, None)) for name in meta} + # Filter out None values. + data = {k: v for k, v in data_opt.items() if v is not None} + return data + + def map_fn(i: int) -> pd.DataFrame: + data = get_dict(i) + return DataFrame(data) + + meta_parts = [map_fn(i) for i in range(n_parts)] + dfq = concat(meta_parts) + if dfq.qid.is_monotonic_increasing: + return kwargs + + LOGGER.warning( + "[r%d]: Sorting data with %d partitions for ranking. " + "This is a costly operation and will increase the memory usage significantly. " + "To avoid this warning, sort the data based on qid before passing it into " + "XGBoost. Alternatively, you can use set the `allow_group_split` to False.", + coll.get_rank(), + n_parts, + ) + # I tried to construct a new dask DF to perform the sort, but it's quite difficult + # to get the partition alignment right. Along with the still maturing shuffle + # implementation and GPU compatibility, a simple concat is used. + # + # In case it might become useful one day, I managed to get a CPU version working, + # albeit qutie slow (much slower than concatenated sort). The implementation merges + # everything into a single Dask DF and runs `DF.sort_values`, then retrieve the + # individual X,y,qid, ... from calculated partition values `client.compute([p for p + # in df.partitions])`. It was to avoid creating mismatched partitions. + dfx = concat(data_parts) + + if is_on_cuda(dfq): + cp = import_cupy() + sorted_idx = cp.argsort(dfq.qid) + else: + sorted_idx = np.argsort(dfq.qid) + dfq = dfq.iloc[sorted_idx, :] + + if hasattr(dfx, "iloc"): + dfx = dfx.iloc[sorted_idx, :] + else: + dfx = dfx[sorted_idx, :] + + kwargs.update({"data": [dfx]}) + for i, c in enumerate(dfq.columns): + assert c in kwargs + kwargs.update({c: [dfq[c]]}) + + return kwargs + + def _get_worker_parts(list_of_parts: _DataParts) -> Dict[str, List[Any]]: assert isinstance(list_of_parts, list) result: Dict[str, List[Any]] = {} @@ -115,6 +277,9 @@ def append(i: int, name: str) -> None: for k in meta: append(i, k) + qid = result.get("qid", None) + if qid is not None: + result = sort_data_by_qid(**result) return result diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 5fbafd6ec58f..80e0ad2db1f5 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -457,7 +457,11 @@ def make_categorical( def make_ltr( - n_samples: int, n_features: int, n_query_groups: int, max_rel: int + n_samples: int, + n_features: int, + n_query_groups: int, + max_rel: int, + sort_qid: bool = True, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Make a dataset for testing LTR.""" rng = np.random.default_rng(1994) @@ -470,7 +474,8 @@ def make_ltr( w = rng.normal(0, 1.0, size=n_query_groups) w -= np.min(w) w /= np.max(w) - qid = np.sort(qid) + if sort_qid: + qid = np.sort(qid) return X, y, qid, w @@ -637,6 +642,10 @@ def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool: return all((y - x) < tolerance for x, y in zip(L, L[1:])) +def non_decreasing(L: Sequence[float], tolerance: float = 1e-4) -> bool: + return all((y - x) >= -tolerance for x, y in zip(L, L[1:])) + + def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool: """Assert whether two DMatrices contain the same predictors.""" lcsr = lhs.get_data() diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py index 541009a73c85..af0fc8bf0397 100644 --- a/python-package/xgboost/testing/dask.py +++ b/python-package/xgboost/testing/dask.py @@ -1,6 +1,6 @@ """Tests for dask shared by different test modules.""" -from typing import Any, List, Literal, cast +from typing import Any, List, Literal, Tuple, cast import numpy as np import pandas as pd @@ -175,7 +175,82 @@ def get_rabit_args(client: Client, n_workers: int) -> Any: return client.sync(_get_rabit_args, client, n_workers) -def get_client_workers(client: Any) -> List[str]: +def get_client_workers(client: Client) -> List[str]: "Get workers from a dask client." workers = client.scheduler_info()["workers"] return list(workers.keys()) + + +def make_ltr( # pylint: disable=too-many-locals,too-many-arguments + client: Client, + n_samples: int, + n_features: int, + *, + n_query_groups: int, + max_rel: int, + device: str, +) -> Tuple[dd.DataFrame, dd.Series, dd.Series]: + """Synthetic dataset for learning to rank.""" + workers = get_client_workers(client) + n_samples_per_worker = n_samples // len(workers) + + if device == "cpu": + from pandas import DataFrame as DF + else: + from cudf import DataFrame as DF + + def make(n: int, seed: int) -> pd.DataFrame: + rng = np.random.default_rng(seed) + X, y = make_classification( + n, n_features, n_informative=n_features, n_redundant=0, n_classes=max_rel + ) + qid = rng.integers(size=(n,), low=0, high=n_query_groups) + df = DF(X, columns=[f"f{i}" for i in range(n_features)]) + df["qid"] = qid + df["y"] = y + return df + + futures = [] + i = 0 + for k in range(0, n_samples, n_samples_per_worker): + fut = client.submit( + make, n=n_samples_per_worker, seed=k, workers=[workers[i % len(workers)]] + ) + futures.append(fut) + i += 1 + + last = n_samples - (n_samples_per_worker * len(workers)) + if last != 0: + fut = client.submit(make, n=last, seed=n_samples_per_worker * len(workers)) + futures.append(fut) + + meta = make(1, 0) + df = dd.from_delayed(futures, meta=meta) + assert isinstance(df, dd.DataFrame) + return df.drop(["qid", "y"], axis=1), df.y, df.qid + + +def check_no_group_split(client: Client, device: str) -> None: + """Test for the allow_group_split parameter.""" + X_tr, q_tr, y_tr = make_ltr( + client, 4096, 128, n_query_groups=4, max_rel=5, device=device + ) + X_va, q_va, y_va = make_ltr( + client, 1024, 128, n_query_groups=4, max_rel=5, device=device + ) + + ltr = dxgb.DaskXGBRanker(allow_group_split=False, n_estimators=32, device=device) + ltr.fit( + X_tr, + y_tr, + qid=q_tr, + eval_set=[(X_tr, y_tr), (X_va, y_va)], + eval_qid=[q_tr, q_va], + verbose=True, + ) + + assert ltr.n_features_in_ == 128 + assert X_tr.shape[1] == ltr.n_features_in_ # no change + ndcg = ltr.evals_result()["validation_0"]["ndcg@32"] + assert tm.non_decreasing(ndcg[:16], tolerance=1e-2), ndcg + np.testing.assert_allclose(ndcg[-1], 1.0, rtol=1e-2) diff --git a/src/data/data.cc b/src/data/data.cc index 47836bb5134b..713ad4a1a514 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -539,7 +539,9 @@ void MetaInfo::SetInfoFromHost(Context const* ctx, StringView key, Json arr) { } else if (key == "label") { CopyTensorInfoImpl(ctx, arr, &this->labels); if (this->num_row_ != 0 && this->labels.Shape(0) != this->num_row_) { - CHECK_EQ(this->labels.Size() % this->num_row_, 0) << "Incorrect size for labels."; + CHECK_EQ(this->labels.Size() % this->num_row_, 0) + << "Incorrect size for labels: (" << this->labels.Shape(0) << "," << this->labels.Shape(1) + << ") v.s. " << this->num_row_; size_t n_targets = this->labels.Size() / this->num_row_; this->labels.Reshape(this->num_row_, n_targets); } diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc index 94acf5a238d9..c50a55b3a17c 100644 --- a/src/objective/lambdarank_obj.cc +++ b/src/objective/lambdarank_obj.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2023, XGBoost contributors + * Copyright 2023-2024, XGBoost contributors */ #include "lambdarank_obj.h" @@ -23,7 +23,6 @@ #include "../common/optional_weight.h" // for MakeOptionalWeights, OptionalWeights #include "../common/ranking_utils.h" // for RankingCache, LambdaRankParam, MAPCache, NDCGC... #include "../common/threading_utils.h" // for ParallelFor, Sched -#include "../common/transform_iterator.h" // for IndexTransformIter #include "init_estimation.h" // for FitIntercept #include "xgboost/base.h" // for bst_group_t, GradientPair, kRtEps, GradientPai... #include "xgboost/context.h" // for Context diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh index 2e5724f7f1fd..e1a78f905434 100644 --- a/src/objective/lambdarank_obj.cuh +++ b/src/objective/lambdarank_obj.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2023 XGBoost contributors + * Copyright 2023-2024, XGBoost contributors */ #ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_ #define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_ @@ -71,13 +71,13 @@ struct KernelInputs { std::int32_t iter; }; /** - * \brief Functor for generating pairs + * @brief Functor for generating pairs */ template struct MakePairsOp { KernelInputs args; /** - * \brief Make pair for the topk pair method. + * @brief Make pair for the topk pair method. */ [[nodiscard]] XGBOOST_DEVICE std::tuple WithTruncation( std::size_t idx, bst_group_t g) const { @@ -86,9 +86,6 @@ struct MakePairsOp { auto data_group_begin = static_cast(args.d_group_ptr[g]); std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin; - // obtain group segment data. - auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0); - auto g_sorted_idx = args.d_sorted_idx.subspan(data_group_begin, n_data); std::size_t i = 0, j = 0; common::UnravelTrapeziodIdx(idx_in_thread_group, n_data, &i, &j); @@ -97,7 +94,7 @@ struct MakePairsOp { return std::make_tuple(rank_high, rank_low); } /** - * \brief Make pair for the mean pair method + * @brief Make pair for the mean pair method */ XGBOOST_DEVICE std::tuple WithSampling(std::size_t idx, bst_group_t g) const { diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index 76860d9d1e35..dfa67e757059 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -111,8 +111,7 @@ class LintersPaths: "tests/test_distributed/test_with_dask/test_external_memory.py", "tests/test_distributed/test_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_spark/test_data.py", - "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py", - "tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py", + "tests/test_distributed/test_gpu_with_dask/", # demo "demo/dask/", "demo/json-model/json_parser.py", diff --git a/tests/test_distributed/test_gpu_with_dask/conftest.py b/tests/test_distributed/test_gpu_with_dask/conftest.py index 0332dd945651..a066461303d3 100644 --- a/tests/test_distributed/test_gpu_with_dask/conftest.py +++ b/tests/test_distributed/test_gpu_with_dask/conftest.py @@ -1,4 +1,4 @@ -from typing import Generator, Sequence +from typing import Any, Generator, Sequence import pytest @@ -6,12 +6,12 @@ @pytest.fixture(scope="session", autouse=True) -def setup_rmm_pool(request, pytestconfig: pytest.Config) -> None: +def setup_rmm_pool(request: Any, pytestconfig: pytest.Config) -> None: tm.setup_rmm_pool(request, pytestconfig) @pytest.fixture(scope="class") -def local_cuda_client(request, pytestconfig: pytest.Config) -> Generator: +def local_cuda_client(request: Any, pytestconfig: pytest.Config) -> Generator: kwargs = {} if hasattr(request, "param"): kwargs.update(request.param) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py index 553b8746f0d0..848321ae4613 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py @@ -14,14 +14,14 @@ @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.mgpu -def test_dask_training(): +def test_dask_training() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "gpu_training.py") cmd = ["python", script] subprocess.check_call(cmd) @pytest.mark.mgpu -def test_dask_sklearn_demo(): +def test_dask_sklearn_demo() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "sklearn_gpu_training.py") cmd = ["python", script] subprocess.check_call(cmd) @@ -29,7 +29,7 @@ def test_dask_sklearn_demo(): @pytest.mark.mgpu @pytest.mark.skipif(**tm.no_cupy()) -def test_forward_logging_demo(): +def test_forward_logging_demo() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "forward_logging.py") cmd = ["python", script] subprocess.check_call(cmd) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py new file mode 100644 index 000000000000..f8f586e39746 --- /dev/null +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py @@ -0,0 +1,18 @@ +"""Copyright 2024, XGBoost contributors""" + +import dask +import pytest +from distributed import Client + +from xgboost.testing import dask as dtm + + +@pytest.mark.filterwarnings("error") +def test_no_group_split(local_cuda_client: Client) -> None: + with dask.config.set( + { + "array.backend": "cupy", + "dataframe.backend": "cudf", + } + ): + dtm.check_no_group_split(local_cuda_client, "cuda") diff --git a/tests/test_distributed/test_with_dask/test_ranking.py b/tests/test_distributed/test_with_dask/test_ranking.py index 0b2ea404fde1..f806d61d2592 100644 --- a/tests/test_distributed/test_with_dask/test_ranking.py +++ b/tests/test_distributed/test_with_dask/test_ranking.py @@ -11,6 +11,7 @@ from xgboost import dask as dxgb from xgboost import testing as tm +from xgboost.testing import dask as dtm @pytest.fixture(scope="module") @@ -59,7 +60,10 @@ def test_dask_ranking(client: Client) -> None: qid_test = qid_test.astype(np.uint32) rank = dxgb.DaskXGBRanker( - n_estimators=2500, eval_metric=["ndcg"], early_stopping_rounds=10 + n_estimators=2500, + eval_metric=["ndcg"], + early_stopping_rounds=10, + allow_group_split=True, ) rank.fit( x_train, @@ -71,3 +75,8 @@ def test_dask_ranking(client: Client) -> None: ) assert rank.n_features_in_ == 46 assert rank.best_score > 0.98 + + +@pytest.mark.filterwarnings("error") +def test_no_group_split(client: Client) -> None: + dtm.check_no_group_split(client, "cpu") From 18f53713f5e9f5ad7347ec30c888c7d174702b7d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 5 Dec 2024 14:39:43 +0800 Subject: [PATCH 09/16] [ci] Rename src tarball. [skip ci] (#11055) - Add `src` to the name. --- dev/release-artifacts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py index 08bb2cbfaff2..a0f8f796130e 100644 --- a/dev/release-artifacts.py +++ b/dev/release-artifacts.py @@ -234,7 +234,7 @@ def check_path() -> None: def make_src_tarball(release: str, outdir: Path) -> Tuple[str, str]: - tarball_name = f"xgboost-{release}.tar.gz" + tarball_name = f"xgboost-src-{release}.tar.gz" tarball_path = outdir / tarball_name if tarball_path.exists(): tarball_path.unlink() @@ -301,7 +301,7 @@ def release_note( * xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url}) **Source tarball** -* xgboost.tar.gz: [Download]({src_tarball})""" +* {tarball_name}: [Download]({src_tarball})""" print(end_note) with open(outdir / "end_note.md", "w") as f: f.write(end_note) From 0eacc37ae3b8479b6f2f4db022f606b994ec92e0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 5 Dec 2024 14:40:21 +0800 Subject: [PATCH 10/16] Update news for 2.1.3 [skip ci] (#11056) --- doc/changes/v2.1.0.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/changes/v2.1.0.rst b/doc/changes/v2.1.0.rst index 4a657c3a403f..3e2297c8a89d 100644 --- a/doc/changes/v2.1.0.rst +++ b/doc/changes/v2.1.0.rst @@ -1,3 +1,13 @@ +################################# +2.1.3 Patch Release (2024 Nov 26) +################################# + +The 2.1.3 patch release makes the following bug fixes: + +- [pyspark] Support large model size (#10984). +- Fix rng for the column sampler (#10998). +- Handle `cudf.pandas` proxy objects properly (#11014). + ################################# 2.1.2 Patch Release (2024 Oct 23) ################################# From fcb9c2fb6159dc820d3da86da8fd7a24599baa61 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Wed, 4 Dec 2024 23:04:58 -0800 Subject: [PATCH 11/16] Update ISSUE_TEMPLATE.md (#11061) --- .github/ISSUE_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed1a2d304916..d4f028e33f93 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -Thanks for participating in the XGBoost community! We use https://discuss.xgboost.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :) +Thanks for participating in the XGBoost community! The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed. From 96952fcb31bee9c01e51fe6e1abb3f9b0bbec185 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 5 Dec 2024 15:58:37 +0800 Subject: [PATCH 12/16] Reduce pandas dataframe overhead. (#11058) --- python-package/xgboost/data.py | 144 ++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 64 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 29647f88a893..7b37ca50e4e9 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -2,6 +2,7 @@ # pylint: disable=too-many-return-statements """Data dispatching for DMatrix.""" import ctypes +import functools import json import os import warnings @@ -21,7 +22,9 @@ TransformedData, c_bst_ulong, ) -from .compat import DataFrame, lazy_isinstance +from .compat import DataFrame +from .compat import Series as PdSeries +from .compat import lazy_isinstance from .core import ( _LIB, DataIter, @@ -377,23 +380,39 @@ def pandas_feature_info( else: feature_names = list(data.columns.map(str)) - # handle feature types + # handle feature types and dtype validation + new_feature_types = [] + need_sparse_extension_warn = True + for dtype in data.dtypes: + if is_pd_sparse_dtype(dtype): + new_feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) + if need_sparse_extension_warn: + warnings.warn("Sparse arrays from pandas are converted into dense.") + need_sparse_extension_warn = False + elif ( + is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) + ) and enable_categorical: + new_feature_types.append(CAT_T) + else: + try: + new_feature_types.append(_pandas_dtype_mapper[dtype.name]) + except KeyError: + _invalid_dataframe_dtype(data) + if feature_types is None and meta is None: - feature_types = [] - for dtype in data.dtypes: - if is_pd_sparse_dtype(dtype): - feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) - elif ( - is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) - ) and enable_categorical: - feature_types.append(CAT_T) - else: - feature_types.append(_pandas_dtype_mapper[dtype.name]) + feature_types = new_feature_types + return feature_names, feature_types def is_nullable_dtype(dtype: PandasDType) -> bool: """Whether dtype is a pandas nullable type.""" + + from pandas.api.extensions import ExtensionDtype + + if not isinstance(dtype, ExtensionDtype): + return False + from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper @@ -415,8 +434,8 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool: ) -def is_pd_cat_dtype(dtype: PandasDType) -> bool: - """Wrapper for testing pandas category type.""" +@functools.cache +def _lazy_load_pd_is_cat() -> Callable[[PandasDType], bool]: import pandas as pd if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"): @@ -424,15 +443,23 @@ def is_pd_cat_dtype(dtype: PandasDType) -> bool: if Version(pd.__version__) >= Version("2.1.0"): from pandas import CategoricalDtype - return isinstance(dtype, CategoricalDtype) + def pd_is_cat_210(dtype: PandasDType) -> bool: + return isinstance(dtype, CategoricalDtype) + return pd_is_cat_210 from pandas.api.types import is_categorical_dtype # type: ignore - return is_categorical_dtype(dtype) + return is_categorical_dtype -def is_pd_sparse_dtype(dtype: PandasDType) -> bool: - """Wrapper for testing pandas sparse type.""" +def is_pd_cat_dtype(dtype: PandasDType) -> bool: + """Wrapper for testing pandas category type.""" + is_cat = _lazy_load_pd_is_cat() + return is_cat(dtype) + + +@functools.cache +def _lazy_load_pd_is_sparse() -> Callable[[PandasDType], bool]: import pandas as pd if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"): @@ -440,10 +467,20 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool: if Version(pd.__version__) >= Version("2.1.0"): from pandas import SparseDtype - return isinstance(dtype, SparseDtype) + def pd_is_sparse_210(dtype: PandasDType) -> bool: + return isinstance(dtype, SparseDtype) + + return pd_is_sparse_210 from pandas.api.types import is_sparse # type: ignore + return is_sparse + + +def is_pd_sparse_dtype(dtype: PandasDType) -> bool: + """Wrapper for testing pandas sparse type.""" + is_sparse = _lazy_load_pd_is_sparse() + return is_sparse(dtype) @@ -474,33 +511,34 @@ def pandas_pa_type(ser: Any) -> np.ndarray: return arr +@functools.cache +def _lazy_has_npdtypes() -> bool: + return np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0") + + +@functools.cache +def _lazy_load_pd_floats() -> tuple: + from pandas import Float32Dtype, Float64Dtype + + return Float32Dtype, Float64Dtype + + def pandas_transform_data(data: DataFrame) -> List[np.ndarray]: """Handle categorical dtype and extension types from pandas.""" - import pandas as pd - from pandas import Float32Dtype, Float64Dtype + Float32Dtype, Float64Dtype = _lazy_load_pd_floats() result: List[np.ndarray] = [] + np_dtypes = _lazy_has_npdtypes() - def cat_codes(ser: pd.Series) -> np.ndarray: - if is_pd_cat_dtype(ser.dtype): - return _ensure_np_dtype( - ser.cat.codes.astype(np.float32) - .replace(-1.0, np.nan) - .to_numpy(na_value=np.nan), - np.float32, - )[0] - # Not yet supported, the index is not ordered for some reason. Alternately: - # `combine_chunks().to_pandas().cat.codes`. The result is the same. - assert is_pa_ext_categorical_dtype(ser.dtype) - return ( - ser.array.__arrow_array__() - .combine_chunks() - .dictionary_encode() - .indices.astype(np.float32) + def cat_codes(ser: PdSeries) -> np.ndarray: + return _ensure_np_dtype( + ser.cat.codes.astype(np.float32) .replace(-1.0, np.nan) - ) + .to_numpy(na_value=np.nan), + np.float32, + )[0] - def nu_type(ser: pd.Series) -> np.ndarray: + def nu_type(ser: PdSeries) -> np.ndarray: # Avoid conversion when possible if isinstance(dtype, Float32Dtype): res_dtype: NumpyDType = np.float32 @@ -512,10 +550,9 @@ def nu_type(ser: pd.Series) -> np.ndarray: ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype )[0] - def oth_type(ser: pd.Series) -> np.ndarray: + def oth_type(ser: PdSeries) -> np.ndarray: # The dtypes module is added in 1.25. - npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0") - npdtypes = npdtypes and isinstance( + npdtypes = np_dtypes and isinstance( ser.dtype, ( # pylint: disable=no-member @@ -545,7 +582,7 @@ def oth_type(ser: pd.Series) -> np.ndarray: elif is_nullable_dtype(dtype): result.append(nu_type(data[col])) elif is_pd_sparse_dtype(dtype): - arr = cast(pd.arrays.SparseArray, data[col].values) + arr = data[col].values arr = arr.to_dense() if _is_np_array_like(arr): arr, _ = _ensure_np_dtype(arr, arr.dtype) @@ -559,26 +596,6 @@ def oth_type(ser: pd.Series) -> np.ndarray: return result -def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None: - """Validate the input types, returns True if the dataframe is backed by arrow.""" - sparse_extension = False - - for dtype in data.dtypes: - if not ( - (dtype.name in _pandas_dtype_mapper) - or is_pd_sparse_dtype(dtype) - or (is_pd_cat_dtype(dtype) and enable_categorical) - or is_pa_ext_dtype(dtype) - ): - _invalid_dataframe_dtype(data) - - if is_pd_sparse_dtype(dtype): - sparse_extension = True - - if sparse_extension: - warnings.warn("Sparse arrays from pandas are converted into dense.") - - class PandasTransformed: """A storage class for transformed pandas DataFrame.""" @@ -604,7 +621,6 @@ def _transform_pandas_df( feature_types: Optional[FeatureTypes] = None, meta: Optional[str] = None, ) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]: - pandas_check_dtypes(data, enable_categorical) if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") From c0f1f568996a059abfa799d2fa4b6433de894a09 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 5 Dec 2024 18:24:00 +0800 Subject: [PATCH 13/16] [doc] Move model parser demo into Python demo. (#11052) - It will be included in the sphinx gallery once moved. - Remove the outdated readme. --- .../json_parser.py => guide-python/model_parser.py} | 7 ++++++- demo/json-model/README.md | 3 --- tests/ci_build/lint_python.py | 4 ++-- tests/python/test_demos.py | 2 +- tests/python/test_with_sklearn.py | 2 +- tests/test_distributed/test_with_dask/test_with_dask.py | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) rename demo/{json-model/json_parser.py => guide-python/model_parser.py} (98%) delete mode 100644 demo/json-model/README.md diff --git a/demo/json-model/json_parser.py b/demo/guide-python/model_parser.py similarity index 98% rename from demo/json-model/json_parser.py rename to demo/guide-python/model_parser.py index b744d9569aea..39a459613409 100644 --- a/demo/json-model/json_parser.py +++ b/demo/guide-python/model_parser.py @@ -1,4 +1,9 @@ -"""Demonstration for parsing JSON/UBJSON tree model file generated by XGBoost. +""" +Demonstration for parsing JSON/UBJSON tree model files +====================================================== + +See :doc:`/tutorials/saving_model` for details about the model serialization. + """ import argparse diff --git a/demo/json-model/README.md b/demo/json-model/README.md deleted file mode 100644 index 065d854f476a..000000000000 --- a/demo/json-model/README.md +++ /dev/null @@ -1,3 +0,0 @@ -We introduced initial support for saving XGBoost model in JSON format in 1.0.0. Note that -it's still experimental and under development, output schema is subject to change due to -bug fixes or further refactoring. For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html . \ No newline at end of file diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index dfa67e757059..8ee0b4e8e692 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -49,13 +49,13 @@ class LintersPaths: # demo "demo/dask/", "demo/rmm_plugin", - "demo/json-model/json_parser.py", "demo/guide-python/continuation.py", "demo/guide-python/cat_in_the_dat.py", "demo/guide-python/callbacks.py", "demo/guide-python/categorical.py", "demo/guide-python/cat_pipeline.py", "demo/guide-python/feature_weights.py", + "demo/guide-python/model_parser.py", "demo/guide-python/sklearn_parallel.py", "demo/guide-python/sklearn_examples.py", "demo/guide-python/sklearn_evals_result.py", @@ -114,7 +114,6 @@ class LintersPaths: "tests/test_distributed/test_gpu_with_dask/", # demo "demo/dask/", - "demo/json-model/json_parser.py", "demo/guide-python/external_memory.py", "demo/guide-python/distributed_extmem_basic.py", "demo/guide-python/sklearn_examples.py", @@ -124,6 +123,7 @@ class LintersPaths: "demo/guide-python/categorical.py", "demo/guide-python/cat_pipeline.py", "demo/guide-python/feature_weights.py", + "demo/guide-python/model_parser.py", "demo/guide-python/individual_trees.py", "demo/guide-python/quantile_regression.py", "demo/guide-python/multioutput_regression.py", diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 01634af2924d..d20e5bc384cc 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -174,7 +174,7 @@ def test_quantile_reg() -> None: @pytest.mark.skipif(**tm.no_ubjson()) def test_json_model() -> None: - script = os.path.join(DEMO_DIR, "json-model", "json_parser.py") + script = os.path.join(PYTHON_DEMO_DIR, "model_parser.py") def run_test(reg: xgboost.XGBRegressor) -> None: with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 937e59095863..3f2b13038c34 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1186,7 +1186,7 @@ def test_feature_weights(tree_method): for i in range(kCols): fw[i] *= float(i) - parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py") + parser_path = os.path.join(tm.demo_dir(__file__), "guide-python", "model_parser.py") poly_increasing = get_feature_weights( X=X, y=y, diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index 77db640c2a78..53e263b5e06e 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -1633,7 +1633,7 @@ def test_feature_weights(self, client: "Client") -> None: for i in range(kCols): fw[i] *= float(i) fw = da.from_array(fw) - parser = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py") + parser = os.path.join(tm.demo_dir(__file__), "guide-python", "model_parser.py") poly_increasing = get_feature_weights( X=X, y=y, From 172de1efa15336a7c414b56f664825b0b5651fa0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 6 Dec 2024 00:52:17 +0800 Subject: [PATCH 14/16] [ci] Update macos for cpp tests. (#11063) --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d1395c15f77e..b75456f04b4a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12] + os: [macos-13] steps: - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: From 5be8083314da3315c1ac9e028142f8c408c94265 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 5 Dec 2024 19:53:48 +0100 Subject: [PATCH 15/16] [R] Update serialization docs (#11059) --- R-package/R/utils.R | 17 +++++++++-------- .../a-compatibility-note-for-saveRDS-save.Rd | 17 +++++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 78249a53f18d..008a88dcd715 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -423,7 +423,7 @@ NULL #' #' @description #' When it comes to serializing XGBoost models, it's possible to use R serializers such as -#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides +#' [save()] or [saveRDS()] to serialize an XGBoost model object, but XGBoost also provides #' its own serializers with better compatibility guarantees, which allow loading #' said models in other language bindings of XGBoost. #' @@ -451,14 +451,15 @@ NULL #' not used for prediction / importance / plotting / etc. #' These R attributes are only preserved when using R's serializers. #' -#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the -#' function [xgboost()] produces a different subclass `xgboost`, which keeps other -#' additional metadata as R attributes such as class names in classification problems, -#' and which has a dedicated `predict` method that uses different defaults. XGBoost's +#' In addition to the regular `xgb.Booster` objects produced by [xgb.train()], the +#' function [xgboost()] produces objects with a different subclass `xgboost` (which +#' inherits from `xgb.Booster`), which keeps other additional metadata as R attributes +#' such as class names in classification problems, and which has a dedicated `predict` +#' method that uses different defaults and takes different argument names. XGBoost's #' own serializers can work with this `xgboost` class, but as they do not keep R #' attributes, the resulting object, when deserialized, is downcasted to the regular #' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use -#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects, +#' [predict.xgb.Booster()] instead of [predict.xgboost()]) - for these `xgboost` objects, #' `saveRDS` might thus be a better option if the extra functionalities are needed. #' #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and @@ -466,8 +467,8 @@ NULL #' are incompatible with each other. Hence, models that were saved with R serializers #' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter #' `xgboost` versions and vice versa. Be aware that the structure of R model objects -#' could in theory change again in the future, so XGBoost's serializers -#' should be preferred for long-term storage. +#' could in theory change again in the future, so XGBoost's serializers should be +#' preferred for long-term storage. #' #' Furthermore, note that using the package `qs` for serialization will require #' version 0.26 or higher of said package, and will have the same compatibility diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index af90ddded197..4ce043799436 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -5,7 +5,7 @@ \title{Model Serialization and Compatibility} \description{ When it comes to serializing XGBoost models, it's possible to use R serializers such as -\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost R model, but XGBoost also provides +\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost model object, but XGBoost also provides its own serializers with better compatibility guarantees, which allow loading said models in other language bindings of XGBoost. @@ -35,14 +35,15 @@ the model was fit, or saving the R call that produced the model, but are otherwi not used for prediction / importance / plotting / etc. These R attributes are only preserved when using R's serializers. -In addition to the regular \code{xgb.Booster} objects producted by \code{\link[=xgb.train]{xgb.train()}}, the -function \code{\link[=xgboost]{xgboost()}} produces a different subclass \code{xgboost}, which keeps other -additional metadata as R attributes such as class names in classification problems, -and which has a dedicated \code{predict} method that uses different defaults. XGBoost's +In addition to the regular \code{xgb.Booster} objects produced by \code{\link[=xgb.train]{xgb.train()}}, the +function \code{\link[=xgboost]{xgboost()}} produces objects with a different subclass \code{xgboost} (which +inherits from \code{xgb.Booster}), which keeps other additional metadata as R attributes +such as class names in classification problems, and which has a dedicated \code{predict} +method that uses different defaults and takes different argument names. XGBoost's own serializers can work with this \code{xgboost} class, but as they do not keep R attributes, the resulting object, when deserialized, is downcasted to the regular \code{xgb.Booster} class (i.e. it loses the metadata, and the resulting object will use -\code{predict.xgb.Booster} instead of \code{predict.xgboost}) - for these \code{xgboost} objects, +\code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} instead of \code{\link[=predict.xgboost]{predict.xgboost()}}) - for these \code{xgboost} objects, \code{saveRDS} might thus be a better option if the extra functionalities are needed. Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and @@ -50,8 +51,8 @@ XGBoost models before version \verb{2.1.0}; have a very different R object struc are incompatible with each other. Hence, models that were saved with R serializers like \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}} before version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that the structure of R model objects -could in theory change again in the future, so XGBoost's serializers -should be preferred for long-term storage. +could in theory change again in the future, so XGBoost's serializers should be +preferred for long-term storage. Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or higher of said package, and will have the same compatibility From 54930ec2e374c460c3ba3c44c1da850afc0f81f8 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 5 Dec 2024 19:59:52 +0100 Subject: [PATCH 16/16] [R] use predict.xgb.Booster internally when needed (#11060) --- R-package/R/xgb.create.features.R | 2 +- R-package/R/xgb.plot.shap.R | 4 ++-- R-package/man/xgb.plot.shap.Rd | 2 +- R-package/man/xgb.plot.shap.summary.Rd | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R index f9d892caa1e5..2c4015c5f2de 100644 --- a/R-package/R/xgb.create.features.R +++ b/R-package/R/xgb.create.features.R @@ -86,7 +86,7 @@ #' @export xgb.create.features <- function(model, data, ...) { check.deprecation(...) - pred_with_leaf <- predict(model, data, predleaf = TRUE) + pred_with_leaf <- predict.xgb.Booster(model, data, predleaf = TRUE) cols <- lapply(as.data.frame(pred_with_leaf), factor) cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint } diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 443020e1ac7e..4184c6f5ea6a 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -16,7 +16,7 @@ #' @param target_class Only relevant for multiclass models. The default (`NULL`) #' averages the SHAP values over all classes. Pass a (0-based) class index #' to show only SHAP values of that class. -#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`. +#' @param approxcontrib Passed to [predict.xgb.Booster()] when `shap_contrib = NULL`. #' @param subsample Fraction of data points randomly picked for plotting. #' The default (`NULL`) will use up to 100k data points. #' @param n_col Number of columns in a grid of plots. @@ -353,7 +353,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, } if (is.null(shap_contrib)) { - shap_contrib <- predict( + shap_contrib <- predict.xgb.Booster( model, newdata = data, predcontrib = TRUE, diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index f4f51059d653..969a7d103c62 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -54,7 +54,7 @@ Only used when \code{features = NULL}.} averages the SHAP values over all classes. Pass a (0-based) class index to show only SHAP values of that class.} -\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.} +\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.} \item{subsample}{Fraction of data points randomly picked for plotting. The default (\code{NULL}) will use up to 100k data points.} diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd index f6df2daca758..b72c560b3769 100644 --- a/R-package/man/xgb.plot.shap.summary.Rd +++ b/R-package/man/xgb.plot.shap.summary.Rd @@ -51,7 +51,7 @@ Only used when \code{features = NULL}.} averages the SHAP values over all classes. Pass a (0-based) class index to show only SHAP values of that class.} -\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.} +\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.} \item{subsample}{Fraction of data points randomly picked for plotting. The default (\code{NULL}) will use up to 100k data points.}