diff --git a/.appveyor.yml b/.appveyor.yml index f8cd85f17dcb..20807e36e6f5 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -7,6 +7,8 @@ configuration: # a trick to construct a build matrix with multiple Python versi environment: matrix: + - COMPILER: MINGW + TASK: r-package - COMPILER: MSVC TASK: python - COMPILER: MINGW diff --git a/.ci/setup.sh b/.ci/setup.sh index 39370fa27604..0c4101189346 100755 --- a/.ci/setup.sh +++ b/.ci/setup.sh @@ -17,7 +17,7 @@ if [[ $OS_NAME == "macos" ]]; then if [[ $AZURE == "true" ]] && [[ $TASK == "sdist" ]]; then brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f3544543a3115023fc7ca962c21d14b443f419d0/Formula/swig.rb # swig 3.0.12 fi - wget -q -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-MacOSX-x86_64.sh + wget -q -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh else # Linux if [[ $TASK == "mpi" ]]; then sudo apt-get update @@ -37,7 +37,7 @@ else # Linux echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd fi if [[ $TRAVIS == "true" ]]; then - wget -q -O conda.sh https://repo.continuum.io/miniconda/Miniconda${PYTHON_VERSION:0:1}-latest-Linux-x86_64.sh + wget -q -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh fi fi diff --git a/.ci/test.sh b/.ci/test.sh index 2b8b48daa6c0..3cb25ee10d14 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -56,7 +56,7 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "lint" ]]; then echo "Linting R code" Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 echo "Linting C++ code" - cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0 + cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit -1 exit 0 fi diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 4dddd36225f4..12e3b4e5724b 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -34,11 +34,6 @@ fi # Installing R precompiled for Mac OS 10.11 or higher if [[ $OS_NAME == "macos" ]]; then - - # temp fix for basictex - if [[ $AZURE == "true" ]]; then - brew update - fi brew install qpdf brew cask install basictex export PATH="/Library/TeX/texbin:$PATH" @@ -86,9 +81,17 @@ export _R_CHECK_FORCE_SUGGESTS_=0 # fails tests if either ERRORs or WARNINGs are thrown by # R CMD CHECK +check_succeeded="yes" R CMD check ${PKG_TARBALL} \ --as-cran \ -|| exit -1 +|| check_succeeded="no" + +echo "R CMD check build logs:" +cat ${BUILD_DIRECTORY}/lightgbm.Rcheck/00install.out + +if [[ $check_succeeded == "no" ]]; then + exit -1 +fi if grep -q -R "WARNING" "$LOG_FILE_NAME"; then echo "WARNINGS have been found by R CMD check!" @@ -105,5 +108,3 @@ if [[ ${NUM_CHECK_NOTES} -gt ${ALLOWED_CHECK_NOTES} ]]; then echo "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed" exit -1 fi - -exit 0 diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1 new file mode 100644 index 000000000000..5bd65d28408b --- /dev/null +++ b/.ci/test_r_package_windows.ps1 @@ -0,0 +1,108 @@ +# Download a file and retry upon failure. This looks like +# an infinite loop but CI-level timeouts will kill it +function Download-File-With-Retries { + param( + [string]$url, + [string]$destfile + ) + do { + Write-Output "Downloading ${url}" + sleep 5; + (New-Object System.Net.WebClient).DownloadFile($url, $destfile) + } while(!$?); +} + +$env:R_WINDOWS_VERSION = "3.6.3" +$env:R_LIB_PATH = "$env:BUILD_SOURCESDIRECTORY/RLibrary" -replace '[\\]', '/' +$env:PATH = "$env:R_LIB_PATH/Rtools/bin;" + "$env:R_LIB_PATH/R/bin/x64;" + "$env:R_LIB_PATH/miktex/texmfs/install/miktex/bin/x64;" + $env:PATH +$env:CRAN_MIRROR = "https://cloud.r-project.org/" +$env:CTAN_MIRROR = "https://ctan.math.illinois.edu/systems/win32/miktex/tm/packages/" + +if ($env:COMPILER -eq "MINGW") { + $env:CXX = "$env:R_LIB_PATH/Rtools/mingw_64/bin/g++.exe" + $env:CC = "$env:R_LIB_PATH/Rtools/mingw_64/bin/gcc.exe" +} + +cd $env:BUILD_SOURCESDIRECTORY +tzutil /s "GMT Standard Time" +[Void][System.IO.Directory]::CreateDirectory($env:R_LIB_PATH) + +if ($env:COMPILER -eq "MINGW") { + Write-Output "Telling R to use MinGW" + $install_libs = "$env:BUILD_SOURCESDIRECTORY/R-package/src/install.libs.R" + ((Get-Content -path $install_libs -Raw) -replace 'use_mingw <- FALSE','use_mingw <- TRUE') | Set-Content -Path $install_libs +} + +# download R and RTools +Write-Output "Downloading R and Rtools" +Download-File-With-Retries -url "https://cloud.r-project.org/bin/windows/base/old/$env:R_WINDOWS_VERSION/R-$env:R_WINDOWS_VERSION-win.exe" -destfile "R-win.exe" +Download-File-With-Retries -url "https://cloud.r-project.org/bin/windows/Rtools/Rtools35.exe" -destfile "Rtools.exe" + +# Install R +Write-Output "Installing R" +Start-Process -FilePath R-win.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT /DIR=$env:R_LIB_PATH/R /COMPONENTS=main,x64" ; Check-Output $? +Write-Output "Done installing R" + +Write-Output "Installing Rtools" +Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT /DIR=$env:R_LIB_PATH/Rtools" ; Check-Output $? +Write-Output "Done installing Rtools" + +# MiKTeX and pandoc can be skipped on non-MINGW builds, since we don't +# build the package documentation for those +if ($env:COMPILER -eq "MINGW") { + Write-Output "Downloading MiKTeX" + Download-File-With-Retries -url "https://miktex.org/download/win/miktexsetup-x64.zip" -destfile "miktexsetup-x64.zip" + Add-Type -AssemblyName System.IO.Compression.FileSystem + [System.IO.Compression.ZipFile]::ExtractToDirectory("miktexsetup-x64.zip", "miktex") + Write-Output "Setting up MiKTeX" + .\miktex\miktexsetup.exe --remote-package-repository="$env:CTAN_MIRROR" --local-package-repository=./miktex/download --package-set=essential --quiet download ; Check-Output $? + Write-Output "Installing MiKTeX" + .\miktex\download\miktexsetup.exe --remote-package-repository="$env:CTAN_MIRROR" --portable="$env:R_LIB_PATH/miktex" --quiet install ; Check-Output $? + Write-Output "Done installing MiKTeX" + + initexmf --set-config-value [MPM]AutoInstall=1 + conda install -q -y --no-deps pandoc +} + +Add-Content .Renviron "R_LIBS=$env:R_LIB_PATH" + +Write-Output "Installing dependencies" +$packages = "c('data.table', 'jsonlite', 'Matrix', 'R6', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" +Rscript --vanilla -e "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH')" ; Check-Output $? + +Write-Output "Building R package" +Rscript build_r.R --skip-install ; Check-Output $? + +$PKG_FILE_NAME = Get-Item *.tar.gz +$LOG_FILE_NAME = "lightgbm.Rcheck/00check.log" + +$env:_R_CHECK_FORCE_SUGGESTS_ = 0 +if ($env:COMPILER -ne "MINGW") { + Write-Output "Running R CMD check without checking documentation" + R.exe CMD check --no-multiarch --no-examples --no-manual --ignore-vignettes ${PKG_FILE_NAME} ; $check_succeeded = $? +} else { + Write-Output "Running R CMD check as CRAN" + R.exe CMD check --no-multiarch --as-cran ${PKG_FILE_NAME} ; $check_succeeded = $? +} + +Write-Output "R CMD check build logs:" +Get-Content -Path $env:BUILD_SOURCESDIRECTORY\lightgbm.Rcheck\00install.out + +Check-Output $check_succeeded + +Write-Output "Looking for issues with R CMD check results" +if (Get-Content "$LOG_FILE_NAME" | Select-String -Pattern "WARNING" -Quiet) { + echo "WARNINGS have been found by R CMD check!" + Check-Output $False +} + +$note_str = Get-Content "${LOG_FILE_NAME}" | Select-String -Pattern ' NOTE' | Out-String ; Check-Output $? +$relevant_line = $note_str -match '.*Status: (\d+) NOTE.*' +$NUM_CHECK_NOTES = $matches[1] +$ALLOWED_CHECK_NOTES = 3 +if ([int]$NUM_CHECK_NOTES -gt $ALLOWED_CHECK_NOTES) { + Write-Output "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed" + Check-Output $False +} + +Write-Output "No issues were found checking the R package" diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index 82849577152d..fd0e9f95a5e4 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -12,6 +12,11 @@ if (Test-Path env:APPVEYOR) { $env:BUILD_SOURCESDIRECTORY = $env:APPVEYOR_BUILD_FOLDER } +if ($env:TASK -eq "r-package") { + & $env:BUILD_SOURCESDIRECTORY\.ci\test_r_package_windows.ps1 ; Check-Output $? + Exit 0 +} + # setup for Python conda init powershell conda activate diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6002b2dafaa5..fdcce7242e00 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,9 +10,9 @@ * @guolinke @StrikerRUS @jameslamb @Laurae2 # main C++ code -include/ @guolinke @chivee -src/ @guolinke @chivee -CmakeLists.txt @guolinke @chivee @Laurae2 @jameslamb @wxchan @henry0312 @StrikerRUS @huanzhang12 +include/ @guolinke @chivee @btrotta +src/ @guolinke @chivee @btrotta +CmakeLists.txt @guolinke @chivee @Laurae2 @jameslamb @wxchan @henry0312 @StrikerRUS @huanzhang12 @btrotta # R code include/LightGBM/lightgbm_R.h @Laurae2 @jameslamb diff --git a/.gitignore b/.gitignore index 283349aee0fa..5a06037ed68f 100644 --- a/.gitignore +++ b/.gitignore @@ -406,6 +406,14 @@ lightgbm_r/* lightgbm*.tar.gz lightgbm.Rcheck/ +# Files created by R examples and tests +**/lgb-Dataset.data +**/lgb-model.rds +**/lgb.Dataset.data +**/model.rds +**/model.txt +**/lgb-model.txt + # Files from interactive R sessions .Rproj.user **/.Rhistory diff --git a/.readthedocs.yml b/.readthedocs.yml index c2e19847a4fc..fe005bc307a8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,7 +4,7 @@ formats: python: version: 3 install: - - requirements: docs/requirements.txt + - requirements: docs/requirements_rtd.txt sphinx: builder: html configuration: docs/conf.py diff --git a/.travis.yml b/.travis.yml index acba8bfc3ad0..3f13504929c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ os: - linux - osx dist: bionic -osx_image: xcode11.3 +osx_image: xcode11.4 env: global: # default values diff --git a/.vsts-ci.yml b/.vsts-ci.yml index a39dbc0e105b..a533ab1659e4 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -22,7 +22,7 @@ jobs: vmImage: 'ubuntu-latest' container: ubuntu1404 strategy: - maxParallel: 6 + maxParallel: 7 matrix: regular: TASK: regular @@ -76,7 +76,7 @@ jobs: pool: vmImage: 'macOS-10.14' strategy: - maxParallel: 3 + maxParallel: 4 matrix: regular: TASK: regular @@ -117,8 +117,11 @@ jobs: pool: vmImage: 'vs2017-win2016' strategy: - maxParallel: 3 + maxParallel: 4 matrix: + r_package: + TASK: r-package + COMPILER: MINGW regular: TASK: regular PYTHON_VERSION: 3.6 diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 6720c418389b..1924b6a1d666 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -4,11 +4,11 @@ Title: Light Gradient Boosting Machine Version: 2.3.2 Date: 2019-11-26 Authors@R: c( - person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")), - person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")), - person("Yachen", "Yan", role = c("ctb")), - person("James", "Lamb", email="jaylamb20@gmail.com", role = c("ctb")) - ) + person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")), + person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")), + person("Yachen", "Yan", role = c("ctb")), + person("James", "Lamb", email="jaylamb20@gmail.com", role = c("ctb")) + ) Description: Tree based algorithms can be improved by introducing boosting frameworks. LightGBM is one such framework, and this package offers an R interface to work with it. It is designed to be distributed and efficient with the following advantages: 1. Faster training speed and higher efficiency. @@ -16,12 +16,14 @@ Description: Tree based algorithms can be improved by introducing boosting frame 3. Better accuracy. 4. Parallel learning supported. 5. Capable of handling large-scale data. - In recognition of these advantages, LightGBM has being widely-used in many winning solutions of machine learning competitions. + In recognition of these advantages, LightGBM has been widely-used in many winning solutions of machine learning competitions. Comparison experiments on public datasets suggest that LightGBM can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. In addition, parallel experiments suggest that in certain circumstances, LightGBM can achieve a linear speed-up in training time by using multiple machines. Encoding: UTF-8 License: MIT + file LICENSE URL: https://github.com/Microsoft/LightGBM BugReports: https://github.com/Microsoft/LightGBM/issues +NeedsCompilation: yes +Biarch: false Suggests: ggplot2 (>= 1.0.1), knitr, @@ -37,4 +39,6 @@ Imports: Matrix (>= 1.1-0), methods, utils -RoxygenNote: 7.0.2 +SystemRequirements: + C++11 +RoxygenNote: 7.1.0 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 7aef39a1f6de..14db7ba98d6c 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -36,6 +36,7 @@ export(saveRDS.lgb.Booster) export(setinfo) export(slice) import(methods) +importFrom(Matrix,Matrix) importFrom(R6,R6Class) importFrom(data.table,":=") importFrom(data.table,as.data.table) diff --git a/R-package/R/callback.R b/R-package/R/callback.R index 3c8bb243783b..d5495c9d56cd 100644 --- a/R-package/R/callback.R +++ b/R-package/R/callback.R @@ -1,3 +1,11 @@ +# constants that control naming in lists +.EVAL_KEY <- function() { + return("eval") +} +.EVAL_ERR_KEY <- function() { + return("eval_err") +} + #' @importFrom R6 R6Class CB_ENV <- R6::R6Class( "lgb.cb_env", @@ -216,8 +224,8 @@ cb.record.evaluation <- function() { # Create dummy lists env$model$record_evals[[data_name]][[name]] <- list() - env$model$record_evals[[data_name]][[name]]$eval <- list() - env$model$record_evals[[data_name]][[name]]$eval_err <- list() + env$model$record_evals[[data_name]][[name]][[.EVAL_KEY()]] <- list() + env$model$record_evals[[data_name]][[name]][[.EVAL_ERR_KEY()]] <- list() } @@ -238,12 +246,12 @@ cb.record.evaluation <- function() { name <- eval_res$name # Store evaluation data - env$model$record_evals[[data_name]][[name]]$eval <- c( - env$model$record_evals[[data_name]][[name]]$eval + env$model$record_evals[[data_name]][[name]][[.EVAL_KEY()]] <- c( + env$model$record_evals[[data_name]][[name]][[.EVAL_KEY()]] , eval_res$value ) - env$model$record_evals[[data_name]][[name]]$eval_err <- c( - env$model$record_evals[[data_name]][[name]]$eval_err + env$model$record_evals[[data_name]][[name]][[.EVAL_ERR_KEY()]] <- c( + env$model$record_evals[[data_name]][[name]][[.EVAL_ERR_KEY()]] , eval_err ) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index f7aa4d10f49d..c9eb81f4d3c7 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -5,7 +5,7 @@ Booster <- R6::R6Class( public = list( best_iter = -1L, - best_score = NA, + best_score = NA_real_, record_evals = list(), # Finalize will free up the handles @@ -711,7 +711,6 @@ Booster <- R6::R6Class( #' number of columns corresponding to the number of trees. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -723,11 +722,10 @@ Booster <- R6::R6Class( #' model <- lgb.train( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , valids = valids #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L #' ) #' preds <- predict(model, test$data) #' @export @@ -769,7 +767,7 @@ predict.lgb.Booster <- function(object, #' @return lgb.Booster #' #' @examples -#' library(lightgbm) +#' \donttest{ #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -781,17 +779,17 @@ predict.lgb.Booster <- function(object, #' model <- lgb.train( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , valids = valids #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L +#' , early_stopping_rounds = 3L #' ) #' lgb.save(model, "model.txt") #' load_booster <- lgb.load(filename = "model.txt") #' model_string <- model$save_model_to_string(NULL) # saves best iteration #' load_booster_from_str <- lgb.load(model_str = model_string) -#' +#' } #' @export lgb.load <- function(filename = NULL, model_str = NULL) { @@ -828,6 +826,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' @return lgb.Booster #' #' @examples +#' \donttest{ #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -846,7 +845,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) { #' , learning_rate = 1.0 #' , early_stopping_rounds = 5L #' ) -#' lgb.save(model, "model.txt") +#' lgb.save(model, "lgb-model.txt") +#' } #' @export lgb.save <- function(booster, filename, num_iteration = NULL) { @@ -874,6 +874,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' @return json format of model #' #' @examples +#' \donttest{ #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -893,7 +894,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) { #' , early_stopping_rounds = 5L #' ) #' json_model <- lgb.dump(model) -#' +#' } #' @export lgb.dump <- function(booster, num_iteration = NULL) { @@ -922,7 +923,6 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' #' @examples #' # train a regression model -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -934,11 +934,10 @@ lgb.dump <- function(booster, num_iteration = NULL) { #' model <- lgb.train( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , valids = valids #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L #' ) #' #' # Examine valid data_name values @@ -990,11 +989,11 @@ lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_ } # Create result - result <- booster$record_evals[[data_name]][[eval_name]]$eval + result <- booster$record_evals[[data_name]][[eval_name]][[.EVAL_KEY()]] # Check if error is requested if (is_err) { - result <- booster$record_evals[[data_name]][[eval_name]]$eval_err + result <- booster$record_evals[[data_name]][[eval_name]][[.EVAL_ERR_KEY()]] } # Check if iteration is non existant diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index c361a6c423c3..fed95913d2d6 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -725,7 +725,6 @@ Dataset <- R6::R6Class( #' @return constructed dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -770,7 +769,6 @@ lgb.Dataset <- function(data, #' @return constructed dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -797,7 +795,6 @@ lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) { #' @param dataset Object of class \code{lgb.Dataset} #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -828,7 +825,6 @@ lgb.Dataset.construct <- function(dataset) { #' be directly used with an \code{lgb.Dataset} object. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -863,7 +859,6 @@ dim.lgb.Dataset <- function(x, ...) { #' Since row names are irrelevant, it is recommended to use \code{colnames} directly. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -936,7 +931,6 @@ dimnames.lgb.Dataset <- function(x) { #' @return constructed sub dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -983,7 +977,6 @@ slice.lgb.Dataset <- function(dataset, idxset, ...) { #' } #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1037,7 +1030,6 @@ getinfo.lgb.Dataset <- function(dataset, name, ...) { #' } #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1078,12 +1070,11 @@ setinfo.lgb.Dataset <- function(dataset, name, info, ...) { #' @return passed dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) -#' lgb.Dataset.save(dtrain, "lgb.Dataset.data") -#' dtrain <- lgb.Dataset("lgb.Dataset.data") +#' lgb.Dataset.save(dtrain, "lgb-Dataset.data") +#' dtrain <- lgb.Dataset("lgb-Dataset.data") #' lgb.Dataset.set.categorical(dtrain, 1L:2L) #' #' @rdname lgb.Dataset.set.categorical @@ -1109,7 +1100,6 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) { #' @return passed dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package ="lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -1141,7 +1131,6 @@ lgb.Dataset.set.reference <- function(dataset, reference) { #' @return passed dataset #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 3433aade6594..6fa0a30c606f 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -56,7 +56,6 @@ CVBooster <- R6::R6Class( #' @return a trained model \code{lgb.CVBooster}. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -64,11 +63,10 @@ CVBooster <- R6::R6Class( #' model <- lgb.cv( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , nfold = 3L #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L #' ) #' @importFrom data.table data.table setorderv #' @export @@ -372,14 +370,22 @@ lgb.cv <- function(params = list() } + # When early stopping is not activated, we compute the best iteration / score ourselves + # based on the first first metric if (record && is.na(env$best_score)) { - if (env$eval_list[[1L]]$higher_better[1L] == TRUE) { - cv_booster$best_iter <- unname(which.max(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]]))) - cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]] - } else { - cv_booster$best_iter <- unname(which.min(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]]))) - cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]] + first_metric <- cv_booster$boosters[[1L]][[1L]]$.__enclos_env__$private$eval_names[1L] + .find_best <- which.min + if (isTRUE(env$eval_list[[1L]]$higher_better[1L])) { + .find_best <- which.max } + cv_booster$best_iter <- unname( + .find_best( + unlist( + cv_booster$record_evals[["valid"]][[first_metric]][[.EVAL_KEY()]] + ) + ) + ) + cv_booster$best_score <- cv_booster$record_evals[["valid"]][[first_metric]][[.EVAL_KEY()]][[cv_booster$best_iter]] } if (reset_data) { diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R index acef193ebbc4..3064673f664a 100644 --- a/R-package/R/lgb.importance.R +++ b/R-package/R/lgb.importance.R @@ -13,20 +13,22 @@ #' } #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) #' #' params <- list( #' objective = "binary" -#' , learning_rate = 0.01 -#' , num_leaves = 63L +#' , learning_rate = 0.1 #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 #' ) -#' model <- lgb.train(params, dtrain, 10L) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 5L +#' ) #' #' tree_imp1 <- lgb.importance(model, percentage = TRUE) #' tree_imp2 <- lgb.importance(model, percentage = FALSE) diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R index eb0ecd94a6a1..e97fb1b590a1 100644 --- a/R-package/R/lgb.interprete.R +++ b/R-package/R/lgb.interprete.R @@ -16,7 +16,6 @@ #' Contribution columns to each class. #' #' @examples -#' Sigmoid <- function(x) 1.0 / (1.0 + exp(-x)) #' Logit <- function(x) log(x / (1.0 - x)) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -27,13 +26,16 @@ #' #' params <- list( #' objective = "binary" -#' , learning_rate = 0.01 -#' , num_leaves = 63L +#' , learning_rate = 0.1 #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 #' ) -#' model <- lgb.train(params, dtrain, 10L) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 3L +#' ) #' #' tree_interpretation <- lgb.interprete(model, test$data, 1L:5L) #' diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R index b05dac39b91a..ec496c4213f3 100644 --- a/R-package/R/lgb.plot.importance.R +++ b/R-package/R/lgb.plot.importance.R @@ -24,17 +24,19 @@ #' #' params <- list( #' objective = "binary" -#' , learning_rate = 0.01 -#' , num_leaves = 63L -#' , max_depth = -1L +#' , learning_rate = 0.1 #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 #' ) #' -#' model <- lgb.train(params, dtrain, 10L) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 5L +#' ) #' #' tree_imp <- lgb.importance(model, percentage = TRUE) -#' lgb.plot.importance(tree_imp, top_n = 10L, measure = "Gain") +#' lgb.plot.importance(tree_imp, top_n = 5L, measure = "Gain") #' @importFrom graphics barplot par #' @export lgb.plot.importance <- function(tree_imp, diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R index 2914ddf94f97..486b80dd46dd 100644 --- a/R-package/R/lgb.plot.interpretation.R +++ b/R-package/R/lgb.plot.interpretation.R @@ -15,28 +15,43 @@ #' The \code{lgb.plot.interpretation} function creates a \code{barplot}. #' #' @examples -#' library(lightgbm) -#' Sigmoid <- function(x) {1.0 / (1.0 + exp(-x))} -#' Logit <- function(x) {log(x / (1.0 - x))} +#' \donttest{ +#' Logit <- function(x) { +#' log(x / (1.0 - x)) +#' } #' data(agaricus.train, package = "lightgbm") -#' train <- agaricus.train -#' dtrain <- lgb.Dataset(train$data, label = train$label) -#' setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) +#' labels <- agaricus.train$label +#' dtrain <- lgb.Dataset( +#' agaricus.train$data +#' , label = labels +#' ) +#' setinfo(dtrain, "init_score", rep(Logit(mean(labels)), length(labels))) +#' #' data(agaricus.test, package = "lightgbm") -#' test <- agaricus.test #' #' params <- list( #' objective = "binary" -#' , learning_rate = 0.01 -#' , num_leaves = 63L +#' , learning_rate = 0.1 #' , max_depth = -1L #' , min_data_in_leaf = 1L #' , min_sum_hessian_in_leaf = 1.0 #' ) -#' model <- lgb.train(params, dtrain, 10L) +#' model <- lgb.train( +#' params = params +#' , data = dtrain +#' , nrounds = 5L +#' ) #' -#' tree_interpretation <- lgb.interprete(model, test$data, 1L:5L) -#' lgb.plot.interpretation(tree_interpretation[[1L]], top_n = 10L) +#' tree_interpretation <- lgb.interprete( +#' model = model +#' , data = agaricus.test$data +#' , idxset = 1L:5L +#' ) +#' lgb.plot.interpretation( +#' tree_interpretation_dt = tree_interpretation[[1L]] +#' , top_n = 3L +#' ) +#' } #' @importFrom data.table setnames #' @importFrom graphics barplot par #' @export @@ -126,7 +141,7 @@ multiple.tree.plot.interpretation <- function(tree_interpretation, } # create plot - tree_interpretation[Contribution > 0.0, bar_color := "firebrick"] + tree_interpretation[abs(Contribution) > 0.0, bar_color := "firebrick"] tree_interpretation[Contribution == 0.0, bar_color := "steelblue"] tree_interpretation[.N:1L, graphics::barplot( diff --git a/R-package/R/lgb.prepare.R b/R-package/R/lgb.prepare.R index 42a9daa0d434..863271e06040 100644 --- a/R-package/R/lgb.prepare.R +++ b/R-package/R/lgb.prepare.R @@ -8,7 +8,6 @@ #' for input in \code{lgb.Dataset}. #' #' @examples -#' library(lightgbm) #' data(iris) #' #' str(iris) diff --git a/R-package/R/lgb.prepare2.R b/R-package/R/lgb.prepare2.R index f2fdc89d0c2a..0d7179ed9496 100644 --- a/R-package/R/lgb.prepare2.R +++ b/R-package/R/lgb.prepare2.R @@ -11,7 +11,6 @@ #' for input in \code{lgb.Dataset}. #' #' @examples -#' library(lightgbm) #' data(iris) #' #' str(iris) diff --git a/R-package/R/lgb.prepare_rules.R b/R-package/R/lgb.prepare_rules.R index e6efe89ab25f..3eda16672ce7 100644 --- a/R-package/R/lgb.prepare_rules.R +++ b/R-package/R/lgb.prepare_rules.R @@ -10,7 +10,6 @@ #' in \code{lgb.Dataset}. #' #' @examples -#' library(lightgbm) #' data(iris) #' #' str(iris) @@ -37,9 +36,13 @@ #' data(iris) # Erase iris dataset #' #' # We remapped values differently -#' personal_rules <- list(Species = c("setosa" = 3L, -#' "versicolor" = 2L, -#' "virginica" = 1L)) +#' personal_rules <- list( +#' Species = c( +#' "setosa" = 3L +#' , "versicolor" = 2L +#' , "virginica" = 1L +#' ) +#' ) #' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules) #' str(newest_iris$data) # SUCCESS! #' diff --git a/R-package/R/lgb.prepare_rules2.R b/R-package/R/lgb.prepare_rules2.R index dab2ae5f5271..62688a765b47 100644 --- a/R-package/R/lgb.prepare_rules2.R +++ b/R-package/R/lgb.prepare_rules2.R @@ -13,7 +13,6 @@ #' \code{lgb.Dataset}. #' #' @examples -#' library(lightgbm) #' data(iris) #' #' str(iris) diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index d0dacecc0bd1..e6cea8076b9f 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -29,7 +29,6 @@ #' @return a trained booster model \code{lgb.Booster}. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -41,11 +40,11 @@ #' model <- lgb.train( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , valids = valids #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L +#' , early_stopping_rounds = 3L #' ) #' @export lgb.train <- function(params = list(), @@ -238,6 +237,7 @@ lgb.train <- function(params = list(), if (valid_contain_train) { booster$set_train_data_name(train_data_name) } + for (key in names(reduced_valid_sets)) { booster$add_valid(reduced_valid_sets[[key]], key) } @@ -291,16 +291,26 @@ lgb.train <- function(params = list(), } + # check if any valids were given other than the training data + non_train_valid_names <- names(valids)[!(names(valids) == train_data_name)] + first_valid_name <- non_train_valid_names[1L] + # When early stopping is not activated, we compute the best iteration / score ourselves by # selecting the first metric and the first dataset - if (record && length(valids) > 0L && is.na(env$best_score)) { - if (env$eval_list[[1L]]$higher_better[1L] == TRUE) { - booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2L]][[1L]][[1L]]))) - booster$best_score <- booster$record_evals[[2L]][[1L]][[1L]][[booster$best_iter]] - } else { - booster$best_iter <- unname(which.min(unlist(booster$record_evals[[2L]][[1L]][[1L]]))) - booster$best_score <- booster$record_evals[[2L]][[1L]][[1L]][[booster$best_iter]] + if (record && length(non_train_valid_names) > 0L && is.na(env$best_score)) { + first_metric <- booster$.__enclos_env__$private$eval_names[1L] + .find_best <- which.min + if (isTRUE(env$eval_list[[1L]]$higher_better[1L])) { + .find_best <- which.max } + booster$best_iter <- unname( + .find_best( + unlist( + booster$record_evals[[first_valid_name]][[first_metric]][[.EVAL_KEY()]] + ) + ) + ) + booster$best_score <- booster$record_evals[[first_valid_name]][[first_metric]][[.EVAL_KEY()]][[booster$best_iter]] } # Check for booster model conversion to predictor model diff --git a/R-package/R/lgb.unloader.R b/R-package/R/lgb.unloader.R index cb80e2f01ff6..aaafca019358 100644 --- a/R-package/R/lgb.unloader.R +++ b/R-package/R/lgb.unloader.R @@ -14,7 +14,6 @@ #' @return NULL invisibly. #' #' @examples -#' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train #' dtrain <- lgb.Dataset(train$data, label = train$label) @@ -26,11 +25,10 @@ #' model <- lgb.train( #' params = params #' , data = dtrain -#' , nrounds = 10L +#' , nrounds = 5L #' , valids = valids #' , min_data = 1L #' , learning_rate = 1.0 -#' , early_stopping_rounds = 5L #' ) #' #' \dontrun{ diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index 9459ccc4d5c4..12640ecfb0af 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -178,6 +178,7 @@ NULL # Various imports #' @import methods +#' @importFrom Matrix Matrix #' @importFrom R6 R6Class #' @useDynLib lib_lightgbm , .registration = TRUE NULL diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R index e68dd8c963de..f0c862f33c74 100644 --- a/R-package/R/readRDS.lgb.Booster.R +++ b/R-package/R/readRDS.lgb.Booster.R @@ -7,6 +7,7 @@ #' @return \code{lgb.Booster}. #' #' @examples +#' \donttest{ #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -27,7 +28,7 @@ #' ) #' saveRDS.lgb.Booster(model, "model.rds") #' new_model <- readRDS.lgb.Booster("model.rds") -#' +#' } #' @export readRDS.lgb.Booster <- function(file = "", refhook = NULL) { diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R index 21bd8483628f..185186c9bc25 100644 --- a/R-package/R/saveRDS.lgb.Booster.R +++ b/R-package/R/saveRDS.lgb.Booster.R @@ -18,6 +18,7 @@ #' @return NULL invisibly. #' #' @examples +#' \donttest{ #' library(lightgbm) #' data(agaricus.train, package = "lightgbm") #' train <- agaricus.train @@ -36,7 +37,8 @@ #' , learning_rate = 1.0 #' , early_stopping_rounds = 5L #' ) -#' saveRDS.lgb.Booster(model, "model.rds") +#' saveRDS.lgb.Booster(model, "lgb-model.rds") +#' } #' @export saveRDS.lgb.Booster <- function(object, file = "", diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 9b036f91db8d..1e0e759d653b 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -19,6 +19,8 @@ lgb.encode.char <- function(arr, len) { } +# [description] Raise an error. Before raising that error, check for any error message +# stored in a buffer on the C++ side. lgb.last_error <- function() { # Perform text error buffering buf_len <- 200L diff --git a/R-package/README.md b/R-package/README.md index 6e4a6eb33050..c9474cf38107 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -147,7 +147,7 @@ The example below shows how to generate code coverage for the R package on a mac # Install export CXX=/usr/local/bin/g++-8 export CC=/usr/local/bin/gcc-8 -Rscript build_r.R +Rscript build_r.R --skip-install # Get coverage Rscript -e " \ diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index c713cbde6a9a..aad4627eedae 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -4,8 +4,10 @@ \name{agaricus.test} \alias{agaricus.test} \title{Test part from Mushroom Data Set} -\format{A list containing a label vector, and a dgCMatrix object with 1611 -rows and 126 variables} +\format{ +A list containing a label vector, and a dgCMatrix object with 1611 +rows and 126 variables +} \usage{ data(agaricus.test) } diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index 9465a6efea29..62b7d05a0bc3 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -4,8 +4,10 @@ \name{agaricus.train} \alias{agaricus.train} \title{Training part from Mushroom Data Set} -\format{A list containing a label vector, and a dgCMatrix object with 6513 -rows and 127 variables} +\format{ +A list containing a label vector, and a dgCMatrix object with 6513 +rows and 127 variables +} \usage{ data(agaricus.train) } diff --git a/R-package/man/bank.Rd b/R-package/man/bank.Rd index fd1382eb87d8..3b4c13b24d54 100644 --- a/R-package/man/bank.Rd +++ b/R-package/man/bank.Rd @@ -4,7 +4,9 @@ \name{bank} \alias{bank} \title{Bank Marketing Data Set} -\format{A data.table with 4521 rows and 17 variables} +\format{ +A data.table with 4521 rows and 17 variables +} \usage{ data(bank) } diff --git a/R-package/man/dim.Rd b/R-package/man/dim.Rd index 4fdb64252f7e..55fde26d6a5b 100644 --- a/R-package/man/dim.Rd +++ b/R-package/man/dim.Rd @@ -22,7 +22,6 @@ Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also be directly used with an \code{lgb.Dataset} object. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/dimnames.lgb.Dataset.Rd b/R-package/man/dimnames.lgb.Dataset.Rd index 5f85ff12bd6f..22be85149646 100644 --- a/R-package/man/dimnames.lgb.Dataset.Rd +++ b/R-package/man/dimnames.lgb.Dataset.Rd @@ -24,7 +24,6 @@ Generic \code{dimnames} methods are used by \code{colnames}. Since row names are irrelevant, it is recommended to use \code{colnames} directly. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 29254d8622f3..2925308ed7e9 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -33,7 +33,6 @@ The \code{name} field can be one of the following: } } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd index b6cbb5327a14..fb1d1067a53e 100644 --- a/R-package/man/lgb.Dataset.Rd +++ b/R-package/man/lgb.Dataset.Rd @@ -40,7 +40,6 @@ Construct \code{lgb.Dataset} object from dense matrix, sparse matrix or local file (that was created previously by saving an \code{lgb.Dataset}). } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.construct.Rd b/R-package/man/lgb.Dataset.construct.Rd index 23dfc0e9f67b..4338f84b669c 100644 --- a/R-package/man/lgb.Dataset.construct.Rd +++ b/R-package/man/lgb.Dataset.construct.Rd @@ -13,7 +13,6 @@ lgb.Dataset.construct(dataset) Construct Dataset explicitly } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd index e48c93772a39..0669f1887171 100644 --- a/R-package/man/lgb.Dataset.create.valid.Rd +++ b/R-package/man/lgb.Dataset.create.valid.Rd @@ -22,7 +22,6 @@ constructed dataset Construct validation data according to training data } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.save.Rd b/R-package/man/lgb.Dataset.save.Rd index d8446f030936..26895999d11a 100644 --- a/R-package/man/lgb.Dataset.save.Rd +++ b/R-package/man/lgb.Dataset.save.Rd @@ -19,7 +19,6 @@ Please note that \code{init_score} is not saved in binary file. If you need it, please set it again after loading Dataset. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd index e1f03cfbf9e6..097eea02b465 100644 --- a/R-package/man/lgb.Dataset.set.categorical.Rd +++ b/R-package/man/lgb.Dataset.set.categorical.Rd @@ -21,12 +21,11 @@ Set the categorical features of an \code{lgb.Dataset} object. Use this function to tell LightGBM which features should be treated as categorical. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) -lgb.Dataset.save(dtrain, "lgb.Dataset.data") -dtrain <- lgb.Dataset("lgb.Dataset.data") +lgb.Dataset.save(dtrain, "lgb-Dataset.data") +dtrain <- lgb.Dataset("lgb-Dataset.data") lgb.Dataset.set.categorical(dtrain, 1L:2L) } diff --git a/R-package/man/lgb.Dataset.set.reference.Rd b/R-package/man/lgb.Dataset.set.reference.Rd index fabe7c03e6fd..e8bd41820286 100644 --- a/R-package/man/lgb.Dataset.set.reference.Rd +++ b/R-package/man/lgb.Dataset.set.reference.Rd @@ -18,7 +18,6 @@ passed dataset If you want to use validation data, you should set reference to training data } \examples{ -library(lightgbm) data(agaricus.train, package ="lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index 01473339cf6f..673392f54568 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -100,7 +100,6 @@ a trained model \code{lgb.CVBooster}. Cross validation logic used by LightGBM } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -108,10 +107,9 @@ params <- list(objective = "regression", metric = "l2") model <- lgb.cv( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , nfold = 3L , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L ) } diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd index 828ba4ac7ea9..6fbc5cbe9b43 100644 --- a/R-package/man/lgb.dump.Rd +++ b/R-package/man/lgb.dump.Rd @@ -18,6 +18,7 @@ json format of model Dump LightGBM model to json } \examples{ +\donttest{ library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -37,5 +38,5 @@ model <- lgb.train( , early_stopping_rounds = 5L ) json_model <- lgb.dump(model) - +} } diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd index c5473825e61a..5707d8ccb6c4 100644 --- a/R-package/man/lgb.get.eval.result.Rd +++ b/R-package/man/lgb.get.eval.result.Rd @@ -33,7 +33,6 @@ Given a \code{lgb.Booster}, return evaluation results for a } \examples{ # train a regression model -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -45,11 +44,10 @@ valids <- list(test = dtest) model <- lgb.train( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , valids = valids , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L ) # Examine valid data_name values diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd index 3d6c1fae5217..5a269407859f 100644 --- a/R-package/man/lgb.importance.Rd +++ b/R-package/man/lgb.importance.Rd @@ -24,20 +24,22 @@ For a tree model, a \code{data.table} with the following columns: Creates a \code{data.table} of feature importances in a model. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) params <- list( objective = "binary" - , learning_rate = 0.01 - , num_leaves = 63L + , learning_rate = 0.1 , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 ) -model <- lgb.train(params, dtrain, 10L) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 5L +) tree_imp1 <- lgb.importance(model, percentage = TRUE) tree_imp2 <- lgb.importance(model, percentage = FALSE) diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd index aa8aedf156f4..86fb8ecb515b 100644 --- a/R-package/man/lgb.interprete.Rd +++ b/R-package/man/lgb.interprete.Rd @@ -29,7 +29,6 @@ For regression, binary classification and lambdarank model, a \code{list} of \co Computes feature contribution components of rawscore prediction. } \examples{ -Sigmoid <- function(x) 1.0 / (1.0 + exp(-x)) Logit <- function(x) log(x / (1.0 - x)) data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -40,13 +39,16 @@ test <- agaricus.test params <- list( objective = "binary" - , learning_rate = 0.01 - , num_leaves = 63L + , learning_rate = 0.1 , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 ) -model <- lgb.train(params, dtrain, 10L) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 3L +) tree_interpretation <- lgb.interprete(model, test$data, 1L:5L) diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd index c2b1500e9bc8..5f7c2354733e 100644 --- a/R-package/man/lgb.load.Rd +++ b/R-package/man/lgb.load.Rd @@ -19,7 +19,7 @@ Load LightGBM takes in either a file path or model string. If both are provided, Load will default to loading from file } \examples{ -library(lightgbm) +\donttest{ data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -31,15 +31,15 @@ valids <- list(test = dtest) model <- lgb.train( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , valids = valids , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L + , early_stopping_rounds = 3L ) lgb.save(model, "model.txt") load_booster <- lgb.load(filename = "model.txt") model_string <- model$save_model_to_string(NULL) # saves best iteration load_booster_from_str <- lgb.load(model_str = model_string) - +} } diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd index 97775efd704d..024077a08409 100644 --- a/R-package/man/lgb.plot.importance.Rd +++ b/R-package/man/lgb.plot.importance.Rd @@ -43,15 +43,17 @@ dtrain <- lgb.Dataset(train$data, label = train$label) params <- list( objective = "binary" - , learning_rate = 0.01 - , num_leaves = 63L - , max_depth = -1L + , learning_rate = 0.1 , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 ) -model <- lgb.train(params, dtrain, 10L) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 5L +) tree_imp <- lgb.importance(model, percentage = TRUE) -lgb.plot.importance(tree_imp, top_n = 10L, measure = "Gain") +lgb.plot.importance(tree_imp, top_n = 5L, measure = "Gain") } diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd index f5fa6497f2c0..f8266308552d 100644 --- a/R-package/man/lgb.plot.interpretation.Rd +++ b/R-package/man/lgb.plot.interpretation.Rd @@ -34,26 +34,41 @@ The graph represents each feature as a horizontal bar of length proportional to contribution of a feature. Features are shown ranked in a decreasing contribution order. } \examples{ -library(lightgbm) -Sigmoid <- function(x) {1.0 / (1.0 + exp(-x))} -Logit <- function(x) {log(x / (1.0 - x))} +\donttest{ +Logit <- function(x) { + log(x / (1.0 - x)) +} data(agaricus.train, package = "lightgbm") -train <- agaricus.train -dtrain <- lgb.Dataset(train$data, label = train$label) -setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))) +labels <- agaricus.train$label +dtrain <- lgb.Dataset( + agaricus.train$data + , label = labels +) +setinfo(dtrain, "init_score", rep(Logit(mean(labels)), length(labels))) + data(agaricus.test, package = "lightgbm") -test <- agaricus.test params <- list( objective = "binary" - , learning_rate = 0.01 - , num_leaves = 63L + , learning_rate = 0.1 , max_depth = -1L , min_data_in_leaf = 1L , min_sum_hessian_in_leaf = 1.0 ) -model <- lgb.train(params, dtrain, 10L) +model <- lgb.train( + params = params + , data = dtrain + , nrounds = 5L +) -tree_interpretation <- lgb.interprete(model, test$data, 1L:5L) -lgb.plot.interpretation(tree_interpretation[[1L]], top_n = 10L) +tree_interpretation <- lgb.interprete( + model = model + , data = agaricus.test$data + , idxset = 1L:5L +) +lgb.plot.interpretation( + tree_interpretation_dt = tree_interpretation[[1L]] + , top_n = 3L +) +} } diff --git a/R-package/man/lgb.prepare.Rd b/R-package/man/lgb.prepare.Rd index dc1fed72e698..db726b15d36a 100644 --- a/R-package/man/lgb.prepare.Rd +++ b/R-package/man/lgb.prepare.Rd @@ -19,7 +19,6 @@ Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. \code{\link{lgb.prepare_rules}} if you want to apply this transformation to other datasets. } \examples{ -library(lightgbm) data(iris) str(iris) diff --git a/R-package/man/lgb.prepare2.Rd b/R-package/man/lgb.prepare2.Rd index e4eaf53df2f6..eef44758f42b 100644 --- a/R-package/man/lgb.prepare2.Rd +++ b/R-package/man/lgb.prepare2.Rd @@ -22,7 +22,6 @@ Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. input. Consider this as a half memory technique which is dangerous, especially for LightGBM. } \examples{ -library(lightgbm) data(iris) str(iris) diff --git a/R-package/man/lgb.prepare_rules.Rd b/R-package/man/lgb.prepare_rules.Rd index a766b7f26af6..cc34bf11ca5c 100644 --- a/R-package/man/lgb.prepare_rules.Rd +++ b/R-package/man/lgb.prepare_rules.Rd @@ -22,7 +22,6 @@ Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. so you can convert other datasets using this converter. } \examples{ -library(lightgbm) data(iris) str(iris) @@ -49,9 +48,13 @@ all.equal(new_iris$data, newer_iris$data) data(iris) # Erase iris dataset # We remapped values differently -personal_rules <- list(Species = c("setosa" = 3L, - "versicolor" = 2L, - "virginica" = 1L)) +personal_rules <- list( + Species = c( + "setosa" = 3L + , "versicolor" = 2L + , "virginica" = 1L + ) +) newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules) str(newest_iris$data) # SUCCESS! diff --git a/R-package/man/lgb.prepare_rules2.Rd b/R-package/man/lgb.prepare_rules2.Rd index b19f275f2009..7fe3de7c4929 100644 --- a/R-package/man/lgb.prepare_rules2.Rd +++ b/R-package/man/lgb.prepare_rules2.Rd @@ -25,7 +25,6 @@ Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Consider this as a half memory technique which is dangerous, especially for LightGBM. } \examples{ -library(lightgbm) data(iris) str(iris) diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd index 70bd098a0913..f1ffd48355ee 100644 --- a/R-package/man/lgb.save.Rd +++ b/R-package/man/lgb.save.Rd @@ -20,6 +20,7 @@ lgb.Booster Save LightGBM model } \examples{ +\donttest{ library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -38,5 +39,6 @@ model <- lgb.train( , learning_rate = 1.0 , early_stopping_rounds = 5L ) -lgb.save(model, "model.txt") +lgb.save(model, "lgb-model.txt") +} } diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index 98298ab6f954..b471e0c7601f 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -83,7 +83,6 @@ a trained booster model \code{lgb.Booster}. Logic to train with LightGBM } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -95,10 +94,10 @@ valids <- list(test = dtest) model <- lgb.train( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , valids = valids , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L + , early_stopping_rounds = 3L ) } diff --git a/R-package/man/lgb.unloader.Rd b/R-package/man/lgb.unloader.Rd index 758a831ee3dd..ca69c08b602c 100644 --- a/R-package/man/lgb.unloader.Rd +++ b/R-package/man/lgb.unloader.Rd @@ -26,7 +26,6 @@ Attempts to unload LightGBM packages so you can remove objects cleanly without apparent reason and you do not want to restart R to fix the lost object. } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -38,11 +37,10 @@ valids <- list(test = dtest) model <- lgb.train( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , valids = valids , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L ) \dontrun{ diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd index ae2f61a86256..a7143179495e 100644 --- a/R-package/man/lgb_shared_params.Rd +++ b/R-package/man/lgb_shared_params.Rd @@ -4,7 +4,8 @@ \alias{lgb_shared_params} \title{Shared parameter docs} \arguments{ -\item{callbacks}{List of callback functions that are applied at each iteration.} +\item{callbacks}{list of callback functions +List of callback functions that are applied at each iteration.} \item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}}, may allow you to pass other types of data like \code{matrix} and then separately supply diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 88d98d13525d..256a7dc6e8e9 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -45,7 +45,8 @@ If early stopping occurs, the model will have 'best_iter' field.} \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model} -\item{callbacks}{List of callback functions that are applied at each iteration.} +\item{callbacks}{list of callback functions +List of callback functions that are applied at each iteration.} \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example \itemize{ diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd index 985cd763689a..40444cbff7be 100644 --- a/R-package/man/predict.lgb.Booster.Rd +++ b/R-package/man/predict.lgb.Booster.Rd @@ -52,7 +52,6 @@ For regression or binary classification, it returns a vector of length \code{nro Predicted values based on class \code{lgb.Booster} } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) @@ -64,11 +63,10 @@ valids <- list(test = dtest) model <- lgb.train( params = params , data = dtrain - , nrounds = 10L + , nrounds = 5L , valids = valids , min_data = 1L , learning_rate = 1.0 - , early_stopping_rounds = 5L ) preds <- predict(model, test$data) } diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd index 809333aef0dc..be03fd1cfcb8 100644 --- a/R-package/man/readRDS.lgb.Booster.Rd +++ b/R-package/man/readRDS.lgb.Booster.Rd @@ -18,6 +18,7 @@ readRDS.lgb.Booster(file = "", refhook = NULL) Attempts to load a model stored in a \code{.rds} file, using \code{\link[base]{readRDS}} } \examples{ +\donttest{ library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -38,5 +39,5 @@ model <- lgb.train( ) saveRDS.lgb.Booster(model, "model.rds") new_model <- readRDS.lgb.Booster("model.rds") - +} } diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd index 2d1fbb636a93..66afa861db9f 100644 --- a/R-package/man/saveRDS.lgb.Booster.Rd +++ b/R-package/man/saveRDS.lgb.Booster.Rd @@ -42,6 +42,7 @@ Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not. } \examples{ +\donttest{ library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train @@ -60,5 +61,6 @@ model <- lgb.train( , learning_rate = 1.0 , early_stopping_rounds = 5L ) -saveRDS.lgb.Booster(model, "model.rds") +saveRDS.lgb.Booster(model, "lgb-model.rds") +} } diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index 74d18673a8a1..344f79cc4621 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -38,7 +38,6 @@ The \code{name} field can be one of the following: } } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 6eaec3daeab2..90c837f222ab 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -24,7 +24,6 @@ Get a new \code{lgb.Dataset} containing the specified rows of original \code{lgb.Dataset} object } \examples{ -library(lightgbm) data(agaricus.train, package = "lightgbm") train <- agaricus.train dtrain <- lgb.Dataset(train$data, label = train$label) diff --git a/R-package/src/cmake/modules/FindLibR.cmake b/R-package/src/cmake/modules/FindLibR.cmake index e8e6fb45e12e..6a613925af5f 100644 --- a/R-package/src/cmake/modules/FindLibR.cmake +++ b/R-package/src/cmake/modules/FindLibR.cmake @@ -171,13 +171,26 @@ set(LIBR_HOME ${LIBR_HOME} CACHE PATH "R home directory") set(LIBR_EXECUTABLE ${LIBR_EXECUTABLE} CACHE PATH "R executable") set(LIBR_INCLUDE_DIRS ${LIBR_INCLUDE_DIRS} CACHE PATH "R include directory") +# where is R.so / R.dll / libR.so likely to be found? +set(LIBR_PATH_HINTS "${CMAKE_CURRENT_BINARY_DIR}" "${LIBR_HOME}/bin/${R_ARCH}" "${LIBR_HOME}/bin" "${LIBR_LIBRARIES}") + # look for the core R library find_library( LIBR_CORE_LIBRARY - NAMES R - HINTS "${CMAKE_CURRENT_BINARY_DIR}" "${LIBR_HOME}/lib" "${LIBR_HOME}/bin/${R_ARCH}" "${LIBR_HOME}/bin" "${LIBR_LIBRARIES}" + NAMES R R.dll + HINTS ${LIBR_PATH_HINTS} ) +# starting from CMake 3.17, find_library() will not find .dll files by default +# https://cmake.org/cmake/help/v3.17/release/3.17.html#other-changes +if (WIN32 AND NOT LIBR_CORE_LIBRARY) + find_file( + LIBR_CORE_LIBRARY + NAME R.dll + HINTS ${LIBR_PATH_HINTS} + ) +endif() + set(LIBR_CORE_LIBRARY ${LIBR_CORE_LIBRARY} CACHE PATH "R core shared library") if(WIN32 AND MSVC) diff --git a/R-package/src/install.libs.R b/R-package/src/install.libs.R index a79c93d846b7..339e8417ee70 100644 --- a/R-package/src/install.libs.R +++ b/R-package/src/install.libs.R @@ -67,6 +67,7 @@ if (!use_precompile) { # Check if Windows installation (for gcc vs Visual Studio) if (WINDOWS) { if (use_mingw) { + print("Trying to build with MinGW") cmake_cmd <- paste0(cmake_cmd, " -G \"MinGW Makefiles\" ") build_cmd <- "mingw32-make.exe _lightgbm" system(paste0(cmake_cmd, " ..")) # Must build twice for Windows due sh.exe in Rtools diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index e3f1030d7755..3455c17cad31 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -250,6 +250,37 @@ test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset } }) +test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { + set.seed(708L) + dtrain <- lgb.Dataset( + data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) + , label = rep(c(0L, 1L), 250L) + ) + nrounds <- 10L + cv_bst <- lgb.cv( + data = dtrain + , nfold = 5L + , nrounds = nrounds + , num_leaves = 5L + , params = list( + objective = "binary" + , metric = "auc,binary_error" + , learning_rate = 1.5 + ) + ) + expect_is(cv_bst, "lgb.CVBooster") + expect_named( + cv_bst$record_evals + , c("start_iter", "valid") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(cv_bst$best_iter, which.max(auc_scores)) + expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) +}) + context("lgb.train()") test_that("lgb.train() works as expected with multiple eval metrics", { @@ -595,3 +626,266 @@ test_that("lgb.train() supports non-ASCII feature names", { , feature_names ) }) + +test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = rep(c(10.0, 100.0), 500L) + , "target" = rep(c(-50.0, 50.0), 500L) + ) + validDF <- data.frame( + "feat1" = rep(50.0, 4L) + , "target" = rep(50.0, 4L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + dvalid2 <- lgb.Dataset( + data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) + , label = validDF[1L:10L, "target"] + ) + nrounds <- 10L + train_params <- list( + objective = "regression" + , metric = "rmse" + , learning_rate = 1.5 + ) + + # example 1: two valids, neither are the training data + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "valid2" = dvalid2 + ) + , params = train_params + ) + expect_named( + bst$record_evals + , c("start_iter", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 2: train first (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "train" = dtrain + , "valid1" = dvalid1 + , "valid2" = dvalid2 + ) + , params = train_params + ) + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 3: train second (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "train" = dtrain + , "valid2" = dvalid2 + ) + , params = train_params + ) + # note that "train" still ends up as the first one + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 4: train third (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "valid2" = dvalid2 + , "train" = dtrain + ) + , params = train_params + ) + # note that "train" still ends up as the first one + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 5: train second (called "something-random-we-would-not-hardcode") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + , "valid2" = dvalid2 + ) + , params = train_params + ) + # note that "something-random-we-would-not-hardcode" was recognized as the training + # data even though it isn't named "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 6: the only valid supplied is the training data + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "train" = dtrain + ) + , params = train_params + ) + expect_identical(bst$best_iter, -1L) + expect_identical(bst$best_score, NA_real_) +}) + +test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = runif(n = 500L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = runif(n = 50L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) + , label = validDF[1L:25L, "target"] + ) + nrounds <- 10L + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + ) + , params = list( + objective = "binary" + , metric = "auc" + , learning_rate = 1.5 + ) + ) + # note that "something-random-we-would-not-hardcode" was recognized as the training + # data even though it isn't named "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(bst$best_iter, which.max(auc_scores)) + expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +}) + +test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { + set.seed(708L) + # example: train second (called "something-random-we-would-not-hardcode"), two valids, + # and a metric where higher values are better ("auc") + trainDF <- data.frame( + "feat1" = runif(n = 500L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = runif(n = 50L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) + , label = validDF[1L:25L, "target"] + ) + dvalid2 <- lgb.Dataset( + data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) + , label = validDF[26L:50L, "target"] + ) + nrounds <- 10L + bst <- lightgbm( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + , "valid2" = dvalid2 + ) + , params = list( + objective = "binary" + , metric = "auc" + , learning_rate = 1.5 + ) + , verbose = -7L + ) + # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() + # untouched. If you set verbose to > 0, the training data will still be first but called "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(bst$best_iter, which.max(auc_scores)) + expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +}) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 8ccb357626ce..6b1cc20f957a 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -227,3 +227,87 @@ test_that("If a string and a file are both passed to lgb.load() the file is used pred2 <- predict(bst2, test$data) expect_identical(pred, pred2) }) + +context("Booster") + +test_that("Creating a Booster from a Dataset should work", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + data(agaricus.test, package = "lightgbm") + dtrain <- lgb.Dataset( + agaricus.train$data + , label = agaricus.train$label + ) + bst <- Booster$new( + params = list( + objective = "binary" + ), + train_set = dtrain + ) + expect_true(lgb.is.Booster(bst)) + expect_equal(bst$current_iter(), 0L) + expect_true(is.na(bst$best_score)) + expect_true(all(bst$predict(agaricus.train$data) == 0.5)) +}) + +test_that("Creating a Booster from a Dataset with an existing predictor should work", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + nrounds <- 2L + bst <- lightgbm( + data = as.matrix(agaricus.train$data) + , label = agaricus.train$label + , num_leaves = 4L + , learning_rate = 1.0 + , nrounds = nrounds + , objective = "binary" + ) + data(agaricus.test, package = "lightgbm") + dtest <- Dataset$new( + data = agaricus.test$data + , label = agaricus.test$label + , predictor = bst$to_predictor() + ) + bst_from_ds <- Booster$new( + train_set = dtest + ) + expect_true(lgb.is.Booster(bst)) + expect_equal(bst$current_iter(), nrounds) + expect_equal(bst$eval_train()[[1L]][["value"]], 0.1115352) + expect_equal(bst_from_ds$current_iter(), nrounds) + dumped_model <- jsonlite::fromJSON(bst$dump_model()) + expect_identical(bst_from_ds$eval_train(), list()) + expect_equal(bst_from_ds$current_iter(), nrounds) +}) + +test_that("Booster$rollback_one_iter() should work as expected", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + data(agaricus.test, package = "lightgbm") + train <- agaricus.train + test <- agaricus.test + nrounds <- 5L + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 4L + , learning_rate = 1.0 + , nrounds = nrounds + , objective = "binary" + ) + expect_equal(bst$current_iter(), nrounds) + expect_true(lgb.is.Booster(bst)) + logloss <- bst$eval_train()[[1L]][["value"]] + expect_equal(logloss, 0.01904786) + + x <- bst$rollback_one_iter() + + # rollback_one_iter() should return a booster and modify the original + # booster in place + expect_true(lgb.is.Booster(x)) + expect_equal(bst$current_iter(), nrounds - 1L) + + # score should now come from the model as of 4 iterations + logloss <- bst$eval_train()[[1L]][["value"]] + expect_equal(logloss, 0.027915146) +}) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index a16e6f742061..9765356a0df8 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -48,3 +48,23 @@ test_that("lgb.params2str() works as expected for a key in params with multiple , "objective=magic metric=a,ab,abc,abcdefg nrounds=10 learning_rate=0.0000001" ) }) + +context("lgb.last_error") + +test_that("lgb.last_error() throws an error if there are no errors", { + expect_error({ + lgb.last_error() + }, regexp = "Everything is fine") +}) + +test_that("lgb.last_error() correctly returns errors from the C++ side", { + data(agaricus.train, package = "lightgbm") + train <- agaricus.train + dvalid1 <- lgb.Dataset( + data = train$data + , label = as.matrix(rnorm(5L)) + ) + expect_error({ + dvalid1$construct() + }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) +}) diff --git a/README.md b/README.md index 290590d80e9e..a6bc3916b5c5 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ External (Unofficial) Repositories Optuna (hyperparameter optimization framework): https://github.com/optuna/optuna -Julia-package: https://github.com/Allardvm/LightGBM.jl +Julia-package: https://github.com/IQVIA-ML/LightGBM.jl JPMML (Java PMML converter): https://github.com/jpmml/jpmml-lightgbm diff --git a/docker/dockerfile-python b/docker/dockerfile-python index b157b41117ba..29fc4ece5b41 100644 --- a/docker/dockerfile-python +++ b/docker/dockerfile-python @@ -13,7 +13,7 @@ RUN apt-get update && \ git \ wget && \ # python environment - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ /bin/bash Miniconda3-latest-Linux-x86_64.sh -f -b -p $CONDA_DIR && \ export PATH="$CONDA_DIR/bin:$PATH" && \ conda config --set always_yes yes --set changeps1 no && \ diff --git a/docker/gpu/dockerfile.gpu b/docker/gpu/dockerfile.gpu index c4801d6e462f..08c243a57bce 100644 --- a/docker/gpu/dockerfile.gpu +++ b/docker/gpu/dockerfile.gpu @@ -70,7 +70,7 @@ ENV PATH $CONDA_DIR/bin:$PATH # Install miniconda RUN echo "export PATH=$CONDA_DIR/bin:"'$PATH' > /etc/profile.d/conda.sh && \ - wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p $CONDA_DIR && \ rm ~/miniconda.sh diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 8816a4430f97..71d9ad38bd19 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -22,6 +22,7 @@ You may also ping a member of the core team according to the relevant area of ex - `@guolinke `__ **Guolin Ke** (C++ code / R-package / Python-package) - `@chivee `__ **Qiwei Ye** (C++ code / Python-package) +- `@btrotta `__ **Belinda Trotta** (C++ code) - `@Laurae2 `__ **Damien Soukhavong** (R-package) - `@jameslamb `__ **James Lamb** (R-package) - `@wxchan `__ **Wenxuan Chen** (Python-package) @@ -210,7 +211,7 @@ This is a known bug: `Microsoft/LightGBM#539 `_. If you use ``lgb.dl()`` to build from source (i.e. not using pre-compiled dll), you need to upgrade your version of ``data.table`` to at least version 1.12.0. +If you are experiencing this error when running ``lightgbm``, you may be facing the same issue reported in `#2715 `_ and later in `#2989 `_. We have seen that some in some situations, using ``data.table`` 1.11.x results in this error. To get around this, you can upgrade your version of ``data.table`` to at least version 1.12.0. ------ diff --git a/docs/conf.py b/docs/conf.py index c157566d3805..6bbf7d52f498 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -139,6 +139,7 @@ def run(self): "h": "c", } breathe_show_define_initializer = True + c_id_attributes = ['LIGHTGBM_C_EXPORT'] # -- Options for HTML output ---------------------------------------------- @@ -227,13 +228,13 @@ def generate_r_docs(app): /home/docs/.conda/bin/conda create -q -y -n r_env \ r-base=3.5.1=h1e0a451_2 \ r-devtools=1.13.6=r351h6115d3f_0 \ - r-data.table=1.11.4=r351h96ca727_0 \ r-jsonlite=1.5=r351h96ca727_0 \ r-matrix=1.2_14=r351h96ca727_0 \ r-testthat=2.0.0=r351h29659fb_0 \ cmake=3.14.0=h52cb24c_0 \ ca-certificates=2019.11.27=0 /home/docs/.conda/bin/conda install -q -y -n r_env -c conda-forge \ + r-data.table=1.12.8=r35hcdcec82_0 \ r-pkgdown=1.3.0=r35h6115d3f_1001 \ r-roxygen2=6.1.1=r35h0357c0b_1001 source /home/docs/.conda/bin/activate r_env diff --git a/docs/requirements.txt b/docs/requirements.txt index 2fb1ed05cb53..17896e0c7283 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,2 @@ -r requirements_base.txt -breathe < 4.15 +breathe diff --git a/docs/requirements_base.txt b/docs/requirements_base.txt index 23bd0e5b8c86..9c3dfc2a5b90 100644 --- a/docs/requirements_base.txt +++ b/docs/requirements_base.txt @@ -1,3 +1,3 @@ -sphinx < 3.0 +sphinx sphinx_rtd_theme >= 0.3 mock; python_version < '3' diff --git a/docs/requirements_rtd.txt b/docs/requirements_rtd.txt new file mode 100644 index 000000000000..91ce2a1b1e6b --- /dev/null +++ b/docs/requirements_rtd.txt @@ -0,0 +1,4 @@ +sphinx >= 3.0.2 +sphinx_rtd_theme >= 0.3 +mock; python_version < '3' +breathe diff --git a/include/LightGBM/R_object_helper.h b/include/LightGBM/R_object_helper.h index 1464c0a7083f..5be578e957f2 100644 --- a/include/LightGBM/R_object_helper.h +++ b/include/LightGBM/R_object_helper.h @@ -15,7 +15,6 @@ #include -#define TYPE_BITS 5 // use .Internal(internalsID()) to uuid #define R_INTERNALS_UUID "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327" diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 8e9a6fc2500c..6a30fce495c5 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -16,6 +16,7 @@ #include #include +#include #include @@ -1074,7 +1075,8 @@ static char* LastErrorMsg() { static THREAD_LOCAL char err_msg[512] = "Everythin * \param msg Error message */ inline void LGBM_SetLastError(const char* msg) { - std::strcpy(LastErrorMsg(), msg); + const int err_buf_len = 512; + snprintf(LastErrorMsg(), err_buf_len, "%s", msg); } #endif // LIGHTGBM_C_API_H_ diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i index 5fba17e12adf..985dfb481f2a 100644 --- a/swig/lightgbmlib.i +++ b/swig/lightgbmlib.i @@ -233,11 +233,6 @@ %pointer_cast(int32_t *, void *, int32_t_to_voidp_ptr) %pointer_cast(int64_t *, void *, int64_t_to_voidp_ptr) -%array_functions(double, doubleArray) -%array_functions(float, floatArray) -%array_functions(int, intArray) -%array_functions(long, longArray) - /* Custom pointer manipulation template */ %define %pointer_manipulation(TYPE, NAME) %{ @@ -278,6 +273,36 @@ TYPE *NAME##_handle(); %enddef +%define %long_array_functions(TYPE,NAME) +%{ + static TYPE *new_##NAME(int64_t nelements) { %} + %{ return new TYPE[nelements](); %} + %{} + + static void delete_##NAME(TYPE *ary) { %} + %{ delete [] ary; %} + %{} + + static TYPE NAME##_getitem(TYPE *ary, int64_t index) { + return ary[index]; + } + static void NAME##_setitem(TYPE *ary, int64_t index, TYPE value) { + ary[index] = value; + } + %} + +TYPE *new_##NAME(int64_t nelements); +void delete_##NAME(TYPE *ary); +TYPE NAME##_getitem(TYPE *ary, int64_t index); +void NAME##_setitem(TYPE *ary, int64_t index, TYPE value); + +%enddef + +%long_array_functions(double, doubleArray) +%long_array_functions(float, floatArray) +%long_array_functions(int, intArray) +%long_array_functions(long, longArray) + %pointer_manipulation(void*, voidpp) /* Allow dereferencing of void** to void* */ diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index b138de4a0ef4..20593e5fe210 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -1,7 +1,6 @@ # coding: utf-8 import ctypes import os -import sys from platform import system diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c5348e9858c4..3be0568e622a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1857,7 +1857,6 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, self.assertEqual(len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])), 4) iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) - iter_min = min([iter_min_l1, iter_min_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) iter_cv_l1 = 4 diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1fb44d0dc12c..096d36a31171 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -4,6 +4,7 @@ import math import os import unittest +import warnings import lightgbm as lgb import numpy as np