From 90b129252cf2e84864d07bbc25f931111a15689f Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 3 Jun 2021 10:49:44 -0500 Subject: [PATCH 01/11] FIX fix kernel and line info in cmake --- ci/gpu/build.sh | 2 +- cpp/CMakeLists.txt | 7 ++++--- cpp/cmake/modules/ConfigureCUDA.cmake | 10 +++++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 0ba03400a0..02b2e63363 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -55,7 +55,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "dask-cuda=${MINOR_VERSION}" \ "ucx-py=${MINOR_VERSION}" \ "ucx-proc=*=gpu" \ - "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ + "xgboost=1.4.2dev.rapidsai21.06" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8936c6d41f..05bdd21f81 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -58,12 +58,12 @@ option(BUILD_CUML_BENCH "Build cuML C++ benchmark tests" ON) option(BUILD_CUML_PRIMS_BENCH "Build ml-prims C++ benchmark tests" ON) option(BUILD_CUML_STD_COMMS "Build the standard NCCL+UCX Communicator" ON) option(BUILD_CUML_MPI_COMMS "Build the MPI+NCCL Communicator (used for testing)" OFF) +option(CUDA_ENABLE_KERNEL_INFO "Enable kernel resource usage info" OFF) +option(CUDA_ENABLE_LINE_INFO "Enable lineinfo in nvcc" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON) option(DISABLE_OPENMP "Disable OpenMP" OFF) option(ENABLE_CUMLPRIMS_MG "Enable algorithms that use libcumlprims_mg" ON) -option(KERNEL_INFO "Enable kernel resource usage info" OFF) -option(LINE_INFO "Enable lineinfo in nvcc" OFF) option(NVTX "Enable nvtx markers" OFF) option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF) option(USE_CCACHE "Cache build artifacts with ccache" OFF) @@ -82,7 +82,8 @@ message(VERBOSE "CUML: Enabling detection of conda environment for dependencies: message(VERBOSE "CUML: Disabling OpenMP: ${DISABLE_OPENMP}") message(VERBOSE "CUML: Enabling algorithms that use libcumlprims_mg: ${ENABLE_CUMLPRIMS_MG}") message(VERBOSE "CUML: Enabling kernel resource usage info: ${KERNEL_INFO}") -message(VERBOSE "CUML: Enabling lineinfo in nvcc: ${LINE_INFO}") +message(VERBOSE "CUML: Enabling kernelinfo in nvcc: ${CUDA_ENABLE_KERNEL_INFO}") +message(VERBOSE "CUML: Enabling lineinfo in nvcc: ${CUDA_ENABLE_LINE_INFO}") message(VERBOSE "CUML: Enabling nvtx markers: ${NVTX}") message(VERBOSE "CUML: Disabling all mnmg components and comms libraries: ${SINGLEGPU}") message(VERBOSE "CUML: Cache build artifacts with ccache: ${USE_CCACHE}") diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake index b9f6795a67..b1c3ae83de 100644 --- a/cpp/cmake/modules/ConfigureCUDA.cmake +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -34,9 +34,13 @@ if(DISABLE_DEPRECATION_WARNING) endif() # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking -if(CUDA_ENABLE_LINEINFO) - list(APPEND CUML_CUDA_FLAGS -lineinfo) -endif() +if(CUDA_ENABLE_LINE_INFO) + list(APPEND CUML_CUDA_FLAGS -lineinfo) +endif(LINE_INFO) + +if(CUDA_ENABLE_KERNEL_INFO) + list(APPEND CUML_CUDA_FLAGS -Xptxas=-v) +endif(KERNEL_INFO) # Debug options if(CMAKE_BUILD_TYPE MATCHES Debug) From 36c80a253311c30a5d93f088aff7fdf87cb42bc0 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 3 Jun 2021 11:33:35 -0500 Subject: [PATCH 02/11] FIX Use ucx-py 0.21 --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 02b2e63363..a986541584 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -53,7 +53,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "libcumlprims=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=${MINOR_VERSION}" \ + "ucx-py=0.21" \ "ucx-proc=*=gpu" \ "xgboost=1.4.2dev.rapidsai21.06" \ "rapids-build-env=${MINOR_VERSION}.*" \ From 8594edaed047a6e089f4f8ecc5accebe36d67290 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 5 Jun 2021 14:07:45 -0500 Subject: [PATCH 03/11] DBG Try installing xgb 21.06 after the big conda install --- ci/gpu/build.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a986541584..5afe84bda8 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -55,7 +55,6 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "dask-cuda=${MINOR_VERSION}" \ "ucx-py=0.21" \ "ucx-proc=*=gpu" \ - "xgboost=1.4.2dev.rapidsai21.06" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" \ @@ -65,6 +64,10 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" +# Installing xgboost 21.06 in the install above was causing conflicts +gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +gpuci_conda_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" + gpuci_logger "Install contextvars if needed" py_ver=$(python -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") if [ "$py_ver" == "3.6" ];then From 04029757ee2f02d4bc8f8a905d4980a7e6173c4e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 6 Jun 2021 14:03:05 -0500 Subject: [PATCH 04/11] DBG Try using mamba to avoid timeouts --- ci/gpu/build.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 5afe84bda8..23250a4eea 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -45,8 +45,10 @@ gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids +gpuci_conda_retry install -c conda-forge mamba + gpuci_logger "Install dependencies" -gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ +mamba install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ "cudatoolkit=${CUDA_REL}" \ "cudf=${MINOR_VERSION}" \ "rmm=${MINOR_VERSION}" \ @@ -66,7 +68,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid # Installing xgboost 21.06 in the install above was causing conflicts gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env -gpuci_conda_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" +mamba install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" gpuci_logger "Install contextvars if needed" py_ver=$(python -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") From 5a5d184635ba2e17a6ea38facee01f15a0d243f4 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 6 Jun 2021 15:48:34 -0500 Subject: [PATCH 05/11] DBG install xgboost after libcuml artifact --- ci/gpu/build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 23250a4eea..661e71ca3d 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -66,10 +66,6 @@ mamba install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_conda_retry install -y "your-pkg=1.0.0" -# Installing xgboost 21.06 in the install above was causing conflicts -gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env -mamba install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" - gpuci_logger "Install contextvars if needed" py_ver=$(python -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") if [ "$py_ver" == "3.6" ];then @@ -198,6 +194,10 @@ else gpuci_logger "Installing $CONDA_FILE" conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" + # Installing xgboost 21.06 in the install above was causing conflicts + gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env + mamba install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" + gpuci_logger "Install the main version of dask and distributed" set -x pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps From e1bca087b93510bbcc28d83f88f2282f5c07a183 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 6 Jun 2021 17:16:35 -0500 Subject: [PATCH 06/11] DBG Playing with using mamba a little bit more --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 661e71ca3d..646d9ef410 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -192,7 +192,7 @@ else CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install gpuci_logger "Installing $CONDA_FILE" - conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" + mamba install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" # Installing xgboost 21.06 in the install above was causing conflicts gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env From 24b3198b30bdd912e84e4dd30e0c70bfb7b4dd97 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 6 Jun 2021 23:05:11 -0500 Subject: [PATCH 07/11] DBG Remove xgboost instead of all of the above --- ci/gpu/build.sh | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 646d9ef410..cf2ed8e231 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -45,18 +45,17 @@ gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -gpuci_conda_retry install -c conda-forge mamba - gpuci_logger "Install dependencies" -mamba install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ +gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \ "cudatoolkit=${CUDA_REL}" \ "cudf=${MINOR_VERSION}" \ "rmm=${MINOR_VERSION}" \ "libcumlprims=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=0.21" \ + "ucx-py=${MINOR_VERSION}" \ "ucx-proc=*=gpu" \ + # "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" \ @@ -192,11 +191,7 @@ else CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install gpuci_logger "Installing $CONDA_FILE" - mamba install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" - - # Installing xgboost 21.06 in the install above was causing conflicts - gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env - mamba install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia "xgboost=1.4.2dev.rapidsai21.06" + conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" gpuci_logger "Install the main version of dask and distributed" set -x From 9692b43ab3611ea727dc6446786e3aed7576404e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 7 Jun 2021 07:57:43 -0500 Subject: [PATCH 08/11] DBG correct ucx-py version --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index cf2ed8e231..a56bc6e1c3 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -53,7 +53,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "libcumlprims=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=${MINOR_VERSION}" \ + "ucx-py=0.21" \ "ucx-proc=*=gpu" \ # "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ "rapids-build-env=${MINOR_VERSION}.*" \ From 0f28f3139177db8994fa819b1a0bc9cc8eacafa3 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 7 Jun 2021 10:34:09 -0500 Subject: [PATCH 09/11] FIX Change order of commented code to make the script happy --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a56bc6e1c3..2b08981adb 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -55,11 +55,11 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "dask-cuda=${MINOR_VERSION}" \ "ucx-py=0.21" \ "ucx-proc=*=gpu" \ - # "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" \ "shap>=0.37,<=0.39" + # "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ # https://docs.rapids.ai/maintainers/depmgmt/ # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env From 8d20def233e1e60170f89233c7c80667ee14ec0a Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 7 Jun 2021 15:17:50 -0500 Subject: [PATCH 10/11] FIX Merge main and use dask main --- ci/gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index eb45126dad..a70173a615 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -195,8 +195,8 @@ else gpuci_logger "Install the main version of dask and distributed" set -x - pip install "git+https://github.com/dask/distributed.git@2021.05.1" --upgrade --no-deps - pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps + pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps + pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps set +x gpuci_logger "Building cuml" From 71e06857fa89260827ff390b9c9e1150b4416618 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 8 Jun 2021 17:01:50 -0500 Subject: [PATCH 11/11] FIX add back xgboost now that package is published --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a70173a615..9d8feebb14 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -55,11 +55,11 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "dask-cuda=${MINOR_VERSION}" \ "ucx-py=0.21" \ "ucx-proc=*=gpu" \ + "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" \ "shap>=0.37,<=0.39" - # "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \ # https://docs.rapids.ai/maintainers/depmgmt/ # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env