From 1fa8c70f1c5a866f2fe0d5bc06c289a60d39b3ab Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 01:04:29 -0700 Subject: [PATCH 01/86] Remove existing GH workflows --- .github/dependabot.yml | 35 --- .github/lock.yml | 32 --- .github/workflows/freebsd.yml | 34 --- .github/workflows/i386.yml | 43 ---- .github/workflows/jvm_tests.yml | 100 -------- .github/workflows/main.yml | 193 --------------- .github/workflows/python_tests.yml | 348 ---------------------------- .github/workflows/python_wheels.yml | 55 ----- .github/workflows/r_nold.yml | 44 ---- .github/workflows/r_tests.yml | 150 ------------ .github/workflows/scorecards.yml | 54 ----- .github/workflows/update_rapids.yml | 44 ---- 12 files changed, 1132 deletions(-) delete mode 100644 .github/dependabot.yml delete mode 100644 .github/lock.yml delete mode 100644 .github/workflows/freebsd.yml delete mode 100644 .github/workflows/i386.yml delete mode 100644 .github/workflows/jvm_tests.yml delete mode 100644 .github/workflows/main.yml delete mode 100644 .github/workflows/python_tests.yml delete mode 100644 .github/workflows/python_wheels.yml delete mode 100644 .github/workflows/r_nold.yml delete mode 100644 .github/workflows/r_tests.yml delete mode 100644 .github/workflows/scorecards.yml delete mode 100644 .github/workflows/update_rapids.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" diff --git a/.github/lock.yml b/.github/lock.yml deleted file mode 100644 index f916abe5a367..000000000000 --- a/.github/lock.yml +++ /dev/null @@ -1,32 +0,0 @@ -# Configuration for lock-threads - https://github.com/dessant/lock-threads - -# Number of days of inactivity before a closed issue or pull request is locked -daysUntilLock: 90 - -# Issues and pull requests with these labels will not be locked. Set to `[]` to disable -exemptLabels: - - feature-request - -# Label to add before locking, such as `outdated`. Set to `false` to disable -lockLabel: false - -# Comment to post before locking. Set to `false` to disable -lockComment: false - -# Assign `resolved` as the reason for locking. Set to `false` to disable -setLockReason: true - -# Limit to only `issues` or `pulls` -# only: issues - -# Optionally, specify configuration settings just for `issues` or `pulls` -# issues: -# exemptLabels: -# - help-wanted -# lockLabel: outdated - -# pulls: -# daysUntilLock: 30 - -# Repository to extend settings from -# _extends: repo diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml deleted file mode 100644 index d3208a1294d1..000000000000 --- a/.github/workflows/freebsd.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: FreeBSD - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test: - runs-on: ubuntu-latest - timeout-minutes: 20 - name: A job to run test in FreeBSD - steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest - - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml deleted file mode 100644 index 72618dc697a6..000000000000 --- a/.github/workflows/i386.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: XGBoost-i386-test - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-32bit: - name: Build 32-bit - runs-on: ubuntu-latest - services: - registry: - image: registry:2 - ports: - - 5000:5000 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.6.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: tests/ci_build/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml deleted file mode 100644 index 1281c5d5fe56..000000000000 --- a/.github/workflows/jvm_tests.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: XGBoost-JVM-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} - timeout-minutes: 30 - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-latest, ubuntu-latest, macos-13] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: actions/setup-java@6a0805fcefea3d4657a47ac4c165951e33482018 # v4.2.2 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml - use-mamba: true - - - name: Cache Maven packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 - - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows - - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') - - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index e515f97296fa..000000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,193 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-12] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose - - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose - - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* - - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml deleted file mode 100644 index c8d2aba55507..000000000000 --- a/.github/workflows/python_tests.yml +++ /dev/null @@ -1,348 +0,0 @@ -name: XGBoost-Python-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' - - python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: macos-13} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - - - python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Set up Python 3.10 - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml deleted file mode 100644 index 235942713287..000000000000 --- a/.github/workflows/python_wheels.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: XGBoost-Python-Wheels - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-wheels: - name: Build wheel for ${{ matrix.platform_id }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-13 - platform_id: macosx_x86_64 - - os: macos-14 - platform_id: macosx_arm64 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml deleted file mode 100644 index 4b506927e06c..000000000000 --- a/.github/workflows/r_nold.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Run expensive R tests with the help of rhub. Only triggered by a pull request review -# See discussion at https://github.com/dmlc/xgboost/pull/6378 - -name: XGBoost-R-noLD - -on: - pull_request_review_comment: - types: [created] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-R-noLD: - if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) - timeout-minutes: 120 - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-devel-nold - steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml deleted file mode 100644 index 27ae4bee1166..000000000000 --- a/.github/workflows/r_tests.yml +++ /dev/null @@ -1,150 +0,0 @@ -name: XGBoost-R-Tests - -on: [push, pull_request] - -env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - fail-fast: false - matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' - - test-R-on-Debian: - name: Test R package on Debian - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-release - - steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml deleted file mode 100644 index 1881c0eba274..000000000000 --- a/.github/workflows/scorecards.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: Scorecards supply-chain security -on: - # Only the default branch is supported. - branch_protection_rule: - schedule: - - cron: '17 2 * * 6' - push: - branches: [ "master" ] - -# Declare default permissions as read only. -permissions: read-all - -jobs: - analysis: - name: Scorecards analysis - runs-on: ubuntu-latest - permissions: - # Needed to upload the results to code-scanning dashboard. - security-events: write - # Used to receive a badge. - id-token: write - - steps: - - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - persist-credentials: false - - - name: "Run analysis" - uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 - with: - results_file: results.sarif - results_format: sarif - - # Publish the results for public repositories to enable scorecard badges. For more details, see - # https://github.com/ossf/scorecard-action#publishing-results. - # For private repositories, `publish_results` will automatically be set to `false`, regardless - # of the value entered here. - publish_results: true - - # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF - # format to the repository Actions tab. - - name: "Upload artifact" - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 - with: - name: SARIF file - path: results.sarif - retention-days: 5 - - # Upload the results to GitHub's code scanning dashboard. - - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@83a02f7883b12e0e4e1a146174f5e2292a01e601 # v2.16.4 - with: - sarif_file: results.sarif diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml deleted file mode 100644 index 9490926cfcaf..000000000000 --- a/.github/workflows/update_rapids.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: update-rapids - -on: - workflow_dispatch: - schedule: - - cron: "0 20 * * 1" # Run once weekly - -permissions: - pull-requests: write - contents: write - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # To use GitHub CLI - -jobs: - update-rapids: - name: Check latest RAPIDS - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v6 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" - From 78c4d8ca82136a428ed71e93b583658b7d0de68f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 01:10:31 -0700 Subject: [PATCH 02/86] First RunsOn example --- .github/workflows/main.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000000..ed02bfb2ad53 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,22 @@ +name: Nextgen XGBoost CI + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build + runs-on: + - runs-on=${{ github.run_id }} + - runner=2cpu-linux-x64 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: echo "Hello from x64!" From 92fd6a29b9b5419adb9a94ecbc9648d942caf42c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 11:29:00 -0700 Subject: [PATCH 03/86] Test custom runner config --- .github/runs-on.yml | 6 ++++++ .github/workflows/main.yml | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 .github/runs-on.yml diff --git a/.github/runs-on.yml b/.github/runs-on.yml new file mode 100644 index 000000000000..24a9caa0073b --- /dev/null +++ b/.github/runs-on.yml @@ -0,0 +1,6 @@ +runners: + linux-amd64-cpu: + cpu: 16 + hdd: 40 + family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] + image: ubuntu24-full-x64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ed02bfb2ad53..af7fc1fa6435 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,9 +14,15 @@ jobs: name: Build runs-on: - runs-on=${{ github.run_id }} - - runner=2cpu-linux-x64 + - runner=linux-amd64-cpu steps: - uses: actions/checkout@v4 with: submodules: "true" - - run: echo "Hello from x64!" + - run: | + sudo apt update && sudo apt install ninja-build + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON + ninja -v + ./testxgboost From d6761cb9623f84187dc7109bba5d8cbd7229d8eb Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 19:17:45 -0700 Subject: [PATCH 04/86] Try out GPU support --- .github/runs-on.yml | 10 ++++++++++ .github/workflows/main.yml | 21 ++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 24a9caa0073b..fd9be8996a9e 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,6 +1,16 @@ +images: + dlami-amd64: + platform: "linux" + arch: "x64" + owner: "898082745236" # AWS + name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + runners: linux-amd64-cpu: cpu: 16 hdd: 40 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: ubuntu24-full-x64 + linux-amd64-gpu: + family: ["g4dn.xlarge"] + image: dlami-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index af7fc1fa6435..64db9583b195 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,10 +19,29 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - run: | + - name: Build and run gtest + run: | sudo apt update && sudo apt install ninja-build mkdir build cd build cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ninja -v ./testxgboost + build-gpu: + name: Build GPU + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + run: | + nvidia-smi + sudo apt update && sudo apt install ninja-build + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 + ninja -v + ./testxgboost From 3331cd6ae307b993456e0334893a80ba30a8cb9e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 22 Oct 2024 12:52:40 -0700 Subject: [PATCH 05/86] Add Windows --- .github/runs-on.yml | 19 +++++- .github/workflows/main.yml | 22 +++++++ ops/packer/windows/bootstrap.ps1 | 73 ++++++++++++++++++++++ ops/packer/windows/install_choco.ps1 | 14 +++++ ops/packer/windows/setup_ssh.ps1 | 58 ++++++++++++++++++ ops/packer/windows/sysprep.ps1 | 14 +++++ ops/packer/windows/windows.pkr.hcl | 90 ++++++++++++++++++++++++++++ 7 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 ops/packer/windows/bootstrap.ps1 create mode 100644 ops/packer/windows/install_choco.ps1 create mode 100644 ops/packer/windows/setup_ssh.ps1 create mode 100644 ops/packer/windows/sysprep.ps1 create mode 100644 ops/packer/windows/windows.pkr.hcl diff --git a/.github/runs-on.yml b/.github/runs-on.yml index fd9be8996a9e..f8de09feb553 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -4,13 +4,30 @@ images: arch: "x64" owner: "898082745236" # AWS name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + windows-amd64: + platform: "windows" + arch: "x64" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-windows-*" runners: linux-amd64-cpu: cpu: 16 - hdd: 40 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: ubuntu24-full-x64 linux-amd64-gpu: family: ["g4dn.xlarge"] image: dlami-amd64 + linux-amd64-mgpu: + family: ["g4dn.12xlarge"] + image: dlami-amd64 + linux-arm64-cpu: + family: ["c6g", "c7g"] + image: ubuntu24-full-arm64 + windows-gpu: + family: ["g4dn.2xlarge"] + image: windows-amd64 + windows-cpu: + cpu: 16 + family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] + image: windows-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 64db9583b195..ead80a7dadac 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -45,3 +45,25 @@ jobs: cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 ninja -v ./testxgboost + build-gpu-win64: + name: Build GPU (Windows) + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + shell: powershell + run: | + nvcc --version + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + mkdir build + cd build + cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + build/testxgboost.exe + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/bootstrap.ps1 b/ops/packer/windows/bootstrap.ps1 new file mode 100644 index 000000000000..c67f3b73fb9a --- /dev/null +++ b/ops/packer/windows/bootstrap.ps1 @@ -0,0 +1,73 @@ +## Install packages from Chocolatey + +# jq & yq +Write-Output "Installing jq and yq..." +choco install jq --version=1.7.1 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install yq --version=4.40.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# AWS CLI +Write-Output "Installing AWS CLI..." +choco install awscli --version=2.18.11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Git +Write-Host '>>> Installing Git...' +choco install git --version=2.47.0 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CMake +Write-Host '>>> Installing CMake 3.30.5...' +choco install cmake --version 3.30.5 --installargs "ADD_CMAKE_TO_PATH=System" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Notepad++ +Write-Host '>>> Installing Notepad++...' +choco install notepadplusplus +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Miniforge3 +Write-Host '>>> Installing Miniforge3...' +choco install miniforge3 --params="'/InstallationType:AllUsers /RegisterPython:1 /D:C:\tools\miniforge3'" +C:\tools\miniforge3\Scripts\conda.exe init --user --system +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +conda config --set auto_activate_base false + +# Java 11 +Write-Host '>>> Installing Java 11...' +choco install openjdk11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Maven +Write-Host '>>> Installing Maven...' +choco install maven +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# GraphViz +Write-Host '>>> Installing GraphViz...' +choco install graphviz +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Visual Studio 2022 Community +Write-Host '>>> Installing Visual Studio 2022 Community...' +choco install visualstudio2022community ` + --params "--wait --passive --norestart" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install visualstudio2022-workload-nativedesktop --params ` + "--wait --passive --norestart --includeOptional" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CUDA 12.5 +Write-Host '>>> Installing CUDA 12.5...' +choco install cuda --version=12.5.1.555 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# R 4.3 +Write-Host '>>> Installing R...' +choco install r.project --version=4.3.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install rtools --version=4.3.5550 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/install_choco.ps1 b/ops/packer/windows/install_choco.ps1 new file mode 100644 index 000000000000..131e8129feaa --- /dev/null +++ b/ops/packer/windows/install_choco.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/InstallChoco.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +# Install Chocolatey +# See https://chocolatey.org/install#individual +Set-ExecutionPolicy Bypass -Scope Process -Force +[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 +Invoke-Expression ((New-Object System.Net.WebClient).DownloadString("https://community.chocolatey.org/install.ps1")) + +# Globally Auto confirm every action +# See: https://docs.chocolatey.org/en-us/faqs#why-do-i-have-to-confirm-packages-now-is-there-a-way-to-remove-this +choco feature enable -n allowGlobalConfirmation diff --git a/ops/packer/windows/setup_ssh.ps1 b/ops/packer/windows/setup_ssh.ps1 new file mode 100644 index 000000000000..a7bdee898002 --- /dev/null +++ b/ops/packer/windows/setup_ssh.ps1 @@ -0,0 +1,58 @@ + +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/SetupSsh.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +# Don't display progress bars +# See: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_preference_variables?view=powershell-7.3#progresspreference +$ProgressPreference = "SilentlyContinue" +$ErrorActionPreference = "Stop" + +# Install OpenSSH using Add-WindowsCapability +# See: https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell#install-openssh-for-windows + +Write-Host "Installing and starting ssh-agent" +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Set-Service -Name ssh-agent -StartupType Automatic +Start-Service ssh-agent + +Write-Host "Installing and starting sshd" +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 +Set-Service -Name sshd -StartupType Automatic +Start-Service sshd + +# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify +if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) { + Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..." + New-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -DisplayName "OpenSSH Server (sshd)" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +} else { + Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists." +} + +# Set default shell to Powershell +New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" -Name DefaultShell -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -PropertyType String -Force + +$keyDownloadScript = Join-Path $env:ProgramData "ssh\download-key.ps1" + +@' +# Download private key to $env:ProgramData\ssh\administrators_authorized_keys +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" + +$keyUrl = "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" +Invoke-WebRequest $keyUrl -OutFile $openSSHAuthorizedKeys + +# Ensure ACL for administrators_authorized_keys is correct +# See https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_server_configuration#authorizedkeysfile +icacls.exe $openSSHAuthorizedKeys /inheritance:r /grant "Administrators:F" /grant "SYSTEM:F" +'@ | Out-File $keyDownloadScript + +# Create Task +$taskName = "DownloadKey" +$principal = New-ScheduledTaskPrincipal -UserID "NT AUTHORITY\SYSTEM" -LogonType ServiceAccount -RunLevel Highest +$action = New-ScheduledTaskAction -Execute "Powershell.exe" -Argument "-NoProfile -File ""$keyDownloadScript""" +$trigger = New-ScheduledTaskTrigger -AtStartup +Register-ScheduledTask -Action $action -Trigger $trigger -Principal $principal -TaskName $taskName -Description $taskName + +# Fetch key via $keyDownloadScript +& Powershell.exe -ExecutionPolicy Bypass -File $keyDownloadScript + + diff --git a/ops/packer/windows/sysprep.ps1 b/ops/packer/windows/sysprep.ps1 new file mode 100644 index 000000000000..a0470309f9da --- /dev/null +++ b/ops/packer/windows/sysprep.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/PrepareImage.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +Write-Output "Cleaning up keys" +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" +Remove-Item -Recurse -Force -Path $openSSHAuthorizedKeys + +# Make sure task is enabled +Enable-ScheduledTask "DownloadKey" + +Write-Output "Running Sysprep" +& "$Env:Programfiles\Amazon\EC2Launch\ec2launch.exe" sysprep diff --git a/ops/packer/windows/windows.pkr.hcl b/ops/packer/windows/windows.pkr.hcl new file mode 100644 index 000000000000..4c14b7b75806 --- /dev/null +++ b/ops/packer/windows/windows.pkr.hcl @@ -0,0 +1,90 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + windows-update = { + version = "0.15.0" + source = "github.com/rgl/windows-update" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Windows Server 2022 + ssh + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 120 +} + +data "amazon-ami" "aws-windows-x64" { + filters = { + name = "Windows_Server-2022-English-Full-Base-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-windows" { + source_ami = "${data.amazon-ami.aws-windows-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-windows-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "Administrator" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.ps1" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 2.5 hours until the AMI is ready + delay_seconds = 15 + max_attempts = 600 + } + fast_launch { + enable_fast_launch = true + target_resource_count = 10 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-windows"] + + provisioner "windows-update" {} + + provisioner "powershell" { + script = "install_choco.ps1" + } + + provisioner "windows-restart" { + max_retries = 3 + } + + provisioner "powershell" { + script = "bootstrap.ps1" + } + + provisioner "powershell" { # Sysprep should run the last + script = "sysprep.ps1" + } +} From 61f207dec6b5001127b4f5cd5e7ee0b2381bf980 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 00:21:50 -0700 Subject: [PATCH 06/86] Fix Windows build --- .github/workflows/main.yml | 2 +- CMakeLists.txt | 8 +++++++- include/xgboost/collective/socket.h | 3 ++- include/xgboost/windefs.h | 7 +++++++ tests/cpp/common/test_device_vector.cu | 5 +++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ead80a7dadac..a523ec7a1ba3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,5 +65,5 @@ jobs: if ($LASTEXITCODE -ne 0) { throw "Last command failed" } cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - build/testxgboost.exe + & .\testxgboost.exe if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/CMakeLists.txt b/CMakeLists.txt index 22fe4a3eb977..9bfaedb1e16f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,9 @@ if(USE_CUDA) find_package(CUDAToolkit REQUIRED) find_package(CCCL CONFIG) - if(NOT CCCL_FOUND) + if(CCCL_FOUND) + message(STATUS "Standalone CCCL found.") + else() message(STATUS "Standalone CCCL not found. Attempting to use CCCL from CUDA Toolkit...") find_package(CCCL CONFIG HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake) @@ -238,6 +240,10 @@ if(USE_CUDA) target_link_libraries(CCCL::CCCL INTERFACE libcudacxx::libcudacxx CUB::CUB Thrust) endif() endif() + # Define guard macros to prevent windows.h from conflicting with winsock2.h + if(WIN32) + target_compile_definitions(CCCL::CCCL INTERFACE NOMINMAX WIN32_LEAN_AND_MEAN _WINSOCKAPI_) + endif() endif() if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h index 4bc285a515c5..57882ee26844 100644 --- a/include/xgboost/collective/socket.h +++ b/include/xgboost/collective/socket.h @@ -99,6 +99,7 @@ inline auto ThrowAtError(StringView fn_name, std::int32_t errsv = LastError()) { using SocketT = SOCKET; #else using SocketT = int; +#define INVALID_SOCKET -1 #endif // defined(_WIN32) #if !defined(xgboost_CHECK_SYS_CALL) @@ -276,7 +277,7 @@ class TCPSocket { SockDomain domain_{SockDomain::kV4}; #endif - constexpr static HandleT InvalidSocket() { return -1; } + constexpr static HandleT InvalidSocket() { return INVALID_SOCKET; } explicit TCPSocket(HandleT newfd) : handle_{newfd} {} diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h index 99bf11d09b17..b0e012994e4a 100644 --- a/include/xgboost/windefs.h +++ b/include/xgboost/windefs.h @@ -20,7 +20,14 @@ #endif // !defined(NOMINMAX) // A macro used inside `windows.h` to avoid conflicts with `winsock2.h` +#if !defined(WIN32_LEAN_AND_MEAN) #define WIN32_LEAN_AND_MEAN +#endif // !defined(WIN32_LEAN_AND_MEAN) + +// Stop windows.h from including winsock.h +#if !defined(_WINSOCKAPI_) +#define _WINSOCKAPI_ +#endif // !defined(_WINSOCKAPI_) #if !defined(xgboost_IS_MINGW) diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 9dff9c691c15..d7a03e41a64b 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -10,6 +10,7 @@ #include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory #include "../../../src/common/device_vector.cuh" #include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore +#include "xgboost/windefs.h" // for xgboost_IS_WIN namespace dh { TEST(DeviceUVector, Basic) { @@ -109,10 +110,14 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) + ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) if (major >= 12 && minor >= 5) { ASSERT_TRUE(pinned.IsVm()); } else { ASSERT_FALSE(pinned.IsVm()); } +#endif // defined(xgboost_IS_WIN) } } // namespace dh From 6b78a12245450eabbe226d3a9df4a986373e258e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 16:06:30 -0700 Subject: [PATCH 07/86] Custom image for Linux --- .github/runs-on.yml | 12 +++--- .github/workflows/main.yml | 11 +++++- ops/packer/linux/bootstrap.sh | 42 +++++++++++++++++++++ ops/packer/linux/linux.pkr.hcl | 68 ++++++++++++++++++++++++++++++++++ ops/packer/linux/setup_ssh.sh | 2 + 5 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 ops/packer/linux/bootstrap.sh create mode 100644 ops/packer/linux/linux.pkr.hcl create mode 100644 ops/packer/linux/setup_ssh.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index f8de09feb553..3fa13de66a25 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,9 +1,11 @@ +# Custom images with CUDA toolkit installed +# See ops/packer for instructions for building the images images: - dlami-amd64: + linux-amd64: platform: "linux" arch: "x64" - owner: "898082745236" # AWS - name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-linux-*" windows-amd64: platform: "windows" arch: "x64" @@ -17,10 +19,10 @@ runners: image: ubuntu24-full-x64 linux-amd64-gpu: family: ["g4dn.xlarge"] - image: dlami-amd64 + image: linux-amd64 linux-amd64-mgpu: family: ["g4dn.12xlarge"] - image: dlami-amd64 + image: linux-amd64 linux-arm64-cpu: family: ["c6g", "c7g"] image: ubuntu24-full-arm64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a523ec7a1ba3..7fb80e21299a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,11 +38,19 @@ jobs: submodules: "true" - name: Build and run gtest run: | + cat >> $HOME/.bashrc <<- EOM + export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} + export LD_LIBRARY_PATH=/usr/local/cuda/lib64\ + ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + EOM + source $HOME/.bashrc nvidia-smi + nvcc --version sudo apt update && sudo apt install ninja-build + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$PWD/../cccl" ninja -v ./testxgboost build-gpu-win64: @@ -57,6 +65,7 @@ jobs: - name: Build and run gtest shell: powershell run: | + nvidia-smi nvcc --version git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh new file mode 100644 index 000000000000..9cf0edfe7fe8 --- /dev/null +++ b/ops/packer/linux/bootstrap.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl + +## Install CUDA 12.5 + driver +echo "Installilng CUDA and driver..." +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin +sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb +sudo dpkg -i cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb +sudo cp /var/cuda-repo-ubuntu2404-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/ +sudo apt-get update +sudo apt-get install -y cuda-toolkit-12-5 nvidia-driver-555-open cuda-drivers-555 + +## Install Docker +# Add Docker's official GPG key: +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# Allow users to use Docker without sudo +sudo usermod -aG docker ubuntu + +## Install NVIDIA Container Toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl new file mode 100644 index 000000000000..1dc11f9bac03 --- /dev/null +++ b/ops/packer/linux/linux.pkr.hcl @@ -0,0 +1,68 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Ubuntu 24.04 + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 40 +} + +data "amazon-ami" "aws-ubuntu-x64" { + filters = { + name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-linux" { + source_ami = "${data.amazon-ami.aws-ubuntu-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-linux-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "ubuntu" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.sh" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 1 hour until the AMI is ready + delay_seconds = 15 + max_attempts = 240 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-linux"] + + provisioner "shell" { + script = "bootstrap.sh" + } +} diff --git a/ops/packer/linux/setup_ssh.sh b/ops/packer/linux/setup_ssh.sh new file mode 100644 index 000000000000..501b4da455f5 --- /dev/null +++ b/ops/packer/linux/setup_ssh.sh @@ -0,0 +1,2 @@ +#!/bin/bash +systemctl start ssh From 000be18bfe35a0acdf071d6f09fc9dd413bd353d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 21:22:29 -0700 Subject: [PATCH 08/86] New Docker wrapper with S3 caching --- .github/runs-on.yml | 2 +- .github/workflows/main.yml | 99 ++++++---- .../docker}/Dockerfile.aarch64 | 0 .../docker}/Dockerfile.clang_tidy | 2 +- {tests/ci_build => ops/docker}/Dockerfile.cpu | 0 {tests/ci_build => ops/docker}/Dockerfile.gpu | 10 +- .../Dockerfile.gpu_build_r_rockylinux8 | 2 +- .../docker}/Dockerfile.gpu_build_rockylinux8 | 2 +- .../ci_build => ops/docker}/Dockerfile.i386 | 0 {tests/ci_build => ops/docker}/Dockerfile.jvm | 0 .../docker}/Dockerfile.jvm_cross | 16 +- .../docker}/Dockerfile.jvm_gpu_build | 2 +- .../Dockerfile.jvm_manylinux2014_aarch64 | 0 .../Dockerfile.jvm_manylinux2014_x86_64 | 0 .../docker}/Dockerfile.manylinux2014_aarch64 | 0 .../docker}/Dockerfile.manylinux2014_x86_64 | 0 .../docker}/Dockerfile.manylinux_2_28_x86_64 | 0 ops/docker/entrypoint.sh | 43 +++++ ops/docker_build.py | 134 +++++++++++++ ops/docker_run.py | 181 ++++++++++++++++++ ops/packer/linux/bootstrap.sh | 7 +- tests/ci_build/Dockerfile.gpu_dev_ver | 54 ------ 22 files changed, 450 insertions(+), 104 deletions(-) rename {tests/ci_build => ops/docker}/Dockerfile.aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.clang_tidy (98%) rename {tests/ci_build => ops/docker}/Dockerfile.cpu (100%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu (81%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu_build_r_rockylinux8 (98%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu_build_rockylinux8 (99%) rename {tests/ci_build => ops/docker}/Dockerfile.i386 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_cross (74%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_gpu_build (98%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_manylinux2014_aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_manylinux2014_x86_64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux2014_aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux2014_x86_64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux_2_28_x86_64 (100%) create mode 100755 ops/docker/entrypoint.sh create mode 100644 ops/docker_build.py create mode 100644 ops/docker_run.py delete mode 100644 tests/ci_build/Dockerfile.gpu_dev_ver diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 3fa13de66a25..6ae28d1e9c6b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -16,7 +16,7 @@ runners: linux-amd64-cpu: cpu: 16 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] - image: ubuntu24-full-x64 + image: linux-amd64 linux-amd64-gpu: family: ["g4dn.xlarge"] image: linux-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7fb80e21299a..8af6571a99bb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,50 +10,85 @@ concurrency: cancel-in-progress: true jobs: - build: - name: Build + build-gpu: + name: Build GPU runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - uses: actions/checkout@v4 with: submodules: "true" - - name: Build and run gtest + - name: Build container run: | - sudo apt update && sudo apt install ninja-build - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON - ninja -v - ./testxgboost - build-gpu: - name: Build GPU + python3 ops/docker_build.py \ + --container-def gpu_build_rockylinux8 \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + --build-arg CUDA_VERSION_ARG=12.5.1 \ + --build-arg NCCL_VERSION_ARG=2.22.3-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Build gtest + run: | + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGPU_COMPUTE_VER=75 + - name: Stash testxgboost + run: | + aws s3 cp ./build/testxgboost \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + test-gpu: + name: Test GPU + needs: build-gpu runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-gpu steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - run: | - cat >> $HOME/.bashrc <<- EOM - export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} - export LD_LIBRARY_PATH=/usr/local/cuda/lib64\ - ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} - EOM - source $HOME/.bashrc - nvidia-smi - nvcc --version - sudo apt update && sudo apt install ninja-build - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$PWD/../cccl" - ninja -v - ./testxgboost - build-gpu-win64: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash testxgboost + run: | + aws s3 cp \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ + ./testxgboost + chmod +x testxgboost + - name: Build container + run: | + python3 ops/docker_build.py \ + --container-def gpu \ + --container-id xgb-ci.gpu \ + --build-arg CUDA_VERSION_ARG=12.5.1 \ + --build-arg NCCL_VERSION_ARG=2.22.3-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Run gtest + run: | + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu \ + --use-gpus \ + -- ./testxgboost + + build-test-gpu-win64: name: Build GPU (Windows) runs-on: - runs-on=${{ github.run_id }} diff --git a/tests/ci_build/Dockerfile.aarch64 b/ops/docker/Dockerfile.aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.aarch64 rename to ops/docker/Dockerfile.aarch64 diff --git a/tests/ci_build/Dockerfile.clang_tidy b/ops/docker/Dockerfile.clang_tidy similarity index 98% rename from tests/ci_build/Dockerfile.clang_tidy rename to ops/docker/Dockerfile.clang_tidy index 2e7751a20185..c9528015c17e 100644 --- a/tests/ci_build/Dockerfile.clang_tidy +++ b/ops/docker/Dockerfile.clang_tidy @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04 ARG CUDA_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.cpu b/ops/docker/Dockerfile.cpu similarity index 100% rename from tests/ci_build/Dockerfile.cpu rename to ops/docker/Dockerfile.cpu diff --git a/tests/ci_build/Dockerfile.gpu b/ops/docker/Dockerfile.gpu similarity index 81% rename from tests/ci_build/Dockerfile.gpu rename to ops/docker/Dockerfile.gpu index 884fc924cba8..461f1d99dd54 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/ops/docker/Dockerfile.gpu @@ -1,8 +1,10 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG + # Should be first 4 digits (e.g. 24.06) ARG NCCL_VERSION_ARG +ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -23,11 +25,11 @@ ENV PATH=/opt/miniforge/bin:$PATH RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cuda-version=$CUDA_SHORT_VER \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ dask \ - dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ diff --git a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/Dockerfile.gpu_build_r_rockylinux8 similarity index 98% rename from tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/Dockerfile.gpu_build_r_rockylinux8 index 159e5d776c16..7c1d4e8ef642 100644 --- a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/Dockerfile.gpu_build_r_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG R_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 b/ops/docker/Dockerfile.gpu_build_rockylinux8 similarity index 99% rename from tests/ci_build/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/Dockerfile.gpu_build_rockylinux8 index 8869fb468e12..d021190b6744 100644 --- a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/Dockerfile.gpu_build_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.i386 b/ops/docker/Dockerfile.i386 similarity index 100% rename from tests/ci_build/Dockerfile.i386 rename to ops/docker/Dockerfile.i386 diff --git a/tests/ci_build/Dockerfile.jvm b/ops/docker/Dockerfile.jvm similarity index 100% rename from tests/ci_build/Dockerfile.jvm rename to ops/docker/Dockerfile.jvm diff --git a/tests/ci_build/Dockerfile.jvm_cross b/ops/docker/Dockerfile.jvm_cross similarity index 74% rename from tests/ci_build/Dockerfile.jvm_cross rename to ops/docker/Dockerfile.jvm_cross index 2f2b5b77ede8..3ebdb3c6686d 100644 --- a/tests/ci_build/Dockerfile.jvm_cross +++ b/ops/docker/Dockerfile.jvm_cross @@ -1,6 +1,6 @@ FROM ubuntu:22.04 -ARG JDK_VERSION=8 -ARG SPARK_VERSION=3.5.1 +ARG JDK_VERSION_ARG=8 +ARG SPARK_VERSION_ARG=3.5.1 # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -11,7 +11,7 @@ RUN \ apt-get install -y software-properties-common && \ add-apt-repository ppa:openjdk-r/ppa && \ apt-get update && \ - apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \ + apt-get install -y tar unzip wget openjdk-$JDK_VERSION_ARG-jdk libgomp1 && \ # Python wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/miniforge && \ @@ -22,12 +22,12 @@ RUN \ ln -s /opt/apache-maven-3.9.7/ /opt/maven && \ # Spark with scala 2.12 mkdir -p /opt/spark-scala-2.12 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \ - tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz && \ + tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ # Spark with scala 2.13 mkdir -p /opt/spark-scala-2.13 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \ - tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz && \ + tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 ENV PATH=/opt/miniforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH @@ -43,7 +43,7 @@ RUN set -ex; \ gosu nobody true # Set default JDK version -RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64 +RUN update-java-alternatives -v -s java-1.$JDK_VERSION_ARG.0-openjdk-amd64 # Default entry-point to use if running locally # It will preserve attributes of created files diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/ops/docker/Dockerfile.jvm_gpu_build similarity index 98% rename from tests/ci_build/Dockerfile.jvm_gpu_build rename to ops/docker/Dockerfile.jvm_gpu_build index edb5918b8bbc..7f0168df467f 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/ops/docker/Dockerfile.jvm_gpu_build @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 b/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 rename to ops/docker/Dockerfile.jvm_manylinux2014_aarch64 diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 b/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 rename to ops/docker/Dockerfile.jvm_manylinux2014_x86_64 diff --git a/tests/ci_build/Dockerfile.manylinux2014_aarch64 b/ops/docker/Dockerfile.manylinux2014_aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux2014_aarch64 rename to ops/docker/Dockerfile.manylinux2014_aarch64 diff --git a/tests/ci_build/Dockerfile.manylinux2014_x86_64 b/ops/docker/Dockerfile.manylinux2014_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux2014_x86_64 rename to ops/docker/Dockerfile.manylinux2014_x86_64 diff --git a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/Dockerfile.manylinux_2_28_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/Dockerfile.manylinux_2_28_x86_64 diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh new file mode 100755 index 000000000000..a0c5f56bb52d --- /dev/null +++ b/ops/docker/entrypoint.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# This script is a wrapper creating the same user inside container as the one +# running the ci_build.sh outside the container. It also set the home directory +# for the user inside container to match the same absolute path as the workspace +# outside of container. Do not run this manually. It does not make sense. It is +# intended to be called by ci_build.sh only. + +set -e + +COMMAND=("$@") + +if ! touch /this_is_writable_file_system; then + echo "You can't write to your filesystem!" + echo "If you are in Docker you should check you do not have too many images" \ + "with too many files in them. Docker has some issue with it." + exit 1 +else + rm /this_is_writable_file_system +fi + +if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then + groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true + useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ + "${CI_BUILD_USER}" || true + export HOME="/home/${CI_BUILD_USER}" + shopt -s dotglob + cp -r /root/* "$HOME/" + chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" + + # Allows project-specific customization + if [[ -e "/workspace/.pre_entry.sh" ]]; then + gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh + fi + + # Enable passwordless sudo capabilities for the user + chown root:"${CI_BUILD_GID}" "$(which gosu)" + chmod +s "$(which gosu)"; sync + + exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" +else + exec "${COMMAND[@]}" +fi diff --git a/ops/docker_build.py b/ops/docker_build.py new file mode 100644 index 000000000000..a7276cd65b76 --- /dev/null +++ b/ops/docker_build.py @@ -0,0 +1,134 @@ +""" +Wrapper script to build a Docker container with layer caching +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +from typing import Optional + +from docker_run import SCRIPT_DIR, fancy_print_cli_args + + +def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + container_id: str, + *, + build_args: list[dict[str, str]], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, + cache_from: Optional[str], + cache_to: Optional[str], +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Layer caching + if cache_from: + docker_build_cli_args.extend(["--cache-from", cache_from]) + if cache_to: + docker_build_cli_args.extend(["--cache-to", cache_to]) + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_id, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "buildx", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + # Dockerfile to be used in docker build + dockerfile_path = SCRIPT_DIR / "docker" / f"Dockerfile.{args.container_def}" + docker_context_path = SCRIPT_DIR / "docker" + + build_args = parse_build_args(args.build_arg) + + docker_build( + args.container_id, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + cache_from=args.cache_from, + cache_to=args.cache_to, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from docker/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID to assign to the newly built container", + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" + ), + ) + parser.add_argument( + "--cache-from", + type=str, + help="Use an external cache source for the Docker build", + ) + parser.add_argument( + "--cache-to", + type=str, + help="Export layers from the container to an external cache destination", + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/docker_run.py b/ops/docker_run.py new file mode 100644 index 000000000000..4fc6356a90a1 --- /dev/null +++ b/ops/docker_run.py @@ -0,0 +1,181 @@ +""" +Wrapper script to run a command inside a Docker container +""" + +import argparse +import grp +import itertools +import os +import pathlib +import pwd +import subprocess +import sys +import textwrap + +SCRIPT_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = SCRIPT_DIR.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def parse_run_args(raw_run_args: str) -> list[str]: + return [x for x in raw_run_args.split(" ") if x] + + +def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: + container_id = f"xgb-ci.{container_name}" + # For some build arguments, append special suffixies + for arg_name, suffix in [ + ("CUDA_VERSION_ARG", "cuda"), + ("RAPIDS_VERSION_ARG", "rapids"), + ("JDK_VERSION_ARG", "jdk"), + ]: + if arg_name in build_args: + container_id += f"_{suffix}{build_args[arg_name]}" + return container_id + + +def get_user_ids() -> dict[str, str]: + uid = os.getuid() + gid = os.getgid() + return { + "CI_BUILD_UID": str(uid), + "CI_BUILD_USER": pwd.getpwuid(uid).pw_name, + "CI_BUILD_GID": str(gid), + "CI_BUILD_GROUP": grp.getgrgid(gid).gr_name, + } + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def docker_run( + container_id: str, + command_args: list[str], + *, + use_gpus: bool, + workdir: pathlib.Path, + user_ids: dict[str, str], + extra_args: list[str], +) -> None: + # Command-line arguments to be passed to `docker run` + docker_run_cli_args = ["--rm", "--pid=host"] + + if use_gpus: + docker_run_cli_args.extend(["--gpus", "all"]) + + docker_run_cli_args.extend(["-v", f"{workdir}:/workspace", "-w", "/workspace"]) + docker_run_cli_args.extend( + itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) + ) + docker_run_cli_args.extend(extra_args) + docker_run_cli_args.append(container_id) + docker_run_cli_args.extend(command_args) + + cli_args = ["docker", "run"] + docker_run_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + run_args = parse_run_args(args.run_args) + user_ids = get_user_ids() + + if args.use_gpus: + print("Using NVIDIA GPUs for `docker run`") + if args.interactive: + print("Using interactive mode for `docker run`") + run_args.append("-it") + + docker_run( + args.container_id, + args.command_args, + use_gpus=args.use_gpus, + workdir=args.workdir, + user_ids=user_ids, + extra_args=run_args, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + usage=( + f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " + "[COMMAND_ARG ...]" + ), + description="Run tasks inside a Docker container", + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID of the container to run.", + ) + parser.add_argument( + "--use-gpus", + action="store_true", + help=( + "Grant the container access to NVIDIA GPUs; requires the NVIDIA " + "Container Toolkit." + ), + ) + parser.add_argument( + "--interactive", + action="store_true", + help=( + "Run the container in the interactive mode; requires an interactive shell " + "(TTY). With this flag, you can use Ctrl-C to interrupt an long-running " + "command." + ), + ) + parser.add_argument( + "--workdir", + type=lambda p: pathlib.Path(p).expanduser().resolve(), + default=PROJECT_ROOT_DIR, + help="Path to working directory; if unset, use the project's root", + ) + parser.add_argument( + "--run-args", + type=str, + default="", + help=( + "Argument(s) to be passed to `docker run`. When passing multiple " + "arguments, use single quotes to wrap them. Example: " + "--run-args '--cap-add SYS_PTRACE --shm-size=4g'" + ), + ) + parser.add_argument( + "command_args", + metavar="COMMAND_ARG", + type=str, + nargs="+", + help=( + "Argument(s) for the command to execute. NOTE. Make sure to specify " + "double-dash (--) to clearly distinguish between the command and the " + "preceding parameters. Example: --run-args '--cap-add SYS_PTRACE " + "--shm-size=4g' -- ./myprog" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index 9cf0edfe7fe8..fac5c20f7146 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -4,7 +4,7 @@ set -euo pipefail ## Install basic tools echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections sudo apt-get update -sudo apt-get install -y cmake git build-essential wget ca-certificates curl +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip ## Install CUDA 12.5 + driver echo "Installilng CUDA and driver..." @@ -40,3 +40,8 @@ sudo apt-get update sudo apt-get install -y nvidia-container-toolkit sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker + +## Install AWS CLI v2 +wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip +unzip awscliv2.zip +sudo ./aws/install diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver deleted file mode 100644 index d23c5e83c2c7..000000000000 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ /dev/null @@ -1,54 +0,0 @@ -# Container to test XGBoost against dev versions of dependencies - -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits of the dev version (e.g. 24.06) -ARG NCCL_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with dev versions of cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - dask \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] From e1e3b41ed2df12a2d0e431d28df82da1b1c16916 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 24 Oct 2024 15:20:02 -0700 Subject: [PATCH 09/86] Create utility script to build CI container --- .github/workflows/main.yml | 143 +++++++++++++++++-------------- ops/docker_build.py | 2 +- ops/docker_build.sh | 141 ++++++++++++++++++++++++++++++ ops/matrix/ci_container.yml | 18 ++++ ops/matrix/docker_cache_ecr.yml | 4 + ops/matrix/extract_build_args.jq | 6 ++ ops/matrix/extract_build_args.sh | 21 +++++ ops/packer/linux/bootstrap.sh | 5 ++ 8 files changed, 275 insertions(+), 65 deletions(-) create mode 100755 ops/docker_build.sh create mode 100644 ops/matrix/ci_container.yml create mode 100644 ops/matrix/docker_cache_ecr.yml create mode 100644 ops/matrix/extract_build_args.jq create mode 100755 ops/matrix/extract_build_args.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8af6571a99bb..18f997e5c52e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,47 +10,66 @@ concurrency: cancel-in-progress: true jobs: + build-containers: + name: Build CI containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + strategy: + matrix: + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 build-gpu: name: Build GPU + needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build container - run: | - python3 ops/docker_build.py \ - --container-def gpu_build_rockylinux8 \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=12.5.1 \ - --build-arg NCCL_VERSION_ARG=2.22.3-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 \ - --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max - - name: Build gtest - run: | - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - -DGPU_COMPUTE_VER=75 - - name: Stash testxgboost - run: | - aws s3 cp ./build/testxgboost \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + - name: Build gtest + run: | + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGPU_COMPUTE_VER=75 + - name: Stash testxgboost + run: | + aws s3 cp ./build/testxgboost \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost test-gpu: name: Test GPU needs: build-gpu @@ -60,8 +79,6 @@ jobs: steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - uses: actions/checkout@v4 with: submodules: "true" @@ -71,21 +88,19 @@ jobs: s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ ./testxgboost chmod +x testxgboost - - name: Build container - run: | - python3 ops/docker_build.py \ - --container-def gpu \ - --container-id xgb-ci.gpu \ - --build-arg CUDA_VERSION_ARG=12.5.1 \ - --build-arg NCCL_VERSION_ARG=2.22.3-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 \ - --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 - name: Run gtest run: | + nvidia-smi python3 ops/docker_run.py \ --container-id xgb-ci.gpu \ --use-gpus \ + --run-args='--privileged' \ -- ./testxgboost build-test-gpu-win64: @@ -94,20 +109,20 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-gpu steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - shell: powershell - run: | - nvidia-smi - nvcc --version - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - & .\testxgboost.exe - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + shell: powershell + run: | + nvidia-smi + nvcc --version + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + mkdir build + cd build + cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + & .\testxgboost.exe + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/docker_build.py b/ops/docker_build.py index a7276cd65b76..dd2871c3a6ed 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -63,7 +63,7 @@ def docker_build( str(docker_context_path), ] ) - cli_args = ["docker", "buildx", "build"] + docker_build_cli_args + cli_args = ["docker", "build"] + docker_build_cli_args fancy_print_cli_args(cli_args) subprocess.run(cli_args, check=True, encoding="utf-8") diff --git a/ops/docker_build.sh b/ops/docker_build.sh new file mode 100755 index 000000000000..c8c0680aea05 --- /dev/null +++ b/ops/docker_build.sh @@ -0,0 +1,141 @@ +#!/bin/bash +## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for ops/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## ops/matrix/ci_container.yml. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - CONTAINER_ID: String ID uniquely identifying the container (Required) + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - USE_DOCKER_CACHE: If set to 1, enable caching +EOF +) + +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "CONTAINER_ID" "BRANCH_NAME" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(ops/matrix/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false +then + USE_DOCKER_CACHE=0 +fi + +if [[ ${USE_DOCKER_CACHE} -eq 0 ]] +then + echo "USE_DOCKER_CACHE not set; caching disabled" +else + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" + echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} +fi + +# Pull pre-built container from the cache +# First try locating one for the particular branch or pull request +CACHE_FROM_CMD="" +IS_CACHED=0 +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" + IS_CACHED=1 + else + # If there's no pre-built container from the cache, + # use the pre-built container from the master branch. + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" + echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ + "Using a cached container from the master branch: ${DOCKER_URL}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + IS_CACHED=1 + else + echo "Could not find a cached container for the master branch either." + IS_CACHED=0 + fi + fi + if [[ $IS_CACHED -eq 1 ]] + then + CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" + fi +fi + +# Run Docker build +set -x +python3 ops/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-id ${CONTAINER_ID} \ + ${BUILD_ARGS} \ + --cache-to type=inline \ + ${CACHE_FROM_CMD} +set +x + +# Now cache the new container +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" + docker tag "${CONTAINER_ID}" "${DOCKER_URL}" + + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${DOCKER_URL}" + if ! time docker push --quiet "${DOCKER_URL}" + then + echo "ERROR: could not update Docker cache ${DOCKER_URL}" + exit 1 + fi +fi diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml new file mode 100644 index 000000000000..e01431b463a5 --- /dev/null +++ b/ops/matrix/ci_container.yml @@ -0,0 +1,18 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# ops/docker/Dockerfile.CONTAINER_DEF + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.10" + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.10" diff --git a/ops/matrix/docker_cache_ecr.yml b/ops/matrix/docker_cache_ecr.yml new file mode 100644 index 000000000000..e20f35fc8020 --- /dev/null +++ b/ops/matrix/docker_cache_ecr.yml @@ -0,0 +1,4 @@ +## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache + +DOCKER_CACHE_ECR_ID: "492475357299" +DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/ops/matrix/extract_build_args.jq b/ops/matrix/extract_build_args.jq new file mode 100644 index 000000000000..0453e2a7c081 --- /dev/null +++ b/ops/matrix/extract_build_args.jq @@ -0,0 +1,6 @@ +def compute_build_args($input; $container_id): + $input | + .[$container_id].build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/ops/matrix/extract_build_args.sh b/ops/matrix/extract_build_args.sh new file mode 100755 index 000000000000..ec4621bc42b2 --- /dev/null +++ b/ops/matrix/extract_build_args.sh @@ -0,0 +1,21 @@ +#!/bin/bash +## Extract container definition and build args from ops/matrix/ci_container.yml, +## given the container ID. + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json ops/matrix/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json ops/matrix/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "ops/matrix/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index fac5c20f7146..9dbda19c3baa 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -45,3 +45,8 @@ sudo systemctl restart docker wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip unzip awscliv2.zip sudo ./aws/install + +## Install jq and yq +sudo apt update && sudo apt install jq +wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ + tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq From 5373276a364037d32a7eb1f7df98d3473431bb29 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 25 Oct 2024 16:32:48 -0700 Subject: [PATCH 10/86] Migrate some tasks from BuildKite --- .github/runs-on.yml | 5 +- .github/workflows/main.yml | 219 +++++++++++++++--- .../Dockerfile.jvm_manylinux2014_aarch64 | 17 -- .../Dockerfile.jvm_manylinux2014_x86_64 | 17 -- ops/docker/Dockerfile.manylinux2014_aarch64 | 2 + ops/docker/Dockerfile.manylinux2014_x86_64 | 2 + .../docker}/conda_env/aarch64_test.yml | 0 .../docker}/conda_env/cpp_test.yml | 0 .../docker}/conda_env/jvm_tests.yml | 0 .../docker}/conda_env/linux_cpu_test.yml | 0 .../docker}/conda_env/linux_sycl_test.yml | 0 .../docker}/conda_env/macos_cpu_test.yml | 0 .../docker}/conda_env/python_lint.yml | 0 .../docker}/conda_env/sdist_test.yml | 0 .../docker}/conda_env/win64_cpu_test.yml | 0 .../docker}/conda_env/win64_test.yml | 0 ops/docker_run.py | 2 +- {tests/ci_build => ops}/format_wheel_meta.py | 3 +- ops/matrix/ci_container.yml | 20 ++ ops/matrix/extract_build_args.jq | 4 +- {tests/ci_build => ops}/rename_whl.py | 0 ops/stash_artifacts.sh | 66 ++++++ .../buildkite => ops/task}/build-cpu-arm64.sh | 44 ++-- ops/task/build-cpu.sh | 43 ++++ ops/task/build-cuda-with-rmm.sh | 65 ++++++ {tests/buildkite => ops/task}/build-cuda.sh | 64 ++--- .../task/build-jvm-manylinux2014.sh | 18 +- .../task}/build-manylinux2014.sh | 46 ++-- .../task/build-via-cmake.sh | 17 +- ops/task/enforce-ci.sh | 42 ++++ .../task/patches}/cpu_only_pypkg.patch | 0 .../task/patches}/manylinux2014_warning.patch | 0 .../task/patches}/remove_nccl_dep.patch | 0 ops/task/run-clang-tidy.sh | 11 + tests/buildkite/build-cpu.sh | 34 --- tests/buildkite/build-cuda-with-rmm.sh | 65 ------ .../build-jvm-linux-x86_64-manylinux2014.sh | 29 --- tests/buildkite/conftest.sh | 64 ----- tests/buildkite/run-clang-tidy.sh | 11 - 39 files changed, 556 insertions(+), 354 deletions(-) delete mode 100644 ops/docker/Dockerfile.jvm_manylinux2014_aarch64 delete mode 100644 ops/docker/Dockerfile.jvm_manylinux2014_x86_64 rename {tests/ci_build => ops/docker}/conda_env/aarch64_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/cpp_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/jvm_tests.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/linux_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/linux_sycl_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/macos_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/python_lint.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/sdist_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/win64_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/win64_test.yml (100%) rename {tests/ci_build => ops}/format_wheel_meta.py (96%) rename {tests/ci_build => ops}/rename_whl.py (100%) create mode 100755 ops/stash_artifacts.sh rename {tests/buildkite => ops/task}/build-cpu-arm64.sh (54%) create mode 100755 ops/task/build-cpu.sh create mode 100755 ops/task/build-cuda-with-rmm.sh rename {tests/buildkite => ops/task}/build-cuda.sh (50%) rename tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh => ops/task/build-jvm-manylinux2014.sh (69%) rename {tests/buildkite => ops/task}/build-manylinux2014.sh (60%) rename tests/ci_build/build_via_cmake.sh => ops/task/build-via-cmake.sh (70%) create mode 100755 ops/task/enforce-ci.sh rename {tests/buildkite => ops/task/patches}/cpu_only_pypkg.patch (100%) rename {tests/buildkite => ops/task/patches}/manylinux2014_warning.patch (100%) rename {tests/buildkite => ops/task/patches}/remove_nccl_dep.patch (100%) create mode 100755 ops/task/run-clang-tidy.sh delete mode 100755 tests/buildkite/build-cpu.sh delete mode 100755 tests/buildkite/build-cuda-with-rmm.sh delete mode 100644 tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh delete mode 100755 tests/buildkite/conftest.sh delete mode 100755 tests/buildkite/run-clang-tidy.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 6ae28d1e9c6b..720ba76bb836 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -14,7 +14,7 @@ images: runners: linux-amd64-cpu: - cpu: 16 + cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: linux-amd64 linux-amd64-gpu: @@ -24,12 +24,13 @@ runners: family: ["g4dn.12xlarge"] image: linux-amd64 linux-arm64-cpu: + cpu: 32 family: ["c6g", "c7g"] image: ubuntu24-full-arm64 windows-gpu: family: ["g4dn.2xlarge"] image: windows-amd64 windows-cpu: - cpu: 16 + cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 18f997e5c52e..84967f0684a2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,17 +9,31 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} + jobs: build-containers: name: Build CI containers runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu + - runner=${{ matrix.runner }} strategy: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 - xgb-ci.gpu + - xgb-ci.cpu + - xgb-ci.clang_tidy + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -30,16 +44,80 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: ${{ matrix.container_id }} - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - build-gpu: - name: Build GPU + + clang-tidy: + name: Run clang-tidy needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.clang_tidy + - run: bash ops/task/run-clang-tidy.sh + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.cpu + - run: bash ops/task/build-cpu.sh + - name: Stash CLI executable + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.aarch64 + - run: bash ops/task/build-cpu-arm64.sh + - name: Stash files + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -48,31 +126,100 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - - name: Build gtest - run: | - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - -DGPU_COMPUTE_VER=75 - - name: Stash testxgboost - run: | - aws s3 cp ./build/testxgboost \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/task/build-cuda.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda + + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/task/build-cuda-with-rmm.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting gblic 2.17 + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/task/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-manylinux2014: + name: Build manylinux2024_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} + test-gpu: name: Test GPU - needs: build-gpu + needs: build-cuda runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-gpu @@ -82,18 +229,18 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Unstash testxgboost + - name: Unstash gtest executable run: | - aws s3 cp \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ - ./testxgboost - chmod +x testxgboost + bash ops/stash_artifacts.sh ./testxgboost + chmod +x ./testxgboost + env: + COMMAND: download + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - name: Run gtest run: | nvidia-smi diff --git a/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 b/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 deleted file mode 100644 index 52baff43bb6f..000000000000 --- a/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_aarch64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 b/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 deleted file mode 100644 index 578b85618776..000000000000 --- a/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel ninja-build - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/Dockerfile.manylinux2014_aarch64 b/ops/docker/Dockerfile.manylinux2014_aarch64 index 9627e15c64a0..52baff43bb6f 100644 --- a/ops/docker/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/Dockerfile.manylinux2014_aarch64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_aarch64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ diff --git a/ops/docker/Dockerfile.manylinux2014_x86_64 b/ops/docker/Dockerfile.manylinux2014_x86_64 index 11beb116ee43..fdfcbd277360 100644 --- a/ops/docker/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/Dockerfile.manylinux2014_x86_64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_x86_64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ diff --git a/tests/ci_build/conda_env/aarch64_test.yml b/ops/docker/conda_env/aarch64_test.yml similarity index 100% rename from tests/ci_build/conda_env/aarch64_test.yml rename to ops/docker/conda_env/aarch64_test.yml diff --git a/tests/ci_build/conda_env/cpp_test.yml b/ops/docker/conda_env/cpp_test.yml similarity index 100% rename from tests/ci_build/conda_env/cpp_test.yml rename to ops/docker/conda_env/cpp_test.yml diff --git a/tests/ci_build/conda_env/jvm_tests.yml b/ops/docker/conda_env/jvm_tests.yml similarity index 100% rename from tests/ci_build/conda_env/jvm_tests.yml rename to ops/docker/conda_env/jvm_tests.yml diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/ops/docker/conda_env/linux_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_cpu_test.yml rename to ops/docker/conda_env/linux_cpu_test.yml diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/ops/docker/conda_env/linux_sycl_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_sycl_test.yml rename to ops/docker/conda_env/linux_sycl_test.yml diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/ops/docker/conda_env/macos_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/macos_cpu_test.yml rename to ops/docker/conda_env/macos_cpu_test.yml diff --git a/tests/ci_build/conda_env/python_lint.yml b/ops/docker/conda_env/python_lint.yml similarity index 100% rename from tests/ci_build/conda_env/python_lint.yml rename to ops/docker/conda_env/python_lint.yml diff --git a/tests/ci_build/conda_env/sdist_test.yml b/ops/docker/conda_env/sdist_test.yml similarity index 100% rename from tests/ci_build/conda_env/sdist_test.yml rename to ops/docker/conda_env/sdist_test.yml diff --git a/tests/ci_build/conda_env/win64_cpu_test.yml b/ops/docker/conda_env/win64_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_cpu_test.yml rename to ops/docker/conda_env/win64_cpu_test.yml diff --git a/tests/ci_build/conda_env/win64_test.yml b/ops/docker/conda_env/win64_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_test.yml rename to ops/docker/conda_env/win64_test.yml diff --git a/ops/docker_run.py b/ops/docker_run.py index 4fc6356a90a1..161c81b477b0 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -25,7 +25,7 @@ def parse_run_args(raw_run_args: str) -> list[str]: - return [x for x in raw_run_args.split(" ") if x] + return [x for x in raw_run_args.split() if x] def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: diff --git a/tests/ci_build/format_wheel_meta.py b/ops/format_wheel_meta.py similarity index 96% rename from tests/ci_build/format_wheel_meta.py rename to ops/format_wheel_meta.py index 9e7bad907687..570f7854cf62 100644 --- a/tests/ci_build/format_wheel_meta.py +++ b/ops/format_wheel_meta.py @@ -2,6 +2,7 @@ Script to generate meta.json to store metadata for a nightly build of XGBoost Python package. """ + import json import pathlib from argparse import ArgumentParser @@ -13,7 +14,7 @@ def main(args): raise ValueError(f"Wheel cannot be found at path {wheel_path}") if not wheel_path.is_file(): raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name + wheel_name = wheel_path.name meta_path = pathlib.Path(args.meta_path) if not meta_path.exists(): diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml index e01431b463a5..d57d63d99e5c 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/matrix/ci_container.yml @@ -16,3 +16,23 @@ xgb-ci.gpu: CUDA_VERSION_ARG: "12.5.1" NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" + +xgb-ci.cpu: + container_def: cpu + +xgb-ci.clang_tidy: + container_def: clang_tidy + build_args: + CUDA_VERSION_ARG: "12.5.1" + +xgb-ci.aarch64: + container_def: aarch64 + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 + +xgb-ci.manylinux2014_x86_64: + container_def: manylinux2014_x86_64 + +xgb-ci.manylinux2014_aarch64: + container_def: manylinux2014_aarch64 diff --git a/ops/matrix/extract_build_args.jq b/ops/matrix/extract_build_args.jq index 0453e2a7c081..682b62cb63cb 100644 --- a/ops/matrix/extract_build_args.jq +++ b/ops/matrix/extract_build_args.jq @@ -1,6 +1,8 @@ def compute_build_args($input; $container_id): $input | - .[$container_id].build_args | + .[$container_id] | + select(.build_args != null) | + .build_args | to_entries | map("--build-arg " + .key + "=" + .value) | join(" "); diff --git a/tests/ci_build/rename_whl.py b/ops/rename_whl.py similarity index 100% rename from tests/ci_build/rename_whl.py rename to ops/rename_whl.py diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh new file mode 100755 index 000000000000..f091af3cf50b --- /dev/null +++ b/ops/stash_artifacts.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +## Stash an artifact in an S3 bucket for later use +## +## Note. This script takes in all inputs via environment variables +## except the path to the artifact(s). + +set -euo pipefail + +ENV_VAR_DOC=$( +cat <<-EOF +Inputs + - COMMAND: Whether to upload or download the artifact. Either "upload" or + "download" + - S3_BUCKET: Name of the S3 bucket to store the artifact + - PREFIX: Where the artifact would be stored. The artifact will be stored + in s3://{S3_BUCKET}/{prefix}/. +EOF +) + +if [ "$#" -lt 1 ]; then + echo "Usage: $0 [artifact] [artifact ...]" + exit 1 +fi + +for arg in "COMMAND" "S3_BUCKET" "PREFIX" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" + exit 1 + fi +done + +compute_s3_url() { # args: artifact + S3_URL="s3://${S3_BUCKET}/${PREFIX}/"$(basename "$1") +} + +aws_s3_cp() { # args: src, dest + set -x + aws s3 cp --no-progress "$1" "$2" + set +x + return 0 +} + +if [[ "$COMMAND" == "upload" ]] +then + echo "Uploading artifacts with prefix $PREFIX..." + for artifact in "$@" + do + compute_s3_url "${artifact}" + aws_s3_cp "${artifact}" "${S3_URL}" + done +elif [[ "$COMMAND" == "download" ]] +then + echo "Downloading artifacts with prefix $PREFIX..." + for artifact in "$@" + do + compute_s3_url "${artifact}" + aws_s3_cp "${S3_URL}" "${artifact}" + done +else + echo "Unrecognized command: $COMMAND" + exit 2 +fi + diff --git a/tests/buildkite/build-cpu-arm64.sh b/ops/task/build-cpu-arm64.sh similarity index 54% rename from tests/buildkite/build-cpu-arm64.sh rename to ops/task/build-cpu-arm64.sh index 8b3847ed58b9..4a8c96e0e941 100755 --- a/tests/buildkite/build-cpu-arm64.sh +++ b/ops/task/build-cpu-arm64.sh @@ -1,47 +1,55 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_aarch64 echo "--- Build CPU code targeting ARM64" -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh aarch64" +source ops/task/enforce-ci.sh echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- ops/task/build-via-cmake.sh \ + --conda-env=aarch64_test \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOL=ON + echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c "cd build && ctest --extra-verbose" echo "--- Build binary wheel" -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ + # Make sure that libgomp.so is vendored in the wheel -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" -buildkite-agent artifact upload "python-package/dist/*.whl" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress fi - -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost diff --git a/ops/task/build-cpu.sh b/ops/task/build-cpu.sh new file mode 100755 index 000000000000..7f8c69cd43bf --- /dev/null +++ b/ops/task/build-cpu.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -euox pipefail + +source ops/task/enforce-ci.sh + +echo "--- Build CPU code" + +# This step is not necessary, but here we include it, to ensure that +# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use +# the configured header build/dmlc/build_config.h instead of +# include/dmlc/build_config_default.h. +rm -fv dmlc-core/include/dmlc/build_config_default.h + +# Sanitizer tests +echo "--- Run Google Test with sanitizer enabled" +# Work around https://github.com/google/sanitizers/issues/1614 +sudo sysctl vm.mmap_rnd_bits=28 +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/task/build-via-cmake.sh \ + -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + --run-args '-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer + -e ASAN_OPTIONS=symbolize=1 + -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log + --cap-add SYS_PTRACE' \ + -- bash -c \ + "cd build && ./testxgboost --gtest_filter=-*DeathTest*" + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DPLUGIN_FEDERATED=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- bash -c "cd build && ctest --extra-verbose" diff --git a/ops/task/build-cuda-with-rmm.sh b/ops/task/build-cuda-with-rmm.sh new file mode 100755 index 000000000000..901e66a8f649 --- /dev/null +++ b/ops/task/build-cuda-with-rmm.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -euo pipefail + +WHEEL_TAG=manylinux_2_28_x86_64 + +source ops/task/enforce-ci.sh + +echo "--- Build with CUDA with RMM" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.$WHEEL_TAG \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.$WHEEL_TAG \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ + --acl public-read --no-progress +fi diff --git a/tests/buildkite/build-cuda.sh b/ops/task/build-cuda.sh similarity index 50% rename from tests/buildkite/build-cuda.sh rename to ops/task/build-cuda.sh index 03d2cc8a6a24..c98c041d8187 100755 --- a/tests/buildkite/build-cuda.sh +++ b/ops/task/build-cuda.sh @@ -4,9 +4,9 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source tests/buildkite/conftest.sh +source ops/task/enforce-ci.sh -echo "--- Build with CUDA ${CUDA_VERSION}" +echo "--- Build with CUDA" if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] then @@ -15,58 +15,62 @@ else arch_flag="" fi -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + echo "--- Build binary wheel" -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ # Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress # Generate the meta info which includes xgboost version and the commit info - $command_wrapper python tests/ci_build/format_wheel_meta.py \ + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- python ops/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} \ --meta-path python-package/dist/ aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress fi echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh b/ops/task/build-jvm-manylinux2014.sh similarity index 69% rename from tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh rename to ops/task/build-jvm-manylinux2014.sh index e7fec780b956..88bdb256821f 100644 --- a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh +++ b/ops/task/build-jvm-manylinux2014.sh @@ -2,24 +2,32 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/task/enforce-ci.sh -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_aarch64" +if [ $# -ne 1 ]; then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch=$1 + +image="xgb-ci.manylinux2014_${arch}" # Build XGBoost4J binary echo "--- Build libxgboost4j.so (targeting glibc 2.17)" set -x mkdir build -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" ldd lib/libxgboost4j.so objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu echo "--- Upload libxgboost4j.so" pushd lib -libname=libxgboost4j_linux_arm64_${BUILDKITE_COMMIT}.so +libname=libxgboost4j_linux_${arch}_${GITHUB_SHA}.so mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp ${libname} \ diff --git a/tests/buildkite/build-manylinux2014.sh b/ops/task/build-manylinux2014.sh similarity index 60% rename from tests/buildkite/build-manylinux2014.sh rename to ops/task/build-manylinux2014.sh index 426d32b5c361..7b71b51a0587 100755 --- a/tests/buildkite/build-manylinux2014.sh +++ b/ops/task/build-manylinux2014.sh @@ -2,6 +2,8 @@ set -euo pipefail +source ops/task/enforce-ci.sh + if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" exit 1 @@ -9,24 +11,28 @@ fi arch=$1 -source tests/buildkite/conftest.sh - WHEEL_TAG="manylinux2014_${arch}" -command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}" +image="xgb-ci.$WHEEL_TAG" + python_bin="/opt/python/cp310-cp310/bin/python" echo "--- Build binary wheel for ${WHEEL_TAG}" # Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ +patch -p0 < ops/task/patches/remove_nccl_dep.patch +patch -p0 < ops/task/patches/manylinux2014_warning.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch +git checkout python-package/pyproject.toml python-package/xgboost/core.py + # discard the patch -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} rm -rf python-package/dist/ mkdir python-package/dist/ @@ -34,25 +40,25 @@ mv -v wheelhouse/*.whl python-package/dist/ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" # Patch to rename pkg to xgboost-cpu -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/cpu_only_pypkg.patch -$command_wrapper bash -c \ +patch -p0 < ops/task/patches/remove_nccl_dep.patch +patch -p0 < ops/task/patches/cpu_only_pypkg.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" git checkout python-package/pyproject.toml # discard the patch -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/xgboost_cpu-*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} rm -v python-package/dist/xgboost_cpu-*.whl mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ echo "--- Upload Python wheel" -for wheel in python-package/dist/*.whl -do - buildkite-agent artifact upload "${wheel}" -done if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then for wheel in python-package/dist/*.whl diff --git a/tests/ci_build/build_via_cmake.sh b/ops/task/build-via-cmake.sh similarity index 70% rename from tests/ci_build/build_via_cmake.sh rename to ops/task/build-via-cmake.sh index 3238c41e1bcb..857ebbbec0c2 100755 --- a/tests/ci_build/build_via_cmake.sh +++ b/ops/task/build-via-cmake.sh @@ -1,5 +1,6 @@ -#!/usr/bin/env bash -set -e +#!/bin/bash + +set -euo pipefail if [[ "$1" == --conda-env=* ]] then @@ -26,7 +27,17 @@ mkdir build cd build # Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until # https://github.com/dmlc/xgboost/issues/10400 is fixed -cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON +set -x +cmake .. ${cmake_args} \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -GNinja \ + ${cmake_prefix_flag} \ + -DHIDE_CXX_SYMBOLS=ON \ + -DBUILD_DEPRECATED_CLI=ON ninja clean time ninja -v cd .. +set +x diff --git a/ops/task/enforce-ci.sh b/ops/task/enforce-ci.sh new file mode 100755 index 000000000000..1e50dc045cb1 --- /dev/null +++ b/ops/task/enforce-ci.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +## Ensure that a script is running inside the CI. +## Usage: source ops/task/enforce-ci.sh + +set -euo pipefail + +set -x + +if [[ -z ${GITHUB_ACTION:-} ]] +then + echo "$0 is not meant to run locally; it should run inside GitHub Actions." + echo "Please inspect the content of $0 and locate the desired command manually." + exit 1 +fi + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 2 +fi + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] +then + is_release_branch=1 +else + is_release_branch=0 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi + +set +x diff --git a/tests/buildkite/cpu_only_pypkg.patch b/ops/task/patches/cpu_only_pypkg.patch similarity index 100% rename from tests/buildkite/cpu_only_pypkg.patch rename to ops/task/patches/cpu_only_pypkg.patch diff --git a/tests/buildkite/manylinux2014_warning.patch b/ops/task/patches/manylinux2014_warning.patch similarity index 100% rename from tests/buildkite/manylinux2014_warning.patch rename to ops/task/patches/manylinux2014_warning.patch diff --git a/tests/buildkite/remove_nccl_dep.patch b/ops/task/patches/remove_nccl_dep.patch similarity index 100% rename from tests/buildkite/remove_nccl_dep.patch rename to ops/task/patches/remove_nccl_dep.patch diff --git a/ops/task/run-clang-tidy.sh b/ops/task/run-clang-tidy.sh new file mode 100755 index 000000000000..da12a8808a2a --- /dev/null +++ b/ops/task/run-clang-tidy.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Run clang-tidy" + +source ops/task/enforce-ci.sh + +python3 ops/docker_run.py \ + --container-id xgb-ci.clang_tidy \ + -- python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh deleted file mode 100755 index 11679d644de1..000000000000 --- a/tests/buildkite/build-cpu.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Build CPU code" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh cpu" - -$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h - # This step is not necessary, but here we include it, to ensure that - # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use - # the configured header build/dmlc/build_config.h instead of - # include/dmlc/build_config_default.h. -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ - -DPLUGIN_FEDERATED=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost - -# Sanitizer test -echo "--- Run Google Test with sanitizer enabled" -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \ - -DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \ - -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "` - `"-e ASAN_OPTIONS=symbolize=1 "` - `"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "` - `"--cap-add SYS_PTRACE" \ - $command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "` - `"--extra-verbose" diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh deleted file mode 100755 index f1d3f1b1c91a..000000000000 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - -echo "--- Build with CUDA ${CUDA_VERSION} with RMM" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ - --acl public-read --no-progress -fi - -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh deleted file mode 100644 index 46a819a016d3..000000000000 --- a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_x86_64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && ninja -v" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_x86_64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh deleted file mode 100755 index 185b4a356d7e..000000000000 --- a/tests/buildkite/conftest.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -function get_aws_secret { - if [[ $# -ne 1 ]] - then - echo "Usage: get_aws_secret [Name of secret]" - return 1 - fi - aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString -} - -function set_buildkite_env_vars_in_container { - # Pass all Buildkite-specific env vars to Docker containers. - # This is to be used with tests/ci_build/ci_build.sh - export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "` - `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "` - `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "` - `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL" -} - -set -x - -CUDA_VERSION=12.4.1 -NCCL_VERSION=2.22.3-1 -RAPIDS_VERSION=24.08 -DEV_RAPIDS_VERSION=24.10 -SPARK_VERSION=3.5.1 -JDK_VERSION=8 -R_VERSION=4.3.2 - -if [[ -z ${BUILDKITE:-} ]] -then - echo "$0 is not meant to run locally; it should run inside BuildKite." - echo "Please inspect the content of $0 and locate the desired command manually." - exit 1 -fi - -if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]] -then - is_pull_request=1 - BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST -else - is_pull_request=0 - BRANCH_NAME=$BUILDKITE_BRANCH -fi -export BRANCH_NAME=${BRANCH_NAME//\//-} - -if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] -then - is_release_branch=1 - enforce_daily_budget=0 -else - is_release_branch=0 - enforce_daily_budget=1 -fi - -if [[ -n ${DISABLE_RELEASE:-} ]] -then - is_release_branch=0 -fi - -set +x diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh deleted file mode 100755 index 95ff010c20f1..000000000000 --- a/tests/buildkite/run-clang-tidy.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Run clang-tidy" - -source tests/buildkite/conftest.sh - -tests/ci_build/ci_build.sh clang_tidy \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - python3 tests/ci_build/tidy.py --cuda-archs 75 From fa96af191afd9bfbae855ae8c3f88771cafdd342 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 28 Oct 2024 23:24:38 -0700 Subject: [PATCH 11/86] Move more pipelines; refine stash_artifacts --- .github/workflows/main.yml | 105 ++++++++--------- .github/workflows/windows.yml | 61 ++++++++++ ops/docker/conda_env/win64_cpu_test.yml | 22 ---- ops/stash_artifacts.py | 144 ++++++++++++++++++++++++ ops/stash_artifacts.sh | 66 ----------- ops/task/build-win64-gpu.ps1 | 44 ++++++++ ops/task/enforce-ci.ps1 | 28 +++++ ops/task/enforce-ci.sh | 2 + ops/task/test-cpp-gpu.sh | 33 ++++++ ops/task/test-win64-gpu.ps1 | 28 +++++ tests/buildkite/build-containers.sh | 47 -------- tests/buildkite/build-win64-gpu.ps1 | 55 --------- tests/buildkite/conftest.ps1 | 13 --- tests/buildkite/test-cpp-gpu.sh | 24 ---- tests/buildkite/test-win64-gpu.ps1 | 39 ------- 15 files changed, 385 insertions(+), 326 deletions(-) create mode 100644 .github/workflows/windows.yml delete mode 100644 ops/docker/conda_env/win64_cpu_test.yml create mode 100644 ops/stash_artifacts.py delete mode 100755 ops/stash_artifacts.sh create mode 100644 ops/task/build-win64-gpu.ps1 create mode 100644 ops/task/enforce-ci.ps1 create mode 100755 ops/task/test-cpp-gpu.sh create mode 100644 ops/task/test-win64-gpu.ps1 delete mode 100755 tests/buildkite/build-containers.sh delete mode 100644 tests/buildkite/build-win64-gpu.ps1 delete mode 100644 tests/buildkite/conftest.ps1 delete mode 100755 tests/buildkite/test-cpp-gpu.sh delete mode 100644 tests/buildkite/test-win64-gpu.ps1 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 84967f0684a2..d1f1d2e3f0b6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,6 +34,8 @@ jobs: include: - container_id: xgb-ci.manylinux2014_aarch64 runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -81,14 +83,16 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/task/build-cpu.sh - name: Stash CLI executable - run: bash ops/stash_artifacts.sh ./xgboost - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu \ + -- ./xgboost build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-arm64-cpu @@ -104,11 +108,12 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/task/build-cpu-arm64.sh - name: Stash files - run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 \ + -- ./xgboost python-package/dist/*.whl build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel @@ -132,11 +137,12 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ + -- build/testxgboost python-package/dist/*.whl build-cuda-with-rmm: name: Build CUDA with RMM @@ -160,11 +166,12 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda-with-rmm.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ + -- build/testxgboost build-jvm-manylinux2014: name: Build libxgboost4j.so targeting gblic 2.17 @@ -217,8 +224,8 @@ jobs: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} - test-gpu: - name: Test GPU + test-cpp-gpu: + name: Run Google Tests needs: build-cuda runs-on: - runs-on=${{ github.run_id }} @@ -229,47 +236,25 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Unstash gtest executable - run: | - bash ops/stash_artifacts.sh ./testxgboost - chmod +x ./testxgboost - env: - COMMAND: download - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu - - name: Run gtest + - name: Unstash gtest run: | - nvidia-smi - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu \ - --use-gpus \ - --run-args='--privileged' \ - -- ./testxgboost - - build-test-gpu-win64: - name: Build GPU (Windows) - runs-on: - - runs-on=${{ github.run_id }} - - runner=windows-gpu - steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - shell: powershell + python3 ops/stash_artifacts.py \ + --command download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ + -- build/testxgboost + chmod +x build/testxgboost + - run: bash ops/task/test-cpp-gpu.sh build-cuda + - name: Unstash gtest run: | - nvidia-smi - nvcc --version - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - & .\testxgboost.exe - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + python3 ops/stash_artifacts.py \ + --command download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ + -- build/testxgboost + chmod +x build/testxgboost + - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 000000000000..6edc14711258 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,61 @@ +name: Nextgen XGBoost CI Windows + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: powershell + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} + # TODO(hcho3): Remove + RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-m3ikdpczirva + +jobs: + build-win64-gpu: + name: Build XGBoost for Windows with CUDA + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-cpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: powershell ops/task/build-win64-gpu.ps1 + - name: Stash files + run: | + conda activate + python ops/stash_artifacts.py ` + --command upload ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` + -- build/testxgboost.exe xgboost.exe ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + test-win64-gpu: + name: Test XGBoost on Windows + needs: build-win64-gpu + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash files + run: | + conda activate + python ops/stash_artifacts.py ` + --command download ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` + -- build/testxgboost.exe xgboost.exe python-package/dist/*.whl + - run: powershell ops/task/test-win64-gpu.ps1 diff --git a/ops/docker/conda_env/win64_cpu_test.yml b/ops/docker/conda_env/win64_cpu_test.yml deleted file mode 100644 index d69dd2a6ef85..000000000000 --- a/ops/docker/conda_env/win64_cpu_test.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: win64_env -channels: -- conda-forge -dependencies: -- python=3.10 -- wheel -- numpy -- scipy -- scikit-learn>=1.4.1 -- pandas -- matplotlib -- dask -- distributed -- python-graphviz -- pytest -- jsonschema -- hypothesis -- python-graphviz -- pip -- py-ubjson -- loky -- pyarrow diff --git a/ops/stash_artifacts.py b/ops/stash_artifacts.py new file mode 100644 index 000000000000..405804b499c6 --- /dev/null +++ b/ops/stash_artifacts.py @@ -0,0 +1,144 @@ +""" +Stash an artifact in an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: + filename = artifact.name + relative_path = resolve(artifact).relative_to(Path.cwd()) + if resolve(artifact.parent) == resolve(Path.cwd()): + full_prefix = prefix + else: + full_prefix = f"{prefix}/{str(relative_path.parent)}" + return f"s3://{s3_bucket}/{full_prefix}/{filename}" + + +def aws_s3_upload(src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(src: str, dest: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + dest_dir = dest.parent + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(args): + print(f"Uploading artifacts with prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + aws_s3_upload(artifact_path, s3_url) + + +def download(args): + print(f"Downloading artifacts with prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + print(f"mkdir -p {str(artifact_path.parent)}") + artifact_path.parent.mkdir(parents=True, exist_ok=True) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + if "*" in artifact: + aws_s3_download_with_wildcard(s3_url, artifact_path) + else: + aws_s3_download(s3_url, artifact_path) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--command", + type=str, + choices=["upload", "download"], + required=True, + help="Whether to upload or download the artifact (upload/download)", + ) + parser.add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parser.add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact would be stored. The artifact will be stored in " + "s3://[s3-bucket]/[prefix]." + ), + ) + parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") + parsed_args = parser.parse_args() + if parsed_args.command == "upload": + upload(parsed_args) + elif parsed_args.command == "download": + download(parsed_args) diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh deleted file mode 100755 index f091af3cf50b..000000000000 --- a/ops/stash_artifacts.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -## Stash an artifact in an S3 bucket for later use -## -## Note. This script takes in all inputs via environment variables -## except the path to the artifact(s). - -set -euo pipefail - -ENV_VAR_DOC=$( -cat <<-EOF -Inputs - - COMMAND: Whether to upload or download the artifact. Either "upload" or - "download" - - S3_BUCKET: Name of the S3 bucket to store the artifact - - PREFIX: Where the artifact would be stored. The artifact will be stored - in s3://{S3_BUCKET}/{prefix}/. -EOF -) - -if [ "$#" -lt 1 ]; then - echo "Usage: $0 [artifact] [artifact ...]" - exit 1 -fi - -for arg in "COMMAND" "S3_BUCKET" "PREFIX" -do - if [[ -z "${!arg:-}" ]] - then - echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" - exit 1 - fi -done - -compute_s3_url() { # args: artifact - S3_URL="s3://${S3_BUCKET}/${PREFIX}/"$(basename "$1") -} - -aws_s3_cp() { # args: src, dest - set -x - aws s3 cp --no-progress "$1" "$2" - set +x - return 0 -} - -if [[ "$COMMAND" == "upload" ]] -then - echo "Uploading artifacts with prefix $PREFIX..." - for artifact in "$@" - do - compute_s3_url "${artifact}" - aws_s3_cp "${artifact}" "${S3_URL}" - done -elif [[ "$COMMAND" == "download" ]] -then - echo "Downloading artifacts with prefix $PREFIX..." - for artifact in "$@" - do - compute_s3_url "${artifact}" - aws_s3_cp "${S3_URL}" "${artifact}" - done -else - echo "Unrecognized command: $COMMAND" - exit 2 -fi - diff --git a/ops/task/build-win64-gpu.ps1 b/ops/task/build-win64-gpu.ps1 new file mode 100644 index 000000000000..0b49d143dd5b --- /dev/null +++ b/ops/task/build-win64-gpu.ps1 @@ -0,0 +1,44 @@ +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +Write-Host "--- Build libxgboost on Windows with CUDA" + +nvcc --version +if ( $is_release_branch -eq 0 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" +} else { + $arch_flag = "" +} + +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +mkdir build +cd build +cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` + -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ` + -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" ${arch_flag} +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +cmake --build . --config Release -- /m /nodeReuse:false ` + "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Build binary wheel" +cd ../python-package +conda activate +pip install --user -v "pip>=23" +pip --version +pip wheel --no-deps -v . --wheel-dir dist/ +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +python ../ops/rename_whl.py ` + --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` + --commit-hash $Env:GITHUB_SHA ` + --platform-tag win_amd64 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Upload Python wheel" +cd .. +if ( $is_release_branch -eq 1 ) { + aws s3 cp (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) ` + s3://xgboost-nightly-builds/$Env:BRANCH_NAME/ --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} diff --git a/ops/task/enforce-ci.ps1 b/ops/task/enforce-ci.ps1 new file mode 100644 index 000000000000..9183764b9a13 --- /dev/null +++ b/ops/task/enforce-ci.ps1 @@ -0,0 +1,28 @@ +## Ensure that a script is running inside the CI. +## Usage: . ops/task/enforce-ci.ps1 + +if ( -Not $Env:GITHUB_ACTION ) { + $script_name = (Split-Path -Path $PSCommandPath -Leaf) + Write-Host "$script_name is not meant to run locally; it should run inside GitHub Actions." + Write-Host "Please inspect the content of $script_name and locate the desired command manually." + exit 1 +} + +if ( -Not $Env:BRANCH_NAME ) { + Write-Host "Make sure to define environment variable BRANCH_NAME." + exit 2 +} + +if ( $Env:GITHUB_BASE_REF ) { + $is_pull_request = 1 +} else { + $is_pull_request = 0 +} + +if ( ($Env:BRANCH_NAME -eq "master") -or ($Env:BRANCH_NAME -match "release_.+") ) { + $is_release_branch = 1 + $enforce_daily_budget = 0 +} else { + $is_release_branch = 0 + $enforce_daily_budget = 1 +} diff --git a/ops/task/enforce-ci.sh b/ops/task/enforce-ci.sh index 1e50dc045cb1..dfed11914c9a 100755 --- a/ops/task/enforce-ci.sh +++ b/ops/task/enforce-ci.sh @@ -30,8 +30,10 @@ fi if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] then is_release_branch=1 + enforce_daily_budget=0 else is_release_branch=0 + enforce_daily_budget=1 fi if [[ -n ${DISABLE_RELEASE:-} ]] diff --git a/ops/task/test-cpp-gpu.sh b/ops/task/test-cpp-gpu.sh new file mode 100755 index 000000000000..57090551ecad --- /dev/null +++ b/ops/task/test-cpp-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {build-cuda,build-cuda-with-rmm}" + exit 1 +fi +arg=$1 + +case "${arg}" in + build-cuda) + echo "--- Run Google Tests with CUDA, using a GPU" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ + -- build/testxgboost + ;; + + build-cuda-with-rmm) + echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ + -- build/testxgboost --use-rmm-pool + ;; + + *) + echo "Unrecognized arg: ${arg}" + exit 2 + ;; +esac diff --git a/ops/task/test-win64-gpu.ps1 b/ops/task/test-win64-gpu.ps1 new file mode 100644 index 000000000000..21d8f6e7b533 --- /dev/null +++ b/ops/task/test-win64-gpu.ps1 @@ -0,0 +1,28 @@ +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +Write-Host "--- Test XGBoost on Windows with CUDA" + +nvcc --version + +Write-Host "--- Run Google Tests" +build/testxgboost.exe +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Set up Python env" +conda activate +$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) +mamba env create -n ${env_name} --file=ops/docker/conda_env/win64_test.yml +conda activate ${env_name} +python -m pip install ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Run Python tests" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +Write-Host "--- Run Python tests with GPU" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` + tests/python-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh deleted file mode 100755 index aa8f572483a3..000000000000 --- a/tests/buildkite/build-containers.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail -set -x - -if [ "$#" -lt 1 ] -then - echo "Usage: $0 [container to build]" - exit 1 -fi -container=$1 - -source tests/buildkite/conftest.sh - -echo "--- Build container ${container}" - -BUILD_ARGS="" - -case "${container}" in - cpu) - ;; - - gpu|gpu_build_rockylinux8) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - ;; - - gpu_dev_ver) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION" - ;; - - jvm_gpu_build) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - ;; - - *) - echo "Unrecognized container ID: ${container}" - exit 2 - ;; -esac - -# Run a no-op command. This will simply build the container and push it to the private registry -tests/ci_build/ci_build.sh ${container} ${BUILD_ARGS} bash diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1 deleted file mode 100644 index 9114d3237751..000000000000 --- a/tests/buildkite/build-win64-gpu.ps1 +++ /dev/null @@ -1,55 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Build libxgboost on Windows with CUDA" - -nvcc --version -if ( $is_release_branch -eq 0 ) { - $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} -mkdir build -cd build -cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` - -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag} -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -cmake --build . --config Release -- /m /nodeReuse:false ` - "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Build binary wheel" -cd ../python-package -conda activate -& pip install --user -v "pip>=23" -& pip --version -& pip wheel --no-deps -v . --wheel-dir dist/ -Get-ChildItem . -Filter dist/*.whl | -Foreach-Object { - & python ../tests/ci_build/rename_whl.py ` - --wheel-path $_.FullName ` - --commit-hash $Env:BUILDKITE_COMMIT ` - --platform-tag win_amd64 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Upload Python wheel" -cd .. -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & buildkite-agent artifact upload python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} -if ( $is_release_branch -eq 1 ) { - Get-ChildItem . -Filter python-package/dist/*.whl | - Foreach-Object { - & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` - --acl public-read --no-progress - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - } -} - -Write-Host "--- Stash C++ test executables" -& buildkite-agent artifact upload build/testxgboost.exe -& buildkite-agent artifact upload xgboost.exe diff --git a/tests/buildkite/conftest.ps1 b/tests/buildkite/conftest.ps1 deleted file mode 100644 index bd623caf0c03..000000000000 --- a/tests/buildkite/conftest.ps1 +++ /dev/null @@ -1,13 +0,0 @@ -if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) { - $is_pull_request = 1 -} else { - $is_pull_request = 0 -} - -if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) { - $is_release_branch = 1 - $enforce_daily_budget = 0 -} else { - $is_release_branch = 0 - $enforce_daily_budget = 1 -} diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh deleted file mode 100755 index d7197db2efce..000000000000 --- a/tests/buildkite/test-cpp-gpu.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Run Google Tests with CUDA, using a GPU" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost - -echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" -rm -rfv build/ -buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --use-rmm-pool diff --git a/tests/buildkite/test-win64-gpu.ps1 b/tests/buildkite/test-win64-gpu.ps1 deleted file mode 100644 index 95a51b50228d..000000000000 --- a/tests/buildkite/test-win64-gpu.ps1 +++ /dev/null @@ -1,39 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Test XGBoost on Windows with CUDA" - -New-Item python-package/dist -ItemType Directory -ea 0 -New-Item build -ItemType Directory -ea 0 -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -nvcc --version - -Write-Host "--- Run Google Tests" -& build/testxgboost.exe -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Set up Python env" -conda activate -$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml -conda activate ${env_name} -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & python -m pip install python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Run Python tests" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -Write-Host "--- Run Python tests with GPU" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` - tests/python-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } From f387555cbe8dbfa384ca2ed294176dc4fff96dae Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 10:42:47 -0700 Subject: [PATCH 12/86] Simplify stash_artifacts --- .github/workflows/main.yml | 59 ++++++++++++++--------------------- .github/workflows/windows.yml | 22 ++++++------- ops/stash_artifacts.ps1 | 47 ++++++++++++++++++++++++++++ ops/stash_artifacts.sh | 39 +++++++++++++++++++++++ 4 files changed, 120 insertions(+), 47 deletions(-) create mode 100644 ops/stash_artifacts.ps1 create mode 100755 ops/stash_artifacts.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d1f1d2e3f0b6..d0e33f87d70a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,6 @@ env: BRANCH_NAME: >- ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} USE_DOCKER_CACHE: 1 - ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} jobs: build-containers: @@ -83,12 +82,10 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/task/build-cpu.sh - name: Stash CLI executable - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu \ - -- ./xgboost + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + KEY: build-cpu build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel @@ -108,12 +105,10 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/task/build-cpu-arm64.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 \ - -- ./xgboost python-package/dist/*.whl + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cpu-arm64 build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel @@ -137,12 +132,10 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ - -- build/testxgboost python-package/dist/*.whl + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cuda build-cuda-with-rmm: name: Build CUDA with RMM @@ -166,12 +159,10 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda-with-rmm.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ - -- build/testxgboost + run: bash ops/stash_artifacts.sh build/testxgboost + env: + COMMAND: upload + KEY: build-cuda-with-rmm build-jvm-manylinux2014: name: Build libxgboost4j.so targeting gblic 2.17 @@ -242,19 +233,17 @@ jobs: CONTAINER_ID: xgb-ci.gpu - name: Unstash gtest run: | - python3 ops/stash_artifacts.py \ - --command download \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ - -- build/testxgboost + bash ops/stash_artifacts.sh build/testxgboost chmod +x build/testxgboost + env: + COMMAND: download + KEY: build-cuda - run: bash ops/task/test-cpp-gpu.sh build-cuda - name: Unstash gtest run: | - python3 ops/stash_artifacts.py \ - --command download \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ - -- build/testxgboost + bash ops/stash_artifacts.sh build/testxgboost chmod +x build/testxgboost + env: + COMMAND: download + KEY: build-cuda-with-rmm - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 6edc14711258..76388302f49f 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -33,13 +33,12 @@ jobs: - run: powershell ops/task/build-win64-gpu.ps1 - name: Stash files run: | - conda activate - python ops/stash_artifacts.py ` - --command upload ` - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` - -- build/testxgboost.exe xgboost.exe ` + powershell ops/stash_artifacts.ps1 ` + build/testxgboost.exe xgboost.exe ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + env: + COMMAND: upload + KEY: build-win64-gpu test-win64-gpu: name: Test XGBoost on Windows needs: build-win64-gpu @@ -52,10 +51,9 @@ jobs: submodules: "true" - name: Unstash files run: | - conda activate - python ops/stash_artifacts.py ` - --command download ` - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` - -- build/testxgboost.exe xgboost.exe python-package/dist/*.whl + powershell ops/stash_artifacts.ps1 ` + build/testxgboost.exe xgboost.exe python-package/dist/*.whl + env: + COMMAND: download + KEY: build-win64-gpu - run: powershell ops/task/test-win64-gpu.ps1 diff --git a/ops/stash_artifacts.ps1 b/ops/stash_artifacts.ps1 new file mode 100644 index 000000000000..2f8cbaf0a855 --- /dev/null +++ b/ops/stash_artifacts.ps1 @@ -0,0 +1,47 @@ +[CmdletBinding()] +Param( + [Parameter( + Mandatory=$true, + Position=0, + ValueFromRemainingArguments=$true + )][string[]]$artifacts +) + +## Convenience wrapper for ops/stash_artifacts.py +## Meant to be used inside GitHub Actions + +$ENV_VAR_DOC = @' +Inputs + - COMMAND: Either "upload" or "download" + - KEY: Unique string to identify a group of artifacts +'@ + +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +foreach ($env in "COMMAND", "KEY", "GITHUB_REPOSITORY", "GITHUB_RUN_ID", + "RUNS_ON_S3_BUCKET_CACHE") { + $val = [Environment]::GetEnvironmentVariable($env) + if ($val -eq $null) { + Write-Host "Error: $env must be set.`n${ENV_VAR_DOC}" + exit 1 + } +} + +$artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ID}" + +conda activate + +Write-Host @" +python ops/stash_artifacts.py ` + --command "${Env:COMMAND}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + -- $artifacts +"@ +python ops/stash_artifacts.py ` + --command "${Env:COMMAND}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + -- $artifacts diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh new file mode 100755 index 000000000000..c796831a963d --- /dev/null +++ b/ops/stash_artifacts.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +## Convenience wrapper for ops/stash_artifacts.py +## Meant to be used inside GitHub Actions + +ENV_VAR_DOC=$( +cat <<-EOF +Inputs + - COMMAND: Either "upload" or "download" + - KEY: Unique string to identify a group of artifacts +EOF +) + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [ "$#" -lt 1 ]; then + echo "Usage: $0 [artifact] [artifact ...]" + exit 1 +fi + +for arg in "COMMAND" "KEY" "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" + exit 1 + fi +done + +artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" + +set -x +python3 ops/stash_artifacts.py \ + --command "${COMMAND}" \ + --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ + --prefix "${artifact_stash_prefix}/${KEY}" \ + -- "$@" From 91eee2d761b852eab47d1c71a86db5afb0ef5419 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 12:10:05 -0700 Subject: [PATCH 13/86] Migrate Python tests --- .github/workflows/main.yml | 83 ++++++++++++++--- ops/matrix/ci_container.yml | 12 ++- ops/task/test-cpp-gpu.sh | 17 +++- ops/task/test-python.sh | 82 +++++++++++++++++ tests/buildkite/test-cpp-mgpu.sh | 17 ---- tests/buildkite/test-python-cpu-arm64.sh | 11 --- tests/buildkite/test-python-cpu.sh | 16 ---- tests/buildkite/test-python-gpu.sh | 59 ------------ tests/ci_build/test_python.sh | 111 ----------------------- 9 files changed, 176 insertions(+), 232 deletions(-) create mode 100755 ops/task/test-python.sh delete mode 100755 tests/buildkite/test-cpp-mgpu.sh delete mode 100755 tests/buildkite/test-python-cpu-arm64.sh delete mode 100755 tests/buildkite/test-python-cpu.sh delete mode 100755 tests/buildkite/test-python-gpu.sh delete mode 100755 tests/ci_build/test_python.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d0e33f87d70a..00e9b5abb844 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -216,11 +216,23 @@ jobs: - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} test-cpp-gpu: - name: Run Google Tests - needs: build-cuda + name: Run Google Tests with GPU(s) + needs: [build-cuda, build-cuda-with-rmm] runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-gpu + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -237,13 +249,62 @@ jobs: chmod +x build/testxgboost env: COMMAND: download - KEY: build-cuda - - run: bash ops/task/test-cpp-gpu.sh build-cuda - - name: Unstash gtest - run: | - bash ops/stash_artifacts.sh build/testxgboost - chmod +x build/testxgboost + KEY: ${{ matrix.artifact_from }} + - run: bash ops/task/test-cpp-gpu.sh ${{ matrix.suite }} + + test-python: + name: Run Python tests + needs: [build-cuda] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - description: "single GPU" + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "single GPU, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "multiple GPUs" + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "multiple GPUs, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "CPU" + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: "CPU ARM64" + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container }} + - name: Unstash Python wheel + run: bash ops/stash_artifacts.sh python-package/dist/*.whl env: COMMAND: download - KEY: build-cuda-with-rmm - - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm + KEY: build-cuda + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/task/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml index d57d63d99e5c..fb0ae62325cd 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/matrix/ci_container.yml @@ -17,14 +17,22 @@ xgb-ci.gpu: NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" -xgb-ci.cpu: - container_def: cpu +xgb-ci.gpu_dev_ver: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.12" + RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: container_def: clang_tidy build_args: CUDA_VERSION_ARG: "12.5.1" +xgb-ci.cpu: + container_def: cpu + xgb-ci.aarch64: container_def: aarch64 diff --git a/ops/task/test-cpp-gpu.sh b/ops/task/test-cpp-gpu.sh index 57090551ecad..96d11bc9940b 100755 --- a/ops/task/test-cpp-gpu.sh +++ b/ops/task/test-cpp-gpu.sh @@ -6,26 +6,33 @@ source ops/task/enforce-ci.sh if [[ "$#" -lt 1 ]] then - echo "Usage: $0 {build-cuda,build-cuda-with-rmm}" + echo "Usage: $0 {gpu,gpu-rmm,mgpu}" exit 1 fi arg=$1 case "${arg}" in - build-cuda) - echo "--- Run Google Tests with CUDA, using a GPU" + gpu) + echo "--- Run Google Tests, using a single GPU" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ --run-args='--privileged' \ -- build/testxgboost ;; - build-cuda-with-rmm) - echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" + gpu-rmm) + echo "--- Run Google Tests, using a single GPU, RMM enabled" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; + mgpu) + echo "--- Run Google Tests, using multiple GPUs" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged --shm-size=4g' \ + -- build/testxgboost --gtest_filter=*MGPU* + ;; + *) echo "Unrecognized arg: ${arg}" exit 2 diff --git a/ops/task/test-python.sh b/ops/task/test-python.sh new file mode 100755 index 000000000000..99f8b0b42277 --- /dev/null +++ b/ops/task/test-python.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} {container_id}" + exit 1 +fi + +suite="$1" +container_id="$2" + +tee test-python-wrapper.sh <<-'EOF' +#!/bin/bash +set -euox pipefail + +source activate "$1" +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl +EOF + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + echo " + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged' \ + -- bash test-python-wrapper.sh gpu_test + ;; + + mgpu) + echo "-- Run Python tests, using multiple GPUs" + echo " + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged --shm-size=4g' \ + -- bash test-python-wrapper.sh gpu_test + ;; + + cpu) + echo "-- Run Python tests (CPU)" + echo " + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash test-python-wrapper.sh linux_cpu_test + ;; + + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + echo " + pytest -v -s -rxXs --fulltrace --durations=0 \\ + tests/python/test_basic.py tests/python/test_basic_models.py \\ + tests/python/test_model_compatibility.py + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash test-python-wrapper.sh aarch64_test + ;; + + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh deleted file mode 100755 index 65614b191d04..000000000000 --- a/tests/buildkite/test-cpp-mgpu.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -echo "--- Run Google Tests with CUDA, using multiple GPUs" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh deleted file mode 100755 index 68a428034073..000000000000 --- a/tests/buildkite/test-python-cpu-arm64.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test Python CPU ARM64" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 -buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 -chmod +x ./xgboost -tests/ci_build/ci_build.sh aarch64 tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh deleted file mode 100755 index 6c53dc2821bc..000000000000 --- a/tests/buildkite/test-python-cpu.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Test CPU code in Python env" - -source tests/buildkite/conftest.sh - -mkdir -pv python-package/dist -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "xgboost" . --step build-cpu -chmod +x ./xgboost - -export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/cpu) -set_buildkite_env_vars_in_container -tests/ci_build/ci_build.sh cpu tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh deleted file mode 100755 index d7bd729a2e01..000000000000 --- a/tests/buildkite/test-python-gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -source tests/buildkite/conftest.sh - -echo "--- Fetch build artifacts" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -if [[ -z "${USE_DEPS_DEV_VER-}" ]] -then - container_tag='gpu' - rapids_version=${RAPIDS_VERSION} -else - container_tag='gpu_dev_ver' - rapids_version=${DEV_RAPIDS_VERSION} -fi - -command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" - -# Run specified test suite -case "$suite" in - gpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, single GPU" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - mgpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, 4 GPUs" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - *) - echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh deleted file mode 100755 index a1a023046e5b..000000000000 --- a/tests/ci_build/test_python.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -set -e - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -# Install XGBoost Python package -function install_xgboost { - wheel_found=0 - pip install --upgrade pip --user - for file in python-package/dist/*.whl - do - if [ -e "${file}" ] - then - pip install --user "${file}" - wheel_found=1 - break # need just one - fi - done - if [ "$wheel_found" -eq 0 ] - then - pushd . - cd python-package - pip install --user -v . - popd - fi -} - -function setup_pyspark_envs { - export PYSPARK_DRIVER_PYTHON=`which python` - export PYSPARK_PYTHON=`which python` - export SPARK_TESTING=1 -} - -function unset_pyspark_envs { - unset PYSPARK_DRIVER_PYTHON - unset PYSPARK_PYTHON - unset SPARK_TESTING -} - -function uninstall_xgboost { - pip uninstall -y xgboost -} - -# Run specified test suite -case "$suite" in - gpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - mgpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu) - source activate linux_cpu_test - set -x - install_xgboost - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu-arm64) - source activate aarch64_test - set -x - install_xgboost - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - *) - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [extra args to pass to pytest]" - exit 1 - ;; -esac From 26fff3826da530931ea48833a018ef3bf686def7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 16:59:57 -0700 Subject: [PATCH 14/86] Mass renaming; Migrate R GPU pkg build + MacOS --- .github/runs-on.yml | 4 +- .github/workflows/macos.yml | 24 ++ .github/workflows/main.yml | 31 +- .github/workflows/windows.yml | 5 +- dev/prepare_jvm_release.py | 2 +- {tests/ci_build => ops}/build_jvm_doc.sh | 11 +- .../ci_build => ops}/build_r_pkg_with_cuda.sh | 12 +- .../build-via-cmake.sh => build_via_cmake.sh} | 0 {dev => ops}/change_scala_version.py | 0 {tests/ci_build => ops}/change_version.py | 0 ops/{matrix => docker}/ci_container.yml | 2 +- ops/{matrix => docker}/docker_cache_ecr.yml | 0 .../{ => dockerfile}/Dockerfile.aarch64 | 0 .../{ => dockerfile}/Dockerfile.clang_tidy | 0 ops/docker/{ => dockerfile}/Dockerfile.cpu | 0 ops/docker/{ => dockerfile}/Dockerfile.gpu | 0 .../Dockerfile.gpu_build_r_rockylinux8 | 0 .../Dockerfile.gpu_build_rockylinux8 | 0 ops/docker/{ => dockerfile}/Dockerfile.i386 | 0 ops/docker/{ => dockerfile}/Dockerfile.jvm | 0 .../{ => dockerfile}/Dockerfile.jvm_cross | 0 .../{ => dockerfile}/Dockerfile.jvm_gpu_build | 0 .../Dockerfile.manylinux2014_aarch64 | 0 .../Dockerfile.manylinux2014_x86_64 | 0 .../Dockerfile.manylinux_2_28_x86_64 | 0 ops/docker/entrypoint.sh | 11 +- ops/{matrix => docker}/extract_build_args.jq | 0 ops/{matrix => docker}/extract_build_args.sh | 8 +- ops/docker_build.py | 7 +- ops/docker_build.sh | 8 +- {tests/ci_build => ops}/lint_cmake.sh | 0 {tests/ci_build => ops}/lint_cpp.py | 0 {tests/ci_build => ops}/lint_python.py | 0 {tests/ci_build => ops}/lint_r.R | 0 .../patches => patch}/cpu_only_pypkg.patch | 0 .../manylinux2014_warning.patch | 0 .../patches => patch}/remove_nccl_dep.patch | 0 ops/{task => pipeline}/build-cpu-arm64.sh | 4 +- ops/{task => pipeline}/build-cpu.sh | 6 +- ops/{task => pipeline}/build-cuda-with-rmm.sh | 14 +- ops/{task => pipeline}/build-cuda.sh | 14 +- .../pipeline}/build-gpu-rpkg.sh | 12 +- .../pipeline}/build-jvm-doc.sh | 6 +- .../pipeline}/build-jvm-macos-m1.sh | 8 +- .../build-jvm-manylinux2014.sh | 2 +- ops/{task => pipeline}/build-manylinux2014.sh | 10 +- ops/{task => pipeline}/build-win64-gpu.ps1 | 10 +- ops/{task => pipeline}/enforce-ci.ps1 | 2 +- ops/{task => pipeline}/enforce-ci.sh | 2 +- ops/{task => pipeline}/run-clang-tidy.sh | 2 +- ops/{task => pipeline}/test-cpp-gpu.sh | 2 +- ops/{task => pipeline}/test-python.sh | 5 +- ops/{task => pipeline}/test-win64-gpu.ps1 | 2 +- ops/stash_artifacts.ps1 | 2 +- ops/stash_artifacts.sh | 2 +- {tests/ci_build => ops}/test_r_package.py | 0 {tests/ci_build => ops}/test_tidy.cc | 0 {tests/ci_build => ops}/test_utils.py | 0 {tests/ci_build => ops}/tidy.py | 0 .../update-rapids.sh => ops/update_rapids.sh | 0 {tests/ci_build => ops}/verify_link.sh | 0 tests/buildkite/infrastructure/README.md | 106 ------ .../agent-iam-policy-template.yml | 32 -- .../aws-stack-creator/create_stack.py | 127 ------- .../aws-stack-creator/metadata.py | 114 ------ .../infrastructure/common_blocks/utils.py | 97 ----- .../buildkite/infrastructure/requirements.txt | 2 - .../service-user/create_service_user.py | 44 --- .../service-user/service-user-template.yml | 349 ------------------ .../create_worker_image_pipelines.py | 85 ----- .../ec2-image-builder-pipeline-template.yml | 108 ------ .../linux-amd64-gpu-bootstrap.yml | 24 -- .../worker-image-pipeline/metadata.py | 18 - .../worker-image-pipeline/run_pipelines.py | 22 -- .../windows-gpu-bootstrap.yml | 71 ---- tests/buildkite/pipeline-mac-m1.yml | 13 - tests/buildkite/pipeline-mgpu.yml | 16 - tests/buildkite/pipeline-nightly.yml | 37 -- tests/buildkite/pipeline-win64.yml | 24 -- tests/buildkite/pipeline.yml | 70 ---- .../test-integration-jvm-packages.sh | 13 - tests/buildkite/test-macos-m1-clang11.sh | 25 -- tests/ci_build/build_jvm_packages.sh | 9 +- tests/ci_build/ci_build.sh | 248 ------------- tests/ci_build/deploy_jvm_packages.sh | 2 +- tests/ci_build/entrypoint.sh | 43 --- tests/ci_build/initialize_maven.sh | 19 - tests/ci_build/jenkins_tools.Groovy | 38 -- tests/ci_build/test_jvm_cross.sh | 62 ---- 89 files changed, 137 insertions(+), 1911 deletions(-) create mode 100644 .github/workflows/macos.yml rename {tests/ci_build => ops}/build_jvm_doc.sh (88%) rename {tests/ci_build => ops}/build_r_pkg_with_cuda.sh (73%) rename ops/{task/build-via-cmake.sh => build_via_cmake.sh} (100%) rename {dev => ops}/change_scala_version.py (100%) rename {tests/ci_build => ops}/change_version.py (100%) rename ops/{matrix => docker}/ci_container.yml (95%) rename ops/{matrix => docker}/docker_cache_ecr.yml (100%) rename ops/docker/{ => dockerfile}/Dockerfile.aarch64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.clang_tidy (100%) rename ops/docker/{ => dockerfile}/Dockerfile.cpu (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu_build_r_rockylinux8 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu_build_rockylinux8 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.i386 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm_cross (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm_gpu_build (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux2014_aarch64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux2014_x86_64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux_2_28_x86_64 (100%) rename ops/{matrix => docker}/extract_build_args.jq (100%) rename ops/{matrix => docker}/extract_build_args.sh (68%) rename {tests/ci_build => ops}/lint_cmake.sh (100%) rename {tests/ci_build => ops}/lint_cpp.py (100%) rename {tests/ci_build => ops}/lint_python.py (100%) rename {tests/ci_build => ops}/lint_r.R (100%) rename ops/{task/patches => patch}/cpu_only_pypkg.patch (100%) rename ops/{task/patches => patch}/manylinux2014_warning.patch (100%) rename ops/{task/patches => patch}/remove_nccl_dep.patch (100%) rename ops/{task => pipeline}/build-cpu-arm64.sh (96%) rename ops/{task => pipeline}/build-cpu.sh (92%) rename ops/{task => pipeline}/build-cuda-with-rmm.sh (91%) rename ops/{task => pipeline}/build-cuda.sh (93%) rename {tests/buildkite => ops/pipeline}/build-gpu-rpkg.sh (53%) rename {tests/buildkite => ops/pipeline}/build-jvm-doc.sh (70%) rename {tests/buildkite => ops/pipeline}/build-jvm-macos-m1.sh (85%) rename ops/{task => pipeline}/build-jvm-manylinux2014.sh (96%) rename ops/{task => pipeline}/build-manylinux2014.sh (88%) rename ops/{task => pipeline}/build-win64-gpu.ps1 (93%) rename ops/{task => pipeline}/enforce-ci.ps1 (94%) rename ops/{task => pipeline}/enforce-ci.sh (94%) rename ops/{task => pipeline}/run-clang-tidy.sh (83%) rename ops/{task => pipeline}/test-cpp-gpu.sh (96%) rename ops/{task => pipeline}/test-python.sh (98%) rename ops/{task => pipeline}/test-win64-gpu.ps1 (96%) rename {tests/ci_build => ops}/test_r_package.py (100%) rename {tests/ci_build => ops}/test_tidy.cc (100%) rename {tests/ci_build => ops}/test_utils.py (100%) rename {tests/ci_build => ops}/tidy.py (100%) rename tests/buildkite/update-rapids.sh => ops/update_rapids.sh (100%) rename {tests/ci_build => ops}/verify_link.sh (100%) delete mode 100644 tests/buildkite/infrastructure/README.md delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/create_stack.py delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/metadata.py delete mode 100644 tests/buildkite/infrastructure/common_blocks/utils.py delete mode 100644 tests/buildkite/infrastructure/requirements.txt delete mode 100644 tests/buildkite/infrastructure/service-user/create_service_user.py delete mode 100644 tests/buildkite/infrastructure/service-user/service-user-template.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/metadata.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml delete mode 100644 tests/buildkite/pipeline-mac-m1.yml delete mode 100644 tests/buildkite/pipeline-nightly.yml delete mode 100644 tests/buildkite/pipeline-win64.yml delete mode 100755 tests/buildkite/test-integration-jvm-packages.sh delete mode 100755 tests/buildkite/test-macos-m1-clang11.sh delete mode 100755 tests/ci_build/ci_build.sh delete mode 100755 tests/ci_build/entrypoint.sh delete mode 100755 tests/ci_build/initialize_maven.sh delete mode 100644 tests/ci_build/jenkins_tools.Groovy delete mode 100755 tests/ci_build/test_jvm_cross.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 720ba76bb836..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -14,7 +14,7 @@ images: runners: linux-amd64-cpu: - cpu: 32 + cpu: 16 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: linux-amd64 linux-amd64-gpu: @@ -24,7 +24,7 @@ runners: family: ["g4dn.12xlarge"] image: linux-amd64 linux-arm64-cpu: - cpu: 32 + cpu: 16 family: ["c6g", "c7g"] image: ubuntu24-full-arm64 windows-gpu: diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 000000000000..2bb3e1aba46c --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,24 @@ +name: Nextgen XGBoost CI, MacOS + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + mac-m1-jvm: + name: "Build libxgboost4j.dylib for MacOS M1" + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: bash ops/pipeline/build-jvm-macos-m1.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 00e9b5abb844..276fa45ba533 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,11 +20,13 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} + - spot=false strategy: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 - xgb-ci.gpu + - xgb-ci.gpu_dev_ver - xgb-ci.cpu - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 @@ -62,7 +64,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.clang_tidy - - run: bash ops/task/run-clang-tidy.sh + - run: bash ops/pipeline/run-clang-tidy.sh build-cpu: name: Build CPU @@ -80,7 +82,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.cpu - - run: bash ops/task/build-cpu.sh + - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable run: bash ops/stash_artifacts.sh ./xgboost env: @@ -103,7 +105,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.aarch64 - - run: bash ops/task/build-cpu-arm64.sh + - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl env: @@ -116,6 +118,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - spot=false steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -130,9 +133,11 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/task/build-cuda.sh + - run: bash ops/pipeline/build-cuda.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + run: | + bash ops/stash_artifacts.sh \ + build/testxgboost ./xgboost python-package/dist/*.whl env: COMMAND: upload KEY: build-cuda @@ -157,7 +162,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/task/build-cuda-with-rmm.sh + - run: bash ops/pipeline/build-cuda-with-rmm.sh - name: Stash files run: bash ops/stash_artifacts.sh build/testxgboost env: @@ -188,7 +193,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/task/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} build-manylinux2014: name: Build manylinux2024_${{ matrix.arch }} wheel @@ -213,7 +218,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} test-cpp-gpu: name: Run Google Tests with GPU(s) @@ -250,7 +255,7 @@ jobs: env: COMMAND: download KEY: ${{ matrix.artifact_from }} - - run: bash ops/task/test-cpp-gpu.sh ${{ matrix.suite }} + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} test-python: name: Run Python tests @@ -302,9 +307,11 @@ jobs: env: CONTAINER_ID: ${{ matrix.container }} - name: Unstash Python wheel - run: bash ops/stash_artifacts.sh python-package/dist/*.whl + run: | + bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost + chmod +x ./xgboost env: COMMAND: download - KEY: build-cuda + KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} - run: bash ops/task/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} + run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 76388302f49f..0fc50815d683 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -26,11 +26,12 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-cpu + - spot=false steps: - uses: actions/checkout@v4 with: submodules: "true" - - run: powershell ops/task/build-win64-gpu.ps1 + - run: powershell ops/pipeline/build-win64-gpu.ps1 - name: Stash files run: | powershell ops/stash_artifacts.ps1 ` @@ -56,4 +57,4 @@ jobs: env: COMMAND: download KEY: build-win64-gpu - - run: powershell ops/task/test-win64-gpu.ps1 + - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 0b4594e2d2c0..927cb4945950 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/tests/ci_build/build_jvm_doc.sh b/ops/build_jvm_doc.sh similarity index 88% rename from tests/ci_build/build_jvm_doc.sh rename to ops/build_jvm_doc.sh index a536b0efeeb3..6f785f488027 100755 --- a/tests/ci_build/build_jvm_doc.sh +++ b/ops/build_jvm_doc.sh @@ -1,15 +1,14 @@ #!/bin/bash -if [ $# -ne 1 ]; then +## Build docs for the JVM packages and package it in a tarball + +if [[ $# -ne 1 ]] +then echo "Usage: $0 [branch name]" exit 1 fi -set -e -set -x - -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh +set -euo pipefail rm -rf build/ cd jvm-packages diff --git a/tests/ci_build/build_r_pkg_with_cuda.sh b/ops/build_r_pkg_with_cuda.sh similarity index 73% rename from tests/ci_build/build_r_pkg_with_cuda.sh rename to ops/build_r_pkg_with_cuda.sh index 78a2afc1cdf7..d0a7c9295195 100755 --- a/tests/ci_build/build_r_pkg_with_cuda.sh +++ b/ops/build_r_pkg_with_cuda.sh @@ -1,8 +1,12 @@ #!/bin/bash -set -e -set -x -if [ "$#" -ne 1 ] +## Build XGBoost R package with GPU support and package it in a tarball. +## Users will be able to install it without having CTK installed +## (only a compatible NVIDIA driver is needed). + +set -euo pipefail + +if [[ "$#" -ne 1 ]] then echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]" exit 1 @@ -10,7 +14,7 @@ fi commit_hash="$1" -python tests/ci_build/test_r_package.py --task=pack +python3 ops/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/task/build-via-cmake.sh b/ops/build_via_cmake.sh similarity index 100% rename from ops/task/build-via-cmake.sh rename to ops/build_via_cmake.sh diff --git a/dev/change_scala_version.py b/ops/change_scala_version.py similarity index 100% rename from dev/change_scala_version.py rename to ops/change_scala_version.py diff --git a/tests/ci_build/change_version.py b/ops/change_version.py similarity index 100% rename from tests/ci_build/change_version.py rename to ops/change_version.py diff --git a/ops/matrix/ci_container.yml b/ops/docker/ci_container.yml similarity index 95% rename from ops/matrix/ci_container.yml rename to ops/docker/ci_container.yml index fb0ae62325cd..f21122231c0b 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -1,7 +1,7 @@ ## List of CI containers with definitions and build arguments # Each container will be built using the definition from -# ops/docker/Dockerfile.CONTAINER_DEF +# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 diff --git a/ops/matrix/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml similarity index 100% rename from ops/matrix/docker_cache_ecr.yml rename to ops/docker/docker_cache_ecr.yml diff --git a/ops/docker/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 similarity index 100% rename from ops/docker/Dockerfile.aarch64 rename to ops/docker/dockerfile/Dockerfile.aarch64 diff --git a/ops/docker/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy similarity index 100% rename from ops/docker/Dockerfile.clang_tidy rename to ops/docker/dockerfile/Dockerfile.clang_tidy diff --git a/ops/docker/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu similarity index 100% rename from ops/docker/Dockerfile.cpu rename to ops/docker/dockerfile/Dockerfile.cpu diff --git a/ops/docker/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu similarity index 100% rename from ops/docker/Dockerfile.gpu rename to ops/docker/dockerfile/Dockerfile.gpu diff --git a/ops/docker/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 similarity index 100% rename from ops/docker/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 diff --git a/ops/docker/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 similarity index 100% rename from ops/docker/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 diff --git a/ops/docker/Dockerfile.i386 b/ops/docker/dockerfile/Dockerfile.i386 similarity index 100% rename from ops/docker/Dockerfile.i386 rename to ops/docker/dockerfile/Dockerfile.i386 diff --git a/ops/docker/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm similarity index 100% rename from ops/docker/Dockerfile.jvm rename to ops/docker/dockerfile/Dockerfile.jvm diff --git a/ops/docker/Dockerfile.jvm_cross b/ops/docker/dockerfile/Dockerfile.jvm_cross similarity index 100% rename from ops/docker/Dockerfile.jvm_cross rename to ops/docker/dockerfile/Dockerfile.jvm_cross diff --git a/ops/docker/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build similarity index 100% rename from ops/docker/Dockerfile.jvm_gpu_build rename to ops/docker/dockerfile/Dockerfile.jvm_gpu_build diff --git a/ops/docker/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 similarity index 100% rename from ops/docker/Dockerfile.manylinux2014_aarch64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 diff --git a/ops/docker/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 similarity index 100% rename from ops/docker/Dockerfile.manylinux2014_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 diff --git a/ops/docker/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 similarity index 100% rename from ops/docker/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh index a0c5f56bb52d..babe4359e8e1 100755 --- a/ops/docker/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,12 +1,8 @@ #!/usr/bin/env bash -# This script is a wrapper creating the same user inside container as the one -# running the ci_build.sh outside the container. It also set the home directory -# for the user inside container to match the same absolute path as the workspace -# outside of container. Do not run this manually. It does not make sense. It is -# intended to be called by ci_build.sh only. +# This wrapper script -set -e +set -euo pipefail COMMAND=("$@") @@ -19,7 +15,8 @@ else rm /this_is_writable_file_system fi -if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ "${CI_BUILD_USER}" || true diff --git a/ops/matrix/extract_build_args.jq b/ops/docker/extract_build_args.jq similarity index 100% rename from ops/matrix/extract_build_args.jq rename to ops/docker/extract_build_args.jq diff --git a/ops/matrix/extract_build_args.sh b/ops/docker/extract_build_args.sh similarity index 68% rename from ops/matrix/extract_build_args.sh rename to ops/docker/extract_build_args.sh index ec4621bc42b2..0fa7b132b760 100755 --- a/ops/matrix/extract_build_args.sh +++ b/ops/docker/extract_build_args.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Extract container definition and build args from ops/matrix/ci_container.yml, +## Extract container definition and build args from ops/docker/ci_container.yml, ## given the container ID. if [ "$#" -ne 1 ]; then @@ -9,13 +9,13 @@ fi CONTAINER_ID="$1" CONTAINER_DEF=$( - yq -o json ops/matrix/ci_container.yml | + yq -o json ops/docker/ci_container.yml | jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' ) BUILD_ARGS=$( - yq -o json ops/matrix/ci_container.yml | + yq -o json ops/docker/ci_container.yml | jq -r --arg container_id "${CONTAINER_ID}" \ - 'include "ops/matrix/extract_build_args"; + 'include "ops/docker/extract_build_args"; compute_build_args(.; $container_id)' ) echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py index dd2871c3a6ed..922d528814a4 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -70,7 +70,9 @@ def docker_build( def main(args: argparse.Namespace) -> None: # Dockerfile to be used in docker build - dockerfile_path = SCRIPT_DIR / "docker" / f"Dockerfile.{args.container_def}" + dockerfile_path = ( + SCRIPT_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + ) docker_context_path = SCRIPT_DIR / "docker" build_args = parse_build_args(args.build_arg) @@ -93,7 +95,8 @@ def main(args: argparse.Namespace) -> None: required=True, help=( "String uniquely identifying the container definition. The container " - "definition will be fetched from docker/Dockerfile.CONTAINER_DEF." + "definition will be fetched from " + "docker/dockerfile/Dockerfile.CONTAINER_DEF." ), ) parser.add_argument( diff --git a/ops/docker_build.sh b/ops/docker_build.sh index c8c0680aea05..0539f817ba8e 100755 --- a/ops/docker_build.sh +++ b/ops/docker_build.sh @@ -2,7 +2,7 @@ ## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). ## This script provides a convenient wrapper for ops/docker_build.py. ## Build-time variables (--build-arg) and container defintion are fetched from -## ops/matrix/ci_container.yml. +## ops/docker/ci_container.yml. ## ## Note. This script takes in all inputs via environment variables. @@ -48,7 +48,7 @@ do done # Fetch CONTAINER_DEF and BUILD_ARGS -source <(ops/matrix/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 +source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false then @@ -59,8 +59,8 @@ if [[ ${USE_DOCKER_CACHE} -eq 0 ]] then echo "USE_DOCKER_CACHE not set; caching disabled" else - DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/matrix/docker_cache_ecr.yml) - DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" # Login for Docker registry diff --git a/tests/ci_build/lint_cmake.sh b/ops/lint_cmake.sh similarity index 100% rename from tests/ci_build/lint_cmake.sh rename to ops/lint_cmake.sh diff --git a/tests/ci_build/lint_cpp.py b/ops/lint_cpp.py similarity index 100% rename from tests/ci_build/lint_cpp.py rename to ops/lint_cpp.py diff --git a/tests/ci_build/lint_python.py b/ops/lint_python.py similarity index 100% rename from tests/ci_build/lint_python.py rename to ops/lint_python.py diff --git a/tests/ci_build/lint_r.R b/ops/lint_r.R similarity index 100% rename from tests/ci_build/lint_r.R rename to ops/lint_r.R diff --git a/ops/task/patches/cpu_only_pypkg.patch b/ops/patch/cpu_only_pypkg.patch similarity index 100% rename from ops/task/patches/cpu_only_pypkg.patch rename to ops/patch/cpu_only_pypkg.patch diff --git a/ops/task/patches/manylinux2014_warning.patch b/ops/patch/manylinux2014_warning.patch similarity index 100% rename from ops/task/patches/manylinux2014_warning.patch rename to ops/patch/manylinux2014_warning.patch diff --git a/ops/task/patches/remove_nccl_dep.patch b/ops/patch/remove_nccl_dep.patch similarity index 100% rename from ops/task/patches/remove_nccl_dep.patch rename to ops/patch/remove_nccl_dep.patch diff --git a/ops/task/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh similarity index 96% rename from ops/task/build-cpu-arm64.sh rename to ops/pipeline/build-cpu-arm64.sh index 4a8c96e0e941..8a5db56d9eeb 100755 --- a/ops/task/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -6,12 +6,12 @@ WHEEL_TAG=manylinux_2_28_aarch64 echo "--- Build CPU code targeting ARM64" -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ --conda-env=aarch64_test \ -DUSE_OPENMP=ON \ -DHIDE_CXX_SYMBOL=ON diff --git a/ops/task/build-cpu.sh b/ops/pipeline/build-cpu.sh similarity index 92% rename from ops/task/build-cpu.sh rename to ops/pipeline/build-cpu.sh index 7f8c69cd43bf..60346203d85f 100755 --- a/ops/task/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -2,7 +2,7 @@ set -euox pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build CPU code" @@ -18,7 +18,7 @@ echo "--- Run Google Test with sanitizer enabled" sudo sysctl vm.mmap_rnd_bits=28 python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DUSE_SANITIZER=ON \ -DENABLED_SANITIZERS="address;leak;undefined" \ -DCMAKE_BUILD_TYPE=Debug \ @@ -35,7 +35,7 @@ python3 ops/docker_run.py \ echo "--- Run Google Test" python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH=/opt/grpc \ -DPLUGIN_FEDERATED=ON python3 ops/docker_run.py \ diff --git a/ops/task/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh similarity index 91% rename from ops/task/build-cuda-with-rmm.sh rename to ops/pipeline/build-cuda-with-rmm.sh index 901e66a8f649..ab5420002f46 100755 --- a/ops/task/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -4,21 +4,21 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA with RMM" -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then +#if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi +#else +# arch_flag="" +#fi echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ diff --git a/ops/task/build-cuda.sh b/ops/pipeline/build-cuda.sh similarity index 93% rename from ops/task/build-cuda.sh rename to ops/pipeline/build-cuda.sh index c98c041d8187..690c7f25f69e 100755 --- a/ops/task/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -4,22 +4,22 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA" -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then +# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi +#else +# arch_flag="" +#fi echo "--- Build libxgboost from the source" git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ diff --git a/tests/buildkite/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh similarity index 53% rename from tests/buildkite/build-gpu-rpkg.sh rename to ops/pipeline/build-gpu-rpkg.sh index 83bcd9eb9c7b..4df0c029568c 100755 --- a/tests/buildkite/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -2,15 +2,13 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" - -tests/ci_build/ci_build.sh gpu_build_r_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg R_VERSION_ARG=${R_VERSION} \ - tests/ci_build/build_r_pkg_with_cuda.sh \ - ${BUILDKITE_COMMIT} +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_r_rockylinux8 \ + -- tests/ci_build/build_r_pkg_with_cuda.sh \ + ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then diff --git a/tests/buildkite/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh similarity index 70% rename from tests/buildkite/build-jvm-doc.sh rename to ops/pipeline/build-jvm-doc.sh index d168eb8cc58d..7f5eb0ac7b8a 100755 --- a/tests/buildkite/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -2,10 +2,12 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh echo "--- Build JVM packages doc" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} +python3 ops/docker_run.py \ + --container-id jvm \ + -- ops/build_jvm_doc.sh ${BRANCH_NAME} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Upload JVM packages doc" diff --git a/tests/buildkite/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh similarity index 85% rename from tests/buildkite/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-m1.sh index 1d2e5e8703bc..d50c1a1a1b1d 100644 --- a/tests/buildkite/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -2,7 +2,7 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh # Display system info echo "--- Display system information" @@ -12,6 +12,8 @@ sysctl -n machdep.cpu.brand_string uname -m set +x +brew install ninja libomp + # Build XGBoost4J binary echo "--- Build libxgboost4j.dylib" set -x @@ -28,9 +30,9 @@ set +x echo "--- Upload libxgboost4j.dylib" set -x pushd lib -libname=libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib +libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -buildkite-agent artifact upload ${libname} + if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp ${libname} \ diff --git a/ops/task/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh similarity index 96% rename from ops/task/build-jvm-manylinux2014.sh rename to ops/pipeline/build-jvm-manylinux2014.sh index 88bdb256821f..c009de93e62c 100644 --- a/ops/task/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" diff --git a/ops/task/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh similarity index 88% rename from ops/task/build-manylinux2014.sh rename to ops/pipeline/build-manylinux2014.sh index 7b71b51a0587..5b1935097d9d 100755 --- a/ops/task/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" @@ -18,8 +18,8 @@ python_bin="/opt/python/cp310-cp310/bin/python" echo "--- Build binary wheel for ${WHEEL_TAG}" # Patch to add warning about manylinux2014 variant -patch -p0 < ops/task/patches/remove_nccl_dep.patch -patch -p0 < ops/task/patches/manylinux2014_warning.patch +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/manylinux2014_warning.patch python3 ops/docker_run.py \ --container-id ${image} \ -- bash -c \ @@ -40,8 +40,8 @@ mv -v wheelhouse/*.whl python-package/dist/ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" # Patch to rename pkg to xgboost-cpu -patch -p0 < ops/task/patches/remove_nccl_dep.patch -patch -p0 < ops/task/patches/cpu_only_pypkg.patch +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/cpu_only_pypkg.patch python3 ops/docker_run.py \ --container-id ${image} \ -- bash -c \ diff --git a/ops/task/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 similarity index 93% rename from ops/task/build-win64-gpu.ps1 rename to ops/pipeline/build-win64-gpu.ps1 index 0b49d143dd5b..48863528684a 100644 --- a/ops/task/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -1,15 +1,15 @@ $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 Write-Host "--- Build libxgboost on Windows with CUDA" nvcc --version -if ( $is_release_branch -eq 0 ) { +#if ( $is_release_branch -eq 0 ) { $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} +#} else { +# $arch_flag = "" +#} git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build diff --git a/ops/task/enforce-ci.ps1 b/ops/pipeline/enforce-ci.ps1 similarity index 94% rename from ops/task/enforce-ci.ps1 rename to ops/pipeline/enforce-ci.ps1 index 9183764b9a13..0528472be6cb 100644 --- a/ops/task/enforce-ci.ps1 +++ b/ops/pipeline/enforce-ci.ps1 @@ -1,5 +1,5 @@ ## Ensure that a script is running inside the CI. -## Usage: . ops/task/enforce-ci.ps1 +## Usage: . ops/pipeline/enforce-ci.ps1 if ( -Not $Env:GITHUB_ACTION ) { $script_name = (Split-Path -Path $PSCommandPath -Leaf) diff --git a/ops/task/enforce-ci.sh b/ops/pipeline/enforce-ci.sh similarity index 94% rename from ops/task/enforce-ci.sh rename to ops/pipeline/enforce-ci.sh index dfed11914c9a..48a48f2dc730 100755 --- a/ops/task/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Ensure that a script is running inside the CI. -## Usage: source ops/task/enforce-ci.sh +## Usage: source ops/pipeline/enforce-ci.sh set -euo pipefail diff --git a/ops/task/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh similarity index 83% rename from ops/task/run-clang-tidy.sh rename to ops/pipeline/run-clang-tidy.sh index da12a8808a2a..9af3273b0dbe 100755 --- a/ops/task/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -4,7 +4,7 @@ set -euox pipefail echo "--- Run clang-tidy" -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ diff --git a/ops/task/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh similarity index 96% rename from ops/task/test-cpp-gpu.sh rename to ops/pipeline/test-cpp-gpu.sh index 96d11bc9940b..51d097fbdbdf 100755 --- a/ops/task/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 1 ]] then diff --git a/ops/task/test-python.sh b/ops/pipeline/test-python.sh similarity index 98% rename from ops/task/test-python.sh rename to ops/pipeline/test-python.sh index 99f8b0b42277..f0c9c81cb554 100755 --- a/ops/task/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 2 ]] then @@ -15,9 +15,10 @@ container_id="$2" tee test-python-wrapper.sh <<-'EOF' #!/bin/bash +source activate "$1" + set -euox pipefail -source activate "$1" export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) export SPARK_TESTING=1 diff --git a/ops/task/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 similarity index 96% rename from ops/task/test-win64-gpu.ps1 rename to ops/pipeline/test-win64-gpu.ps1 index 21d8f6e7b533..e4a55c77b2bd 100644 --- a/ops/task/test-win64-gpu.ps1 +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -1,6 +1,6 @@ $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 Write-Host "--- Test XGBoost on Windows with CUDA" diff --git a/ops/stash_artifacts.ps1 b/ops/stash_artifacts.ps1 index 2f8cbaf0a855..57a58d884226 100644 --- a/ops/stash_artifacts.ps1 +++ b/ops/stash_artifacts.ps1 @@ -18,7 +18,7 @@ Inputs $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 foreach ($env in "COMMAND", "KEY", "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh index c796831a963d..c2a16f42a26c 100755 --- a/ops/stash_artifacts.sh +++ b/ops/stash_artifacts.sh @@ -13,7 +13,7 @@ EOF set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ "$#" -lt 1 ]; then echo "Usage: $0 [artifact] [artifact ...]" diff --git a/tests/ci_build/test_r_package.py b/ops/test_r_package.py similarity index 100% rename from tests/ci_build/test_r_package.py rename to ops/test_r_package.py diff --git a/tests/ci_build/test_tidy.cc b/ops/test_tidy.cc similarity index 100% rename from tests/ci_build/test_tidy.cc rename to ops/test_tidy.cc diff --git a/tests/ci_build/test_utils.py b/ops/test_utils.py similarity index 100% rename from tests/ci_build/test_utils.py rename to ops/test_utils.py diff --git a/tests/ci_build/tidy.py b/ops/tidy.py similarity index 100% rename from tests/ci_build/tidy.py rename to ops/tidy.py diff --git a/tests/buildkite/update-rapids.sh b/ops/update_rapids.sh similarity index 100% rename from tests/buildkite/update-rapids.sh rename to ops/update_rapids.sh diff --git a/tests/ci_build/verify_link.sh b/ops/verify_link.sh similarity index 100% rename from tests/ci_build/verify_link.sh rename to ops/verify_link.sh diff --git a/tests/buildkite/infrastructure/README.md b/tests/buildkite/infrastructure/README.md deleted file mode 100644 index cc3e552e70ff..000000000000 --- a/tests/buildkite/infrastructure/README.md +++ /dev/null @@ -1,106 +0,0 @@ -BuildKite CI Infrastructure -=========================== - -# Worker image builder (`worker-image-pipeline/`) - -Use EC2 Image Builder to build machine images in a deterministic fashion. -The machine images are used to initialize workers in the CI/CD pipelines. - -## Editing bootstrap scripts - -Currently, we create two pipelines for machine images: one for Linux workers and another -for Windows workers. -You can edit the bootstrap scripts to change how the worker machines are initialized. - -* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines -* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines - -## Creating and running Image Builder pipelines - -Run the following commands to create and run pipelines in EC2 Image Builder service: -```bash -python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2 -python worker-image-pipeline/run_pipelines.py --aws-region us-west-2 -``` -Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks: -* `buildkite-windows-gpu-worker` -* `buildkite-linux-amd64-gpu-worker` - -Then go to the EC2 Image Builder console to check the status of the image builds. You may -want to inspect the log output should a build fails. -Once the new machine images are done building, see the next section to deploy the new -images to the worker machines. - -# Elastic CI Stack for AWS (`aws-stack-creator/`) - -Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends -messages to the Autoscaling groups to increase or decrease the number of workers according -to the number of outstanding testing jobs. - -## Deploy an updated CI stack with new machine images - -First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields: -```python -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "...", - }, - "linux-amd64-mgpu": { - "us-west-2": "...", - }, - "windows-gpu": { - "us-west-2": "...", - }, - "windows-cpu": { - "us-west-2": "...", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "...", - }, - "pipeline-loader": { - "us-west-2": "...", - }, - "linux-arm64-cpu": { - "us-west-2": "...", - }, -} -``` -AMI IDs uniquely identify the machine images in the EC2 service. -Go to the EC2 Image Builder console to find the AMI IDs for the new machine images -(see the previous section), and update the following fields: - -* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline -* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]` -* `AMI_ID["windows-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-windows-gpu-worker` pipeline -* `AMI_ID["windows-cpu"]["us-west-2"]`: - Should be identical to `AMI_ID["windows-gpu"]["us-west-2"]` - -Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml -to look up the AMI IDs for the following fields: - -* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxamd64` -* `AMI_ID["pipeline-loader"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]` -* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxarm64` - -Finally, run the following commands to deploy the new machine images: -``` -python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN -``` -Go to the AWS CloudFormation console and verify the existence of the following -CloudFormation stacks: -* `buildkite-pipeline-loader-autoscaling-group` -* `buildkite-linux-amd64-cpu-autoscaling-group` -* `buildkite-linux-amd64-gpu-autoscaling-group` -* `buildkite-linux-amd64-mgpu-autoscaling-group` -* `buildkite-linux-arm64-cpu-autoscaling-group` -* `buildkite-windows-cpu-autoscaling-group` -* `buildkite-windows-gpu-autoscaling-group` diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml deleted file mode 100644 index 7f15b1fbcd4f..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite agent's IAM policy" - -Resources: - BuildkiteAgentManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:*", - "s3-object-lambda:*" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "lambda:InvokeFunction", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "secretsmanager:GetSecretValue", - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py deleted file mode 100644 index 8f8db348a073..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ /dev/null @@ -1,127 +0,0 @@ -import argparse -import copy -import os -import re -import sys - -import boto3 -import botocore -from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import create_or_update_stack, wait - -TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" - - -def get_availability_zones(*, aws_region): - client = boto3.client("ec2", region_name=aws_region) - r = client.describe_availability_zones( - Filters=[ - {"Name": "region-name", "Values": [aws_region]}, - {"Name": "zone-type", "Values": ["availability-zone"]}, - ] - ) - return sorted([x["ZoneName"] for x in r["AvailabilityZones"]]) - - -def get_default_vpc(*, aws_region): - ec2 = boto3.resource("ec2", region_name=aws_region) - default_vpc_id = None - for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]): - return x - - # Create default VPC if not exist - client = boto3.client("ec2", region_name=aws_region) - r = client.create_default_vpc() - default_vpc_id = r["Vpc"]["VpcId"] - - return ec2.Vpc(default_vpc_id) - - -def format_params(args, *, stack_id, agent_iam_policy): - default_vpc = get_default_vpc(aws_region=args.aws_region) - azs = get_availability_zones(aws_region=args.aws_region) - # For each of the first two availability zones (AZs), choose the default subnet - subnets = [ - x.id - for x in default_vpc.subnets.filter( - Filters=[ - {"Name": "default-for-az", "Values": ["true"]}, - {"Name": "availability-zone", "Values": azs[:2]}, - ] - ) - ] - assert len(subnets) == 2 - - params = copy.deepcopy(STACK_PARAMS[stack_id]) - params["ImageId"] = AMI_ID[stack_id][args.aws_region] - params["BuildkiteQueue"] = stack_id - params["CostAllocationTagValue"] = f"buildkite-{stack_id}" - params["BuildkiteAgentToken"] = args.agent_token - params["VpcId"] = default_vpc.id - params["Subnets"] = ",".join(subnets) - params["ManagedPolicyARNs"] = agent_iam_policy - params.update(COMMON_STACK_PARAMS) - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-autoscaling-group" - - -def create_agent_iam_policy(args, *, client): - policy_stack_name = "buildkite-agent-iam-policy" - print(f"Creating stack {policy_stack_name} for agent IAM policy...") - with open( - os.path.join(current_dir, "agent-iam-policy-template.yml"), - encoding="utf-8", - ) as f: - policy_template = f.read() - promise = create_or_update_stack( - args, client=client, stack_name=policy_stack_name, template_body=policy_template - ) - wait(promise, client=client) - - cf = boto3.resource("cloudformation", region_name=args.aws_region) - policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy") - return policy.physical_resource_id - - -def main(args): - client = boto3.client("cloudformation", region_name=args.aws_region) - - agent_iam_policy = create_agent_iam_policy(args, client=client) - - promises = [] - - for stack_id in AMI_ID: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating elastic CI stack {stack_id_full}...") - - params = format_params( - args, stack_id=stack_id, agent_iam_policy=agent_iam_policy - ) - - promise = create_or_update_stack( - args, - client=client, - stack_name=stack_id_full, - template_url=TEMPLATE_URL, - params=params, - ) - promises.append(promise) - print(f"CI stack {stack_id_full} is in progress in the background") - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument("--agent-token", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py deleted file mode 100644 index 5012aa738854..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ /dev/null @@ -1,114 +0,0 @@ -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "linux-amd64-mgpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "windows-gpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - "windows-cpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "pipeline-loader": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "linux-arm64-cpu": { - "us-west-2": "ami-0dbf1f9da54222f21", - }, -} - -STACK_PARAMS = { - "linux-amd64-gpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-mgpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.12xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "1", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-gpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "g4dn.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-cpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "c5a.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c5a.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "16", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "pipeline-loader": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "t3a.micro", - "AgentsPerInstance": "1", - "MinSize": "2", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-arm64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c6g.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, -} - -COMMON_STACK_PARAMS = { - "BuildkiteAgentTimestampLines": "false", - "BuildkiteWindowsAdministrator": "true", - "AssociatePublicIpAddress": "true", - "ScaleOutForWaitingJobs": "false", - "EnableCostAllocationTags": "true", - "CostAllocationTagName": "CreatedBy", - "ECRAccessPolicy": "full", - "EnableSecretsPlugin": "false", - "EnableECRPlugin": "false", - "EnableDockerLoginPlugin": "false", - "EnableDockerUserNamespaceRemap": "false", - "BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout", -} diff --git a/tests/buildkite/infrastructure/common_blocks/utils.py b/tests/buildkite/infrastructure/common_blocks/utils.py deleted file mode 100644 index 27a0835e8dc0..000000000000 --- a/tests/buildkite/infrastructure/common_blocks/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import re - -import boto3 -import botocore - - -def stack_exists(args, *, stack_name): - client = boto3.client("cloudformation", region_name=args.aws_region) - waiter = client.get_waiter("stack_exists") - try: - waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1}) - return True - except botocore.exceptions.WaiterError as e: - return False - - -def create_or_update_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - - if stack_exists(args, stack_name=stack_name): - print(f"Stack {stack_name} already exists. Updating...") - try: - response = client.update_stack(**kwargs) - return {"StackName": stack_name, "Action": "update"} - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "ValidationError" and re.search( - "No updates are to be performed", e.response["Error"]["Message"] - ): - print(f"No update was made to {stack_name}") - return {"StackName": stack_name, "Action": "noop"} - else: - raise e - else: - kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False}) - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def replace_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - """Delete an existing stack and create a new stack with identical name""" - - if not stack_exists(args, stack_name=stack_name): - raise ValueError(f"Stack {stack_name} does not exist") - r = client.delete_stack(StackName=stack_name) - delete_waiter = client.get_waiter("stack_delete_complete") - delete_waiter.wait(StackName=stack_name) - - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - "OnFailure": "ROLLBACK", - "EnableTerminationProtection": False, - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def wait(promise, *, client): - stack_name = promise["StackName"] - print(f"Waiting for {stack_name}...") - if promise["Action"] == "create": - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_name) - print(f"Finished creating stack {stack_name}") - elif promise["Action"] == "update": - waiter = client.get_waiter("stack_update_complete") - waiter.wait(StackName=stack_name) - print(f"Finished updating stack {stack_name}") - elif promise["Action"] != "noop": - raise ValueError(f"Invalid promise {promise}") diff --git a/tests/buildkite/infrastructure/requirements.txt b/tests/buildkite/infrastructure/requirements.txt deleted file mode 100644 index 3ce271ebbdd6..000000000000 --- a/tests/buildkite/infrastructure/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3 -cfn_tools diff --git a/tests/buildkite/infrastructure/service-user/create_service_user.py b/tests/buildkite/infrastructure/service-user/create_service_user.py deleted file mode 100644 index ba08779bd159..000000000000 --- a/tests/buildkite/infrastructure/service-user/create_service_user.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -import os - -import boto3 - -current_dir = os.path.dirname(__file__) - - -def main(args): - with open( - os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8" - ) as f: - service_user_template = f.read() - - stack_id = "buildkite-elastic-ci-stack-service-user" - - print("Create a new IAM user with suitable permissions...") - client = boto3.client("cloudformation", region_name=args.aws_region) - response = client.create_stack( - StackName=stack_id, - TemplateBody=service_user_template, - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - ], - Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}], - ) - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_id) - user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name) - key_pair = user.create_access_key_pair() - print("Finished creating an IAM users with suitable permissions.") - print(f"Access Key ID: {key_pair.access_key_id}") - print(f"Access Secret Access Key: {key_pair.secret_access_key}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument( - "--user-name", type=str, default="buildkite-elastic-ci-stack-user" - ) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/service-user/service-user-template.yml b/tests/buildkite/infrastructure/service-user/service-user-template.yml deleted file mode 100644 index 2077cfe7b148..000000000000 --- a/tests/buildkite/infrastructure/service-user/service-user-template.yml +++ /dev/null @@ -1,349 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite Elastic CI Stack CloudFormation service user" - -Parameters: - UserName: - Type: String - Default: buildkite-elastic-ci-stack-user - Description: Name of user to create - -Outputs: - UserNameOutput: - Value: !Ref CloudFormationServiceUser - UserArnOutput: - Value: !GetAtt CloudFormationServiceUser.Arn - -Resources: - CloudFormationServiceUser: - Type: AWS::IAM::User - Properties: - ManagedPolicyArns: - - !Ref SubstackCrudPolicy - - !Ref CrudPolicy - - !Ref ImageBuilderPolicy - UserName: !Ref UserName - - SubstackCrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "cloudformation:*", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "serverlessrepo:GetApplication", - "serverlessrepo:GetCloudFormationTemplate", - "serverlessrepo:CreateCloudFormationTemplate" - ], - "Resource": "*" - } - ] - } - - CrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:DescribeAccountAttributes", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInstances", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcs", - "ec2:CreateTags" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DeleteInternetGateway" - ], - "Resource": "arn:aws:ec2:*:*:internet-gateway/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:DeleteLaunchTemplate" - ], - "Resource": "arn:aws:ec2:*:*:launch-template/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:route-table/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup" - ], - "Resource": "arn:aws:ec2:*:*:security-group/*" - }, - { - "Effect": "Allow", - "Action": "ec2:RunInstances", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:subnet/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:CreateSecurityGroup", - "ec2:ModifyVpcAttribute", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:CreateSubnet", - "ec2:CreateRouteTable", - "ec2:DeleteVpc" - ], - "Resource": "arn:aws:ec2:*:*:vpc/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateDefaultVpc", - "ec2:CreateDefaultSubnet" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:CreateInstanceProfile", - "iam:GetInstanceProfile", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:DeleteInstanceProfile" - ], - "Resource": "arn:aws:iam::*:instance-profile/*" - }, - { - "Effect": "Allow", - "Action": [ - "kms:DescribeKey", - "kms:CreateGrant", - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": "arn:aws:kms:*:*:key/*" - }, - { - "Effect": "Allow", - "Action": [ - "lambda:CreateFunction", - "lambda:GetFunction", - "lambda:GetFunctionCodeSigningConfig", - "lambda:AddPermission", - "lambda:RemovePermission", - "lambda:DeleteFunction", - "lambda:InvokeFunction", - "lambda:TagResource" - ], - "Resource": "arn:aws:lambda:*:*:function:*" - }, - { - "Effect": "Allow", - "Action": [ - "logs:CreateLogGroup", - "logs:PutRetentionPolicy", - "logs:DeleteLogGroup" - ], - "Resource": "arn:aws:logs:*:*:log-group:*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:CreateBucket", - "s3:PutBucketAcl", - "s3:PutBucketLogging", - "s3:PutBucketTagging", - "s3:PutBucketVersioning" - ], - "Resource": "arn:aws:s3:::*" - }, - { - "Effect": "Allow", - "Action": [ - "ssm:GetParameter", - "ssm:PutParameter", - "ssm:DeleteParameter" - ], - "Resource": "arn:aws:ssm:*:*:parameter/*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:ListPolicies", - "iam:ListInstanceProfiles", - "iam:ListRoles", - "iam:ListPolicyVersions", - "iam:ListRolePolicies", - "iam:ListAttachedRolePolicies", - "iam:ListInstanceProfileTags", - "iam:ListRoleTags", - "iam:ListInstanceProfilesForRole", - "iam:GetPolicyVersion", - "iam:GetPolicy", - "iam:GetInstanceProfile", - "iam:GetRole", - "iam:GetRolePolicy", - "iam:TagPolicy", - "iam:UntagPolicy", - "iam:TagInstanceProfile", - "iam:UntagInstanceProfile", - "iam:TagRole", - "iam:UntagRole", - "iam:CreateRole", - "iam:PassRole", - "iam:DeleteRole", - "iam:UpdateRoleDescription", - "iam:UpdateRole", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:CreateInstanceProfile", - "iam:DeleteInstanceProfile", - "iam:DetachRolePolicy", - "iam:SetDefaultPolicyVersion", - "iam:AttachRolePolicy", - "iam:UpdateAssumeRolePolicy", - "iam:PutRolePermissionsBoundary", - "iam:DeleteRolePermissionsBoundary", - "iam:CreatePolicy", - "iam:DeletePolicyVersion", - "iam:DeletePolicy", - "iam:PutRolePolicy", - "iam:DeleteRolePolicy" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeLifecycleHookTypes", - "autoscaling:DescribeTerminationPolicyTypes", - "autoscaling:DescribePolicies", - "autoscaling:DescribeWarmPool", - "autoscaling:DescribeScalingActivities", - "autoscaling:DescribeScalingProcessTypes", - "autoscaling:DescribeScheduledActions", - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLifecycleHooks", - "autoscaling:SetDesiredCapacity", - "autoscaling:PutLifecycleHook", - "autoscaling:DeleteLifecycleHook", - "autoscaling:SetInstanceProtection", - "autoscaling:CreateAutoScalingGroup", - "autoscaling:EnableMetricsCollection", - "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DeleteAutoScalingGroup", - "autoscaling:PutScalingPolicy", - "autoscaling:DeletePolicy", - "autoscaling:BatchPutScheduledUpdateGroupAction", - "autoscaling:PutScheduledUpdateGroupAction", - "autoscaling:DeleteScheduledAction", - "autoscaling:PutWarmPool", - "autoscaling:DeleteWarmPool", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "autoscaling:AttachInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "events:DescribeRule", - "events:PutRule", - "events:PutTargets", - "events:RemoveTargets", - "events:DeleteRule" - ], - "Resource": "arn:aws:events:*:*:rule/*" - } - ] - } - - ImageBuilderPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "imagebuilder:CreateComponent", - "imagebuilder:GetComponent", - "imagebuilder:DeleteComponent", - "imagebuilder:CreateImageRecipe", - "imagebuilder:GetImageRecipe", - "imagebuilder:DeleteImageRecipe", - "imagebuilder:CreateImagePipeline", - "imagebuilder:GetImagePipeline", - "imagebuilder:DeleteImagePipeline", - "imagebuilder:CreateInfrastructureConfiguration", - "imagebuilder:GetInfrastructureConfiguration", - "imagebuilder:DeleteInfrastructureConfiguration", - "imagebuilder:CreateDistributionConfiguration", - "imagebuilder:GetDistributionConfiguration", - "imagebuilder:DeleteDistributionConfiguration", - "imagebuilder:TagResource", - "imagebuilder:StartImagePipelineExecution", - "ec2:DescribeImages", - "ec2:DescribeSnapshots", - "ec2:DescribeRegions", - "ec2:DescribeVolumes", - "ec2:DescribeKeyPairs", - "ec2:DescribeInstanceTypeOfferings" - ], - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py deleted file mode 100644 index 8051b991da51..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import copy -import json -import os -import sys -from urllib.request import urlopen - -import boto3 -import cfn_flip -from metadata import IMAGE_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import replace_stack, wait - -BUILDKITE_CF_TEMPLATE_URL = ( - "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" -) - - -def format_params(*, stack_id, aws_region, ami_mapping): - params = copy.deepcopy(IMAGE_PARAMS[stack_id]) - with open( - os.path.join(current_dir, params["BootstrapScript"]), - encoding="utf-8", - ) as f: - bootstrap_script = f.read() - params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]] - params["BootstrapScript"] = bootstrap_script - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_ami_mapping(): - with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response: - buildkite_cf_template = response.read().decode("utf-8") - cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template)) - return cfn_obj["Mappings"]["AWSRegion2AMI"] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-worker" - - -def main(args): - with open( - os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"), - encoding="utf-8", - ) as f: - ec2_image_pipeline_template = f.read() - - ami_mapping = get_ami_mapping() - - client = boto3.client("cloudformation", region_name=args.aws_region) - promises = [] - - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating EC2 image builder stack {stack_id_full}...") - - params = format_params( - stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping - ) - - promise = replace_stack( - args, - client=client, - stack_name=stack_id_full, - template_body=ec2_image_pipeline_template, - params=params, - ) - promises.append(promise) - print( - f"EC2 image builder stack {stack_id_full} is in progress in the background" - ) - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml deleted file mode 100644 index 8d3bafa72f08..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "EC2 Image Builder pipelines to build workers" - -Parameters: - BaseImageId: - Type: String - Description: Base AMI to build a new image on top of. - - BootstrapScript: - Type: String - Description: Content of AMI customization script - - InstanceType: - Type: String - Description: Instance type for the Image Builder instances. - - InstanceOperatingSystem: - Type: String - Description: The operating system to run on the instance - AllowedValues: - - Linux - - Windows - Default: "Linux" - - VolumeSize: - Type: Number - Description: Size of EBS volume, in GiBs - -Conditions: - IsInstanceWindows: - !Equals [ !Ref InstanceOperatingSystem, "Windows" ] - -Resources: - # IAM role for the image builder instance - InstanceRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: "Allow" - Principal: - Service: "ec2.amazonaws.com" - Action: "sts:AssumeRole" - ManagedPolicyArns: - - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - - arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder - - arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess - - InstanceProfile: - Type: AWS::IAM::InstanceProfile - Properties: - Roles: - - !Ref InstanceRole - - # Component that runs the bootstrap script - BootstrapComponent: - Type: AWS::ImageBuilder::Component - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Platform: !Ref InstanceOperatingSystem - Version: "1.0.0" - Description: Execute a bootstrap script. - Data: !Ref BootstrapScript - - Recipe: - Type: AWS::ImageBuilder::ImageRecipe - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Components: - - ComponentArn: !Ref BootstrapComponent - ParentImage: !Ref BaseImageId - BlockDeviceMappings: - - DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"] - Ebs: - DeleteOnTermination: true - Encrypted: false - VolumeSize: !Ref VolumeSize - VolumeType: gp2 - Version: "1.0.0" - - Infrastructure: - Type: AWS::ImageBuilder::InfrastructureConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - InstanceProfileName: !Ref InstanceProfile - InstanceTypes: - - !Ref InstanceType - TerminateInstanceOnFailure: true - - # Copy to this region only - Distribution: - Type: AWS::ImageBuilder::DistributionConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Distributions: - - Region: !Ref AWS::Region - AmiDistributionConfiguration: {} - - # Composition of the above elements - Pipeline: - Type: AWS::ImageBuilder::ImagePipeline - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - DistributionConfigurationArn: !Ref Distribution - ImageRecipeArn: !Ref Recipe - InfrastructureConfigurationArn: !Ref Infrastructure diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml deleted file mode 100644 index 88403911cbc6..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: BuildKiteLinuxAMD64GPUBootstrap -description: Set up worker image for linux-amd64-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecuteBash - inputs: - commands: - - | - yum groupinstall -y "Development tools" - yum install -y kernel-devel-$(uname -r) - dnf install -y kernel-modules-extra - aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . - chmod +x NVIDIA-Linux-x86_64*.run - ./NVIDIA-Linux-x86_64*.run --silent - - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo - yum install -y nvidia-container-toolkit - yum clean expire-cache - nvidia-ctk runtime configure --runtime=docker - systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py deleted file mode 100644 index 37100209fe2e..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py +++ /dev/null @@ -1,18 +0,0 @@ -IMAGE_PARAMS = { - "linux-amd64-gpu": { - "BaseImageId": "linuxamd64", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "linux-amd64-gpu-bootstrap.yml", - "InstanceType": "g4dn.xlarge", - "InstanceOperatingSystem": "Linux", - "VolumeSize": "40", # in GiBs - }, - "windows-gpu": { - "BaseImageId": "windows", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "windows-gpu-bootstrap.yml", - "InstanceType": "g4dn.2xlarge", - "InstanceOperatingSystem": "Windows", - "VolumeSize": "120", # in GiBs - }, -} diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py deleted file mode 100644 index 9edb8b1a7c24..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py +++ /dev/null @@ -1,22 +0,0 @@ -import argparse - -import boto3 -from create_worker_image_pipelines import get_full_stack_id -from metadata import IMAGE_PARAMS - - -def main(args): - cf = boto3.resource("cloudformation", region_name=args.aws_region) - builder_client = boto3.client("imagebuilder", region_name=args.aws_region) - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id - print(f"Running pipeline {pipeline_arn} to generate a new AMI...") - r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml deleted file mode 100644 index 0348e28c8709..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: BuildKiteWindowsGPUBootstrap -description: Set up worker image for windows-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecutePowerShell - inputs: - commands: - - | - $ErrorActionPreference = "Stop" - - choco --version - choco feature enable -n=allowGlobalConfirmation - - # CMake 3.29.2 - Write-Host '>>> Installing CMake 3.29.2...' - choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Notepad++ - Write-Host '>>> Installing Notepad++...' - choco install notepadplusplus - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Mambaforge - Write-Host '>>> Installing Mambaforge...' - choco install mambaforge /RegisterPython:1 /D:C:\tools\mambaforge - C:\tools\mambaforge\Scripts\conda.exe init --user --system - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - conda config --set auto_activate_base false - - # Install Java 11 - Write-Host '>>> Installing Java 11...' - choco install openjdk11 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Maven - Write-Host '>>> Installing Maven...' - choco install maven - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install GraphViz - Write-Host '>>> Installing GraphViz...' - choco install graphviz - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Visual Studio 2022 Community - Write-Host '>>> Installing Visual Studio 2022 Community...' - choco install visualstudio2022community ` - --params "--wait --passive --norestart" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install visualstudio2022-workload-nativedesktop --params ` - "--wait --passive --norestart --includeOptional" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install CUDA 12.4 - Write-Host '>>> Installing CUDA 12.4...' - choco install cuda --version=12.4.1.551 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install R - Write-Host '>>> Installing R...' - choco install r.project --version=4.3.2 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install rtools --version=4.3.5550 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/pipeline-mac-m1.yml b/tests/buildkite/pipeline-mac-m1.yml deleted file mode 100644 index 57b1b1d12010..000000000000 --- a/tests/buildkite/pipeline-mac-m1.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - - label: ":macos: Build libxgboost4j.dylib for MacOS M1" - command: "tests/buildkite/build-jvm-macos-m1.sh" - key: mac-m1-jvm - agents: - queue: mac-mini-m1 - - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11" - command: "tests/buildkite/test-macos-m1-clang11.sh" - key: mac-m1-appleclang11 - agents: - queue: mac-mini-m1 diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index cbb573c3682c..4246425de0ca 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -24,25 +24,9 @@ steps: queue: linux-amd64-cpu - wait #### -------- BUILD -------- - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - label: ":console: Build and test JVM packages with CUDA" command: "tests/buildkite/build-jvm-packages-gpu.sh" key: build-jvm-packages-gpu agents: queue: linux-amd64-mgpu - wait - #### -------- TEST -------- - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-mgpu.sh" - key: test-cpp-mgpu - agents: - queue: linux-amd64-mgpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml deleted file mode 100644 index c8cc459acc1e..000000000000 --- a/tests/buildkite/pipeline-nightly.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Nightly CI pipeline, to test against dev versions of dependencies - -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 - USE_DEPS_DEV_VER: "1" - # Use dev versions of RAPIDS and other dependencies -steps: - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh gpu_dev_ver" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml deleted file mode 100644 index 83a61981e716..000000000000 --- a/tests/buildkite/pipeline-win64.yml +++ /dev/null @@ -1,24 +0,0 @@ -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- BUILD -------- - - label: ":windows: Build XGBoost for Windows with CUDA" - command: "tests/buildkite/build-win64-gpu.ps1" - key: build-win64-gpu - agents: - queue: windows-cpu - - - wait - - #### -------- TEST -------- - - label: ":windows: Test XGBoost on Windows" - command: "tests/buildkite/test-win64-gpu.ps1" - key: test-win64-gpu - agents: - queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 19e9c6e2b9e5..65225649a3af 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -21,89 +21,19 @@ steps: queue: linux-amd64-cpu - wait #### -------- BUILD -------- - - label: ":console: Run clang-tidy" - command: "tests/buildkite/run-clang-tidy.sh" - key: run-clang-tidy - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU" - command: "tests/buildkite/build-cpu.sh" - key: build-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU ARM64 + manylinux_2_28_aarch64 wheel" - command: "tests/buildkite/build-cpu-arm64.sh" - key: build-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Build CUDA + manylinux_2_28_x86_64 wheel" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build CUDA with RMM" - command: "tests/buildkite/build-cuda-with-rmm.sh" - key: build-cuda-with-rmm - agents: - queue: linux-amd64-cpu - - label: ":console: Build R package with CUDA" - command: "tests/buildkite/build-gpu-rpkg.sh" - key: build-gpu-rpkg - agents: - queue: linux-amd64-cpu - label: ":console: Build JVM packages" timeout_in_minutes: 30 command: "tests/buildkite/build-jvm-packages.sh" key: build-jvm-packages agents: queue: linux-amd64-cpu - - label: ":console: Build libxgboost4j.so for Linux ARM64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh" - key: build-jvm-linux-arm64-manylinux2014 - agents: - queue: linux-arm64-cpu - - label: ":console: Build libxgboost4j.so for Linux x86_64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh" - key: build-jvm-linux-x86_64-manylinux2014 - agents: - queue: linux-amd64-cpu - label: ":console: Build JVM package doc" command: "tests/buildkite/build-jvm-doc.sh" key: build-jvm-doc agents: queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014.sh x86_64" - key: build-manylinux2014-x86_64 - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014.sh aarch64" - key: build-manylinux2014-aarch64 - agents: - queue: linux-arm64-cpu - wait #### -------- TEST -------- - - label: ":console: Test Python package, CPU" - command: "tests/buildkite/test-python-cpu.sh" - key: test-python-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Test Python package, CPU ARM64" - command: "tests/buildkite/test-python-cpu-arm64.sh" - key: test-python-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-gpu.sh" - key: test-cpp-gpu - agents: - queue: linux-amd64-gpu - label: ":console: Run integration tests with JVM packages" command: "tests/buildkite/test-integration-jvm-packages.sh" key: test-integration-jvm-packages diff --git a/tests/buildkite/test-integration-jvm-packages.sh b/tests/buildkite/test-integration-jvm-packages.sh deleted file mode 100755 index 51f74afe9006..000000000000 --- a/tests/buildkite/test-integration-jvm-packages.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test XGBoost4J on a machine with JDK ${JDK_VERSION}, Spark ${SPARK_VERSION}" -buildkite-agent artifact download "jvm-packages/xgboost4j/target/*.jar" . --step build-jvm-packages -buildkite-agent artifact download "jvm-packages/xgboost4j-spark/target/*.jar" . --step build-jvm-packages -buildkite-agent artifact download "jvm-packages/xgboost4j-example/target/*.jar" . --step build-jvm-packages -export CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1' -tests/ci_build/ci_build.sh jvm_cross --build-arg JDK_VERSION=${JDK_VERSION} \ - --build-arg SPARK_VERSION=${SPARK_VERSION} tests/ci_build/test_jvm_cross.sh diff --git a/tests/buildkite/test-macos-m1-clang11.sh b/tests/buildkite/test-macos-m1-clang11.sh deleted file mode 100755 index 6824cb7b14b4..000000000000 --- a/tests/buildkite/test-macos-m1-clang11.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Display system info -echo "--- Display system information" -set -x -system_profiler SPSoftwareDataType -sysctl -n machdep.cpu.brand_string -uname -m -set +x - -# Ensure that XGBoost can be built with Clang 11 -echo "--- Build and Test XGBoost with MacOS M1, Clang 11" -set -x -LLVM11_PATH=$(brew --prefix llvm\@11) -mkdir build -pushd build -cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \ - -DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON -ninja -v -./testxgboost diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh index 97c056403f0a..23811f817bd7 100755 --- a/tests/ci_build/build_jvm_packages.sh +++ b/tests/ci_build/build_jvm_packages.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euo pipefail spark_version=$1 use_cuda=$2 @@ -13,9 +12,6 @@ if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then gpu_options="$use_cuda -Pgpu" fi -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh - rm -rf build/ cd jvm-packages @@ -25,11 +21,10 @@ fi if [ "x$use_scala213" != "x" ]; then cd .. - python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts + python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts cd jvm-packages fi mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options set +x -set +e diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh deleted file mode 100755 index a2f2d6063160..000000000000 --- a/tests/ci_build/ci_build.sh +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env bash -# -# Execute command within a docker container -# -# Usage: ci_build.sh [--use-gpus] -# [--dockerfile ] [-it] -# [--build-arg ] -# -# CONTAINER_TYPE: Type of the docker container used the run the build: e.g., -# (cpu | gpu) -# -# --use-gpus: Whether to grant the container access to NVIDIA GPUs. -# -# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If -# this optional value is not supplied (via the --dockerfile -# flag), will use Dockerfile.CONTAINER_TYPE in default -# -# BUILD_ARG: (Optional) an argument to be passed to docker build -# -# COMMAND: Command to be executed in the docker container -# -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Get the command line arguments. -CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) -shift 1 - -# Dockerfile to be used in docker build -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" -DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" - -GPU_FLAG='' -if [[ "$1" == "--use-gpus" ]]; then - echo "Using NVIDIA GPUs" - GPU_FLAG='--gpus all' - shift 1 -fi - -if [[ "$1" == "--dockerfile" ]]; then - DOCKERFILE_PATH="$2" - DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}") - echo "Using custom Dockerfile path: ${DOCKERFILE_PATH}" - echo "Using custom docker build context path: ${DOCKER_CONTEXT_PATH}" - shift 2 -fi - -if [[ -n "${CI_DOCKER_EXTRA_PARAMS_INIT}" ]] -then - IFS=' ' read -r -a CI_DOCKER_EXTRA_PARAMS <<< "${CI_DOCKER_EXTRA_PARAMS_INIT}" -fi - -if [[ "$1" == "-it" ]]; then - CI_DOCKER_EXTRA_PARAMS+=('-it') - shift 1 -fi - -while [[ "$1" == "--build-arg" ]]; do - CI_DOCKER_BUILD_ARG+=" $1" - CI_DOCKER_BUILD_ARG+=" $2" - shift 2 -done - -if [[ ! -f "${DOCKERFILE_PATH}" ]]; then - echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\"" - exit 1 -fi - -COMMAND=("$@") - -# Validate command line arguments. -if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then - supported_container_types=$( ls -1 ${SCRIPT_DIR}/Dockerfile.* | \ - sed -n 's/.*Dockerfile\.\([^\/]*\)/\1/p' | tr '\n' ' ' ) - echo "Usage: $(basename $0) CONTAINER_TYPE COMMAND" - echo " CONTAINER_TYPE can be one of [${supported_container_types}]" - echo " COMMAND is a command (with arguments) to run inside" - echo " the container." - exit 1 -fi - -# Helper function to traverse directories up until given file is found. -function upsearch () { - test / == "$PWD" && return || \ - test -e "$1" && echo "$PWD" && return || \ - cd .. && upsearch "$1" -} - -# Set up WORKSPACE. Jenkins will set them for you or we pick -# reasonable defaults if you run it outside of Jenkins. -WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}" - -# Determine the docker image name -DOCKER_IMG_NAME="xgb-ci.${CONTAINER_TYPE}" - -# Append cuda version if available -CUDA_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CUDA_VERSION_ARG=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append jdk version if available -JDK_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'JDK_VERSION=[0-9]+' | grep -o -E '[0-9]+') -# Append cmake version if available -CMAKE_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CMAKE_VERSION=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append R version if available -USE_R35=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'USE_R35=[0-9]+' | grep -o -E '[0-9]+$') -if [[ ${USE_R35} == "1" ]]; then - USE_R35="_r35" -elif [[ ${USE_R35} == "0" ]]; then - USE_R35="_no_r35" -fi -DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION$JDK_VERSION$CMAKE_VERSION$USE_R35 - -# Under Jenkins matrix build, the build tag may contain characters such as -# commas (,) and equal signs (=), which are not valid inside docker image names. -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') - -# Convert to all lower-case, as per requirement of Docker image names -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]') - -# Bash on Ubuntu on Windows -UBUNTU_ON_WINDOWS=$([ -e /proc/version ] && grep -l Microsoft /proc/version || echo "") -# MSYS, Git Bash, etc. -MSYS=$([ -e /proc/version ] && grep -l MINGW /proc/version || echo "") - -if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]] && [[ ! "$OSTYPE" == "darwin"* ]]; then - USER_IDS="-e CI_BUILD_UID=$( id -u ) -e CI_BUILD_GID=$( id -g ) -e CI_BUILD_USER=$( id -un ) -e CI_BUILD_GROUP=$( id -gn ) -e CI_BUILD_HOME=${WORKSPACE}" -fi - -# Print arguments. -cat < Date: Thu, 31 Oct 2024 22:36:51 -0700 Subject: [PATCH 15/86] Fix dmlc/xgboost#10752 --- .github/workflows/main.yml | 19 +++++++++++++++++++ ops/docker/ci_container.yml | 6 ++++++ ops/docker/dockerfile/Dockerfile.cpu | 3 +-- ops/docker/dockerfile/Dockerfile.gpu | 3 +-- ops/pipeline/build-cuda-with-rmm.sh | 2 +- ops/pipeline/build-cuda.sh | 3 ++- ops/pipeline/build-gpu-rpkg.sh | 4 ++-- ops/pipeline/build-jvm-doc.sh | 2 +- ops/pipeline/build-jvm-macos-m1.sh | 2 +- ops/pipeline/build-jvm-manylinux2014.sh | 2 +- ops/pipeline/build-manylinux2014.sh | 2 +- ops/pipeline/run-clang-tidy.sh | 2 +- ops/pipeline/test-cpp-gpu.sh | 2 +- ops/pipeline/test-python.sh | 18 +++++++++++++----- ops/{tidy.py => run_clang_tidy.py} | 0 .../test_gpu_with_dask/test_gpu_with_dask.py | 2 ++ 16 files changed, 53 insertions(+), 19 deletions(-) rename ops/{tidy.py => run_clang_tidy.py} (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 276fa45ba533..a05661d22c80 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,6 +25,7 @@ jobs: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_r_rockylinux8 - xgb-ci.gpu - xgb-ci.gpu_dev_ver - xgb-ci.cpu @@ -220,6 +221,24 @@ jobs: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-gpu + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index f21122231c0b..1b3a60adc014 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -10,6 +10,12 @@ xgb-ci.gpu_build_rockylinux8: NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" +xgb-ci.gpu_build_r_rockylinux8: + container_def: gpu_build_r_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.5.1" + R_VERSION_ARG: "4.3.2" + xgb-ci.gpu: container_def: gpu build_args: diff --git a/ops/docker/dockerfile/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu index 22db93572207..64b28026a89c 100644 --- a/ops/docker/dockerfile/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -41,8 +41,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ COPY conda_env/linux_cpu_test.yml /scripts/ RUN mamba create -n linux_cpu_test && \ mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector + mamba clean --all --yes # Install lightweight sudo (not bound to TTY) RUN set -ex; \ diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index 461f1d99dd54..eac35c3aaa90 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -33,8 +33,7 @@ RUN \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector + mamba clean --all --yes ENV GOSU_VERSION=1.10 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index ab5420002f46..24523bd875c0 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_x86_64 diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 690c7f25f69e..9dc7dfad0224 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_x86_64 @@ -16,6 +16,7 @@ echo "--- Build with CUDA" #fi echo "--- Build libxgboost from the source" +set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index 4df0c029568c..c7d3f7fa4235 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -1,13 +1,13 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_r_rockylinux8 \ - -- tests/ci_build/build_r_pkg_with_cuda.sh \ + -- ops/build_r_pkg_with_cuda.sh \ ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 7f5eb0ac7b8a..7b029a4e7e26 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh index d50c1a1a1b1d..29a11451428c 100644 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index c009de93e62c..99216d6f6272 100644 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index 5b1935097d9d..3f04c0f7e7f4 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index 9af3273b0dbe..b669f12ebf9e 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 tests/ci_build/tidy.py --cuda-archs 75 + -- python3 ops/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 51d097fbdbdf..8ff66a554e0c 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index f0c9c81cb554..b33b38ac187c 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,7 +13,7 @@ fi suite="$1" container_id="$2" -tee test-python-wrapper.sh <<-'EOF' +cat > test-python-wrapper.sh <<-'EOF' #!/bin/bash source activate "$1" @@ -32,7 +32,9 @@ case "$suite" in echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test @@ -46,7 +48,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ --run-args='--privileged --shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test @@ -60,7 +64,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" \ -- bash test-python-wrapper.sh linux_cpu_test ;; @@ -71,7 +77,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 \\ tests/python/test_basic.py tests/python/test_basic_models.py \\ tests/python/test_model_compatibility.py - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" \ -- bash test-python-wrapper.sh aarch64_test ;; diff --git a/ops/tidy.py b/ops/run_clang_tidy.py similarity index 100% rename from ops/tidy.py rename to ops/run_clang_tidy.py diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index c729761b8dd4..50d6f4e43ffc 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -99,6 +99,8 @@ def is_df(part: T) -> T: cp.testing.assert_allclose(predt.values.compute(), single_node) # Make sure the output can be integrated back to original dataframe + X.columns = X.columns.astype("object") + # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From 80a883ea037cf087bf9bf7c2db47eac8f2c5e764 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 31 Oct 2024 23:24:09 -0700 Subject: [PATCH 16/86] Fix clang-tidy + rpkg build --- .github/workflows/main.yml | 2 +- ops/{ => clang-tidy}/run_clang_tidy.py | 8 ++++---- ops/{ => clang-tidy}/test_tidy.cc | 0 ops/docker/ci_container.yml | 2 +- ops/{ => lint}/lint_cmake.sh | 0 ops/{ => lint}/lint_cpp.py | 0 ops/{ => lint}/lint_python.py | 0 ops/{ => lint}/lint_r.R | 0 ops/pipeline/enforce-ci.sh | 4 ---- ops/pipeline/run-clang-tidy.sh | 2 +- ops/test_r_package.py | 2 +- ops/test_utils.py | 2 +- 12 files changed, 9 insertions(+), 13 deletions(-) rename ops/{ => clang-tidy}/run_clang_tidy.py (97%) rename ops/{ => clang-tidy}/test_tidy.cc (100%) rename ops/{ => lint}/lint_cmake.sh (100%) rename ops/{ => lint}/lint_cpp.py (100%) rename ops/{ => lint}/lint_python.py (100%) rename ops/{ => lint}/lint_r.R (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a05661d22c80..c0885eaa2ffc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -226,7 +226,7 @@ jobs: needs: build-containers runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-gpu + - runner=linux-amd64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker diff --git a/ops/run_clang_tidy.py b/ops/clang-tidy/run_clang_tidy.py similarity index 97% rename from ops/run_clang_tidy.py rename to ops/clang-tidy/run_clang_tidy.py index 13bbedc0b4b5..24cb270393e8 100755 --- a/ops/run_clang_tidy.py +++ b/ops/clang-tidy/run_clang_tidy.py @@ -19,7 +19,7 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", error_msg, re.MULTILINE ) if matched is None: @@ -265,7 +265,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "tests", "ci_build", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "clang-tidy", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: @@ -274,8 +274,8 @@ def test_tidy(args: argparse.Namespace) -> None: tidy = "clang-tidy-" + str(args.tidy_version) cmd = [tidy, tidy_config, test_file_path] (proc_code, tidy_status, error_msg, _) = call(cmd) - assert proc_code == 0 - assert tidy_status == 1 + if proc_code != 0 or tidy_status != 1: + raise RuntimeError(error_msg) print("clang-tidy is working.") diff --git a/ops/test_tidy.cc b/ops/clang-tidy/test_tidy.cc similarity index 100% rename from ops/test_tidy.cc rename to ops/clang-tidy/test_tidy.cc diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 1b3a60adc014..3612529607b7 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -13,7 +13,7 @@ xgb-ci.gpu_build_rockylinux8: xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 build_args: - CUDA_VERSION_ARG: "12.5.1" + CUDA_VERSION_ARG: "12.4.1" R_VERSION_ARG: "4.3.2" xgb-ci.gpu: diff --git a/ops/lint_cmake.sh b/ops/lint/lint_cmake.sh similarity index 100% rename from ops/lint_cmake.sh rename to ops/lint/lint_cmake.sh diff --git a/ops/lint_cpp.py b/ops/lint/lint_cpp.py similarity index 100% rename from ops/lint_cpp.py rename to ops/lint/lint_cpp.py diff --git a/ops/lint_python.py b/ops/lint/lint_python.py similarity index 100% rename from ops/lint_python.py rename to ops/lint/lint_python.py diff --git a/ops/lint_r.R b/ops/lint/lint_r.R similarity index 100% rename from ops/lint_r.R rename to ops/lint/lint_r.R diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh index 48a48f2dc730..eefb6450b98d 100755 --- a/ops/pipeline/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -5,8 +5,6 @@ set -euo pipefail -set -x - if [[ -z ${GITHUB_ACTION:-} ]] then echo "$0 is not meant to run locally; it should run inside GitHub Actions." @@ -40,5 +38,3 @@ if [[ -n ${DISABLE_RELEASE:-} ]] then is_release_branch=0 fi - -set +x diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index b669f12ebf9e..496b601bfdfb 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 ops/run_clang_tidy.py --cuda-archs 75 + -- python3 ops/clang-tidy/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/test_r_package.py b/ops/test_r_package.py index 5ca7fa69b21a..3ce886c1bc41 100644 --- a/ops/test_r_package.py +++ b/ops/test_r_package.py @@ -42,7 +42,7 @@ def pkgroot(path: str) -> None: else: would_remove = output.stdout.decode("utf-8").strip().split("\n") - if would_remove and not all(f.find("tests/ci_build") != -1 for f in would_remove): + if would_remove and not all(f.find("ops") != -1 for f in would_remove): raise ValueError( "\n".join(would_remove) + "\nPlease cleanup the working git repository." ) diff --git a/ops/test_utils.py b/ops/test_utils.py index adcd05d5a124..f05fed4dc7f8 100644 --- a/ops/test_utils.py +++ b/ops/test_utils.py @@ -75,7 +75,7 @@ def print_time() -> None: ROOT = os.path.normpath( os.path.join( - os.path.dirname(os.path.abspath(__file__)), os.path.pardir, os.path.pardir + os.path.dirname(os.path.abspath(__file__)), os.path.pardir ) ) R_PACKAGE = os.path.join(ROOT, "R-package") From d86deda4dc6f38ca04557e7edf18c292819df21f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 4 Nov 2024 18:41:08 -0800 Subject: [PATCH 17/86] [WIP] Properly handle shaded JARs --- jvm-packages/pom.xml | 111 ++---------- jvm-packages/xgboost4j-spark-gpu/pom.xml | 3 + jvm-packages/xgboost4j-spark/pom.xml | 2 + jvm-packages/xgboost4j-tester/generate_pom.py | 162 ------------------ jvm-packages/xgboost4j-tester/get_iris.py | 10 -- .../java/ml/dmlc/xgboost4j/tester/App.java | 26 --- .../build_python_wheels_macos.sh | 3 +- 7 files changed, 16 insertions(+), 301 deletions(-) delete mode 100644 jvm-packages/xgboost4j-tester/generate_pom.py delete mode 100644 jvm-packages/xgboost4j-tester/get_iris.py delete mode 100644 jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java rename tests/ci_build/build_python_wheels.sh => ops/build_python_wheels_macos.sh (98%) diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index c4dba142b0e1..af7aec0a6982 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -19,8 +19,16 @@ - CodingCat - codingcat@apache.org + Bobby Wang + wbo4958@gmail.com + + + Jiaming Yuan + jm.yuan@outlook.com + + + Hyunsu Cho + chohyu01@cs.washington.edu @@ -106,13 +114,6 @@ release - - xgboost4j - xgboost4j-example - xgboost4j-spark - xgboost4j-flink - xgboost4j-spark-gpu - @@ -192,98 +193,6 @@ - - assembly - - - - org.apache.maven.plugins - maven-assembly-plugin - 3.7.1 - - - jar-with-dependencies - - true - - - - make-assembly - package - - single - - - - - - - - - release-to-github - - - github.repo - Temporary Staging Repository - file://${project.build.directory}/mvn-repo - - - - github - - - xgboost4j - xgboost4j-example - xgboost4j-spark - xgboost4j-flink - xgboost4j-spark-gpu - - - - - com.github.github - site-maven-plugin - 0.12 - - Maven artifacts for ${project.version} - true - ${project.build.directory}/mvn-repo - refs/heads/maven-repo - - *-with-dependencies.jar - - xgboost - CodingCat - true - - - - - - site - - deploy - - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.1.3 - - internal.repo::default::file://${project.build.directory}/mvn-repo - - - - org.apache.maven.plugins - maven-surefire-plugin - - true - - - - - release-to-s3 diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml index 9722da39f801..4eed2648229a 100644 --- a/jvm-packages/xgboost4j-spark-gpu/pom.xml +++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml @@ -23,9 +23,12 @@ org.apache.maven.plugins maven-shade-plugin + true + true false + ml.dmlc:xgboost4j_${scala.binary.version} ml.dmlc:xgboost4j-spark_${scala.binary.version} diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml index f1791ab90d1a..d62b5808c0e0 100644 --- a/jvm-packages/xgboost4j-spark/pom.xml +++ b/jvm-packages/xgboost4j-spark/pom.xml @@ -23,6 +23,8 @@ org.apache.maven.plugins maven-shade-plugin + true + true false diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py deleted file mode 100644 index ad729b3a64cb..000000000000 --- a/jvm-packages/xgboost4j-tester/generate_pom.py +++ /dev/null @@ -1,162 +0,0 @@ -import sys - -pom_template = """ - - - - 4.0.0 - - ml.dmlc - xgboost4j-tester_{scala_binary_version} - 1.0-SNAPSHOT - - xgboost4j-tester - - - UTF-8 - {maven_compiler_source} - {maven_compiler_target} - 4.13.2 - {spark_version} - {scala_version} - 3.2.15 - {scala_binary_version} - 5.6.0 - - - - - com.esotericsoftware - kryo - ${{kryo.version}} - - - org.scala-lang - scala-compiler - ${{scala.version}} - - - org.scala-lang - scala-reflect - ${{scala.version}} - - - org.scala-lang - scala-library - ${{scala.version}} - - - commons-logging - commons-logging - 1.2 - - - com.fasterxml.jackson.core - jackson-databind - 2.14.2 - - - org.scalatest - scalatest_${{scala.binary.version}} - ${{scalatest.version}} - test - - - org.apache.spark - spark-core_${{scala.binary.version}} - ${{spark.version}} - provided - - - org.apache.spark - spark-sql_${{scala.binary.version}} - ${{spark.version}} - provided - - - org.apache.spark - spark-mllib_${{scala.binary.version}} - ${{spark.version}} - provided - - - junit - junit - ${{junit.version}} - test - - - ml.dmlc - xgboost4j_${{scala.binary.version}} - {xgboost4j_version} - - - ml.dmlc - xgboost4j_${{scala.binary.version}} - {xgboost4j_version} - tests - test-jar - test - - - ml.dmlc - xgboost4j-spark_${{scala.binary.version}} - {xgboost4j_version} - - - ml.dmlc - xgboost4j-example_${{scala.binary.version}} - {xgboost4j_version} - - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - jar-with-dependencies - - - - ml.dmlc.xgboost4j.tester.App - - - - - - package - - single - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - ml.dmlc:xgboost4j_${{scala.binary.version}} - - - - - - -""" - -if __name__ == '__main__': - if len(sys.argv) != 7: - print('Usage: {} [xgboost4j version] [maven compiler source level] [maven compiler target level] [spark version] [scala version] [scala binary version]'.format(sys.argv[0])) - sys.exit(1) - with open('pom.xml', 'w') as f: - print(pom_template.format(xgboost4j_version=sys.argv[1], - maven_compiler_source=sys.argv[2], - maven_compiler_target=sys.argv[3], - spark_version=sys.argv[4], - scala_version=sys.argv[5], - scala_binary_version=sys.argv[6]), file=f) diff --git a/jvm-packages/xgboost4j-tester/get_iris.py b/jvm-packages/xgboost4j-tester/get_iris.py deleted file mode 100644 index 728c149b0260..000000000000 --- a/jvm-packages/xgboost4j-tester/get_iris.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np -import pandas -from sklearn.datasets import load_iris - -X, y = load_iris(return_X_y=True) -y = y.astype(np.int32) -df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) -class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'} -df['class'] = np.vectorize(class_id_to_name.get)(y) -df.to_csv('./iris.csv', float_format='%.1f', header=False, index=False) diff --git a/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java b/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java deleted file mode 100644 index 917f5062061c..000000000000 --- a/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java +++ /dev/null @@ -1,26 +0,0 @@ -package ml.dmlc.xgboost4j.tester; - -import ml.dmlc.xgboost4j.java.example.*; - -import java.io.IOException; -import ml.dmlc.xgboost4j.java.XGBoostError; - -public class App { - public static void main(String[] args) throws IOException, XGBoostError { - String[] args2 = new String[0]; - System.out.println("BoostFromPrediction"); - BoostFromPrediction.main(args2); - System.out.println("CrossValidation"); - CrossValidation.main(args2); - System.out.println("CustomObjective"); - CustomObjective.main(args2); - System.out.println("ExternalMemory"); - ExternalMemory.main(args2); - System.out.println("GeneralizedLinearModel"); - GeneralizedLinearModel.main(args2); - System.out.println("PredictFirstNtree"); - PredictFirstNtree.main(args2); - System.out.println("PredictLeafIndices"); - PredictLeafIndices.main(args2); - } -} diff --git a/tests/ci_build/build_python_wheels.sh b/ops/build_python_wheels_macos.sh similarity index 98% rename from tests/ci_build/build_python_wheels.sh rename to ops/build_python_wheels_macos.sh index d9927905cf83..f2d1c692c8cb 100644 --- a/tests/ci_build/build_python_wheels.sh +++ b/ops/build_python_wheels_macos.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euox pipefail if [[ $# -ne 2 ]]; then echo "Usage: $0 [platform_id] [commit ID]" From 125b7e94d55f992dc9c6e6bb3e087788baae55ee Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 12 Nov 2024 18:19:00 -0800 Subject: [PATCH 18/86] [CI] Pin Dask to 2024.10.0 (dmlc/xgboost#10995) --- ops/docker/conda_env/linux_cpu_test.yml | 4 ++-- ops/docker/conda_env/macos_cpu_test.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ops/docker/conda_env/linux_cpu_test.yml b/ops/docker/conda_env/linux_cpu_test.yml index e9d05c2f70d1..1ec2a5447604 100644 --- a/ops/docker/conda_env/linux_cpu_test.yml +++ b/ops/docker/conda_env/linux_cpu_test.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn>=1.4.1 - pandas - matplotlib -- dask -- distributed +- dask<=2024.10.0 +- distributed<=2024.10.0 - python-graphviz - hypothesis>=6.46 - astroid diff --git a/ops/docker/conda_env/macos_cpu_test.yml b/ops/docker/conda_env/macos_cpu_test.yml index f1fcb6b99993..29ff99e3504f 100644 --- a/ops/docker/conda_env/macos_cpu_test.yml +++ b/ops/docker/conda_env/macos_cpu_test.yml @@ -14,8 +14,8 @@ dependencies: - scikit-learn>=1.4.1 - pandas - matplotlib -- dask -- distributed +- dask<=2024.10.0 +- distributed<=2024.10.0 - graphviz - python-graphviz - hypothesis From f3ccc6f17a8d405538bc28d17e3547ac1a336925 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 12 Nov 2024 18:21:52 -0800 Subject: [PATCH 19/86] Refactor JVM tests --- .github/workflows/main.yml | 34 ++++++-- jvm-packages/xgboost4j-example/pom.xml | 6 ++ .../java/example/BasicWalkThrough.java | 8 +- .../xgboost4j/java/example/EarlyStopping.java | 4 +- .../scala/example/BasicWalkThrough.scala | 10 +-- .../java/example/JavaExamplesTest.java | 44 ++++++++++ .../scala/example/ScalaExamplesTest.scala | 40 +++++++++ ops/change_scala_version.py | 3 + ops/docker/ci_container.yml | 3 + ops/docker/dockerfile/Dockerfile.jvm_cross | 53 ------------ ops/pipeline/build-jvm-macos-m1.sh | 1 + ops/pipeline/build-jvm-manylinux2014.sh | 1 + ops/pipeline/build-test-jvm-packages-impl.sh | 84 +++++++++++++++++++ ops/pipeline/build-test-jvm-packages.sh | 46 ++++++++++ tests/buildkite/build-jvm-packages-gpu.sh | 20 ----- tests/buildkite/build-jvm-packages.sh | 26 ------ tests/buildkite/deploy-jvm-packages.sh | 14 ---- tests/buildkite/enforce_daily_budget.py | 14 ---- tests/buildkite/enforce_daily_budget.sh | 15 ---- tests/buildkite/pipeline-mgpu.yml | 32 ------- tests/buildkite/pipeline.yml | 48 ----------- tests/ci_build/build_jvm_packages.sh | 30 ------- tests/ci_build/deploy_jvm_packages.sh | 37 -------- 23 files changed, 269 insertions(+), 304 deletions(-) create mode 100644 jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java create mode 100644 jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala delete mode 100644 ops/docker/dockerfile/Dockerfile.jvm_cross create mode 100755 ops/pipeline/build-test-jvm-packages-impl.sh create mode 100755 ops/pipeline/build-test-jvm-packages.sh delete mode 100755 tests/buildkite/build-jvm-packages-gpu.sh delete mode 100755 tests/buildkite/build-jvm-packages.sh delete mode 100755 tests/buildkite/deploy-jvm-packages.sh delete mode 100644 tests/buildkite/enforce_daily_budget.py delete mode 100755 tests/buildkite/enforce_daily_budget.sh delete mode 100644 tests/buildkite/pipeline-mgpu.yml delete mode 100644 tests/buildkite/pipeline.yml delete mode 100755 tests/ci_build/build_jvm_packages.sh delete mode 100755 tests/ci_build/deploy_jvm_packages.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c0885eaa2ffc..74a1cc135908 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,6 +32,7 @@ jobs: - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -39,7 +40,7 @@ jobs: - container_id: xgb-ci.aarch64 runner: linux-arm64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -197,7 +198,7 @@ jobs: - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} build-manylinux2014: - name: Build manylinux2024_${{ matrix.arch }} wheel + name: Build manylinux2014_${{ matrix.arch }} wheel needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -228,7 +229,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -239,6 +240,29 @@ jobs: CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh + build-test-jvm-packages: + name: Build and test JVM packages + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] @@ -258,7 +282,7 @@ jobs: runner: linux-amd64-mgpu artifact_from: build-cuda steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -316,7 +340,7 @@ jobs: runner: linux-arm64-cpu artifact_from: build-cpu-arm64 steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml index eda453041fa3..4ea4b691987d 100644 --- a/jvm-packages/xgboost4j-example/pom.xml +++ b/jvm-packages/xgboost4j-example/pom.xml @@ -40,5 +40,11 @@ xgboost4j-flink_2.12 ${project.version} + + junit + junit + ${junit.version} + test + diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java index 8a74b74dab7e..0daf2c2179b1 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java @@ -62,8 +62,10 @@ public static void saveDumpModel(String modelPath, String[] modelInfos) throws I public static void main(String[] args) throws IOException, XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); + DMatrix trainMat = new DMatrix( + "../../demo/data/agaricus.txt.train?format=libsvm&indexing_mode=1"); + DMatrix testMat = new DMatrix( + "../../demo/data/agaricus.txt.test?format=libsvm&indexing_mode=1"); HashMap params = new HashMap(); params.put("eta", 1.0); @@ -113,7 +115,7 @@ public static void main(String[] args) throws IOException, XGBoostError { System.out.println("start build dmatrix from csr sparse data ..."); //build dmatrix from CSR Sparse Matrix DataLoader.CSRSparseData spData = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, DMatrix.SparseType.CSR, 127); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java index 9e52c12fdf3c..61e752f85aa9 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java @@ -29,9 +29,9 @@ public class EarlyStopping { public static void main(String[] args) throws IOException, XGBoostError { DataLoader.CSRSparseData trainCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); DataLoader.CSRSparseData testCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test"); Map paramMap = new HashMap() { { diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala index 4629fa352ec4..975d890a24b7 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala @@ -36,8 +36,8 @@ object BasicWalkThrough { } def main(args: Array[String]): Unit = { - val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") - val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") + val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm&indexing_mode=1") + val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm&indexing_mode=1") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 @@ -61,7 +61,7 @@ object BasicWalkThrough { } booster.saveModel(file.getAbsolutePath + "/xgb.model") // dump model with feature map - val modelInfos = booster.getModelDump(file.getAbsolutePath + "/featmap.txt", false) + val modelInfos = booster.getModelDump("../../demo/data/featmap.txt", false) saveDumpModel(file.getAbsolutePath + "/dump.raw.txt", modelInfos) // save dmatrix into binary buffer testMax.saveBinary(file.getAbsolutePath + "/dtest.buffer") @@ -76,9 +76,9 @@ object BasicWalkThrough { // build dmatrix from CSR Sparse Matrix println("start build dmatrix from csr sparse data ...") - val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm") + val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train") val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, - JDMatrix.SparseType.CSR) + JDMatrix.SparseType.CSR, 127) trainMax2.setLabel(spData.labels) // specify watchList diff --git a/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java new file mode 100644 index 000000000000..da57d1ebb28b --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java @@ -0,0 +1,44 @@ +/* + Copyright (c) 2024 by Contributors + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package ml.dmlc.xgboost4j.java.example; + +import java.io.IOException; +import ml.dmlc.xgboost4j.java.XGBoostError; +import org.junit.Test; + + +public class JavaExamplesTest { + + @Test + public void testExamples() throws XGBoostError, IOException { + String[] args = {""}; + System.out.println("BasicWalkThrough"); + BasicWalkThrough.main(args); + System.out.println("BoostFromPrediction"); + BoostFromPrediction.main(args); + System.out.println("CrossValidation"); + CrossValidation.main(args); + System.out.println("CustomObjective"); + CustomObjective.main(args); + System.out.println("EarlyStopping"); + EarlyStopping.main(args); + System.out.println("ExternalMemory"); + ExternalMemory.main(args); + System.out.println("GeneralizedLinearModel"); + GeneralizedLinearModel.main(args); + System.out.println("PredictFirstNtree"); + PredictFirstNtree.main(args); + System.out.println("PredictLeafIndices"); + PredictLeafIndices.main(args); + } +} diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala new file mode 100644 index 000000000000..d7705f90e5ce --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala @@ -0,0 +1,40 @@ +/* + Copyright (c) 2024 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package ml.dmlc.xgboost4j.scala.example + +import org.scalatest.funsuite.AnyFunSuite + +class ScalaExamplesTest extends AnyFunSuite { + test("Smoke test for Scala examples") { + val args = Array("") + println("BasicWalkThrough") + BasicWalkThrough.main(args) + println("BoostFromPrediction") + BoostFromPrediction.main(args) + println("CrossValidation") + CrossValidation.main(args) + println("CustomObjective") + CustomObjective.main(args) + println("ExternalMemory") + ExternalMemory.main(args) + println("GeneralizedLinearModel") + GeneralizedLinearModel.main(args) + println("PredictFirstNTree") + PredictFirstNTree.main(args) + println("PredictLeafIndices") + PredictLeafIndices.main(args) + } +} diff --git a/ops/change_scala_version.py b/ops/change_scala_version.py index c8a9b54ccf91..3489479dd464 100644 --- a/ops/change_scala_version.py +++ b/ops/change_scala_version.py @@ -20,6 +20,9 @@ def main(args): if target.is_dir(): print(f"Removing {target}...") shutil.rmtree(target) + for target in pathlib.Path("jvm-packages/").glob("**/*.so"): + print(f"Removing {target}...") + target.unlink() # Update pom.xml for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"): diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 3612529607b7..d042e35549f9 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -50,3 +50,6 @@ xgb-ci.manylinux2014_x86_64: xgb-ci.manylinux2014_aarch64: container_def: manylinux2014_aarch64 + +xgb-ci.jvm: + container_def: jvm diff --git a/ops/docker/dockerfile/Dockerfile.jvm_cross b/ops/docker/dockerfile/Dockerfile.jvm_cross deleted file mode 100644 index 3ebdb3c6686d..000000000000 --- a/ops/docker/dockerfile/Dockerfile.jvm_cross +++ /dev/null @@ -1,53 +0,0 @@ -FROM ubuntu:22.04 -ARG JDK_VERSION_ARG=8 -ARG SPARK_VERSION_ARG=3.5.1 - -# Environment -ENV DEBIAN_FRONTEND=noninteractive - -# Install all basic requirements -RUN \ - apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:openjdk-r/ppa && \ - apt-get update && \ - apt-get install -y tar unzip wget openjdk-$JDK_VERSION_ARG-jdk libgomp1 && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - /opt/miniforge/bin/pip install awscli && \ - # Maven - wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ - tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ - ln -s /opt/apache-maven-3.9.7/ /opt/maven && \ - # Spark with scala 2.12 - mkdir -p /opt/spark-scala-2.12 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz && \ - tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ - # Spark with scala 2.13 - mkdir -p /opt/spark-scala-2.13 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz && \ - tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 - -ENV PATH=/opt/miniforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH - -# Install Python packages -RUN pip install numpy scipy pandas scikit-learn - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Set default JDK version -RUN update-java-alternatives -v -s java-1.$JDK_VERSION_ARG.0-openjdk-amd64 - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh index 29a11451428c..75785aa03eba 100644 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build libxgboost4j.dylib targeting MacOS set -euox pipefail diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index 99216d6f6272..93fa03d2eb0b 100644 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build libxgboost4j.so targeting glibc 2.17 systems set -euox pipefail diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh new file mode 100755 index 000000000000..717868521408 --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -0,0 +1,84 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - CUDA_ARCH: Semicolon separated list of GPU compute capability targets + (e.g. '35;61') Only applicable if USE_CUDA=1 + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +# Set Scala version +if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] +then + python ops/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts +else + echo "Error: SCALA_VERSION must be either 2.12 or 2.13" + exit 2 +fi + +# If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + echo "Using externally provided libxgboost4j.so. Locating one from lib/..." + cp -v lib/libxgboost4j.so ./jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +fi + +cd jvm-packages/ + +# Ensure that XGBoost4J-Spark is compatible with multiple versions of Spark +if [[ "${USE_CUDA:-}" != "1" && "${SCALA_VERSION}" == "2.12" ]] +then + for spark_version in 3.1.3 3.2.4 3.3.4 3.4.3 + do + mvn --no-transfer-progress clean package -Dspark.version=${spark_version} \ + -pl xgboost4j,xgboost4j-spark + done +fi + +set +x +mvn_options="" +if [[ "${USE_CUDA:-}" == "1" ]] +then + mvn_options="${mvn_options} -Pgpu" +fi +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + mvn_options="${mvn_options} -Dskip.native.build=true" +fi +set -x + +if [[ -n "${CUDA_ARCH:-}" ]] +then + export GPU_ARCH_FLAG="-DGPU_COMPUTE_VER='${CUDA_ARCH}'" +fi + +mvn --no-transfer-progress clean install ${mvn_options} + +# Integration tests +if [[ "${USE_CUDA:-}" != "1" ]] +then + mvn --no-transfer-progress test -pl xgboost4j-example +fi diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh new file mode 100755 index 000000000000..30a11d444d1b --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -0,0 +1,46 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - CUDA_ARCH: Semicolon separated list of GPU compute capability targets + (e.g. '35;61') Only applicable if USE_CUDA=1 + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +run_args="-e SCALA_VERSION=${SCALA_VERSION}" +for arg in "USE_CUDA" "CUDA_ARCH" "SKIP_NATIVE_BUILD" +do + if [[ -n "${!arg:-}" ]] + then + run_args="${run_args} -e ${arg}=${!arg}" + fi +done +echo "${run_args}" + +python3 ops/docker_run.py --container-id xgb-ci.jvm \ + --run-args "${run_args}" -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh deleted file mode 100755 index 76ffafbcfdd7..000000000000 --- a/tests/buildkite/build-jvm-packages-gpu.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with CUDA" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -tests/ci_build/ci_build.sh jvm_gpu_build --use-gpus \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag} diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh deleted file mode 100755 index 338a599f7e15..000000000000 --- a/tests/buildkite/build-jvm-packages.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with Scala 2.12" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} - -echo "--- Stash XGBoost4J JARs (Scala 2.12)" -buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" - -echo "--- Build and test XGBoost JVM packages with Scala 2.13" - -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} "" "" "true" - -echo "--- Stash XGBoost4J JARs (Scala 2.13)" -buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh deleted file mode 100755 index 812a6c5cafec..000000000000 --- a/tests/buildkite/deploy-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - tests/ci_build/ci_build.sh jvm_gpu_build \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} -fi diff --git a/tests/buildkite/enforce_daily_budget.py b/tests/buildkite/enforce_daily_budget.py deleted file mode 100644 index af1b1ce484b8..000000000000 --- a/tests/buildkite/enforce_daily_budget.py +++ /dev/null @@ -1,14 +0,0 @@ -import json -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--response", type=str, required=True) - args = parser.parse_args() - with open(args.response, "r") as f: - payload = f.read() - response = json.loads(payload) - if response["approved"]: - print(f"Testing approved. Reason: {response['reason']}") - else: - raise RuntimeError(f"Testing rejected. Reason: {response['reason']}") diff --git a/tests/buildkite/enforce_daily_budget.sh b/tests/buildkite/enforce_daily_budget.sh deleted file mode 100755 index 8212f07c1b24..000000000000 --- a/tests/buildkite/enforce_daily_budget.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Enforce daily budget" - -source tests/buildkite/conftest.sh - -if [[ $enforce_daily_budget == 0 ]] -then - echo "Automatically approving all test jobs for trunk branches" -else - aws lambda invoke --function-name XGBoostCICostWatcher --invocation-type RequestResponse --region us-west-2 response.json - python3 tests/buildkite/enforce_daily_budget.py --response response.json -fi diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml deleted file mode 100644 index 4246425de0ca..000000000000 --- a/tests/buildkite/pipeline-mgpu.yml +++ /dev/null @@ -1,32 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh jvm_gpu_build" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build and test JVM packages with CUDA" - command: "tests/buildkite/build-jvm-packages-gpu.sh" - key: build-jvm-packages-gpu - agents: - queue: linux-amd64-mgpu - - wait diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml deleted file mode 100644 index 65225649a3af..000000000000 --- a/tests/buildkite/pipeline.yml +++ /dev/null @@ -1,48 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh cpu" - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build JVM packages" - timeout_in_minutes: 30 - command: "tests/buildkite/build-jvm-packages.sh" - key: build-jvm-packages - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM package doc" - command: "tests/buildkite/build-jvm-doc.sh" - key: build-jvm-doc - agents: - queue: linux-amd64-cpu - - wait - #### -------- TEST -------- - - label: ":console: Run integration tests with JVM packages" - command: "tests/buildkite/test-integration-jvm-packages.sh" - key: test-integration-jvm-packages - agents: - queue: linux-amd64-cpu - - wait - #### -------- DEPLOY JVM -------- - - label: ":console: Deploy JVM packages" - command: "tests/buildkite/deploy-jvm-packages.sh" - key: deploy-jvm-packages - agents: - queue: linux-amd64-cpu diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh deleted file mode 100755 index 23811f817bd7..000000000000 --- a/tests/ci_build/build_jvm_packages.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -spark_version=$1 -use_cuda=$2 -gpu_arch=$3 -use_scala213=$4 - -gpu_options="" -if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then - gpu_options="$use_cuda -Pgpu" -fi - -rm -rf build/ -cd jvm-packages - -if [ "x$gpu_arch" != "x" ]; then - export GPU_ARCH_FLAG=$gpu_arch -fi - -if [ "x$use_scala213" != "x" ]; then - cd .. - python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts - cd jvm-packages -fi - -mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options - -set +x diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh deleted file mode 100755 index d8c50d297686..000000000000 --- a/tests/ci_build/deploy_jvm_packages.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -set -e -set -x - -if [ $# -ne 1 ]; then - echo "Usage: $0 [spark version]" - exit 1 -fi - -spark_version=$1 - -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh - -cd jvm-packages -rm -rf $(find . -name target) -rm -rf ../build/ - -# Re-build package -# Maven profiles: -# `default` includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example -# `gpu` includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON` -# `scala-2.13` sets the scala binary version to the 2.13 -# `release-to-s3` sets maven deployment targets - -# Deploy to S3 bucket xgboost-maven-repo -mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests -# Deploy scala 2.13 to S3 bucket xgboost-maven-repo -cd .. -python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts -cd jvm-packages/ -mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests - - -set +x -set +e From 67d0cc6fc028569c41fb8342c05b38cdddb1888d Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 23 Oct 2024 22:38:52 +0800 Subject: [PATCH 20/86] [jvm-packages] resolve spark compatibility issue (#10917) --------- Co-authored-by: Hyunsu Cho --- .../scala/spark/XGBoostEstimator.scala | 9 ++- .../apache/spark/ml/xgboost/SparkUtils.scala | 55 +++++++++++++++++-- .../xgboost4j/scala/spark/XGBoostSuite.scala | 45 ++++++++------- 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala index 6978b82da8fc..98b70a63c4f6 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala @@ -561,7 +561,11 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML val featureName = getFeaturesCol val missing = getMissing - val output = dataset.toDF().mapPartitions { rowIter => + // Here, we use RDD instead of DF to avoid different encoders for different + // spark versions for the compatibility issue. + // 3.5+, Encoders.row(schema) + // 3.5-, RowEncoder(schema) + val outRDD = dataset.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIter => rowIter.grouped(inferBatchSize).flatMap { batchRow => val features = batchRow.iterator.map(row => row.getAs[Vector]( row.fieldIndex(featureName))) @@ -573,8 +577,9 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML dm.delete() } } + } + val output = dataset.sparkSession.createDataFrame(outRDD, schema) - }(Encoders.row(schema)) bBooster.unpersist(blocking = false) postTransform(output, pred).toDF() } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala index 8bc88434a443..4402f8efca19 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala @@ -16,14 +16,15 @@ package org.apache.spark.ml.xgboost -import org.apache.spark.SparkContext +import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.ml.classification.ProbabilisticClassifierParams import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.ml.param.Params -import org.apache.spark.ml.util.{DatasetUtils, DefaultParamsReader, DefaultParamsWriter, SchemaUtils} +import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, MetadataUtils, SchemaUtils} import org.apache.spark.ml.util.DefaultParamsReader.Metadata -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.{Column, Dataset, Row} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DataType, DoubleType, IntegerType, StructType} import org.json4s.{JObject, JValue} import ml.dmlc.xgboost4j.scala.spark.params.NonXGBoostParams @@ -57,8 +58,52 @@ trait XGBProbabilisticClassifierParams[T <: Params] /** Utils to access the spark internal functions */ object SparkUtils { + private def checkClassificationLabels( + labelCol: String, + numClasses: Option[Int]): Column = { + val casted = col(labelCol).cast(DoubleType) + numClasses match { + case Some(2) => + when(casted.isNull || casted.isNaN, raise_error(lit("Labels MUST NOT be Null or NaN"))) + .when(casted =!= 0 && casted =!= 1, + raise_error(concat(lit("Labels MUST be in {0, 1}, but got "), casted))) + .otherwise(casted) + + case _ => + val n = numClasses.getOrElse(Int.MaxValue) + require(0 < n && n <= Int.MaxValue) + when(casted.isNull || casted.isNaN, raise_error(lit("Labels MUST NOT be Null or NaN"))) + .when(casted < 0 || casted >= n, + raise_error(concat(lit(s"Labels MUST be in [0, $n), but got "), casted))) + .when(casted =!= casted.cast(IntegerType), + raise_error(concat(lit("Labels MUST be Integers, but got "), casted))) + .otherwise(casted) + } + } + + // Copied from DatasetUtils of Spark to compatible with spark below 3.4 def getNumClasses(dataset: Dataset[_], labelCol: String, maxNumClasses: Int = 100): Int = { - DatasetUtils.getNumClasses(dataset, labelCol, maxNumClasses) + MetadataUtils.getNumClasses(dataset.schema(labelCol)) match { + case Some(n: Int) => n + case None => + // Get number of classes from dataset itself. + val maxLabelRow: Array[Row] = dataset + .select(max(checkClassificationLabels(labelCol, Some(maxNumClasses)))) + .take(1) + if (maxLabelRow.isEmpty || maxLabelRow(0).get(0) == null) { + throw new SparkException("ML algorithm was given empty dataset.") + } + val maxDoubleLabel: Double = maxLabelRow.head.getDouble(0) + require((maxDoubleLabel + 1).isValidInt, s"Classifier found max label value =" + + s" $maxDoubleLabel but requires integers in range [0, ... ${Int.MaxValue})") + val numClasses = maxDoubleLabel.toInt + 1 + require(numClasses <= maxNumClasses, s"Classifier inferred $numClasses from label values" + + s" in column $labelCol, but this exceeded the max numClasses ($maxNumClasses) allowed" + + s" to be inferred from values. To avoid this error for labels with > $maxNumClasses" + + s" classes, specify numClasses explicitly in the metadata; this can be done by applying" + + s" StringIndexer to the label column.") + numClasses + } } def checkNumericType(schema: StructType, colName: String, msg: String = ""): Unit = { diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala index 3a45cf4448c0..f9a7c0c1060d 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala @@ -100,29 +100,32 @@ class XGBoostSuite extends AnyFunSuite with PerTest { .config("spark.executor.cores", 4) .config("spark.executor.resource.gpu.amount", 1) .config("spark.task.resource.gpu.amount", 0.25) - val ss = builder.getOrCreate() - - try { - val df = ss.range(1, 10) - val rdd = df.rdd - - val runtimeParams = new XGBoostClassifier( - Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) - .getRuntimeParameters(true) - assert(runtimeParams.runOnGpu) - - val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, runtimeParams, - rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]]) - - val taskResources = finalRDD.getResourceProfile().taskResources - assert(taskResources.contains("cpus")) - assert(taskResources.get("cpus").get.amount == 3) - - assert(taskResources.contains("gpu")) - assert(taskResources.get("gpu").get.amount == 1.0) - } finally { + if (ss.version < "3.4.1") { + // Pass ss.stop() + } else { + try { + val df = ss.range(1, 10) + val rdd = df.rdd + + val runtimeParams = new XGBoostClassifier( + Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) + .getRuntimeParameters(true) + assert(runtimeParams.runOnGpu) + + val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, runtimeParams, + rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]]) + + val taskResources = finalRDD.getResourceProfile().taskResources + assert(taskResources.contains("cpus")) + assert(taskResources.get("cpus").get.amount == 3) + + assert(taskResources.contains("gpu")) + assert(taskResources.get("gpu").get.amount == 1.0) + } finally { + ss.stop() + } } } } From b65e2ccf8ae7b9f7cba095fad6f866bac00d6f12 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 14 Nov 2024 12:49:20 -0800 Subject: [PATCH 21/86] Test GPU JVM packages --- .github/workflows/main.yml | 74 ++++++++++++++++---- jvm-packages/create_jni.py | 16 ++--- ops/docker/ci_container.yml | 20 ++++-- ops/pipeline/build-cuda.sh | 2 + ops/pipeline/build-jvm-gpu.sh | 33 +++++++++ ops/pipeline/build-jvm-macos-m1.sh | 0 ops/pipeline/build-jvm-manylinux2014.sh | 0 ops/pipeline/build-test-jvm-packages-impl.sh | 30 +++++--- ops/pipeline/build-test-jvm-packages.sh | 22 +----- ops/pipeline/build-win64-gpu.ps1 | 2 + ops/pipeline/test-cpp-gpu.sh | 10 ++- ops/pipeline/test-jvm-gpu.sh | 18 +++++ ops/pipeline/test-python.sh | 21 ++++-- 13 files changed, 183 insertions(+), 65 deletions(-) create mode 100755 ops/pipeline/build-jvm-gpu.sh mode change 100644 => 100755 ops/pipeline/build-jvm-macos-m1.sh mode change 100644 => 100755 ops/pipeline/build-jvm-manylinux2014.sh create mode 100755 ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 74a1cc135908..1b78bdb69df1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,6 +33,7 @@ jobs: - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 - xgb-ci.jvm + - xgb-ci.jvm_gpu_build runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -171,8 +172,8 @@ jobs: COMMAND: upload KEY: build-cuda-with-rmm - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting gblic 2.17 + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -184,7 +185,6 @@ jobs: runner: linux-arm64-cpu - arch: x86_64 runner: linux-amd64-cpu - steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -195,10 +195,28 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - build-manylinux2014: - name: Build manylinux2014_${{ matrix.arch }} wheel + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting glibc 2.17 needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -220,10 +238,10 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - build-gpu-rpkg: - name: Build GPU-enabled R package + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -237,8 +255,13 @@ jobs: - name: Fetch container from cache run: bash ops/docker_build.sh env: - CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - - run: bash ops/pipeline/build-gpu-rpkg.sh + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu build-test-jvm-packages: name: Build and test JVM packages @@ -256,10 +279,12 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.jvm - - run: bash ops/pipeline/build-test-jvm-packages.sh + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.12 - - run: bash ops/pipeline/build-test-jvm-packages.sh + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.13 @@ -358,3 +383,26 @@ jobs: KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} + + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 6be7b451ce14..fbd9b4ce5672 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -32,7 +32,7 @@ def cd(path): path = normpath(path) cwd = os.getcwd() os.chdir(path) - print("cd " + path) + print("cd " + path, flush=True) try: yield path finally: @@ -41,7 +41,7 @@ def cd(path): def maybe_makedirs(path): path = normpath(path) - print("mkdir -p " + path) + print("mkdir -p " + path, flush=True) try: os.makedirs(path) except OSError as e: @@ -50,14 +50,14 @@ def maybe_makedirs(path): def run(command, **kwargs): - print(command) + print(command, flush=True) subprocess.run(command, shell=True, check=True, env=os.environ, **kwargs) def cp(source, target): source = normpath(source) target = normpath(target) - print("cp {0} {1}".format(source, target)) + print("cp {0} {1}".format(source, target), flush=True) shutil.copy(source, target) @@ -78,7 +78,7 @@ def native_build(args): subprocess.check_output("/usr/libexec/java_home").strip().decode() ) - print("building Java wrapper") + print("building Java wrapper", flush=True) with cd(".."): build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build" maybe_makedirs(build_dir) @@ -123,7 +123,7 @@ def native_build(args): run("cmake .. " + " ".join(args + [generator])) break except subprocess.CalledProcessError as e: - print(f"Failed to build with generator: {generator}", e) + print(f"Failed to build with generator: {generator}", e, flush=True) with cd(os.path.pardir): shutil.rmtree(build_dir) maybe_makedirs(build_dir) @@ -132,7 +132,7 @@ def native_build(args): run("cmake --build . --config Release" + maybe_parallel_build) - print("copying native library") + print("copying native library", flush=True) library_name, os_folder = { "Windows": ("xgboost4j.dll", "windows"), "Darwin": ("libxgboost4j.dylib", "macos"), @@ -153,7 +153,7 @@ def native_build(args): maybe_makedirs(output_folder) cp("../lib/" + library_name, output_folder) - print("copying train/test files") + print("copying train/test files", flush=True) # for xgboost4j maybe_makedirs("xgboost4j/src/test/resources") diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index d042e35549f9..90c9e6c8c800 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -6,8 +6,8 @@ xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.10" xgb-ci.gpu_build_r_rockylinux8: @@ -19,22 +19,22 @@ xgb-ci.gpu_build_r_rockylinux8: xgb-ci.gpu: container_def: gpu build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.10" xgb-ci.gpu_dev_ver: container_def: gpu build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.12" RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: container_def: clang_tidy build_args: - CUDA_VERSION_ARG: "12.5.1" + CUDA_VERSION_ARG: "12.4.1" xgb-ci.cpu: container_def: cpu @@ -53,3 +53,9 @@ xgb-ci.manylinux2014_aarch64: xgb-ci.jvm: container_def: jvm + +xgb-ci.jvm_gpu_build: + container_def: jvm_gpu_build + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 9dc7dfad0224..bcda081b338e 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -17,6 +17,8 @@ echo "--- Build with CUDA" echo "--- Build libxgboost from the source" set -x +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh new file mode 100755 index 000000000000..ee12fbd78289 --- /dev/null +++ b/ops/pipeline/build-jvm-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build libxgboost4j.so with CUDA + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +echo "--- Build libxgboost4j.so with CUDA" + +# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then + arch_flag="-DGPU_COMPUTE_VER=75" +#else +# arch_flag="" +#fi + +COMMAND=$( +cat <<-EOF +cd build-gpu/ && \ +cmake .. -DCMAKE_PREFIX_PATH=/workspace/cccl -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DJVM_BINDINGS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ${arch_flag} && \ + ninja +EOF +) + +set -x +mkdir -p build-gpu/ +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- bash -c "${COMMAND}" diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh old mode 100644 new mode 100755 diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh old mode 100644 new mode 100755 diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index 717868521408..180788436d9b 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -8,8 +8,6 @@ cat <<-EOF Inputs - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) - USE_CUDA: Set to 1 to enable CUDA - - CUDA_ARCH: Semicolon separated list of GPU compute capability targets - (e.g. '35;61') Only applicable if USE_CUDA=1 - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided libxgboost4j.so. (Usually Maven will invoke create_jni.py to build it from scratch.) When using this option, make sure to @@ -40,10 +38,31 @@ else fi # If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +# Also copy in other files needed for testing. (Usually create_jni.py would perform this +# step, but we need to do it manually here.) if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] then echo "Using externally provided libxgboost4j.so. Locating one from lib/..." - cp -v lib/libxgboost4j.so ./jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + mkdir -p jvm-packages/xgboost4j/src/test/resources + mkdir -p jvm-packages/xgboost4j-spark/src/test/resources + mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources + + # Generate machine.txt.* files from the CLI regression demo + # TODO(hcho3): Remove once CLI is removed + pushd demo/CLI/regression + python3 mapfeat.py + python3 mknfold.py machine.txt 1 + popd + + cp -v demo/data/agaricus.* \ + jvm-packages/xgboost4j/src/test/resources + cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ + jvm-packages/xgboost4j-spark/src/test/resources + cp -v demo/data/veterans_lung_cancer.csv \ + jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ + jvm-packages/xgboost4j-spark-gpu/src/test/resources fi cd jvm-packages/ @@ -70,11 +89,6 @@ then fi set -x -if [[ -n "${CUDA_ARCH:-}" ]] -then - export GPU_ARCH_FLAG="-DGPU_COMPUTE_VER='${CUDA_ARCH}'" -fi - mvn --no-transfer-progress clean install ${mvn_options} # Integration tests diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh index 30a11d444d1b..1feddf2bff98 100755 --- a/ops/pipeline/build-test-jvm-packages.sh +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -6,14 +6,7 @@ INPUT_DOC=$( cat <<-EOF Inputs - - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) - - USE_CUDA: Set to 1 to enable CUDA - - CUDA_ARCH: Semicolon separated list of GPU compute capability targets - (e.g. '35;61') Only applicable if USE_CUDA=1 - - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided - libxgboost4j.so. (Usually Maven will invoke create_jni.py to - build it from scratch.) When using this option, make sure to - place libxgboost4j.so in lib/ directory. + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) EOF ) @@ -32,15 +25,6 @@ done set -x -run_args="-e SCALA_VERSION=${SCALA_VERSION}" -for arg in "USE_CUDA" "CUDA_ARCH" "SKIP_NATIVE_BUILD" -do - if [[ -n "${!arg:-}" ]] - then - run_args="${run_args} -e ${arg}=${!arg}" - fi -done -echo "${run_args}" - python3 ops/docker_run.py --container-id xgb-ci.jvm \ - --run-args "${run_args}" -- ops/pipeline/build-test-jvm-packages-impl.sh + --run-args "-e SCALA_VERSION=${SCALA_VERSION}" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index 48863528684a..c691a55f954c 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -11,6 +11,8 @@ nvcc --version # $arch_flag = "" #} +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build cd build diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 8ff66a554e0c..b66162d66a50 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -15,21 +15,25 @@ case "${arg}" in gpu) echo "--- Run Google Tests, using a single GPU" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- build/testxgboost ;; gpu-rmm) echo "--- Run Google Tests, using a single GPU, RMM enabled" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- build/testxgboost --use-rmm-pool ;; mgpu) echo "--- Run Google Tests, using multiple GPUs" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged --shm-size=4g' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh new file mode 100755 index 000000000000..272b55ad0d1a --- /dev/null +++ b/ops/pipeline/test-jvm-gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +## Test JVM packages with CUDA. Note: this script assumes that +## the user has already built libxgboost4j.so with CUDA support +## and place it in the lib/ directory. + +set -euo pipefail + +# source ops/pipeline/enforce-ci.sh + +SCALA_VERSION=2.12 + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + -- nvidia-smi +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index b33b38ac187c..3002e878cf6e 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -31,12 +31,15 @@ case "$suite" in echo "-- Run Python tests, using a single GPU" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- bash test-python-wrapper.sh gpu_test ;; @@ -44,15 +47,19 @@ case "$suite" in echo "-- Run Python tests, using multiple GPUs" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged --shm-size=4g' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test ;; From f4d94a19b903d4bfd6458b90f0f8201616f2765d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 2 Nov 2024 13:49:28 +0800 Subject: [PATCH 22/86] Disable the host numa virtual memory allocator for now. (#10934) --- src/common/device_helpers.cu | 5 +++++ tests/cpp/common/test_device_vector.cu | 11 +++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 608a535cd8cb..01e81b16ee0b 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,6 +7,11 @@ namespace dh { PinnedMemory::PinnedMemory() { + // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. + // See https://github.com/dmlc/xgboost/issues/10933 + this->impl_.emplace(); + return; + #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index d7a03e41a64b..ec1a420bd349 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,6 +31,9 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); + if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); + } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -110,14 +113,6 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; -#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); -#else // defined(xgboost_IS_WIN) - if (major >= 12 && minor >= 5) { - ASSERT_TRUE(pinned.IsVm()); - } else { - ASSERT_FALSE(pinned.IsVm()); - } -#endif // defined(xgboost_IS_WIN) } } // namespace dh From e1c7e24e1db4cd72759531b90cc8bd4df05847ea Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 14 Nov 2024 23:39:19 -0800 Subject: [PATCH 23/86] Run GPU tests with privilege escalation --- ops/docker/dockerfile/Dockerfile.gpu | 3 ++- ops/pipeline/test-cpp-gpu.sh | 4 +++- ops/pipeline/test-jvm-gpu.sh | 2 +- ops/pipeline/test-python.sh | 17 +++++++---------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index eac35c3aaa90..beb1710d2d7a 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -28,7 +28,8 @@ RUN \ mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ - dask \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index b66162d66a50..98f467250dd0 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -17,6 +17,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost ;; @@ -25,6 +26,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; @@ -33,7 +35,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 272b55ad0d1a..108ec749674b 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -14,5 +14,5 @@ set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --privileged" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 3002e878cf6e..3997f416ec46 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -31,15 +31,14 @@ case "$suite" in echo "-- Run Python tests, using a single GPU" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test ;; @@ -47,19 +46,17 @@ case "$suite" in echo "-- Run Python tests, using multiple GPUs" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- bash test-python-wrapper.sh gpu_test ;; From 11b0427b277d0577d971feafba2b48b8aaee16d3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 16 Nov 2024 22:50:38 -0800 Subject: [PATCH 24/86] Reboot after install driver; use proprietary driver for now --- ops/packer/linux/bootstrap.sh | 32 ++++++++++++++--------------- ops/packer/linux/install_drivers.sh | 14 +++++++++++++ ops/packer/linux/linux.pkr.hcl | 13 +++++++++++- 3 files changed, 42 insertions(+), 17 deletions(-) create mode 100644 ops/packer/linux/install_drivers.sh diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index 9dbda19c3baa..57be6e14b507 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -1,21 +1,6 @@ #!/bin/bash set -euo pipefail -## Install basic tools -echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections -sudo apt-get update -sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip - -## Install CUDA 12.5 + driver -echo "Installilng CUDA and driver..." -wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin -sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 -wget -nv https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb -sudo dpkg -i cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb -sudo cp /var/cuda-repo-ubuntu2404-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/ -sudo apt-get update -sudo apt-get install -y cuda-toolkit-12-5 nvidia-driver-555-open cuda-drivers-555 - ## Install Docker # Add Docker's official GPG key: sudo install -m 0755 -d /etc/apt/keyrings @@ -31,6 +16,12 @@ sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plug # Allow users to use Docker without sudo sudo usermod -aG docker ubuntu +# Start Docker daemon +sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.service +sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service +sleep 10 # Docker daemon takes time to come up after installing +sudo docker info + ## Install NVIDIA Container Toolkit curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ @@ -41,12 +32,21 @@ sudo apt-get install -y nvidia-container-toolkit sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker +sleep 10 +sudo docker run --rm --gpus all ubuntu nvidia-smi +sudo systemctl stop docker + ## Install AWS CLI v2 wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip -unzip awscliv2.zip +unzip -q awscliv2.zip sudo ./aws/install +rm -rf ./aws/ ./awscliv2.zip ## Install jq and yq sudo apt update && sudo apt install jq +mkdir yq/ +pushd yq/ wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq +popd +rm -rf yq/ diff --git a/ops/packer/linux/install_drivers.sh b/ops/packer/linux/install_drivers.sh new file mode 100644 index 000000000000..07309be836a8 --- /dev/null +++ b/ops/packer/linux/install_drivers.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip + +## Install CUDA Toolkit 12.6 (Driver will be installed later) +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12-6 cuda-drivers-565 +rm cuda-keyring_1.1-1_all.deb diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl index 1dc11f9bac03..c6990894764a 100644 --- a/ops/packer/linux/linux.pkr.hcl +++ b/ops/packer/linux/linux.pkr.hcl @@ -63,6 +63,17 @@ build { sources = ["source.amazon-ebs.runs-on-linux"] provisioner "shell" { - script = "bootstrap.sh" + script = "install_drivers.sh" + pause_after = "30s" + } + + provisioner "shell" { + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + pause_before = "1m0s" + script = "bootstrap.sh" } } From 49eaec14339c7efcf88088e8478f49422a327a0e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 16 Nov 2024 22:54:59 -0800 Subject: [PATCH 25/86] Try removing --privileged flag --- ops/pipeline/test-cpp-gpu.sh | 4 +--- ops/pipeline/test-jvm-gpu.sh | 2 +- ops/pipeline/test-python.sh | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 98f467250dd0..b66162d66a50 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -17,7 +17,6 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ -- build/testxgboost ;; @@ -26,7 +25,6 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; @@ -35,7 +33,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--shm-size=4g --privileged' \ + --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 108ec749674b..272b55ad0d1a 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -14,5 +14,5 @@ set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --privileged" \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 3997f416ec46..02907253bc0f 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -38,7 +38,6 @@ case "$suite" in python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test ;; @@ -56,7 +55,7 @@ case "$suite" in python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g --privileged' \ + --run-args='--shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test ;; From d3482e1b6d67698de299dc153cb6441f1c798d9b Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 17 Nov 2024 02:19:16 -0800 Subject: [PATCH 26/86] Revert "Disable the host numa virtual memory allocator for now. (#10934)" This reverts commit f4d94a19b903d4bfd6458b90f0f8201616f2765d. --- src/common/device_helpers.cu | 5 ----- tests/cpp/common/test_device_vector.cu | 11 ++++++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 01e81b16ee0b..608a535cd8cb 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,11 +7,6 @@ namespace dh { PinnedMemory::PinnedMemory() { - // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. - // See https://github.com/dmlc/xgboost/issues/10933 - this->impl_.emplace(); - return; - #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index ec1a420bd349..d7a03e41a64b 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,9 +31,6 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); - if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { - GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); - } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -113,6 +110,14 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +#endif // defined(xgboost_IS_WIN) } } // namespace dh From 130d303f40c25fc3fb3b07469bbb59f80118a8cf Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 09:30:31 -0800 Subject: [PATCH 27/86] Build JVM docs --- .github/workflows/main.yml | 25 +++++++++++++++++-- .github/workflows/windows.yml | 1 - doc/jvm/api.rst | 1 + jvm-packages/pom.xml | 16 ++++++++++++ .../build-jvm-doc-impl.sh} | 18 +++++++------ ops/pipeline/build-jvm-doc.sh | 7 ++++-- 6 files changed, 56 insertions(+), 12 deletions(-) rename ops/{build_jvm_doc.sh => pipeline/build-jvm-doc-impl.sh} (57%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1b78bdb69df1..47e195267d49 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} - - spot=false strategy: matrix: container_id: @@ -121,7 +120,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu - - spot=false steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -263,6 +261,29 @@ jobs: COMMAND: upload KEY: build-jvm-gpu + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + build-test-jvm-packages: name: Build and test JVM packages needs: build-containers diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 0fc50815d683..73a258158b12 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -26,7 +26,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-cpu - - spot=false steps: - uses: actions/checkout@v4 with: diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst index b9e7821aa6fa..3d56cb2c9aa4 100644 --- a/doc/jvm/api.rst +++ b/doc/jvm/api.rst @@ -5,4 +5,5 @@ API Docs for the JVM packages * `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_ * `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_ * `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_ +* `XGBoost4J-Spark-GPU Scala API <../jvm_docs/scaladocs/xgboost4j-spark-gpu/index.html>`_ * `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_ diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index af7aec0a6982..815e8b473139 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -112,6 +112,22 @@ + + docs + + ON + true + true + true + + + xgboost4j + xgboost4j-spark + xgboost4j-spark-gpu + xgboost4j-flink + + + release diff --git a/ops/build_jvm_doc.sh b/ops/pipeline/build-jvm-doc-impl.sh similarity index 57% rename from ops/build_jvm_doc.sh rename to ops/pipeline/build-jvm-doc-impl.sh index 6f785f488027..c334b8ad91d1 100755 --- a/ops/build_jvm_doc.sh +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -1,6 +1,7 @@ #!/bin/bash - ## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. if [[ $# -ne 1 ]] then @@ -10,23 +11,26 @@ fi set -euo pipefail -rm -rf build/ -cd jvm-packages - branch_name=$1 +# Copy in libxgboost4j.so +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + +cd jvm-packages/ # Install JVM packages in local Maven repository -mvn --no-transfer-progress install -DskipTests +mvn --no-transfer-progress install -Pdocs # Build Scaladocs -mvn --no-transfer-progress scala:doc -DskipTests +mvn --no-transfer-progress scala:doc -Pdocs # Build Javadocs -mvn --no-transfer-progress javadoc:javadoc -DskipTests +mvn --no-transfer-progress javadoc:javadoc -Pdocs # Package JVM docs in a tarball mkdir -p tmp/scaladocs cp -rv xgboost4j/target/site/apidocs/ ./tmp/javadocs/ cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ +cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ cd tmp diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 7b029a4e7e26..0c1afe46e212 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -1,4 +1,7 @@ #!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. set -euox pipefail @@ -6,8 +9,8 @@ source ops/pipeline/enforce-ci.sh echo "--- Build JVM packages doc" python3 ops/docker_run.py \ - --container-id jvm \ - -- ops/build_jvm_doc.sh ${BRANCH_NAME} + --container-id xgb-ci.jvm_gpu_build \ + -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Upload JVM packages doc" From a45b24fd9608f2bf05dfd4282bad05a03eb10373 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 10:37:48 -0800 Subject: [PATCH 28/86] Re-org test scripts --- .../build-gpu-rpkg-impl.sh} | 0 ops/pipeline/build-gpu-rpkg.sh | 2 +- ops/pipeline/test-python-impl.sh | 60 ++++++++++++++ ops/pipeline/test-python.sh | 81 +------------------ 4 files changed, 63 insertions(+), 80 deletions(-) rename ops/{build_r_pkg_with_cuda.sh => pipeline/build-gpu-rpkg-impl.sh} (100%) create mode 100755 ops/pipeline/test-python-impl.sh diff --git a/ops/build_r_pkg_with_cuda.sh b/ops/pipeline/build-gpu-rpkg-impl.sh similarity index 100% rename from ops/build_r_pkg_with_cuda.sh rename to ops/pipeline/build-gpu-rpkg-impl.sh diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index c7d3f7fa4235..e85826f36a26 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -7,7 +7,7 @@ source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_r_rockylinux8 \ - -- ops/build_r_pkg_with_cuda.sh \ + -- ops/pipeline/build-gpu-rpkg-impl.sh \ ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] diff --git a/ops/pipeline/test-python-impl.sh b/ops/pipeline/test-python-impl.sh new file mode 100755 index 000000000000..bd71cfb06435 --- /dev/null +++ b/ops/pipeline/test-python-impl.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64}" + exit 1 +fi + +suite="$1" + +set -x + +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + source activate gpu_test + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + ;; + mgpu) + echo "-- Run Python tests, using multiple GPUs" + source activate gpu_test + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_federated + ;; + cpu) + echo "-- Run Python tests (CPU)" + source activate linux_cpu_test + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + ;; + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + source activate aarch64_test + pytest -v -s -rxXs --fulltrace --durations=0 \ + tests/python/test_basic.py tests/python/test_basic_models.py \ + tests/python/test_model_compatibility.py + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 02907253bc0f..047a6f411d6d 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,82 +13,5 @@ fi suite="$1" container_id="$2" -cat > test-python-wrapper.sh <<-'EOF' -#!/bin/bash -source activate "$1" - -set -euox pipefail - -export PYSPARK_DRIVER_PYTHON=$(which python) -export PYSPARK_PYTHON=$(which python) -export SPARK_TESTING=1 - -pip install -v ./python-package/dist/*.whl -EOF - -case "$suite" in - gpu) - echo "-- Run Python tests, using a single GPU" - echo " - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash test-python-wrapper.sh gpu_test - ;; - - mgpu) - echo "-- Run Python tests, using multiple GPUs" - echo " - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g' \ - -- bash test-python-wrapper.sh gpu_test - ;; - - cpu) - echo "-- Run Python tests (CPU)" - echo " - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - pytest -v -s -rxXs --fulltrace --durations=0 tests/python - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" \ - -- bash test-python-wrapper.sh linux_cpu_test - ;; - - cpu-arm64) - echo "-- Run Python tests (CPU, ARM64)" - echo " - pytest -v -s -rxXs --fulltrace --durations=0 \\ - tests/python/test_basic.py tests/python/test_basic_models.py \\ - tests/python/test_model_compatibility.py - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" \ - -- bash test-python-wrapper.sh aarch64_test - ;; - - *) - echo "Unrecognized argument: $suite" - exit 1 - ;; -esac +python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash ops/pipeline/test-python-impl.sh "${suite}" From 6dc0df2c3d2aefb1ff168f3ef5213199682c9ca5 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 10:52:22 -0800 Subject: [PATCH 29/86] More reorg --- dev/prepare_jvm_release.py | 2 +- ops/pipeline/build-cpu-arm64.sh | 6 ++--- ops/pipeline/build-cpu.sh | 4 +-- ops/pipeline/build-cuda-with-rmm.sh | 6 ++--- ops/pipeline/build-cuda.sh | 8 +++--- ops/pipeline/build-gpu-rpkg-impl.sh | 2 +- ops/pipeline/build-manylinux2014.sh | 4 +-- .../build-python-wheels-macos.sh} | 2 +- ops/pipeline/build-test-jvm-packages-impl.sh | 2 +- ops/pipeline/build-win64-gpu.ps1 | 2 +- ops/pipeline/run-clang-tidy.sh | 2 +- ops/pipeline/test-python-impl.sh | 25 ++++++++++++++----- ops/{ => script}/build_via_cmake.sh | 0 ops/{ => script}/change_scala_version.py | 0 ops/{ => script}/change_version.py | 0 ops/{ => script}/format_wheel_meta.py | 0 ops/{lint => script}/lint_cmake.sh | 0 ops/{lint => script}/lint_cpp.py | 0 ops/{lint => script}/lint_python.py | 23 ++++++++--------- ops/{lint => script}/lint_r.R | 0 ops/{ => script}/rename_whl.py | 0 ops/{clang-tidy => script}/run_clang_tidy.py | 2 +- ops/{ => script}/test_r_package.py | 0 ops/{clang-tidy => script}/test_tidy.cc | 0 ops/{ => script}/test_utils.py | 2 +- ops/{ => script}/update_rapids.sh | 0 ops/{ => script}/verify_link.sh | 0 27 files changed, 52 insertions(+), 40 deletions(-) rename ops/{build_python_wheels_macos.sh => pipeline/build-python-wheels-macos.sh} (97%) rename ops/{ => script}/build_via_cmake.sh (100%) rename ops/{ => script}/change_scala_version.py (100%) rename ops/{ => script}/change_version.py (100%) rename ops/{ => script}/format_wheel_meta.py (100%) rename ops/{lint => script}/lint_cmake.sh (100%) rename ops/{lint => script}/lint_cpp.py (100%) rename ops/{lint => script}/lint_python.py (95%) rename ops/{lint => script}/lint_r.R (100%) rename ops/{ => script}/rename_whl.py (100%) rename ops/{clang-tidy => script}/run_clang_tidy.py (99%) rename ops/{ => script}/test_r_package.py (100%) rename ops/{clang-tidy => script}/test_tidy.cc (100%) rename ops/{ => script}/test_utils.py (99%) rename ops/{ => script}/update_rapids.sh (100%) rename ops/{ => script}/verify_link.sh (100%) diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 927cb4945950..c5a72724f707 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh index 8a5db56d9eeb..4be57557ea36 100755 --- a/ops/pipeline/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -11,7 +11,7 @@ source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ --conda-env=aarch64_test \ -DUSE_OPENMP=ON \ -DHIDE_CXX_SYMBOL=ON @@ -26,7 +26,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -35,7 +35,7 @@ echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh index 60346203d85f..22384d056f15 100755 --- a/ops/pipeline/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -18,7 +18,7 @@ echo "--- Run Google Test with sanitizer enabled" sudo sysctl vm.mmap_rnd_bits=28 python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DUSE_SANITIZER=ON \ -DENABLED_SANITIZERS="address;leak;undefined" \ -DCMAKE_BUILD_TYPE=Debug \ @@ -35,7 +35,7 @@ python3 ops/docker_run.py \ echo "--- Run Google Test" python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH=/opt/grpc \ -DPLUGIN_FEDERATED=ON python3 ops/docker_run.py \ diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 24523bd875c0..1da0e5e61827 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -18,7 +18,7 @@ echo "--- Build with CUDA with RMM" echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ @@ -36,7 +36,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -46,7 +46,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.$WHEEL_TAG \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index bcda081b338e..0487fb209dbe 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -22,7 +22,7 @@ set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ @@ -39,7 +39,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -49,7 +49,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.manylinux_2_28_x86_64 \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -68,7 +68,7 @@ then # Generate the meta info which includes xgboost version and the commit info python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- python ops/format_wheel_meta.py \ + -- python ops/script/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} \ diff --git a/ops/pipeline/build-gpu-rpkg-impl.sh b/ops/pipeline/build-gpu-rpkg-impl.sh index d0a7c9295195..2815b8f448f1 100755 --- a/ops/pipeline/build-gpu-rpkg-impl.sh +++ b/ops/pipeline/build-gpu-rpkg-impl.sh @@ -14,7 +14,7 @@ fi commit_hash="$1" -python3 ops/test_r_package.py --task=pack +python3 ops/script/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index 3f04c0f7e7f4..7802fa555187 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -30,7 +30,7 @@ git checkout python-package/pyproject.toml python-package/xgboost/core.py python3 ops/docker_run.py \ --container-id ${image} \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -51,7 +51,7 @@ git checkout python-package/pyproject.toml # discard the patch python3 ops/docker_run.py \ --container-id ${image} \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/xgboost_cpu-*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/build_python_wheels_macos.sh b/ops/pipeline/build-python-wheels-macos.sh similarity index 97% rename from ops/build_python_wheels_macos.sh rename to ops/pipeline/build-python-wheels-macos.sh index f2d1c692c8cb..3715ec9e7e0f 100644 --- a/ops/build_python_wheels_macos.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -43,7 +43,7 @@ export CIBW_REPAIR_WHEEL_COMMAND_MACOS="delocate-wheel --require-archs {delocate python -m pip install cibuildwheel python -m cibuildwheel python-package --output-dir wheelhouse -python tests/ci_build/rename_whl.py \ +python ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${commit_id} \ --platform-tag ${wheel_tag} diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index 180788436d9b..3290bf0f17c9 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -31,7 +31,7 @@ set -x # Set Scala version if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] then - python ops/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts + python ops/script/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts else echo "Error: SCALA_VERSION must be either 2.12 or 2.13" exit 2 diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index c691a55f954c..cc5380a7c7c2 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -31,7 +31,7 @@ pip install --user -v "pip>=23" pip --version pip wheel --no-deps -v . --wheel-dir dist/ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -python ../ops/rename_whl.py ` +python ../ops/script/rename_whl.py ` --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` --commit-hash $Env:GITHUB_SHA ` --platform-tag win_amd64 diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index 496b601bfdfb..a9ff039ee4ca 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 ops/clang-tidy/run_clang_tidy.py --cuda-archs 75 + -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/test-python-impl.sh b/ops/pipeline/test-python-impl.sh index bd71cfb06435..be1cb410c96c 100755 --- a/ops/pipeline/test-python-impl.sh +++ b/ops/pipeline/test-python-impl.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -eo pipefail if [[ "$#" -lt 1 ]] then @@ -10,7 +10,24 @@ fi suite="$1" -set -x +# Cannot set -u before Conda env activation +case "$suite" in + gpu|mgpu) + source activate gpu_test + ;; + cpu) + source activate linux_cpu_test + ;; + cpu-arm64) + source activate aarch64_test + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +set -xu export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) @@ -21,13 +38,11 @@ pip install -v ./python-package/dist/*.whl case "$suite" in gpu) echo "-- Run Python tests, using a single GPU" - source activate gpu_test python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu ;; mgpu) echo "-- Run Python tests, using multiple GPUs" - source activate gpu_test python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ @@ -39,7 +54,6 @@ case "$suite" in ;; cpu) echo "-- Run Python tests (CPU)" - source activate linux_cpu_test export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 pytest -v -s -rxXs --fulltrace --durations=0 tests/python pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask @@ -48,7 +62,6 @@ case "$suite" in ;; cpu-arm64) echo "-- Run Python tests (CPU, ARM64)" - source activate aarch64_test pytest -v -s -rxXs --fulltrace --durations=0 \ tests/python/test_basic.py tests/python/test_basic_models.py \ tests/python/test_model_compatibility.py diff --git a/ops/build_via_cmake.sh b/ops/script/build_via_cmake.sh similarity index 100% rename from ops/build_via_cmake.sh rename to ops/script/build_via_cmake.sh diff --git a/ops/change_scala_version.py b/ops/script/change_scala_version.py similarity index 100% rename from ops/change_scala_version.py rename to ops/script/change_scala_version.py diff --git a/ops/change_version.py b/ops/script/change_version.py similarity index 100% rename from ops/change_version.py rename to ops/script/change_version.py diff --git a/ops/format_wheel_meta.py b/ops/script/format_wheel_meta.py similarity index 100% rename from ops/format_wheel_meta.py rename to ops/script/format_wheel_meta.py diff --git a/ops/lint/lint_cmake.sh b/ops/script/lint_cmake.sh similarity index 100% rename from ops/lint/lint_cmake.sh rename to ops/script/lint_cmake.sh diff --git a/ops/lint/lint_cpp.py b/ops/script/lint_cpp.py similarity index 100% rename from ops/lint/lint_cpp.py rename to ops/script/lint_cpp.py diff --git a/ops/lint/lint_python.py b/ops/script/lint_python.py similarity index 95% rename from ops/lint/lint_python.py rename to ops/script/lint_python.py index c8d0f47709ab..29339d6e04d0 100644 --- a/ops/lint/lint_python.py +++ b/ops/script/lint_python.py @@ -16,8 +16,6 @@ class LintersPaths: BLACK = ( # core "python-package/", - # CI - "tests/ci_build/tidy.py", # tests "tests/python/test_config.py", "tests/python/test_callback.py", @@ -66,10 +64,11 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/script/run_clang_tidy.py", + "ops/script/lint_python.py", + "ops/script/test_r_package.py", + "ops/script/test_utils.py", + "ops/script/change_version.py", ) ISORT = ( @@ -79,7 +78,7 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "tests/ci_build/", + "ops/script/", # demo "demo/", # misc @@ -123,11 +122,11 @@ class LintersPaths: "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/tidy.py", - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/script/run_clang_tidy.py", + "ops/script/lint_python.py", + "ops/script/test_r_package.py", + "ops/script/test_utils.py", + "ops/script/change_version.py", ) diff --git a/ops/lint/lint_r.R b/ops/script/lint_r.R similarity index 100% rename from ops/lint/lint_r.R rename to ops/script/lint_r.R diff --git a/ops/rename_whl.py b/ops/script/rename_whl.py similarity index 100% rename from ops/rename_whl.py rename to ops/script/rename_whl.py diff --git a/ops/clang-tidy/run_clang_tidy.py b/ops/script/run_clang_tidy.py similarity index 99% rename from ops/clang-tidy/run_clang_tidy.py rename to ops/script/run_clang_tidy.py index 24cb270393e8..aaeccdaf3718 100755 --- a/ops/clang-tidy/run_clang_tidy.py +++ b/ops/script/run_clang_tidy.py @@ -265,7 +265,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "ops", "clang-tidy", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "script", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: diff --git a/ops/test_r_package.py b/ops/script/test_r_package.py similarity index 100% rename from ops/test_r_package.py rename to ops/script/test_r_package.py diff --git a/ops/clang-tidy/test_tidy.cc b/ops/script/test_tidy.cc similarity index 100% rename from ops/clang-tidy/test_tidy.cc rename to ops/script/test_tidy.cc diff --git a/ops/test_utils.py b/ops/script/test_utils.py similarity index 99% rename from ops/test_utils.py rename to ops/script/test_utils.py index f05fed4dc7f8..adcd05d5a124 100644 --- a/ops/test_utils.py +++ b/ops/script/test_utils.py @@ -75,7 +75,7 @@ def print_time() -> None: ROOT = os.path.normpath( os.path.join( - os.path.dirname(os.path.abspath(__file__)), os.path.pardir + os.path.dirname(os.path.abspath(__file__)), os.path.pardir, os.path.pardir ) ) R_PACKAGE = os.path.join(ROOT, "R-package") diff --git a/ops/update_rapids.sh b/ops/script/update_rapids.sh similarity index 100% rename from ops/update_rapids.sh rename to ops/script/update_rapids.sh diff --git a/ops/verify_link.sh b/ops/script/verify_link.sh similarity index 100% rename from ops/verify_link.sh rename to ops/script/verify_link.sh From d1d82bc827e54530e1d77745ea294717f8c3cc06 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 2 Nov 2024 13:49:28 +0800 Subject: [PATCH 30/86] Disable the host numa virtual memory allocator for now. (#10934) --- src/common/device_helpers.cu | 5 +++++ tests/cpp/common/test_device_vector.cu | 11 +++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 608a535cd8cb..01e81b16ee0b 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,6 +7,11 @@ namespace dh { PinnedMemory::PinnedMemory() { + // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. + // See https://github.com/dmlc/xgboost/issues/10933 + this->impl_.emplace(); + return; + #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index d7a03e41a64b..ec1a420bd349 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,6 +31,9 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); + if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); + } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -110,14 +113,6 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; -#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); -#else // defined(xgboost_IS_WIN) - if (major >= 12 && minor >= 5) { - ASSERT_TRUE(pinned.IsVm()); - } else { - ASSERT_FALSE(pinned.IsVm()); - } -#endif // defined(xgboost_IS_WIN) } } // namespace dh From ad75358f735b94c13e99fbe633517ad1bdcfedde Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 15:15:47 -0800 Subject: [PATCH 31/86] Fix run flags to Python tests --- ops/pipeline/test-python.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 047a6f411d6d..507deb37d9c0 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,5 +13,13 @@ fi suite="$1" container_id="$2" -python3 ops/docker_run.py --container-id "${container_id}" \ +if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +then + gpu_option="--use-gpus" +else + gpu_option="" +fi + +python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ + --run-args='--shm-size=4g' \ -- bash ops/pipeline/test-python-impl.sh "${suite}" From 06d060b88f23973adc0a4d2d363060130491cf05 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 16:26:08 -0800 Subject: [PATCH 32/86] Separate workflow for JVM packages --- .github/workflows/jvm_tests.yml | 161 ++++++++++++++++++++++++++++++++ .github/workflows/main.yml | 123 +----------------------- 2 files changed, 162 insertions(+), 122 deletions(-) create mode 100644 .github/workflows/jvm_tests.yml diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml new file mode 100644 index 000000000000..5894a4bead7d --- /dev/null +++ b/.github/workflows/jvm_tests.yml @@ -0,0 +1,161 @@ +name: XGBoost CI (JVM packages) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + +jobs: + build-containers: + name: Build CI containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting glibc 2.17 + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu + + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + + build-test-jvm-packages: + name: Build and test JVM packages + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 47e195267d49..80e6db40cfb6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,8 +31,6 @@ jobs: - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 - - xgb-ci.jvm - - xgb-ci.jvm_gpu_build runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -213,102 +211,6 @@ jobs: CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting glibc 2.17 - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - - build-jvm-gpu: - name: Build libxgboost4j.so with CUDA - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - run: bash ops/pipeline/build-jvm-gpu.sh - - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-jvm-gpu - - build-jvm-docs: - name: Build docs for JVM packages - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/build-jvm-doc.sh - - build-test-jvm-packages: - name: Build and test JVM packages - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm - - name: Build and test JVM packages (Scala 2.12) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.12 - - name: Build and test JVM packages (Scala 2.13) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.13 - test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] @@ -348,7 +250,7 @@ jobs: test-python: name: Run Python tests - needs: [build-cuda] + needs: [build-cuda, build-cpu-arm64] runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} @@ -404,26 +306,3 @@ jobs: KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} - - test-jvm-packages-gpu: - name: Test JVM packages with CUDA - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-mgpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/test-jvm-gpu.sh From 588dd67f9b1dc9671a69bd63dc9178201be7a448 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 16:29:44 -0800 Subject: [PATCH 33/86] Rename workflow files --- .github/workflows/{jvm_tests.yml => jvm_tests_runs_on.yml} | 0 .github/workflows/{main.yml => main_runs_on.yml} | 0 .github/workflows/{windows.yml => windows_runs_on.yml} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{jvm_tests.yml => jvm_tests_runs_on.yml} (100%) rename .github/workflows/{main.yml => main_runs_on.yml} (100%) rename .github/workflows/{windows.yml => windows_runs_on.yml} (100%) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests_runs_on.yml similarity index 100% rename from .github/workflows/jvm_tests.yml rename to .github/workflows/jvm_tests_runs_on.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main_runs_on.yml similarity index 100% rename from .github/workflows/main.yml rename to .github/workflows/main_runs_on.yml diff --git a/.github/workflows/windows.yml b/.github/workflows/windows_runs_on.yml similarity index 100% rename from .github/workflows/windows.yml rename to .github/workflows/windows_runs_on.yml From cb8f63f4991c09ee1c13fd1a2c25ee081191352d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 17:52:21 -0800 Subject: [PATCH 34/86] Consolidate workflow defs --- .github/workflows/i386.yml | 6 +- .github/workflows/jvm_tests.yml | 238 ++++++++-- .github/workflows/jvm_tests_runs_on.yml | 161 ------- .github/workflows/lint.yml | 144 ++++++ .github/workflows/macos.yml | 24 - .github/workflows/main.yml | 431 +++++++++++------- .github/workflows/main_runs_on.yml | 308 ------------- .github/workflows/misc.yml | 133 ++++++ .github/workflows/python_tests.yml | 204 +-------- ...hon_wheels.yml => python_wheels_macos.yml} | 28 +- .github/workflows/r_tests.yml | 84 ++-- .github/workflows/scorecards.yml | 2 +- .github/workflows/sycl_tests.yml | 94 ++++ .github/workflows/update_rapids.yml | 2 +- .../{windows_runs_on.yml => windows.yml} | 7 +- ops/{docker => }/conda_env/aarch64_test.yml | 0 ops/{docker => }/conda_env/cpp_test.yml | 0 ops/{docker => }/conda_env/jvm_tests.yml | 0 ops/{docker => }/conda_env/linux_cpu_test.yml | 0 .../conda_env/linux_sycl_test.yml | 0 ops/{docker => }/conda_env/macos_cpu_test.yml | 0 ops/{docker => }/conda_env/python_lint.yml | 0 ops/{docker => }/conda_env/sdist_test.yml | 0 ops/{docker => }/conda_env/win64_test.yml | 0 ops/docker/dockerfile/Dockerfile.aarch64 | 2 +- ops/docker/dockerfile/Dockerfile.clang_tidy | 2 +- ops/docker/dockerfile/Dockerfile.cpu | 2 +- ops/docker/dockerfile/Dockerfile.gpu | 2 +- .../Dockerfile.gpu_build_r_rockylinux8 | 2 +- ops/docker/dockerfile/Dockerfile.jvm | 2 +- .../dockerfile/Dockerfile.jvm_gpu_build | 2 +- .../Dockerfile.manylinux2014_aarch64 | 2 +- .../Dockerfile.manylinux2014_x86_64 | 2 +- .../Dockerfile.manylinux_2_28_x86_64 | 2 +- ops/docker_build.py | 6 +- ops/docker_run.py | 4 +- ...m1.sh => build-jvm-macos-apple-silicon.sh} | 8 +- ops/pipeline/build-jvm-macos-intel.sh | 44 ++ ops/pipeline/build-python-wheels-macos.sh | 1 - ops/pipeline/test-win64-gpu.ps1 | 2 +- ops/script/build_via_cmake.sh | 11 +- ops/script/lint_cmake.sh | 2 +- ops/script/run_clang_tidy.py | 4 +- .../test_gpu_with_dask/test_gpu_with_dask.py | 2 +- 44 files changed, 970 insertions(+), 1000 deletions(-) delete mode 100644 .github/workflows/jvm_tests_runs_on.yml create mode 100644 .github/workflows/lint.yml delete mode 100644 .github/workflows/macos.yml delete mode 100644 .github/workflows/main_runs_on.yml create mode 100644 .github/workflows/misc.yml rename .github/workflows/{python_wheels.yml => python_wheels_macos.yml} (55%) create mode 100644 .github/workflows/sycl_tests.yml rename .github/workflows/{windows_runs_on.yml => windows.yml} (93%) rename ops/{docker => }/conda_env/aarch64_test.yml (100%) rename ops/{docker => }/conda_env/cpp_test.yml (100%) rename ops/{docker => }/conda_env/jvm_tests.yml (100%) rename ops/{docker => }/conda_env/linux_cpu_test.yml (100%) rename ops/{docker => }/conda_env/linux_sycl_test.yml (100%) rename ops/{docker => }/conda_env/macos_cpu_test.yml (100%) rename ops/{docker => }/conda_env/python_lint.yml (100%) rename ops/{docker => }/conda_env/sdist_test.yml (100%) rename ops/{docker => }/conda_env/win64_test.yml (100%) rename ops/pipeline/{build-jvm-macos-m1.sh => build-jvm-macos-apple-silicon.sh} (85%) create mode 100755 ops/pipeline/build-jvm-macos-intel.sh mode change 100644 => 100755 ops/pipeline/build-python-wheels-macos.sh mode change 100644 => 100755 ops/script/lint_cmake.sh diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aec7e9d31087..aa71147e2581 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,7 +19,7 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up Docker Buildx @@ -30,7 +30,7 @@ jobs: uses: docker/build-push-action@v6 with: context: . - file: tests/ci_build/Dockerfile.i386 + file: ops/docker/dockerfile/Dockerfile.i386 push: true tags: localhost:5000/xgboost/build-32bit:latest cache-from: type=gha @@ -40,4 +40,4 @@ jobs: docker run --rm -v $PWD:/workspace -w /workspace \ -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index dcbd9de55b50..f9385fa4acaf 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -1,44 +1,193 @@ -name: XGBoost-JVM-Tests +name: XGBoost CI (JVM packages) on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-jvm-manylinux2014: + name: >- + Build libxgboost4j.so targeting glibc 2.17 + (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu + + build-jvm-mac: + name: "Build libxgboost4j.dylib for ${{ matrix.description }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - description: "MacOS (Apple Silicon)" + script: ops/pipeline/build-jvm-macos-apple-silicon.sh + runner: macos-14 + - description: "MacOS (Intel)" + script: ops/pipeline/build-jvm-macos-intel.sh + runner: macos-13 + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - run: bash ${{ matrix.script }} + + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + + build-test-jvm-packages: + name: Build and test JVM packages (Linux) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + + build-test-jvm-packages-other-os: + name: Build and test JVM packages (${{ matrix.os }}) timeout-minutes: 30 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [windows-latest, ubuntu-latest, macos-13] + os: [windows-latest, macos-13] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: actions/setup-java@b36c23c0d998641eff861008f374ee103c25ac73 # v4.4.0 + - uses: actions/setup-java@v4.5.0 with: distribution: 'temurin' java-version: '8' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml + environment-file: ops/conda_env/jvm_tests.yml use-mamba: true - name: Cache Maven packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache@v4.1.2 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} @@ -49,52 +198,41 @@ jobs: cd jvm-packages mvn test -B -pl :xgboost4j_2.12 - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows - - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') - - name: Publish artifact xgboost4j.dll to S3 run: | cd lib/ Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: matrix.os == 'windows-latest' + # if: | + # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + # matrix.os == 'windows-latest' env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/jvm_tests_runs_on.yml b/.github/workflows/jvm_tests_runs_on.yml deleted file mode 100644 index 5894a4bead7d..000000000000 --- a/.github/workflows/jvm_tests_runs_on.yml +++ /dev/null @@ -1,161 +0,0 @@ -name: XGBoost CI (JVM packages) - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - -jobs: - build-containers: - name: Build CI containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - container_id: - - xgb-ci.manylinux2014_x86_64 - - xgb-ci.jvm - - xgb-ci.jvm_gpu_build - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} - - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting glibc 2.17 - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - - build-jvm-gpu: - name: Build libxgboost4j.so with CUDA - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - run: bash ops/pipeline/build-jvm-gpu.sh - - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-jvm-gpu - - build-jvm-docs: - name: Build docs for JVM packages - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/build-jvm-doc.sh - - build-test-jvm-packages: - name: Build and test JVM packages - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm - - name: Build and test JVM packages (Scala 2.12) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.12 - - name: Build and test JVM packages (Scala 2.13) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.13 - - test-jvm-packages-gpu: - name: Test JVM packages with CUDA - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-mgpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000000..caceb3e3893b --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,144 @@ +name: XGBoost CI (Lint) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - container_id: xgb-ci.clang_tidy + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + clang-tidy: + name: Run clang-tidy + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.clang_tidy + - run: bash ops/pipeline/run-clang-tidy.sh + + python-mypy-lint: + runs-on: ubuntu-latest + name: Type and format checks for the Python package + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: python_lint + environment-file: ops/conda_env/python_lint.yml + use-mamba: true + - name: Display Conda env + shell: bash -el {0} + run: | + conda info + conda list + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + + cpp-lint: + runs-on: ubuntu-latest + name: Code linting for C++ + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh + + lintr: + runs-on: ${{ matrix.os }} + name: Run R linters on OS ${{ matrix.os }}, R ${{ matrix.r }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + r: "release" + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} + + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml deleted file mode 100644 index 2bb3e1aba46c..000000000000 --- a/.github/workflows/macos.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Nextgen XGBoost CI, MacOS - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - -jobs: - mac-m1-jvm: - name: "Build libxgboost4j.dylib for MacOS M1" - runs-on: macos-14 - steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - run: bash ops/pipeline/build-jvm-macos-m1.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3c0a67b4f463..77208a146443 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,193 +1,294 @@ -# This is a basic workflow to help you get started with Actions +name: XGBoost CI -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -# A workflow run is made up of one or more jobs that can run sequentially or in parallel +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: - fail-fast: false matrix: - os: [macos-12] + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_r_rockylinux8 + - xgb-ci.gpu + - xgb-ci.gpu_dev_ver + - xgb-ci.cpu + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.cpu + - run: bash ops/pipeline/build-cpu.sh + - name: Stash CLI executable + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + KEY: build-cpu + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.aarch64 + - run: bash ops/pipeline/build-cpu-arm64.sh + - name: Stash files + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cpu-arm64 + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda.sh + - name: Stash files + run: | + bash ops/stash_artifacts.sh \ + build/testxgboost ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cuda - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda-with-rmm.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost + env: + COMMAND: upload + KEY: build-cuda-with-rmm + + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} + test-cpp-gpu: + name: >- + Run Google Tests with GPUs + (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) + needs: [build-cuda, build-cuda-with-rmm] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.10"] + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu + - name: Unstash gtest + run: | + bash ops/stash_artifacts.sh build/testxgboost + chmod +x build/testxgboost + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} + test-python: + name: Run Python tests (${{ matrix.description }}) + needs: [build-cuda, build-cpu-arm64] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ + include: + - description: "single GPU" + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "single GPU, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "multiple GPUs" + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "multiple GPUs, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "CPU" + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: "CPU ARM64" + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container }} + - name: Unstash Python wheel + run: | + bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost + chmod +x ./xgboost + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/main_runs_on.yml b/.github/workflows/main_runs_on.yml deleted file mode 100644 index 80e6db40cfb6..000000000000 --- a/.github/workflows/main_runs_on.yml +++ /dev/null @@ -1,308 +0,0 @@ -name: Nextgen XGBoost CI - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - -jobs: - build-containers: - name: Build CI containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - container_id: - - xgb-ci.gpu_build_rockylinux8 - - xgb-ci.gpu_build_r_rockylinux8 - - xgb-ci.gpu - - xgb-ci.gpu_dev_ver - - xgb-ci.cpu - - xgb-ci.clang_tidy - - xgb-ci.manylinux_2_28_x86_64 - - xgb-ci.manylinux2014_x86_64 - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - - container_id: xgb-ci.aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} - - clang-tidy: - name: Run clang-tidy - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.clang_tidy - - run: bash ops/pipeline/run-clang-tidy.sh - - build-cpu: - name: Build CPU - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.cpu - - run: bash ops/pipeline/build-cpu.sh - - name: Stash CLI executable - run: bash ops/stash_artifacts.sh ./xgboost - env: - COMMAND: upload - KEY: build-cpu - - build-cpu-arm64: - name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.aarch64 - - run: bash ops/pipeline/build-cpu-arm64.sh - - name: Stash files - run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cpu-arm64 - - build-cuda: - name: Build CUDA + manylinux_2_28_x86_64 wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda.sh - - name: Stash files - run: | - bash ops/stash_artifacts.sh \ - build/testxgboost ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cuda - - build-cuda-with-rmm: - name: Build CUDA with RMM - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda-with-rmm.sh - - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost - env: - COMMAND: upload - KEY: build-cuda-with-rmm - - build-manylinux2014: - name: Build manylinux2014_${{ matrix.arch }} wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - - build-gpu-rpkg: - name: Build GPU-enabled R package - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - - run: bash ops/pipeline/build-gpu-rpkg.sh - - test-cpp-gpu: - name: Run Google Tests with GPU(s) - needs: [build-cuda, build-cuda-with-rmm] - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - suite: gpu-rmm - runner: linux-amd64-gpu - artifact_from: build-cuda-with-rmm - - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu - - name: Unstash gtest - run: | - bash ops/stash_artifacts.sh build/testxgboost - chmod +x build/testxgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - - test-python: - name: Run Python tests - needs: [build-cuda, build-cpu-arm64] - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - description: "single GPU" - container: xgb-ci.gpu - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - description: "single GPU, nightly deps" - container: xgb-ci.gpu_dev_ver - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - description: "multiple GPUs" - container: xgb-ci.gpu - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - - description: "multiple GPUs, nightly deps" - container: xgb-ci.gpu_dev_ver - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - - description: "CPU" - container: xgb-ci.cpu - suite: cpu - runner: linux-amd64-cpu - artifact_from: build-cuda - - description: "CPU ARM64" - container: xgb-ci.aarch64 - suite: cpu-arm64 - runner: linux-arm64-cpu - artifact_from: build-cpu-arm64 - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container }} - - name: Unstash Python wheel - run: | - bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost - chmod +x ./xgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - - name: Run Python tests, ${{ matrix.description }} - run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml new file mode 100644 index 000000000000..7294faa0d93b --- /dev/null +++ b/.github/workflows/misc.yml @@ -0,0 +1,133 @@ +name: XGBoost CI (misc) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu: + name: Test Google C++ test (CPU) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-13] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + brew install ninja libomp + - name: Build gtest binary + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo + ninja -v + - name: Run gtest binary + run: | + cd build + ./testxgboost + ctest -R TestXGBoostCLI --extra-verbose + + gtest-cpu-nonomp: + name: Test Google C++ unittest (CPU Non-OMP) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON + ninja -v + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose + + c-api-demo: + name: Test installing XGBoost lib + building the C API demo + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.10"] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: cpp_test + environment-file: ops/conda_env/cpp_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost static library + run: | + mkdir build + cd build + cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja -v install + cd - + - name: Build and run C API demo with static + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + cd .. + rm -rf ./build + popd + + - name: Build and install XGBoost shared library + run: | + cd build + cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON + ninja -v install + ./testxgboost + cd - + - name: Build and run C API demo with shared + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + popd + ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo + ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 907cf98e1011..c43d8b056c8d 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -1,4 +1,4 @@ -name: XGBoost-Python-Tests +name: XGBoost CI (Python tests) on: [push, pull_request] @@ -14,54 +14,23 @@ concurrency: cancel-in-progress: true jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. runs-on: ${{ matrix.os }} name: Test installing XGBoost Python source package on ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml + environment-file: ops/conda_env/sdist_test.yml use-mamba: true - name: Display Conda env run: | @@ -82,18 +51,19 @@ jobs: runs-on: ${{ matrix.os }} name: Test installing XGBoost Python source package on ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [macos-13, windows-latest] python-version: ["3.10"] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Install osx system dependencies if: matrix.os == 'macos-13' run: | brew install ninja libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: auto-update-conda: true python-version: ${{ matrix.python-version }} @@ -115,25 +85,25 @@ jobs: python -c 'import xgboost' python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} + name: Test XGBoost Python package on ${{ matrix.os }} + runs-on: ${{ matrix.os }} timeout-minutes: 60 strategy: + fail-fast: false matrix: - config: - - {os: macos-13} + os: [macos-13] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml + environment-file: ops/conda_env/macos_cpu_test.yml use-mamba: true - name: Display Conda env @@ -167,159 +137,21 @@ jobs: run: | pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - - python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up Python 3.10 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@v5.3.0 with: python-version: "3.10" diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels_macos.yml similarity index 55% rename from .github/workflows/python_wheels.yml rename to .github/workflows/python_wheels_macos.yml index 3b7a8072c109..a4cff8eb0e6f 100644 --- a/.github/workflows/python_wheels.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -1,9 +1,9 @@ -name: XGBoost-Python-Wheels +name: Build Python wheels targeting MacOS on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) defaults: run: @@ -13,11 +13,16 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + jobs: - python-wheels: + python-wheels-macos: name: Build wheel for ${{ matrix.platform_id }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: include: - os: macos-13 @@ -25,31 +30,26 @@ jobs: - os: macos-14 platform_id: macosx_arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest python-version: "3.10" use-mamba: true - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index c56d1f8ef943..3885c126f11e 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -13,78 +13,46 @@ concurrency: cancel-in-progress: true jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} + runs-on: ${{ matrix.os }} + name: Test R on OS ${{ matrix.os }}, R ${{ matrix.r }}, Compiler ${{ matrix.compiler }}, Build ${{ matrix.build }} strategy: fail-fast: false matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} + include: + - os: windows-latest + r: release + compiler: mingw + build: autotools + - os: ubuntu-latest + r: release + compiler: none + build: cmake env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} steps: - name: Install system dependencies run: | sudo apt update sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 + - uses: r-lib/actions/setup-r@v2.11.0 with: - r-version: ${{ matrix.config.r }} + r-version: ${{ matrix.r }} - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache@v4.1.2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + - uses: actions/setup-python@v5.3.0 with: python-version: "3.10" architecture: 'x64' @@ -98,13 +66,13 @@ jobs: - name: Test R run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' - name: Test R run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian @@ -123,7 +91,7 @@ jobs: run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' @@ -135,7 +103,7 @@ jobs: - name: Test R shell: bash -l {0} run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - uses: dorny/paths-filter@v3 id: changes @@ -147,4 +115,4 @@ jobs: - name: Run document check if: steps.changes.outputs.r_package == 'true' run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 85a9abb57e1b..8ab77ec4c382 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -22,7 +22,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@v4.2.2 with: persist-credentials: false diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml new file mode 100644 index 000000000000..54ebcb5f9532 --- /dev/null +++ b/.github/workflows/sycl_tests.yml @@ -0,0 +1,94 @@ +name: XGBoost CI (oneAPI) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-sycl: + name: Test Google C++ unittest (CPU SYCL) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX + make -j$(nproc) + - name: Run gtest binary for SYCL + run: | + cd build + ./testxgboost --gtest_filter=Sycl* + - name: Run gtest binary for non SYCL + run: | + cd build + ./testxgboost --gtest_filter=-Sycl* + + python-sycl-tests-on-ubuntu: + name: Test XGBoost Python package with SYCL on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true + + - name: Display Conda env + run: | + conda info + conda list + - name: Build XGBoost on Ubuntu + run: | + mkdir build + cd build + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + make -j$(nproc) + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 5e229db4c050..636661db46b8 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,7 +25,7 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Check latest RAPIDS and update conftest.sh diff --git a/.github/workflows/windows_runs_on.yml b/.github/workflows/windows.yml similarity index 93% rename from .github/workflows/windows_runs_on.yml rename to .github/workflows/windows.yml index 73a258158b12..3dc9c4962646 100644 --- a/.github/workflows/windows_runs_on.yml +++ b/.github/workflows/windows.yml @@ -1,4 +1,4 @@ -name: Nextgen XGBoost CI Windows +name: XGBoost CI (Windows) on: [push, pull_request] @@ -27,7 +27,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-cpu steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.2.2 with: submodules: "true" - run: powershell ops/pipeline/build-win64-gpu.ps1 @@ -39,6 +39,7 @@ jobs: env: COMMAND: upload KEY: build-win64-gpu + test-win64-gpu: name: Test XGBoost on Windows needs: build-win64-gpu @@ -46,7 +47,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-gpu steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.2.2 with: submodules: "true" - name: Unstash files diff --git a/ops/docker/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml similarity index 100% rename from ops/docker/conda_env/aarch64_test.yml rename to ops/conda_env/aarch64_test.yml diff --git a/ops/docker/conda_env/cpp_test.yml b/ops/conda_env/cpp_test.yml similarity index 100% rename from ops/docker/conda_env/cpp_test.yml rename to ops/conda_env/cpp_test.yml diff --git a/ops/docker/conda_env/jvm_tests.yml b/ops/conda_env/jvm_tests.yml similarity index 100% rename from ops/docker/conda_env/jvm_tests.yml rename to ops/conda_env/jvm_tests.yml diff --git a/ops/docker/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml similarity index 100% rename from ops/docker/conda_env/linux_cpu_test.yml rename to ops/conda_env/linux_cpu_test.yml diff --git a/ops/docker/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml similarity index 100% rename from ops/docker/conda_env/linux_sycl_test.yml rename to ops/conda_env/linux_sycl_test.yml diff --git a/ops/docker/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml similarity index 100% rename from ops/docker/conda_env/macos_cpu_test.yml rename to ops/conda_env/macos_cpu_test.yml diff --git a/ops/docker/conda_env/python_lint.yml b/ops/conda_env/python_lint.yml similarity index 100% rename from ops/docker/conda_env/python_lint.yml rename to ops/conda_env/python_lint.yml diff --git a/ops/docker/conda_env/sdist_test.yml b/ops/conda_env/sdist_test.yml similarity index 100% rename from ops/docker/conda_env/sdist_test.yml rename to ops/conda_env/sdist_test.yml diff --git a/ops/docker/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml similarity index 100% rename from ops/docker/conda_env/win64_test.yml rename to ops/conda_env/win64_test.yml diff --git a/ops/docker/dockerfile/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 index 8d6cfaca39fa..9dff2a05230b 100644 --- a/ops/docker/dockerfile/Dockerfile.aarch64 +++ b/ops/docker/dockerfile/Dockerfile.aarch64 @@ -32,7 +32,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy index c9528015c17e..de7d9bd3f254 100644 --- a/ops/docker/dockerfile/Dockerfile.clang_tidy +++ b/ops/docker/dockerfile/Dockerfile.clang_tidy @@ -44,7 +44,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu index 64b28026a89c..a426ce5da30c 100644 --- a/ops/docker/dockerfile/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -51,7 +51,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index d8be4d3b07ef..96a532fc2ff1 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 index 7c1d4e8ef642..2d18b1eeb315 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -52,7 +52,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm index c4584747f5db..9fd62e52de93 100644 --- a/ops/docker/dockerfile/Dockerfile.jvm +++ b/ops/docker/dockerfile/Dockerfile.jvm @@ -37,7 +37,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build index 7f0168df467f..4983493a6878 100644 --- a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build +++ b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 index 52baff43bb6f..7800033f552d 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 @@ -11,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 index fdfcbd277360..8214b598d8d4 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 @@ -11,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 index 5e264e2f16e6..f5dac54b9b8f 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -9,7 +9,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker_build.py b/ops/docker_build.py index 922d528814a4..b096d9201d0f 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -9,7 +9,7 @@ import sys from typing import Optional -from docker_run import SCRIPT_DIR, fancy_print_cli_args +from docker_run import OPS_DIR, fancy_print_cli_args def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: @@ -71,9 +71,9 @@ def docker_build( def main(args: argparse.Namespace) -> None: # Dockerfile to be used in docker build dockerfile_path = ( - SCRIPT_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" ) - docker_context_path = SCRIPT_DIR / "docker" + docker_context_path = OPS_DIR build_args = parse_build_args(args.build_arg) diff --git a/ops/docker_run.py b/ops/docker_run.py index 161c81b477b0..41ec9acb17c2 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -12,8 +12,8 @@ import sys import textwrap -SCRIPT_DIR = pathlib.Path(__file__).expanduser().resolve().parent -PROJECT_ROOT_DIR = SCRIPT_DIR.parent +OPS_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = OPS_DIR.parent LINEWIDTH = 88 TEXT_WRAPPER = textwrap.TextWrapper( width=LINEWIDTH, diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh similarity index 85% rename from ops/pipeline/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-apple-silicon.sh index 75785aa03eba..0c0aa6300729 100755 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Build libxgboost4j.dylib targeting MacOS +## Build libxgboost4j.dylib targeting MacOS (Apple Silicon) set -euox pipefail @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -fi +# fi popd set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh new file mode 100755 index 000000000000..ee71a8b13078 --- /dev/null +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -0,0 +1,44 @@ +#!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Intel) + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +# Display system info +echo "--- Display system information" +set -x +system_profiler SPSoftwareDataType +sysctl -n machdep.cpu.brand_string +uname -m +set +x + +brew install ninja libomp + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.dylib" +set -x +mkdir build +pushd build +export JAVA_HOME=$(/usr/libexec/java_home) +cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 +ninja -v +popd +rm -rf build +otool -L lib/libxgboost.dylib +set +x + +echo "--- Upload libxgboost4j.dylib" +set -x +pushd lib +libname=libxgboost4j_intel_${GITHUB_SHA}.dylib +mv -v libxgboost4j.dylib ${libname} + +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then + aws s3 cp ${libname} \ + s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ + --acl public-read --no-progress +# fi +popd +set +x diff --git a/ops/pipeline/build-python-wheels-macos.sh b/ops/pipeline/build-python-wheels-macos.sh old mode 100644 new mode 100755 index 3715ec9e7e0f..697514c0c3ad --- a/ops/pipeline/build-python-wheels-macos.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -30,7 +30,6 @@ if [[ "$platform_id" == macosx_* ]]; then # Set up environment variables to configure cibuildwheel export CIBW_BUILD=cp${cpython_ver}-${platform_id} export CIBW_ARCHS=${cibw_archs} - export CIBW_ENVIRONMENT=${setup_env_var} export CIBW_TEST_SKIP='*-macosx_arm64' export CIBW_BUILD_VERBOSITY=3 else diff --git a/ops/pipeline/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 index e4a55c77b2bd..2416d53b3f85 100644 --- a/ops/pipeline/test-win64-gpu.ps1 +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -13,7 +13,7 @@ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } Write-Host "--- Set up Python env" conda activate $env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=ops/docker/conda_env/win64_test.yml +mamba env create -n ${env_name} --file=ops/conda_env/win64_test.yml conda activate ${env_name} python -m pip install ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh index 857ebbbec0c2..86e3677f4392 100755 --- a/ops/script/build_via_cmake.sh +++ b/ops/script/build_via_cmake.sh @@ -2,9 +2,16 @@ set -euo pipefail -if [[ "$1" == --conda-env=* ]] +if [[ "$#" -lt 1 ]] then - conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -) + conda_env="" +else + conda_env="$1" +fi + +if [[ "${conda_env}" == --conda-env=* ]] +then + conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) echo "Activating Conda environment ${conda_env}" shift 1 cmake_args="$@" diff --git a/ops/script/lint_cmake.sh b/ops/script/lint_cmake.sh old mode 100644 new mode 100755 index d67ecd0844ed..55aeb20e8fb2 --- a/ops/script/lint_cmake.sh +++ b/ops/script/lint_cmake.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e +set -euo pipefail cmake_files=$( find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \ diff --git a/ops/script/run_clang_tidy.py b/ops/script/run_clang_tidy.py index aaeccdaf3718..dca5d1069598 100755 --- a/ops/script/run_clang_tidy.py +++ b/ops/script/run_clang_tidy.py @@ -19,7 +19,9 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", + error_msg, + re.MULTILINE, ) if matched is None: diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index b7be3c44c1df..5746f33044e9 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -101,7 +101,7 @@ def is_df(part: T) -> T: X.columns = X.columns.astype("object") # Make sure the output can be integrated back to original dataframe X.columns = X.columns.astype("object") - # Work around https://github.com/dmlc/xgboost/issues/10752 + # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From 9f9db2b54d1ea836a485b9bb70110ff94a922f17 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 18:51:05 -0800 Subject: [PATCH 35/86] Various fixes --- .github/dependabot.yml | 35 +++++++++++++++ .github/lock.yml | 32 ++++++++++++++ .github/workflows/freebsd.yml | 34 ++++++++++++++ .github/workflows/r_nold.yml | 44 +++++++++++++++++++ .github/workflows/sycl_tests.yml | 23 +++++----- ops/conda_env/linux_sycl_test.yml | 1 + .../Dockerfile.gpu_build_rockylinux8 | 2 +- ops/pipeline/build-jvm-doc-impl.sh | 2 +- 8 files changed, 160 insertions(+), 13 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/lock.yml create mode 100644 .github/workflows/freebsd.yml create mode 100644 .github/workflows/r_nold.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000000..1a8098071ba3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,35 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "maven" + directory: "/jvm-packages" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-gpu" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-example" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-spark" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-spark-gpu" + schedule: + interval: "monthly" + - package-ecosystem: "github-actions" + directory: / + schedule: + interval: "monthly" diff --git a/.github/lock.yml b/.github/lock.yml new file mode 100644 index 000000000000..f916abe5a367 --- /dev/null +++ b/.github/lock.yml @@ -0,0 +1,32 @@ +# Configuration for lock-threads - https://github.com/dessant/lock-threads + +# Number of days of inactivity before a closed issue or pull request is locked +daysUntilLock: 90 + +# Issues and pull requests with these labels will not be locked. Set to `[]` to disable +exemptLabels: + - feature-request + +# Label to add before locking, such as `outdated`. Set to `false` to disable +lockLabel: false + +# Comment to post before locking. Set to `false` to disable +lockComment: false + +# Assign `resolved` as the reason for locking. Set to `false` to disable +setLockReason: true + +# Limit to only `issues` or `pulls` +# only: issues + +# Optionally, specify configuration settings just for `issues` or `pulls` +# issues: +# exemptLabels: +# - help-wanted +# lockLabel: outdated + +# pulls: +# daysUntilLock: 30 + +# Repository to extend settings from +# _extends: repo diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml new file mode 100644 index 000000000000..d3208a1294d1 --- /dev/null +++ b/.github/workflows/freebsd.yml @@ -0,0 +1,34 @@ +name: FreeBSD + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + test: + runs-on: ubuntu-latest + timeout-minutes: 20 + name: A job to run test in FreeBSD + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest + + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON + ninja -v + ./testxgboost diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml new file mode 100644 index 000000000000..4b506927e06c --- /dev/null +++ b/.github/workflows/r_nold.yml @@ -0,0 +1,44 @@ +# Run expensive R tests with the help of rhub. Only triggered by a pull request review +# See discussion at https://github.com/dmlc/xgboost/pull/6378 + +name: XGBoost-R-noLD + +on: + pull_request_review_comment: + types: [created] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + test-R-noLD: + if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) + timeout-minutes: 120 + runs-on: ubuntu-latest + container: + image: rhub/debian-gcc-devel-nold + steps: + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 54ebcb5f9532..b317050fc652 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -5,6 +5,10 @@ on: [push, pull_request] permissions: contents: read # to fetch code (actions/checkout) +defaults: + run: + shell: bash -l {0} + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -37,20 +41,16 @@ jobs: conda info conda list - name: Build and install XGBoost - shell: bash -l {0} run: | mkdir build cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL + cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja + - name: Run gtest run: | cd build - ./testxgboost --gtest_filter=-Sycl* + ./testxgboost python-sycl-tests-on-ubuntu: name: Test XGBoost Python package with SYCL on ${{ matrix.os }} @@ -82,8 +82,9 @@ jobs: run: | mkdir build cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja + ninja - name: Install Python package run: | cd python-package diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml index 5b3a15f7e3b1..f1ce49492d42 100644 --- a/ops/conda_env/linux_sycl_test.yml +++ b/ops/conda_env/linux_sycl_test.yml @@ -18,6 +18,7 @@ dependencies: - pytest-timeout - pytest-cov - dask +- ninja - dpcpp_linux-64 - onedpl-devel - intel-openmp diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 index d021190b6744..ae79e88b15b3 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -76,7 +76,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/pipeline/build-jvm-doc-impl.sh b/ops/pipeline/build-jvm-doc-impl.sh index c334b8ad91d1..4e95f284e25c 100755 --- a/ops/pipeline/build-jvm-doc-impl.sh +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -27,7 +27,7 @@ mvn --no-transfer-progress javadoc:javadoc -Pdocs # Package JVM docs in a tarball mkdir -p tmp/scaladocs -cp -rv xgboost4j/target/site/apidocs/ ./tmp/javadocs/ +cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ From dce4f7ba55b78a7754b6c9ee2e08460ea3ed825c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 19:40:22 -0800 Subject: [PATCH 36/86] Disable dependabot for now --- .github/dependabot.yml | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" From dfd5624d181ba550d2719eca06652f15702ffa79 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 08:09:48 -0800 Subject: [PATCH 37/86] Fixes --- .github/workflows/freebsd.yml | 32 ++-- .github/workflows/i386.yml | 44 +++--- .github/workflows/jvm_tests.yml | 91 +++++------ .github/workflows/lint.yml | 122 +++++++-------- .github/workflows/misc.yml | 172 ++++++++++---------- .github/workflows/python_tests.yml | 182 +++++++++++----------- .github/workflows/python_wheels_macos.yml | 46 +++--- .github/workflows/r_nold.yml | 40 ++--- .github/workflows/r_tests.yml | 152 +++++++++--------- .github/workflows/sycl_tests.yml | 58 +++---- .github/workflows/update_rapids.yml | 32 ++-- 11 files changed, 487 insertions(+), 484 deletions(-) diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index d3208a1294d1..d0eb13c20fb6 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -15,20 +15,20 @@ jobs: timeout-minutes: 20 name: A job to run test in FreeBSD steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON + ninja -v + ./testxgboost diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aa71147e2581..455d6ea91033 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,25 +19,25 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: ops/docker/dockerfile/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - bash ops/script/build_via_cmake.sh + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3.7.1 + with: + driver-opts: network=host + - name: Build and push container + uses: docker/build-push-action@v6 + with: + context: . + file: ops/docker/dockerfile/Dockerfile.i386 + push: true + tags: localhost:5000/xgboost/build-32bit:latest + cache-from: type=gha + cache-to: type=gha,mode=max + - name: Build XGBoost + run: | + docker run --rm -v $PWD:/workspace -w /workspace \ + -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ + localhost:5000/xgboost/build-32bit:latest \ + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index f9385fa4acaf..ab21e2f19466 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -110,6 +110,9 @@ jobs: with: submodules: "true" - run: bash ${{ matrix.script }} + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} build-jvm-docs: name: Build docs for JVM packages @@ -169,50 +172,50 @@ jobs: os: [windows-latest, macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: actions/setup-java@v4.5.0 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: ops/conda_env/jvm_tests.yml - use-mamba: true - - - name: Cache Maven packages - uses: actions/cache@v4.1.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 - - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` - --acl public-read --region us-west-2 - if: matrix.os == 'windows-latest' - # if: | - # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - # matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: actions/setup-java@v4.5.0 + with: + distribution: 'temurin' + java-version: '8' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: jvm_tests + environment-file: ops/conda_env/jvm_tests.yml + use-mamba: true + + - name: Cache Maven packages + uses: actions/cache@v4.1.2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + + - name: Test XGBoost4J (Core) + run: | + cd jvm-packages + mvn test -B -pl :xgboost4j_2.12 + + - name: Publish artifact xgboost4j.dll to S3 + run: | + cd lib/ + Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll + dir + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: matrix.os == 'windows-latest' + # if: | + # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + # matrix.os == 'windows-latest' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} test-jvm-packages-gpu: name: Test JVM packages with CUDA diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index caceb3e3893b..59b3cecf57ed 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -58,52 +58,52 @@ jobs: runs-on: ubuntu-latest name: Type and format checks for the Python package steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: ops/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - shell: bash -el {0} - run: | - conda info - conda list - - name: Run mypy - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: python_lint + environment-file: ops/conda_env/python_lint.yml + use-mamba: true + - name: Display Conda env + shell: bash -el {0} + run: | + conda info + conda list + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 cpp-lint: runs-on: ubuntu-latest name: Code linting for C++ steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: actions/setup-python@v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 ops/script/lint_cpp.py - bash ops/script/lint_cmake.sh + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh lintr: runs-on: ${{ matrix.os }} @@ -118,27 +118,27 @@ jobs: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' - - uses: r-lib/actions/setup-r@v2.11.0 - with: - r-version: ${{ matrix.r }} + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} - - name: Cache R packages - uses: actions/cache@v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript ops/script/lint_r.R $(pwd) + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index 7294faa0d93b..b1b92c1528b7 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -22,23 +22,23 @@ jobs: matrix: os: [macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + brew install ninja libomp + - name: Build gtest binary + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo + ninja -v + - name: Run gtest binary + run: | + cd build + ./testxgboost + ctest -R TestXGBoostCLI --extra-verbose gtest-cpu-nonomp: name: Test Google C++ unittest (CPU Non-OMP) @@ -48,23 +48,23 @@ jobs: matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON + ninja -v + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose c-api-demo: name: Test installing XGBoost lib + building the C API demo @@ -78,56 +78,56 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: ops/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: cpp_test + environment-file: ops/conda_env/cpp_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost static library + run: | + mkdir build + cd build + cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja -v install + cd - + - name: Build and run C API demo with static + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + cd .. + rm -rf ./build + popd - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo - ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo + - name: Build and install XGBoost shared library + run: | + cd build + cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON + ninja -v install + ./testxgboost + cd - + - name: Build and run C API demo with shared + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + popd + ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo + ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index c43d8b056c8d..344e2f276b22 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -22,28 +22,28 @@ jobs: matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test - environment-file: ops/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: sdist_test + environment-file: ops/conda_env/sdist_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + run: | + cd python-package + python --version + python -m build --sdist + pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False + cd .. + python -c 'import xgboost' python-sdist-test: # Use system toolchain instead of conda toolchain for macos and windows. @@ -56,33 +56,33 @@ jobs: os: [macos-13, windows-latest] python-version: ["3.10"] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install osx system dependencies + if: matrix.os == 'macos-13' + run: | + brew install ninja libomp + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + auto-update-conda: true + python-version: ${{ matrix.python-version }} + activate-environment: test + - name: Install build + run: | + conda install -c conda-forge python-build + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + run: | + cd python-package + python --version + python -m build --sdist + pip install -v ./dist/xgboost-*.tar.gz + cd .. + python -c 'import xgboost' python-tests-on-macos: name: Test XGBoost Python package on ${{ matrix.os }} @@ -94,48 +94,48 @@ jobs: os: [macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: ops/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: macos_cpu_test + environment-file: ops/conda_env/macos_cpu_test.yml + use-mamba: true + + - name: Display Conda env + run: | + conda info + conda list + + - name: Build XGBoost on macos + run: | + brew install ninja + + mkdir build + cd build + # Set prefix, to use OpenMP library from Conda env + # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 + # to learn why we don't use libomp from Homebrew. + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON + ninja + + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python + + - name: Test Dask Interface + run: | + pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on ${{ matrix.os }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index a4cff8eb0e6f..f0f5042660df 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -30,26 +30,26 @@ jobs: - os: macos-14 platform_id: macosx_arm64 steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Upload Python wheel - # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Set up homebrew + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 + - name: Install libomp + run: brew install libomp + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + python-version: "3.10" + use-mamba: true + - name: Build wheels + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} + - name: Upload Python wheel + # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: | + python -m pip install awscli + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 4b506927e06c..6ff4aa079e95 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -22,23 +22,23 @@ jobs: container: image: rhub/debian-gcc-devel-nold steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index 3885c126f11e..f5e5152fa29a 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -32,47 +32,47 @@ jobs: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.os == 'ubuntu-latest' - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@v2.11.0 - with: - r-version: ${{ matrix.r }} - - - name: Cache R packages - uses: actions/cache@v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check - if: matrix.compiler != 'none' - - - name: Test R - run: | - python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check - if: matrix.compiler == 'none' + - name: Install system dependencies + run: | + sudo apt update + sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} + + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + + - uses: r-lib/actions/setup-tinytex@v2 + + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + + - name: Test R + run: | + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' + + - name: Test R + run: | + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian @@ -81,38 +81,38 @@ jobs: image: rhub/debian-gcc-release steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc + - name: Install system dependencies + run: | + # Must run before checkout to have the latest git installed. + # No need to add pandoc, the container has it figured out. + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - name: Trust git cloning project sources + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Test R + shell: bash -l {0} + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + + - uses: dorny/paths-filter@v3 + id: changes + with: + filters: | + r_package: + - 'R-package/**' + + - name: Run document check + if: steps.changes.outputs.r_package == 'true' + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index b317050fc652..467734607ea6 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -62,34 +62,34 @@ jobs: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: ops/conda_env/linux_sycl_test.yml - use-mamba: true + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja - ninja - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + - name: Display Conda env + run: | + conda info + conda list + - name: Build XGBoost on Ubuntu + run: | + mkdir build + cd build + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja + ninja + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 636661db46b8..03a39f72b660 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,20 +25,20 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Check latest RAPIDS and update conftest.sh + run: | + bash tests/buildkite/update-rapids.sh + - name: Create Pull Request + uses: peter-evans/create-pull-request@v7 + if: github.ref == 'refs/heads/master' + with: + add-paths: | + tests/buildkite + branch: create-pull-request/update-rapids + base: master + title: "[CI] Update RAPIDS to latest stable" + commit-message: "[CI] Update RAPIDS to latest stable" From 0ee55c2786f334c97f679cc09944c5991e24bfcd Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:00:27 -0800 Subject: [PATCH 38/86] Don't upload artifacts from pull requests --- ops/pipeline/build-cuda-with-rmm.sh | 10 +++++----- ops/pipeline/build-cuda.sh | 10 +++++----- ops/pipeline/build-jvm-gpu.sh | 10 +++++----- ops/pipeline/build-jvm-macos-apple-silicon.sh | 6 +++--- ops/pipeline/build-jvm-macos-intel.sh | 6 +++--- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 1da0e5e61827..50bbf8b340f3 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -8,12 +8,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA with RMM" -#if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 0487fb209dbe..4ed82618da23 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -8,12 +8,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA" -# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi echo "--- Build libxgboost from the source" set -x diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index ee12fbd78289..6bcd2a327553 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -7,12 +7,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost4j.so with CUDA" -# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi COMMAND=$( cat <<-EOF diff --git a/ops/pipeline/build-jvm-macos-apple-silicon.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh index 0c0aa6300729..99ca20d7e1e3 100755 --- a/ops/pipeline/build-jvm-macos-apple-silicon.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -# then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -# fi +fi popd set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh index ee71a8b13078..ecf480d3c063 100755 --- a/ops/pipeline/build-jvm-macos-intel.sh +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_intel_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -# then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -# fi +fi popd set +x From cb55d7a7c53b6f7dbadcf321026dd051bb4caba9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:39:10 -0800 Subject: [PATCH 39/86] Fix --- .github/workflows/jvm_tests.yml | 7 +++---- .github/workflows/python_wheels_macos.yml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index ab21e2f19466..8eecc83c0c19 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -209,10 +209,9 @@ jobs: python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` --acl public-read --region us-west-2 - if: matrix.os == 'windows-latest' - # if: | - # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - # matrix.os == 'windows-latest' + if: | + (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + matrix.os == 'windows-latest' env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index f0f5042660df..02f21593c220 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -46,7 +46,7 @@ jobs: - name: Build wheels run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - name: Upload Python wheel - # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | python -m pip install awscli python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 From 6641f7d1c555387c226943f160433a66f0422ea0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:45:31 -0800 Subject: [PATCH 40/86] Fix merge conflict --- .github/runs-on.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 1911c527481d..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -34,7 +34,3 @@ runners: cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 -<<<<<<< HEAD -======= - ->>>>>>> upstream/master From 0727cf5f80be4ebdfd951c5b3db9e1d99988cf7f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 10:25:27 -0800 Subject: [PATCH 41/86] Fix --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 3dc9c4962646..afd9e65192ba 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -18,7 +18,7 @@ env: ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} # TODO(hcho3): Remove - RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-m3ikdpczirva + RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-dv5n3gmnaius jobs: build-win64-gpu: From 3c2e5c88c4c92fe58c0cb1d24a655e8d220f09e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 11:09:18 -0800 Subject: [PATCH 42/86] Remove unneeded matrix; set max-parallel --- .github/workflows/jvm_tests.yml | 1 + .github/workflows/lint.yml | 33 +++++++++--------------------- .github/workflows/main.yml | 3 +++ .github/workflows/misc.yml | 19 +++-------------- .github/workflows/python_tests.yml | 26 ++++++----------------- .github/workflows/sycl_tests.yml | 15 +++----------- 6 files changed, 26 insertions(+), 71 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 8eecc83c0c19..659de52c30e0 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -21,6 +21,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} strategy: + max-parallel: 2 matrix: container_id: - xgb-ci.manylinux2014_x86_64 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 59b3cecf57ed..70d892b1061d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,26 +15,20 @@ env: jobs: build-containers: - name: Build CI containers (${{ matrix.container_id }}) + name: Build CI containers + env: + CONTAINER_ID: xgb-ci.clang_tidy runs-on: - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - fail-fast: false - matrix: - include: - - container_id: xgb-ci.clang_tidy - runner: linux-amd64-cpu + - runner=linux-amd64-cpu steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4.2.2 with: submodules: "true" - - name: Build ${{ matrix.container_id }} + - name: Build ${{ env.CONTAINER_ID }} run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} clang-tidy: name: Run clang-tidy @@ -106,17 +100,10 @@ jobs: bash ops/script/lint_cmake.sh lintr: - runs-on: ${{ matrix.os }} - name: Run R linters on OS ${{ matrix.os }}, R ${{ matrix.r }} - strategy: - fail-fast: false - matrix: - include: - - os: ubuntu-latest - r: "release" + runs-on: ubuntu-latest + name: Run R linters on Ubuntu env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - steps: - uses: actions/checkout@v4.2.2 with: @@ -124,14 +111,14 @@ jobs: - uses: r-lib/actions/setup-r@v2.11.0 with: - r-version: ${{ matrix.r }} + r-version: "release" - name: Cache R packages uses: actions/cache@v4.1.2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} - name: Install dependencies shell: Rscript {0} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 77208a146443..15822c55f0d5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,6 +21,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} strategy: + max-parallel: 2 matrix: container_id: - xgb-ci.gpu_build_rockylinux8 @@ -203,6 +204,7 @@ jobs: - runner=${{ matrix.runner }} strategy: fail-fast: false + max-parallel: 2 matrix: include: - suite: gpu @@ -241,6 +243,7 @@ jobs: - runner=${{ matrix.runner }} strategy: fail-fast: false + max-parallel: 2 matrix: include: - description: "single GPU" diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index b1b92c1528b7..1e6df46615d5 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -16,11 +16,7 @@ env: jobs: gtest-cpu: name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-13] + runs-on: macos-13 steps: - uses: actions/checkout@v4.2.2 with: @@ -42,11 +38,7 @@ jobs: gtest-cpu-nonomp: name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: @@ -68,15 +60,10 @@ jobs: c-api-demo: name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest defaults: run: shell: bash -l {0} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] steps: - uses: actions/checkout@v4.2.2 with: diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 344e2f276b22..bcc0f5b8ba81 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -15,12 +15,8 @@ concurrency: jobs: python-sdist-test-on-Linux: - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest + name: Test installing XGBoost Python source package steps: - uses: actions/checkout@v4.2.2 with: @@ -85,14 +81,9 @@ jobs: python -c 'import xgboost' python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: Test XGBoost Python package on macos-13 + runs-on: macos-13 timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - os: [macos-13] - steps: - uses: actions/checkout@v4.2.2 with: @@ -138,13 +129,8 @@ jobs: pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - + name: Test XGBoost Python package System Installation on Ubuntu + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 467734607ea6..7f6214016c00 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -20,11 +20,7 @@ env: jobs: gtest-cpu-sycl: name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: @@ -53,14 +49,9 @@ jobs: ./testxgboost python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: Test XGBoost Python package with SYCL + runs-on: ubuntu-latest timeout-minutes: 90 - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - steps: - uses: actions/checkout@v4.2.2 with: From 32f7406b2eff7b14cba6d171a8a644393b175adf Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 16:36:32 -0800 Subject: [PATCH 43/86] Formatting fixes --- ops/docker/entrypoint.sh | 7 +++- ops/docker_build.py | 4 +-- ops/docker_run.py | 13 ------- ops/script/change_scala_version.py | 2 +- ops/script/format_wheel_meta.py | 6 ++-- ops/script/lint_cpp.py | 34 ++++++++++++------- ops/script/lint_python.py | 15 +++----- ops/script/rename_whl.py | 6 ++-- ops/stash_artifacts.py | 4 +-- .../test_gpu_with_dask/test_gpu_with_dask.py | 2 -- 10 files changed, 42 insertions(+), 51 deletions(-) diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh index babe4359e8e1..40135c197c73 100755 --- a/ops/docker/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -# This wrapper script +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. set -euo pipefail @@ -15,6 +17,9 @@ else rm /this_is_writable_file_system fi +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true diff --git a/ops/docker_build.py b/ops/docker_build.py index b096d9201d0f..1fed975ce223 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -12,7 +12,7 @@ from docker_run import OPS_DIR, fancy_print_cli_args -def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: +def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: parsed_build_args = dict() for arg in raw_build_args: try: @@ -28,7 +28,7 @@ def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: def docker_build( container_id: str, *, - build_args: list[dict[str, str]], + build_args: dict[str, str], dockerfile_path: pathlib.Path, docker_context_path: pathlib.Path, cache_from: Optional[str], diff --git a/ops/docker_run.py b/ops/docker_run.py index 41ec9acb17c2..7e61c5a14f39 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -28,19 +28,6 @@ def parse_run_args(raw_run_args: str) -> list[str]: return [x for x in raw_run_args.split() if x] -def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: - container_id = f"xgb-ci.{container_name}" - # For some build arguments, append special suffixies - for arg_name, suffix in [ - ("CUDA_VERSION_ARG", "cuda"), - ("RAPIDS_VERSION_ARG", "rapids"), - ("JDK_VERSION_ARG", "jdk"), - ]: - if arg_name in build_args: - container_id += f"_{suffix}{build_args[arg_name]}" - return container_id - - def get_user_ids() -> dict[str, str]: uid = os.getuid() gid = os.getgid() diff --git a/ops/script/change_scala_version.py b/ops/script/change_scala_version.py index 3489479dd464..ed475a1f9582 100644 --- a/ops/script/change_scala_version.py +++ b/ops/script/change_scala_version.py @@ -4,7 +4,7 @@ import shutil -def main(args): +def main(args: argparse.Namespace) -> None: if args.scala_version == "2.12": scala_ver = "2.12" scala_patchver = "2.12.18" diff --git a/ops/script/format_wheel_meta.py b/ops/script/format_wheel_meta.py index 570f7854cf62..a7def879905e 100644 --- a/ops/script/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -3,12 +3,12 @@ XGBoost Python package. """ +import argparse import json import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -37,7 +37,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format meta.json encoding the latest nightly version of the Python wheel" ) parser.add_argument( diff --git a/ops/script/lint_cpp.py b/ops/script/lint_cpp.py index d4775d6b6b3e..2d00b219ceab 100644 --- a/ops/script/lint_cpp.py +++ b/ops/script/lint_cpp.py @@ -2,6 +2,7 @@ import os import re import sys +from typing import TextIO import cpplint from cpplint import _cpplint_state @@ -9,7 +10,7 @@ CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"]) -def filepath_enumerate(paths): +def filepath_enumerate(paths: list[str]) -> list[str]: """Enumerate the file paths of all subfiles of the list of paths""" out = [] for path in paths: @@ -22,7 +23,7 @@ def filepath_enumerate(paths): return out -def get_header_guard_dmlc(filename): +def get_header_guard_dmlc(filename: str) -> str: """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path @@ -54,11 +55,10 @@ def get_header_guard_dmlc(filename): class Lint: - def __init__(self): + def __init__(self) -> None: self.project_name = "xgboost" - self.cpp_header_map = {} - self.cpp_src_map = {} - self.python_map = {} + self.cpp_header_map: dict[str, dict[str, int]] = {} + self.cpp_src_map: dict[str, dict[str, int]] = {} self.pylint_cats = set(["error", "warning", "convention", "refactor"]) # setup cpp lint @@ -78,7 +78,7 @@ def __init__(self): cpplint._SetCountingStyle("toplevel") cpplint._line_length = 100 - def process_cpp(self, path, suffix): + def process_cpp(self, path: str, suffix: str) -> None: """Process a cpp file.""" _cpplint_state.ResetErrorCounts() cpplint.ProcessFile(str(path), _cpplint_state.verbose_level) @@ -91,7 +91,9 @@ def process_cpp(self, path, suffix): self.cpp_src_map[str(path)] = errors @staticmethod - def _print_summary_map(strm, result_map, ftype): + def _print_summary_map( + strm: TextIO, result_map: dict[str, dict[str, int]], ftype: str + ) -> int: """Print summary of certain result map.""" if len(result_map) == 0: return 0 @@ -105,7 +107,7 @@ def _print_summary_map(strm, result_map, ftype): ) return len(result_map) - npass - def print_summary(self, strm): + def print_summary(self, strm: TextIO) -> int: """Print summary of lint.""" nerr = 0 nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header") @@ -122,7 +124,7 @@ def print_summary(self, strm): cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc -def process(fname, allow_type): +def process(fname: str, allow_type: list[str]) -> None: """Process a file.""" fname = str(fname) arr = fname.rsplit(".", 1) @@ -132,13 +134,19 @@ def process(fname, allow_type): _HELPER.process_cpp(fname, arr[-1]) -def main(): +def main() -> None: parser = argparse.ArgumentParser(description="run cpp lint") parser.add_argument( "path", nargs="*", help="Path to traverse", - default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"], + default=[ + "src", + "include", + os.path.join("R-package", "src"), + "python-package", + "plugin/sycl", + ], ) parser.add_argument( "--exclude_path", @@ -149,7 +157,7 @@ def main(): args = parser.parse_args() excluded_paths = filepath_enumerate(args.exclude_path) - allow_type = [] + allow_type: list[str] = [] allow_type += CXX_SUFFIX for path in args.path: diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py index a589385b2588..67343cc430ac 100644 --- a/ops/script/lint_python.py +++ b/ops/script/lint_python.py @@ -68,11 +68,7 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "ops/script/run_clang_tidy.py", - "ops/script/lint_python.py", - "ops/script/test_r_package.py", - "ops/script/test_utils.py", - "ops/script/change_version.py", + "ops/", ) ISORT = ( @@ -82,12 +78,13 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "ops/script/", # demo "demo/", # misc "dev/", "doc/", + # CI + "ops/", ) MYPY = ( @@ -129,11 +126,7 @@ class LintersPaths: "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "ops/script/run_clang_tidy.py", - "ops/script/lint_python.py", - "ops/script/test_r_package.py", - "ops/script/test_utils.py", - "ops/script/change_version.py", + "ops/", ) diff --git a/ops/script/rename_whl.py b/ops/script/rename_whl.py index 500196190b3d..d4467720c738 100644 --- a/ops/script/rename_whl.py +++ b/ops/script/rename_whl.py @@ -1,8 +1,8 @@ +import argparse import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -43,7 +43,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format a Python wheel's name using the git commit hash and platform tag" ) parser.add_argument( diff --git a/ops/stash_artifacts.py b/ops/stash_artifacts.py index 405804b499c6..827e448ac49e 100644 --- a/ops/stash_artifacts.py +++ b/ops/stash_artifacts.py @@ -84,7 +84,7 @@ def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: ) -def upload(args): +def upload(args: argparse.Namespace) -> None: print(f"Uploading artifacts with prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) @@ -92,7 +92,7 @@ def upload(args): aws_s3_upload(artifact_path, s3_url) -def download(args): +def download(args: argparse.Namespace) -> None: print(f"Downloading artifacts with prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index 5746f33044e9..3bc7d46eb721 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -100,8 +100,6 @@ def is_df(part: T) -> T: # Work around https://github.com/dmlc/xgboost/issues/10752 X.columns = X.columns.astype("object") # Make sure the output can be integrated back to original dataframe - X.columns = X.columns.astype("object") - # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From e148e29b0662f81b1ee3acab75cacf85e77fd0f8 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 16:53:02 -0800 Subject: [PATCH 44/86] Fix update_rapids.sh --- .github/workflows/update_rapids.yml | 2 +- ops/docker/ci_container.yml | 10 +++++++--- ops/script/update_rapids.sh | 9 ++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 03a39f72b660..d6be99d00851 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -30,7 +30,7 @@ jobs: submodules: 'true' - name: Check latest RAPIDS and update conftest.sh run: | - bash tests/buildkite/update-rapids.sh + bash ops/script/update_rapids.sh - name: Create Pull Request uses: peter-evans/create-pull-request@v7 if: github.ref == 'refs/heads/master' diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 90c9e6c8c800..f5eb7eb982df 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -3,12 +3,16 @@ # Each container will be built using the definition from # ops/docker/dockerfile/Dockerfile.CONTAINER_DEF +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.10" + RAPIDS_VERSION_ARG: *rapids_version xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 @@ -21,14 +25,14 @@ xgb-ci.gpu: build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.10" + RAPIDS_VERSION_ARG: *rapids_version xgb-ci.gpu_dev_ver: container_def: gpu build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.12" + RAPIDS_VERSION_ARG: *dev_rapids_version RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: diff --git a/ops/script/update_rapids.sh b/ops/script/update_rapids.sh index f6a2675bdfa9..d7958ce70d86 100755 --- a/ops/script/update_rapids.sh +++ b/ops/script/update_rapids.sh @@ -7,7 +7,10 @@ echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION" DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .) echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION" -PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +OPS_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")/.." ; pwd -P ) +CONTAINER_YAML="$OPS_PATH/docker/ci_container.yml" -sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh -sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh +sed -i "s/\&rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&rapids_version \"${LATEST_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" +sed -i "s/\&dev_rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&dev_rapids_version \"${DEV_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" From d8abb3c70fc747328333e5ef7bf4b64bc745f961 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 18:14:38 -0800 Subject: [PATCH 45/86] Add JVM deploy --- .github/workflows/jvm_tests.yml | 43 ++++++++++++++++++++++ ops/pipeline/build-win64-gpu.ps1 | 8 ++--- ops/pipeline/deploy-jvm-packages-impl.sh | 45 ++++++++++++++++++++++++ ops/pipeline/deploy-jvm-packages.sh | 21 +++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) create mode 100755 ops/pipeline/deploy-jvm-packages-impl.sh create mode 100755 ops/pipeline/deploy-jvm-packages.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 659de52c30e0..549094d52e37 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -162,6 +162,11 @@ jobs: run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.13 + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-test-jvm-packages build-test-jvm-packages-other-os: name: Build and test JVM packages (${{ matrix.os }}) @@ -239,3 +244,41 @@ jobs: COMMAND: download KEY: build-jvm-gpu - run: bash ops/pipeline/test-jvm-gpu.sh + + deploy-jvm-packages: + name: Deploy JVM packages to S3 (${{ matrix.variant }}) + needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + strategy: + fail-fast: false + matrix: + include: + - variant: cpu + container_id: xgb-ci.jvm + artifact_from: build-test-jvm-packages + - variant: gpu + container_id: xgb-ci.jvm_gpu_build + artifact_from: build-jvm-gpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + - name: Unstash files + run: | + bash ops/stash_artifacts.sh lib/libxgboost4j.so + ls -lh lib/libxgboost4j.so + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - name: Deploy JVM packages to S3 + run: >- + bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant }} \ + ${{ matrix.container_id }} diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index cc5380a7c7c2..76cc955059b8 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -5,11 +5,11 @@ $ErrorActionPreference = "Stop" Write-Host "--- Build libxgboost on Windows with CUDA" nvcc --version -#if ( $is_release_branch -eq 0 ) { +if ( $is_release_branch -eq 0 ) { $arch_flag = "-DGPU_COMPUTE_VER=75" -#} else { -# $arch_flag = "" -#} +} else { + $arch_flag = "" +} # Work around https://github.com/NVIDIA/cccl/issues/1956 # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh new file mode 100755 index 000000000000..36fd23a583d6 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -0,0 +1,45 @@ +#!/bin/bash +## Deploy JVM packages to xgboost-maven-repo S3 bucket + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {cpu,gpu}" + exit 1 +fi + +variant="$1" + +maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" +case "$variant" in + cpu) + # CPU variant + for scala_version in 2.12 2.13 + do + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + pushd jvm-packages + mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} + mvn clean + mvn clean -Pdefault,release-to-s3 + popd + done + ;; + gpu) + # GPU variant + for scala_version in 2.12 2.13 + do + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + pushd jvm-packages + mvn --no-transfer-progress install -Pgpu ${maven_options} + mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} + mvn clean + mvn clean -Pgpu,release-to-s3 + popd + done + ;; + *) + echo "Unrecognized argument: $variant" + exit 2 + ;; +esac diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh new file mode 100755 index 000000000000..866b6dded393 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {cpu,gpu} {container_id}" + exit 1 +fi + +variant="$1" +container_id="$2" + +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then + echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" + python3 ops/docker_run.py --container-id "${container_id}" \ + -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" +# fi From 9c43544f54c60f2a076b17bd3e5d20153fa77e2e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 23:32:49 -0800 Subject: [PATCH 46/86] Move stash-artifacts --- .github/workflows/jvm_tests.yml | 10 +++++----- .github/workflows/main.yml | 12 ++++++------ .github/workflows/windows.yml | 4 ++-- .../stash-artifacts.ps1} | 6 +++--- .../stash-artifacts.py} | 2 +- .../stash-artifacts.sh} | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) rename ops/{stash_artifacts.ps1 => pipeline/stash-artifacts.ps1} (88%) rename ops/{stash_artifacts.py => pipeline/stash-artifacts.py} (98%) rename ops/{stash_artifacts.sh => pipeline/stash-artifacts.sh} (88%) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 549094d52e37..ebcfe4061518 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -88,7 +88,7 @@ jobs: CONTAINER_ID: xgb-ci.jvm_gpu_build - run: bash ops/pipeline/build-jvm-gpu.sh - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so env: COMMAND: upload KEY: build-jvm-gpu @@ -132,7 +132,7 @@ jobs: env: CONTAINER_ID: xgb-ci.jvm_gpu_build - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so env: COMMAND: download KEY: build-jvm-gpu @@ -163,7 +163,7 @@ jobs: env: SCALA_VERSION: 2.13 - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so env: COMMAND: upload KEY: build-test-jvm-packages @@ -239,7 +239,7 @@ jobs: env: CONTAINER_ID: xgb-ci.jvm_gpu_build - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so env: COMMAND: download KEY: build-jvm-gpu @@ -273,7 +273,7 @@ jobs: CONTAINER_ID: ${{ matrix.container_id }} - name: Unstash files run: | - bash ops/stash_artifacts.sh lib/libxgboost4j.so + bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so ls -lh lib/libxgboost4j.so env: COMMAND: download diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 15822c55f0d5..51f192c441fc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -66,7 +66,7 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable - run: bash ops/stash_artifacts.sh ./xgboost + run: bash ops/pipeline/stash-artifacts.sh ./xgboost env: COMMAND: upload KEY: build-cpu @@ -89,7 +89,7 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files - run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + run: bash ops/pipeline/stash-artifacts.sh ./xgboost python-package/dist/*.whl env: COMMAND: upload KEY: build-cpu-arm64 @@ -117,7 +117,7 @@ jobs: - run: bash ops/pipeline/build-cuda.sh - name: Stash files run: | - bash ops/stash_artifacts.sh \ + bash ops/pipeline/stash-artifacts.sh \ build/testxgboost ./xgboost python-package/dist/*.whl env: COMMAND: upload @@ -145,7 +145,7 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/pipeline/build-cuda-with-rmm.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost + run: bash ops/pipeline/stash-artifacts.sh build/testxgboost env: COMMAND: upload KEY: build-cuda-with-rmm @@ -228,7 +228,7 @@ jobs: CONTAINER_ID: xgb-ci.gpu - name: Unstash gtest run: | - bash ops/stash_artifacts.sh build/testxgboost + bash ops/pipeline/stash-artifacts.sh build/testxgboost chmod +x build/testxgboost env: COMMAND: download @@ -288,7 +288,7 @@ jobs: CONTAINER_ID: ${{ matrix.container }} - name: Unstash Python wheel run: | - bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost + bash ops/pipeline/stash-artifacts.sh python-package/dist/*.whl ./xgboost chmod +x ./xgboost env: COMMAND: download diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index afd9e65192ba..a2f721544d43 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -33,7 +33,7 @@ jobs: - run: powershell ops/pipeline/build-win64-gpu.ps1 - name: Stash files run: | - powershell ops/stash_artifacts.ps1 ` + powershell ops/pipeline/stash-artifacts.ps1 ` build/testxgboost.exe xgboost.exe ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) env: @@ -52,7 +52,7 @@ jobs: submodules: "true" - name: Unstash files run: | - powershell ops/stash_artifacts.ps1 ` + powershell ops/pipeline/stash-artifacts.ps1 ` build/testxgboost.exe xgboost.exe python-package/dist/*.whl env: COMMAND: download diff --git a/ops/stash_artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 similarity index 88% rename from ops/stash_artifacts.ps1 rename to ops/pipeline/stash-artifacts.ps1 index 57a58d884226..202a6c4521ed 100644 --- a/ops/stash_artifacts.ps1 +++ b/ops/pipeline/stash-artifacts.ps1 @@ -7,7 +7,7 @@ Param( )][string[]]$artifacts ) -## Convenience wrapper for ops/stash_artifacts.py +## Convenience wrapper for ops/pipeline/stash-artifacts.py ## Meant to be used inside GitHub Actions $ENV_VAR_DOC = @' @@ -34,13 +34,13 @@ $artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ conda activate Write-Host @" -python ops/stash_artifacts.py ` +python ops/pipeline/stash-artifacts.py ` --command "${Env:COMMAND}" ` --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` --prefix "${artifact_stash_prefix}/${Env:KEY}" ` -- $artifacts "@ -python ops/stash_artifacts.py ` +python ops/pipeline/stash-artifacts.py ` --command "${Env:COMMAND}" ` --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` --prefix "${artifact_stash_prefix}/${Env:KEY}" ` diff --git a/ops/stash_artifacts.py b/ops/pipeline/stash-artifacts.py similarity index 98% rename from ops/stash_artifacts.py rename to ops/pipeline/stash-artifacts.py index 827e448ac49e..3c77eb4d2f31 100644 --- a/ops/stash_artifacts.py +++ b/ops/pipeline/stash-artifacts.py @@ -108,7 +108,7 @@ def download(args: argparse.Namespace) -> None: if __name__ == "__main__": # Ensure that the current working directory is the project root if not (Path.cwd() / "ops").is_dir() or not path_equals( - Path(__file__).parent, Path.cwd() / "ops" + Path(__file__).parent.parent, Path.cwd() / "ops" ): x = Path(__file__).name raise RuntimeError(f"Script {x} must be run at the project's root directory") diff --git a/ops/stash_artifacts.sh b/ops/pipeline/stash-artifacts.sh similarity index 88% rename from ops/stash_artifacts.sh rename to ops/pipeline/stash-artifacts.sh index c2a16f42a26c..3cd0378fc916 100755 --- a/ops/stash_artifacts.sh +++ b/ops/pipeline/stash-artifacts.sh @@ -1,6 +1,6 @@ #!/bin/bash -## Convenience wrapper for ops/stash_artifacts.py +## Convenience wrapper for ops/pipeline/stash-artifacts.py ## Meant to be used inside GitHub Actions ENV_VAR_DOC=$( @@ -32,7 +32,7 @@ done artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" set -x -python3 ops/stash_artifacts.py \ +python3 ops/pipeline/stash-artifacts.py \ --command "${COMMAND}" \ --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ --prefix "${artifact_stash_prefix}/${KEY}" \ From 2c2d47d8b724177f49ade5481565fb48a2ae2c72 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 01:02:21 -0800 Subject: [PATCH 47/86] Allow most pipeline scripts to be run locally --- .github/workflows/jvm_tests.yml | 19 ++++++++++++- .github/workflows/main.yml | 27 +++++++++++++++++++ ops/pipeline/build-cpu-arm64.sh | 15 +++++------ ops/pipeline/build-cpu.sh | 2 -- ops/pipeline/build-cuda-with-rmm.sh | 21 +++++++-------- ops/pipeline/build-cuda.sh | 27 +++++++++++-------- ops/pipeline/build-gpu-rpkg.sh | 13 ++++----- ops/pipeline/build-jvm-doc.sh | 23 +++++++++------- ops/pipeline/build-jvm-gpu.sh | 2 +- ops/pipeline/build-jvm-macos-apple-silicon.sh | 17 ------------ ops/pipeline/build-jvm-macos-intel.sh | 18 ------------- ops/pipeline/build-jvm-manylinux2014.sh | 17 ++---------- ops/pipeline/build-manylinux2014.sh | 21 ++++++--------- ops/pipeline/build-test-jvm-packages.sh | 2 -- ops/pipeline/classify-git-branch.sh | 26 ++++++++++++++++++ ops/pipeline/enforce-ci.sh | 21 +-------------- ops/pipeline/publish-artifact.sh | 23 ++++++++++++++++ ops/pipeline/run-clang-tidy.sh | 2 -- ops/pipeline/test-cpp-gpu.sh | 2 -- ops/pipeline/test-jvm-gpu.sh | 6 +---- ops/pipeline/test-python.sh | 2 -- ops/pipeline/test-win64-gpu.ps1 | 2 -- 22 files changed, 158 insertions(+), 150 deletions(-) create mode 100755 ops/pipeline/classify-git-branch.sh create mode 100755 ops/pipeline/publish-artifact.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index ebcfe4061518..a0ecaa189e36 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -69,6 +69,12 @@ jobs: env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - name: Upload libxgboost4j.so + run: | + libname=lib/libxgboost4j_linux_${{ matrix.arch }}_${{ github.sha }}.so + mv -v lib/libxgboost4j.so ${libname} + bash ops/pipeline/publish-artifact.sh ${libname} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ build-jvm-gpu: name: Build libxgboost4j.so with CUDA @@ -102,15 +108,22 @@ jobs: include: - description: "MacOS (Apple Silicon)" script: ops/pipeline/build-jvm-macos-apple-silicon.sh + libname: libxgboost4j_m1_${{ github.sha }}.dylib runner: macos-14 - description: "MacOS (Intel)" script: ops/pipeline/build-jvm-macos-intel.sh + libname: libxgboost4j_intel_${{ github.sha }}.dylib runner: macos-13 steps: - uses: actions/checkout@v4.2.2 with: submodules: "true" - run: bash ${{ matrix.script }} + - name: Upload libxgboost4j.dylib + run: | + mv -v lib/libxgboost4j.dylib ${{ matrix.libname }} + bash ops/pipeline/publish-artifact.sh ${{ matrix.libname }} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} @@ -137,6 +150,10 @@ jobs: COMMAND: download KEY: build-jvm-gpu - run: bash ops/pipeline/build-jvm-doc.sh + - name: Upload JVM doc + run: | + bash ops/pipeline/publish-artifact.sh jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ + s3://xgboost-docs/ build-test-jvm-packages: name: Build and test JVM packages (Linux) @@ -279,6 +296,6 @@ jobs: COMMAND: download KEY: ${{ matrix.artifact_from }} - name: Deploy JVM packages to S3 - run: >- + run: | bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant }} \ ${{ matrix.container_id }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 51f192c441fc..3073c73ae642 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -93,6 +93,10 @@ jobs: env: COMMAND: upload KEY: build-cpu-arm64 + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel @@ -122,6 +126,13 @@ jobs: env: COMMAND: upload KEY: build-cuda + - name: Upload Python wheel + run: | + for file in python-package/dist/*.whl python-package/dist/meta.json + do + bash ops/pipeline/publish-artifact.sh "${file}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done build-cuda-with-rmm: name: Build CUDA with RMM @@ -149,6 +160,10 @@ jobs: env: COMMAND: upload KEY: build-cuda-with-rmm + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/experimental_build_with_rmm/ build-manylinux2014: name: Build manylinux2014_${{ matrix.arch }} wheel @@ -175,6 +190,13 @@ jobs: env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + - name: Upload Python wheel + run: | + for wheel in python-package/dist/*.whl + do + bash ops/pipeline/publish-artifact.sh "${wheel}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done build-gpu-rpkg: name: Build GPU-enabled R package @@ -193,6 +215,11 @@ jobs: env: CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh + - name: Upload R tarball + run: | + bash ops/pipeline/publish-artifact.sh xgboost_r_gpu_linux_*.tar.gz \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + test-cpp-gpu: name: >- diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh index 4be57557ea36..ff948ca0c77a 100755 --- a/ops/pipeline/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -2,12 +2,16 @@ set -euox pipefail +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + WHEEL_TAG=manylinux_2_28_aarch64 echo "--- Build CPU code targeting ARM64" -source ops/pipeline/enforce-ci.sh - echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ @@ -46,10 +50,3 @@ python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ -- bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh index 22384d056f15..dc0572f0ca4d 100755 --- a/ops/pipeline/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -2,8 +2,6 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh - echo "--- Build CPU code" # This step is not necessary, but here we include it, to ensure that diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 50bbf8b340f3..797051e958ae 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -2,9 +2,15 @@ set -euox pipefail -WHEEL_TAG=manylinux_2_28_x86_64 +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +source ops/pipeline/classify-git-branch.sh -source ops/pipeline/enforce-ci.sh +WHEEL_TAG=manylinux_2_28_x86_64 echo "--- Build with CUDA with RMM" @@ -43,7 +49,7 @@ python3 ops/script/rename_whl.py \ echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" python3 ops/docker_run.py \ - --container-id xgb-ci.$WHEEL_TAG \ + --container-id xgb-ci.${WHEEL_TAG} \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl python3 ops/script/rename_whl.py \ @@ -53,13 +59,6 @@ python3 ops/script/rename_whl.py \ mv -v wheelhouse/*.whl python-package/dist/ # Make sure that libgomp.so is vendored in the wheel python3 ops/docker_run.py \ - --container-id xgb-ci.$WHEEL_TAG \ + --container-id xgb-ci.${WHEEL_TAG} \ -- bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ - --acl public-read --no-progress -fi diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 4ed82618da23..09d8cad46c30 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -2,9 +2,15 @@ set -euox pipefail +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + WHEEL_TAG=manylinux_2_28_x86_64 -source ops/pipeline/enforce-ci.sh +source ops/pipeline/classify-git-branch.sh echo "--- Build with CUDA" @@ -59,21 +65,20 @@ python3 ops/docker_run.py \ --container-id xgb-ci.manylinux_2_28_x86_64 \ -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +# Generate the meta info which includes xgboost version and the commit info +python3 ops/docker_run.py \ +--container-id xgb-ci.gpu_build_rockylinux8 \ +-- python ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + echo "--- Upload Python wheel" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress - - # Generate the meta info which includes xgboost version and the commit info - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- python ops/script/format_wheel_meta.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} \ - --meta-path python-package/dist/ aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress fi -echo "-- Stash C++ test executable (testxgboost)" diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index e85826f36a26..d1384ef766a6 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -2,17 +2,14 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi echo "--- Build XGBoost R package with CUDA" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_r_rockylinux8 \ -- ops/pipeline/build-gpu-rpkg-impl.sh \ ${GITHUB_SHA} - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload R tarball" - aws s3 cp xgboost_r_gpu_linux_*.tar.gz s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 0c1afe46e212..00fdac7a1353 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -1,19 +1,24 @@ #!/bin/bash ## Build docs for the JVM packages and package it in a tarball -## Note: Note: this script assumes that the user has already built libxgboost4j.so +## Note: this script assumes that the user has already built libxgboost4j.so ## and place it in the lib/ directory. set -euox pipefail -source ops/pipeline/enforce-ci.sh - echo "--- Build JVM packages doc" + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 1 +fi + +if [[ ! -f lib/libxgboost4j.so ]] +then + echo "Must place libxgboost4j.so in lib/ first" + exit 2 +fi + python3 ops/docker_run.py \ --container-id xgb-ci.jvm_gpu_build \ -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload JVM packages doc" - aws s3 cp jvm-packages/${BRANCH_NAME}.tar.bz2 \ - s3://xgboost-docs/${BRANCH_NAME}.tar.bz2 --acl public-read --no-progress -fi diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index 6bcd2a327553..7656a3d2f188 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -3,7 +3,7 @@ set -euo pipefail -source ops/pipeline/enforce-ci.sh +source ops/pipeline/classify-git-branch.sh echo "--- Build libxgboost4j.so with CUDA" diff --git a/ops/pipeline/build-jvm-macos-apple-silicon.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh index 99ca20d7e1e3..cfba35d0f96a 100755 --- a/ops/pipeline/build-jvm-macos-apple-silicon.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -3,8 +3,6 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh - # Display system info echo "--- Display system information" set -x @@ -27,18 +25,3 @@ popd rm -rf build otool -L lib/libxgboost.dylib set +x - -echo "--- Upload libxgboost4j.dylib" -set -x -pushd lib -libname=libxgboost4j_m1_${GITHUB_SHA}.dylib -mv -v libxgboost4j.dylib ${libname} - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd -set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh index ecf480d3c063..5e73b03b7f6e 100755 --- a/ops/pipeline/build-jvm-macos-intel.sh +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -3,8 +3,6 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh - # Display system info echo "--- Display system information" set -x @@ -26,19 +24,3 @@ ninja -v popd rm -rf build otool -L lib/libxgboost.dylib -set +x - -echo "--- Upload libxgboost4j.dylib" -set -x -pushd lib -libname=libxgboost4j_intel_${GITHUB_SHA}.dylib -mv -v libxgboost4j.dylib ${libname} - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd -set +x diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index 93fa03d2eb0b..e69dd3682b90 100755 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -3,9 +3,8 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh - -if [ $# -ne 1 ]; then +if [[ $# -ne 1 ]] +then echo "Usage: $0 {x86_64,aarch64}" exit 1 fi @@ -24,15 +23,3 @@ python3 ops/docker_run.py \ "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" ldd lib/libxgboost4j.so objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_${arch}_${GITHUB_SHA}.so -mv -v libxgboost4j.so ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index 7802fa555187..cada47e06b72 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -2,9 +2,14 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi -if [ $# -ne 1 ]; then +if [[ $# -ne 1 ]] +then echo "Usage: $0 {x86_64,aarch64}" exit 1 fi @@ -12,7 +17,7 @@ fi arch=$1 WHEEL_TAG="manylinux2014_${arch}" -image="xgb-ci.$WHEEL_TAG" +image="xgb-ci.${WHEEL_TAG}" python_bin="/opt/python/cp310-cp310/bin/python" @@ -57,13 +62,3 @@ python3 ops/script/rename_whl.py \ --platform-tag ${WHEEL_TAG} rm -v python-package/dist/xgboost_cpu-*.whl mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ - -echo "--- Upload Python wheel" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - for wheel in python-package/dist/*.whl - do - aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - done -fi diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh index 1feddf2bff98..d04cc3510de5 100755 --- a/ops/pipeline/build-test-jvm-packages.sh +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -12,8 +12,6 @@ EOF set -euo pipefail -source ops/pipeline/enforce-ci.sh - for arg in "SCALA_VERSION" do if [[ -z "${!arg:-}" ]] diff --git a/ops/pipeline/classify-git-branch.sh b/ops/pipeline/classify-git-branch.sh new file mode 100755 index 000000000000..0c175c92792d --- /dev/null +++ b/ops/pipeline/classify-git-branch.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +## Detect whether the current git branch is a pull request or a release branch. + +set -euo pipefail + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ ${BRANCH_NAME:-} == "master" || ${BRANCH_NAME:-} == "release_"* || ${BRANCH_NAME:-} == "federated-secure" ]] +then + is_release_branch=1 + enforce_daily_budget=0 +else + is_release_branch=0 + enforce_daily_budget=1 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh index eefb6450b98d..1e853a5ea266 100755 --- a/ops/pipeline/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -18,23 +18,4 @@ then exit 2 fi -if [[ -n ${GITHUB_BASE_REF:-} ]] -then - is_pull_request=1 -else - is_pull_request=0 -fi - -if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] -then - is_release_branch=1 - enforce_daily_budget=0 -else - is_release_branch=0 - enforce_daily_budget=1 -fi - -if [[ -n ${DISABLE_RELEASE:-} ]] -then - is_release_branch=0 -fi +source ops/pipeline/classify-git-branch.sh diff --git a/ops/pipeline/publish-artifact.sh b/ops/pipeline/publish-artifact.sh new file mode 100755 index 000000000000..efdf837e55bf --- /dev/null +++ b/ops/pipeline/publish-artifact.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +## Publish artifacts in an S3 bucket +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ $# -ne 2 ]] +then + echo "Usage: $0 [artifact] [s3_url]" + exit 1 +fi + +artifact="$1" +s3_url="$2" + +#if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +#then + echo "aws s3 cp ${artifact} ${s3_url} --acl public-read --no-progress" + aws s3 cp "${artifact}" "${s3_url}" --acl public-read --no-progress +#fi diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index a9ff039ee4ca..676f302009ce 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -4,8 +4,6 @@ set -euox pipefail echo "--- Run clang-tidy" -source ops/pipeline/enforce-ci.sh - python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index b66162d66a50..9a0cd4743c18 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -2,8 +2,6 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh - if [[ "$#" -lt 1 ]] then echo "Usage: $0 {gpu,gpu-rmm,mgpu}" diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 272b55ad0d1a..c490e58ea01d 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -3,14 +3,10 @@ ## the user has already built libxgboost4j.so with CUDA support ## and place it in the lib/ directory. -set -euo pipefail - -# source ops/pipeline/enforce-ci.sh +set -euox pipefail SCALA_VERSION=2.12 -set -x - python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 507deb37d9c0..c513499a4220 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -2,8 +2,6 @@ set -euo pipefail -source ops/pipeline/enforce-ci.sh - if [[ "$#" -lt 2 ]] then echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} {container_id}" diff --git a/ops/pipeline/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 index 2416d53b3f85..4af3bee2cffc 100644 --- a/ops/pipeline/test-win64-gpu.ps1 +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -1,7 +1,5 @@ $ErrorActionPreference = "Stop" -. ops/pipeline/enforce-ci.ps1 - Write-Host "--- Test XGBoost on Windows with CUDA" nvcc --version From f30a0c358e9cb60d86251d4732d345aee9eae2e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 10:03:45 -0800 Subject: [PATCH 48/86] Only upload if branch --- ops/pipeline/publish-artifact.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ops/pipeline/publish-artifact.sh b/ops/pipeline/publish-artifact.sh index efdf837e55bf..adcb3c521d2a 100755 --- a/ops/pipeline/publish-artifact.sh +++ b/ops/pipeline/publish-artifact.sh @@ -16,8 +16,8 @@ fi artifact="$1" s3_url="$2" -#if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -#then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then echo "aws s3 cp ${artifact} ${s3_url} --acl public-read --no-progress" aws s3 cp "${artifact}" "${s3_url}" --acl public-read --no-progress -#fi +fi From 416253190ccbd2ad585110cae035fcad4bfd2d6e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 10:47:02 -0800 Subject: [PATCH 49/86] Troubleshoot hanging matrix jobs --- .github/workflows/jvm_tests.yml | 12 +++++++++--- .github/workflows/main.yml | 28 ++++++++++++++++++---------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index a0ecaa189e36..0a542be0a510 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -18,8 +18,10 @@ jobs: build-containers: name: Build CI containers (${{ matrix.container_id }}) runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-containers-${{ matrix.container_id }} strategy: max-parallel: 2 matrix: @@ -48,8 +50,10 @@ jobs: (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) needs: build-containers runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-jvm-manylinux2014-${{ matrix.arch }} strategy: fail-fast: false matrix: @@ -266,8 +270,10 @@ jobs: name: Deploy JVM packages to S3 (${{ matrix.variant }}) needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=linux-amd64-cpu + - run-id=${{ github.run_id }} + - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3073c73ae642..cb6fbbadb5c4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,8 +18,10 @@ jobs: build-containers: name: Build CI containers (${{ matrix.container_id }}) runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-containers-${{ matrix.container_id }} strategy: max-parallel: 2 matrix: @@ -169,8 +171,10 @@ jobs: name: Build manylinux2014_${{ matrix.arch }} wheel needs: build-containers runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-manylinux2014-${{ matrix.arch }} strategy: fail-fast: false matrix: @@ -227,8 +231,10 @@ jobs: (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) needs: [build-cuda, build-cuda-with-rmm] runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-cpp-gpu-${{ matrix.suite }} strategy: fail-fast: false max-parallel: 2 @@ -266,39 +272,41 @@ jobs: name: Run Python tests (${{ matrix.description }}) needs: [build-cuda, build-cpu-arm64] runs-on: - - runs-on=${{ github.run_id }} + - runs-on - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-python-${{ matrix.description }} strategy: fail-fast: false max-parallel: 2 matrix: include: - - description: "single GPU" + - description: single-gpu container: xgb-ci.gpu suite: gpu runner: linux-amd64-gpu artifact_from: build-cuda - - description: "single GPU, nightly deps" + - description: single-gpu-nightly-deps container: xgb-ci.gpu_dev_ver suite: gpu runner: linux-amd64-gpu artifact_from: build-cuda - - description: "multiple GPUs" + - description: multiple-gpu container: xgb-ci.gpu suite: mgpu runner: linux-amd64-mgpu artifact_from: build-cuda - - description: "multiple GPUs, nightly deps" + - description: multiple-gpu-nightly-deps container: xgb-ci.gpu_dev_ver suite: mgpu runner: linux-amd64-mgpu artifact_from: build-cuda - - description: "CPU" + - description: cpu-amd64 container: xgb-ci.cpu suite: cpu runner: linux-amd64-cpu artifact_from: build-cuda - - description: "CPU ARM64" + - description: cpu-arm64 container: xgb-ci.aarch64 suite: cpu-arm64 runner: linux-arm64-cpu From 2d374698e7e9b1cc68e8cab732407c76b3156391 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 15:25:33 -0800 Subject: [PATCH 50/86] Use major version of actions --- .github/workflows/i386.yml | 4 ++-- .github/workflows/jvm_tests.yml | 24 +++++++++++------------ .github/workflows/lint.yml | 18 ++++++++--------- .github/workflows/main.yml | 18 ++++++++--------- .github/workflows/misc.yml | 8 ++++---- .github/workflows/python_tests.yml | 16 +++++++-------- .github/workflows/python_wheels_macos.yml | 4 ++-- .github/workflows/r_nold.yml | 2 +- .github/workflows/r_tests.yml | 10 +++++----- .github/workflows/scorecards.yml | 2 +- .github/workflows/sycl_tests.yml | 8 ++++---- .github/workflows/update_rapids.yml | 2 +- .github/workflows/windows.yml | 4 ++-- 13 files changed, 60 insertions(+), 60 deletions(-) diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index 455d6ea91033..8b7c71a82bf8 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,11 +19,11 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 + uses: docker/setup-buildx-action@v3 with: driver-opts: network=host - name: Build and push container diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 0a542be0a510..27ecd32ab3a6 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -36,7 +36,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Build ${{ matrix.container_id }} @@ -65,7 +65,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -89,7 +89,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -119,7 +119,7 @@ jobs: libname: libxgboost4j_intel_${{ github.sha }}.dylib runner: macos-13 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - run: bash ${{ matrix.script }} @@ -141,7 +141,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -168,7 +168,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -199,16 +199,16 @@ jobs: os: [windows-latest, macos-13] steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: actions/setup-java@v4.5.0 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '8' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest @@ -217,7 +217,7 @@ jobs: use-mamba: true - name: Cache Maven packages - uses: actions/cache@v4.1.2 + uses: actions/cache@v4 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} @@ -252,7 +252,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -287,7 +287,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 70d892b1061d..fcc804776fff 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -24,7 +24,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Build ${{ env.CONTAINER_ID }} @@ -39,7 +39,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -52,10 +52,10 @@ jobs: runs-on: ubuntu-latest name: Type and format checks for the Python package steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest @@ -84,10 +84,10 @@ jobs: runs-on: ubuntu-latest name: Code linting for C++ steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: actions/setup-python@v5.3.0 + - uses: actions/setup-python@v5 with: python-version: "3.10" architecture: 'x64' @@ -105,16 +105,16 @@ jobs: env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: r-lib/actions/setup-r@v2.11.0 + - uses: r-lib/actions/setup-r@v2 with: r-version: "release" - name: Cache R packages - uses: actions/cache@v4.1.2 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cb6fbbadb5c4..ceae5cae8523 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -42,7 +42,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Build ${{ matrix.container_id }} @@ -59,7 +59,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -82,7 +82,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -109,7 +109,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -145,7 +145,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -186,7 +186,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -211,7 +211,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -252,7 +252,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache @@ -314,7 +314,7 @@ jobs: steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Fetch container from cache diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index 1e6df46615d5..11f1c5ad7d2d 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -18,7 +18,7 @@ jobs: name: Test Google C++ test (CPU) runs-on: macos-13 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Install system packages @@ -40,7 +40,7 @@ jobs: name: Test Google C++ unittest (CPU Non-OMP) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Install system packages @@ -65,10 +65,10 @@ jobs: run: shell: bash -l {0} steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index bcc0f5b8ba81..1e38fbf9e2d2 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -18,10 +18,10 @@ jobs: runs-on: ubuntu-latest name: Test installing XGBoost Python source package steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest @@ -52,14 +52,14 @@ jobs: os: [macos-13, windows-latest] python-version: ["3.10"] steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Install osx system dependencies if: matrix.os == 'macos-13' run: | brew install ninja libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true python-version: ${{ matrix.python-version }} @@ -85,11 +85,11 @@ jobs: runs-on: macos-13 timeout-minutes: 60 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest @@ -132,12 +132,12 @@ jobs: name: Test XGBoost Python package System Installation on Ubuntu runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Set up Python 3.10 - uses: actions/setup-python@v5.3.0 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index 02f21593c220..0b8c62794359 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -30,14 +30,14 @@ jobs: - os: macos-14 platform_id: macosx_arm64 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Set up homebrew uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 6ff4aa079e95..89f079fc1df0 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -27,7 +27,7 @@ jobs: run: | apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4 with: submodules: 'true' diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index f5e5152fa29a..f88f9bd2d833 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -37,22 +37,22 @@ jobs: sudo apt update sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev if: matrix.os == 'ubuntu-latest' - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: r-lib/actions/setup-r@v2.11.0 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.r }} - name: Cache R packages - uses: actions/cache@v4.1.2 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@v5.3.0 + - uses: actions/setup-python@v5 with: python-version: "3.10" architecture: 'x64' @@ -91,7 +91,7 @@ jobs: run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 8ab77ec4c382..f3837391b4fe 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -22,7 +22,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@v4.2.2 + uses: actions/checkout@v4 with: persist-credentials: false diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 7f6214016c00..a0d4f9272100 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -22,10 +22,10 @@ jobs: name: Test Google C++ unittest (CPU SYCL) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest @@ -53,11 +53,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 90 steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 + - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 miniforge-version: latest diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index d6be99d00851..6e525fed837c 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,7 +25,7 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: 'true' - name: Check latest RAPIDS and update conftest.sh diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index a2f721544d43..0c4d027efec5 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -27,7 +27,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-cpu steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - run: powershell ops/pipeline/build-win64-gpu.ps1 @@ -47,7 +47,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-gpu steps: - - uses: actions/checkout@v4.2.2 + - uses: actions/checkout@v4 with: submodules: "true" - name: Unstash files From eef98c75354da1d91e9a6a0a4918a208a5b5259c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 16:16:09 -0800 Subject: [PATCH 51/86] Move all commands to separate script files --- .github/workflows/freebsd.yml | 7 +- .github/workflows/jvm_tests.yml | 1 - .github/workflows/main.yml | 6 +- .github/workflows/misc.yml | 68 +---------- .github/workflows/python_tests.yml | 109 ++---------------- .github/workflows/sycl_tests.yml | 25 +--- ops/conda_env/sdist_test.yml | 2 - ops/pipeline/build-test-jvm-packages-impl.sh | 1 + ops/pipeline/classify-git-branch.sh | 3 +- ops/pipeline/deploy-jvm-packages-impl.sh | 3 +- ops/pipeline/deploy-jvm-packages.sh | 1 + ops/pipeline/test-c-api-demo.sh | 39 +++++++ ops/pipeline/test-python-macos.sh | 21 ++++ ops/pipeline/test-python-sdist.sh | 11 ++ ...thon-impl.sh => test-python-wheel-impl.sh} | 1 + .../{test-python.sh => test-python-wheel.sh} | 3 +- ops/pipeline/test-python-with-sysprefix.sh | 22 ++++ 17 files changed, 125 insertions(+), 198 deletions(-) create mode 100755 ops/pipeline/test-c-api-demo.sh create mode 100755 ops/pipeline/test-python-macos.sh create mode 100755 ops/pipeline/test-python-sdist.sh rename ops/pipeline/{test-python-impl.sh => test-python-wheel-impl.sh} (97%) rename ops/pipeline/{test-python.sh => test-python-wheel.sh} (77%) create mode 100755 ops/pipeline/test-python-with-sysprefix.sh diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index d0eb13c20fb6..bfa7c5a6feef 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -27,8 +27,5 @@ jobs: pkg install -y cmake git ninja googletest run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost + bash ops/script/build_via_cmake.sh + ./build/testxgboost diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 27ecd32ab3a6..f5f369015bc8 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -232,7 +232,6 @@ jobs: run: | cd lib/ Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` --acl public-read --region us-west-2 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ceae5cae8523..156c813a3446 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -268,14 +268,14 @@ jobs: KEY: ${{ matrix.artifact_from }} - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - test-python: + test-python-wheel: name: Run Python tests (${{ matrix.description }}) needs: [build-cuda, build-cpu-arm64] runs-on: - runs-on - runner=${{ matrix.runner }} - run-id=${{ github.run_id }} - - tag=main-test-python-${{ matrix.description }} + - tag=main-test-python-wheel-${{ matrix.description }} strategy: fail-fast: false max-parallel: 2 @@ -329,4 +329,4 @@ jobs: COMMAND: download KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} - run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} + run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index 11f1c5ad7d2d..c031cfea3d2d 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -14,28 +14,6 @@ env: ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: macos-13 - steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose - gtest-cpu-nonomp: name: Test Google C++ unittest (CPU Non-OMP) runs-on: ubuntu-latest @@ -47,12 +25,7 @@ jobs: run: | sudo apt-get install -y --no-install-recommends ninja-build - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v + run: bash ops/script/build_via_cmake.sh -DUSE_OPENMP=OFF - name: Run gtest binary run: | cd build @@ -79,42 +52,5 @@ jobs: run: | conda info conda list - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo - ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo + run: bash ops/pipeline/test-c-api-demo.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 1e38fbf9e2d2..e378ab010d6a 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -14,9 +14,13 @@ concurrency: cancel-in-progress: true jobs: - python-sdist-test-on-Linux: - runs-on: ubuntu-latest - name: Test installing XGBoost Python source package + python-sdist-test: + runs-on: ${{ matrix.os }} + name: Test installing Python XGBoost from the source distribution (${{ matrix.os }}) + strategy: + fail-fast: false + matrix: + os: [macos-13, windows-latest, ubuntu-latest] steps: - uses: actions/checkout@v4 with: @@ -32,53 +36,12 @@ jobs: run: | conda info conda list - - name: Build and install XGBoost + - name: Install extra package for MacOS run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Install osx system dependencies + mamba install -c conda-forge llvm-openmp if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@v3 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' + run: bash ops/pipeline/test-python-sdist.sh python-tests-on-macos: name: Test XGBoost Python package on macos-13 @@ -102,31 +65,7 @@ jobs: conda info conda list - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask + - run: bash ops/pipeline/test-python-macos.sh python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on Ubuntu @@ -135,32 +74,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - name: Set up Python 3.10 uses: actions/setup-python@v5 with: python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' + - run: bash ops/pipeline/test-python-with-sysprefix.sh diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index a0d4f9272100..babf9184fe86 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -38,15 +38,9 @@ jobs: conda list - name: Build and install XGBoost run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ - -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja + bash ops/script/build_via_cmake.sh --conda-env=linux_sycl_test -DPLUGIN_SYCL=ON - name: Run gtest - run: | - cd build - ./testxgboost + run: ./build/testxgboost python-sycl-tests-on-ubuntu: name: Test XGBoost Python package with SYCL @@ -56,7 +50,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Miniforge3 @@ -64,23 +57,15 @@ jobs: activate-environment: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml use-mamba: true - - name: Display Conda env run: | conda info conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja - ninja - - name: Install Python package + - name: Build and install XGBoost run: | + bash ops/script/build_via_cmake.sh --conda-env=linux_sycl_test -DPLUGIN_SYCL=ON cd python-package python --version pip install -v . - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + run: pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ diff --git a/ops/conda_env/sdist_test.yml b/ops/conda_env/sdist_test.yml index 3597b42c6132..c21cd2b701e1 100644 --- a/ops/conda_env/sdist_test.yml +++ b/ops/conda_env/sdist_test.yml @@ -9,5 +9,3 @@ dependencies: - cmake - ninja - python-build -- c-compiler -- cxx-compiler diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index 3290bf0f17c9..bd2f462a3061 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -1,5 +1,6 @@ #!/bin/bash ## Build and test JVM packages. +## Companion script for build-test-jvm-packages.sh. ## ## Note. This script takes in all inputs via environment variables. diff --git a/ops/pipeline/classify-git-branch.sh b/ops/pipeline/classify-git-branch.sh index 0c175c92792d..3d9a2348f23e 100755 --- a/ops/pipeline/classify-git-branch.sh +++ b/ops/pipeline/classify-git-branch.sh @@ -1,6 +1,5 @@ #!/bin/bash - -## Detect whether the current git branch is a pull request or a release branch. +## Detect whether the current git branch is a pull request or a release branch set -euo pipefail diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh index 36fd23a583d6..2b6d60303d01 100755 --- a/ops/pipeline/deploy-jvm-packages-impl.sh +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -1,5 +1,6 @@ #!/bin/bash -## Deploy JVM packages to xgboost-maven-repo S3 bucket +## Deploy JVM packages to S3 bucket +## Companion script for ops/pipeline/deploy-jvm-packages.sh set -euox pipefail diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index 866b6dded393..1b47029b346a 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Deploy JVM packages to S3 bucket set -euox pipefail diff --git a/ops/pipeline/test-c-api-demo.sh b/ops/pipeline/test-c-api-demo.sh new file mode 100755 index 000000000000..9a44c8c46fd9 --- /dev/null +++ b/ops/pipeline/test-c-api-demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash +## Test C API demos + +set -euox pipefail + +# Build and install XGBoost static library (libxgboost.a) +mkdir build +pushd build +cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja +ninja -v install +popd + +# Build and run C API demo with static library +pushd demo/c-api/ +mkdir build-c-api-demo +pushd build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +rm -rf ./build-c-api-demo +popd + +# Build and install XGBoost shared library (libxgboost.so) +pushd build +cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja \ + -DPLUGIN_FEDERATED=ON +ninja -v install +popd + +# Build and run C API demo with shared library +mkdir demo/c-api/build-c-api-demo +pushd demo/c-api/build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/basic/api-demo +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/external-memory/external-memory-demo diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh new file mode 100755 index 000000000000..de1521573c71 --- /dev/null +++ b/ops/pipeline/test-python-macos.sh @@ -0,0 +1,21 @@ +#!/bin/bash +## Test XGBoost Python wheel on MacOS + +set -euox pipefail + +brew install ninja + +mkdir build +cd build +# Set prefix, to use OpenMP library from Conda env +# See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 +# to learn why we don't use libomp from Homebrew. +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON +ninja + +cd python-package +python --version +pip install -v . + +pytest -s -v -rxXs --durations=0 ./tests/python +pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask diff --git a/ops/pipeline/test-python-sdist.sh b/ops/pipeline/test-python-sdist.sh new file mode 100755 index 000000000000..d6b71597380e --- /dev/null +++ b/ops/pipeline/test-python-sdist.sh @@ -0,0 +1,11 @@ +#!/bin/bash +## Test installing Python XGBoost from source distribution + +set -euox pipefail + +cd python-package +python --version +python -m build --sdist +pip install -v ./dist/xgboost-*.tar.gz +cd .. +python -c 'import xgboost' diff --git a/ops/pipeline/test-python-impl.sh b/ops/pipeline/test-python-wheel-impl.sh similarity index 97% rename from ops/pipeline/test-python-impl.sh rename to ops/pipeline/test-python-wheel-impl.sh index be1cb410c96c..75bfa5fbaffb 100755 --- a/ops/pipeline/test-python-impl.sh +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Companion script for ops/pipeline/test-python-wheel.sh set -eo pipefail diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python-wheel.sh similarity index 77% rename from ops/pipeline/test-python.sh rename to ops/pipeline/test-python-wheel.sh index c513499a4220..113695b4a820 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Test XGBoost Python wheel on the Linux platform set -euo pipefail @@ -20,4 +21,4 @@ fi python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ --run-args='--shm-size=4g' \ - -- bash ops/pipeline/test-python-impl.sh "${suite}" + -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" diff --git a/ops/pipeline/test-python-with-sysprefix.sh b/ops/pipeline/test-python-with-sysprefix.sh new file mode 100755 index 000000000000..b44e28e8b29a --- /dev/null +++ b/ops/pipeline/test-python-with-sysprefix.sh @@ -0,0 +1,22 @@ +#!/bin/bash +## Test if Python XGBoost can be configured to use libxgboost.so from the system prefix + +set -euox pipefail + +sudo apt-get update && sudo apt-get install -y ninja-build + +mkdir build +cd build +cmake .. -GNinja +ninja + +# Copy libxgboost.so to system prefix +cp -v lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" + +# Now configure Python XGBoost to use libxgboost.so from the system prefix +cd python-package +pip install virtualenv +virtualenv venv +source venv/bin/activate && \ + pip install -v . --config-settings use_system_libxgboost=True && \ + python -c 'import xgboost' From bb1a1523a7243c9ebe3ab5df4ff5d2518f497d21 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 16:34:59 -0800 Subject: [PATCH 52/86] Fix shell error --- ops/script/build_via_cmake.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh index 86e3677f4392..3fc39a39b4df 100755 --- a/ops/script/build_via_cmake.sh +++ b/ops/script/build_via_cmake.sh @@ -17,7 +17,7 @@ then cmake_args="$@" # Workaround for file permission error - if [[ -n $CI_BUILD_UID ]] + if [[ -n ${CI_BUILD_UID:-} ]] then gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs fi From 5ef3454a54ffc4786bd2f554bd97b5193f223cc9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 16:44:08 -0800 Subject: [PATCH 53/86] Temporary fix --- ops/script/build_via_cmake.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh index 3fc39a39b4df..00a571584ea4 100755 --- a/ops/script/build_via_cmake.sh +++ b/ops/script/build_via_cmake.sh @@ -22,7 +22,11 @@ then gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs fi - source activate ${conda_env} + # Don't activate Conda env if it's already activated + if [[ -z ${CONDA_PREFIX:-} ]] + then + source activate ${conda_env} + fi cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" else cmake_args="$@" From 5788f144ba3e6a15dbe86d4af5b53fa09003cbb6 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 22:49:58 -0800 Subject: [PATCH 54/86] Fix deploy step --- ops/pipeline/build-test-jvm-packages-impl.sh | 22 +------------- ops/pipeline/deploy-jvm-packages-impl.sh | 4 ++- ops/pipeline/deploy-jvm-packages.sh | 2 +- ops/script/inject_jvm_lib.sh | 32 ++++++++++++++++++++ 4 files changed, 37 insertions(+), 23 deletions(-) create mode 100755 ops/script/inject_jvm_lib.sh diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index bd2f462a3061..ed95ba3368ab 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -43,27 +43,7 @@ fi # step, but we need to do it manually here.) if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] then - echo "Using externally provided libxgboost4j.so. Locating one from lib/..." - mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ - cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ - mkdir -p jvm-packages/xgboost4j/src/test/resources - mkdir -p jvm-packages/xgboost4j-spark/src/test/resources - mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources - - # Generate machine.txt.* files from the CLI regression demo - # TODO(hcho3): Remove once CLI is removed - pushd demo/CLI/regression - python3 mapfeat.py - python3 mknfold.py machine.txt 1 - popd - - cp -v demo/data/agaricus.* \ - jvm-packages/xgboost4j/src/test/resources - cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ - jvm-packages/xgboost4j-spark/src/test/resources - cp -v demo/data/veterans_lung_cancer.csv \ - jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ - jvm-packages/xgboost4j-spark-gpu/src/test/resources + bash ops/script/inject_jvm_lib.sh fi cd jvm-packages/ diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh index 2b6d60303d01..f3828de8fe2f 100755 --- a/ops/pipeline/deploy-jvm-packages-impl.sh +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -11,8 +11,10 @@ then fi variant="$1" - maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" + +bash ops/script/inject_jvm_lib.sh + case "$variant" in cpu) # CPU variant diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index 1b47029b346a..c78f8adfabc3 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -3,7 +3,7 @@ set -euox pipefail -source ops/pipeline/enforce-ci.sh +# source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 2 ]] then diff --git a/ops/script/inject_jvm_lib.sh b/ops/script/inject_jvm_lib.sh new file mode 100755 index 000000000000..82584aeaca92 --- /dev/null +++ b/ops/script/inject_jvm_lib.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Inject lib/libxgboost4j.so into JVM packages. +# This script is useful when the user opts to set skip.native.build=true +# option in the JVM package build. When this option is set, the JVM package +# build will not build libxgboost4j.so; instead it will expect to find the +# library in jvm-packages/xgboost4j/src/main/resources/lib/{os}/{arch}/. +# This script will ensure that libxgboost4j.so is copied to the correct +# location. + +set -euox pipefail + +echo "Using externally provided libxgboost4j.so. Locating one from lib/..." +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +mkdir -p jvm-packages/xgboost4j/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources + +# Generate machine.txt.* files from the CLI regression demo +# TODO(hcho3): Remove once CLI is removed +pushd demo/CLI/regression +python3 mapfeat.py +python3 mknfold.py machine.txt 1 +popd + +cp -v demo/data/agaricus.* \ + jvm-packages/xgboost4j/src/test/resources +cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ + jvm-packages/xgboost4j-spark/src/test/resources +cp -v demo/data/veterans_lung_cancer.csv \ + jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ + jvm-packages/xgboost4j-spark-gpu/src/test/resources From ebea2b1ee72d3b7ed1e5874d8c49b3a6cc704bee Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 23:00:33 -0800 Subject: [PATCH 55/86] install bash in FreeBSD --- .github/workflows/freebsd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index bfa7c5a6feef..f20b6fd40157 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -24,7 +24,7 @@ jobs: with: usesh: true prepare: | - pkg install -y cmake git ninja googletest + pkg install -y cmake git ninja googletest bash run: | bash ops/script/build_via_cmake.sh From 904aaeed1daf0bb1d1996315aff17e600150c66c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 23:25:44 -0800 Subject: [PATCH 56/86] Fix FreeBSD and SYCL --- .github/workflows/freebsd.yml | 4 +-- .github/workflows/sycl_tests.yml | 13 ++------ .../xgboost4j-example/model/dump.raw.txt | 14 ++++++++ ops/pipeline/build-test-sycl.sh | 32 +++++++++++++++++++ ops/pipeline/test-freebsd.sh | 10 ++++++ 5 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 jvm-packages/xgboost4j-example/model/dump.raw.txt create mode 100755 ops/pipeline/build-test-sycl.sh create mode 100755 ops/pipeline/test-freebsd.sh diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index f20b6fd40157..26e8fa34c119 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -25,7 +25,5 @@ jobs: usesh: true prepare: | pkg install -y cmake git ninja googletest bash - run: | - bash ops/script/build_via_cmake.sh - ./build/testxgboost + bash ops/pipeline/test-freebsd.sh diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index babf9184fe86..180c62310765 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -36,11 +36,8 @@ jobs: run: | conda info conda list - - name: Build and install XGBoost - run: | - bash ops/script/build_via_cmake.sh --conda-env=linux_sycl_test -DPLUGIN_SYCL=ON - name: Run gtest - run: ./build/testxgboost + run: bash ops/pipeline/build-test-sycl.sh gtest python-sycl-tests-on-ubuntu: name: Test XGBoost Python package with SYCL @@ -61,11 +58,5 @@ jobs: run: | conda info conda list - - name: Build and install XGBoost - run: | - bash ops/script/build_via_cmake.sh --conda-env=linux_sycl_test -DPLUGIN_SYCL=ON - cd python-package - python --version - pip install -v . - name: Test Python package - run: pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + run: bash ops/pipeline/build-test-sycl.sh pytest diff --git a/jvm-packages/xgboost4j-example/model/dump.raw.txt b/jvm-packages/xgboost4j-example/model/dump.raw.txt new file mode 100644 index 000000000000..91c6417f383f --- /dev/null +++ b/jvm-packages/xgboost4j-example/model/dump.raw.txt @@ -0,0 +1,14 @@ +booster[0]: +0:[odor=none] yes=1,no=2 +1:[spore-print-color=green] yes=3,no=4 + 3:leaf=1.92847228 + 4:leaf=-1.87165487 +2:[stalk-root=club] yes=5,no=6 + 5:leaf=-1.63159156 + 6:leaf=1.78592122 +booster[1]: +0:[stalk-root=rooted] yes=1,no=2 + 1:leaf=-6.24725294 +2:[odor=none] yes=3,no=4 + 3:leaf=-0.967758596 + 4:leaf=0.784398556 diff --git a/ops/pipeline/build-test-sycl.sh b/ops/pipeline/build-test-sycl.sh new file mode 100755 index 000000000000..644c0c9276f1 --- /dev/null +++ b/ops/pipeline/build-test-sycl.sh @@ -0,0 +1,32 @@ +#!/bin/bash +## Build and test oneAPI + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gtest,pytest}" + exit 1 +fi + +suite="$1" + +mkdir build +pushd build +cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja +ninja +popd + +case "$suite" in + gtest) + ./build/testxgboost + ;; + pytest) + cd python-package + python --version + pip install -v . + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + ;; +esac diff --git a/ops/pipeline/test-freebsd.sh b/ops/pipeline/test-freebsd.sh new file mode 100755 index 000000000000..f9ed61e9e2b8 --- /dev/null +++ b/ops/pipeline/test-freebsd.sh @@ -0,0 +1,10 @@ +#!/bin/bash +## Run tests on FreeBSD + +set -euox pipefail + +mkdir build +cd build +cmake .. -GNinja -DGOOGLE_TEST=ON +ninja -v +./testxgboost From 0015d67166d624b3fe854d67f15ab00826ec739b Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 Nov 2024 23:59:19 -0800 Subject: [PATCH 57/86] More fix --- ops/pipeline/deploy-jvm-packages-impl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh index f3828de8fe2f..3fb7522ac273 100755 --- a/ops/pipeline/deploy-jvm-packages-impl.sh +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -13,18 +13,17 @@ fi variant="$1" maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" -bash ops/script/inject_jvm_lib.sh - case "$variant" in cpu) # CPU variant for scala_version in 2.12 2.13 do python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh pushd jvm-packages mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} mvn clean - mvn clean -Pdefault,release-to-s3 + mvn clean -Pdefault,release-to-s3 popd done ;; @@ -33,6 +32,7 @@ case "$variant" in for scala_version in 2.12 2.13 do python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh pushd jvm-packages mvn --no-transfer-progress install -Pgpu ${maven_options} mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} From b931b20ba6e33d2359a2d4b61a57d58a419e5c52 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 21 Nov 2024 00:41:44 -0800 Subject: [PATCH 58/86] Fix --- ops/pipeline/test-python-macos.sh | 3 ++- ops/pipeline/test-python-with-sysprefix.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh index de1521573c71..b6bb463c9304 100755 --- a/ops/pipeline/test-python-macos.sh +++ b/ops/pipeline/test-python-macos.sh @@ -6,12 +6,13 @@ set -euox pipefail brew install ninja mkdir build -cd build +pushd build # Set prefix, to use OpenMP library from Conda env # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 # to learn why we don't use libomp from Homebrew. cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON ninja +popd cd python-package python --version diff --git a/ops/pipeline/test-python-with-sysprefix.sh b/ops/pipeline/test-python-with-sysprefix.sh index b44e28e8b29a..9ee918b112f4 100755 --- a/ops/pipeline/test-python-with-sysprefix.sh +++ b/ops/pipeline/test-python-with-sysprefix.sh @@ -6,9 +6,10 @@ set -euox pipefail sudo apt-get update && sudo apt-get install -y ninja-build mkdir build -cd build +pushd build cmake .. -GNinja ninja +popd # Copy libxgboost.so to system prefix cp -v lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" From b64ac1bef302e7bc455ab2a927568afc2d123673 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 21 Nov 2024 00:53:16 -0800 Subject: [PATCH 59/86] Fix sycl --- ops/pipeline/build-test-sycl.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ops/pipeline/build-test-sycl.sh b/ops/pipeline/build-test-sycl.sh index 644c0c9276f1..f3b651b18cf9 100755 --- a/ops/pipeline/build-test-sycl.sh +++ b/ops/pipeline/build-test-sycl.sh @@ -27,6 +27,7 @@ case "$suite" in cd python-package python --version pip install -v . + cd .. pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ ;; esac From 7540203d6ddd14554d894e2376896a38c96f66e5 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 21 Nov 2024 01:09:15 -0800 Subject: [PATCH 60/86] Fix --- ops/pipeline/test-python-macos.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh index b6bb463c9304..63b5690d1312 100755 --- a/ops/pipeline/test-python-macos.sh +++ b/ops/pipeline/test-python-macos.sh @@ -18,5 +18,6 @@ cd python-package python --version pip install -v . +cd .. pytest -s -v -rxXs --durations=0 ./tests/python pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask From a8a8c6abbcf2b8dbaa93a5b135a5bcc804f06ce4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 21 Nov 2024 18:06:10 -0800 Subject: [PATCH 61/86] Document CI --- .github/workflows/update_rapids.yml | 2 +- dev/release-artifacts.py | 2 +- doc/contrib/ci.rst | 349 +++++++++++++++++++--------- doc/contrib/coding_guide.rst | 16 +- doc/contrib/donate.rst | 8 +- doc/contrib/release.rst | 2 +- doc/contrib/unit_tests.rst | 2 +- ops/pipeline/test-python-wheel.sh | 1 + 8 files changed, 256 insertions(+), 126 deletions(-) diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 6e525fed837c..4a3e4747c3ff 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -36,7 +36,7 @@ jobs: if: github.ref == 'refs/heads/master' with: add-paths: | - tests/buildkite + ops/docker branch: create-pull-request/update-rapids base: master title: "[CI] Update RAPIDS to latest stable" diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py index 08bb2cbfaff2..b4ce2bca6ec4 100644 --- a/dev/release-artifacts.py +++ b/dev/release-artifacts.py @@ -123,7 +123,7 @@ def make_python_sdist( with DirectoryExcursion(ROOT): with open("python-package/pyproject.toml", "r") as f: orig_pyproj_lines = f.read() - with open("tests/buildkite/remove_nccl_dep.patch", "r") as f: + with open("ops/patch/remove_nccl_dep.patch", "r") as f: patch_lines = f.read() subprocess.run( ["patch", "-p0"], input=patch_lines, check=True, text=True, encoding="utf-8" diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index af9e6556290c..2e0725fddbff 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -14,11 +14,9 @@ project. ************** GitHub Actions ************** -The configuration files are located under the directory -`.github/workflows `_. - -Most of the tests listed in the configuration files run automatically for every incoming pull -requests and every update to branches. A few tests however require manual activation: +We make the extensive use of `GitHub Actions `_ to host our +CI pipelines. Most of the tests listed in the configuration files run automatically for every +incoming pull requests and every update to branches. A few tests however require manual activation: * R tests with ``noLD`` option: Run R tests using a custom-built R with compilation flag ``--disable-long-double``. See `this page `_ for more @@ -26,18 +24,16 @@ requests and every update to branches. A few tests however require manual activa To invoke this test suite for a particular pull request, simply add a review comment ``/gha run r-nold-test``. (Ordinary comment won't work. It needs to be a review comment.) -GitHub Actions is also used to build Python wheels targeting MacOS Intel and Apple Silicon. See -`.github/workflows/python_wheels.yml -`_. The -``python_wheels`` pipeline sets up environment variables prefixed ``CIBW_*`` to indicate the target -OS and processor. The pipeline then invokes the script ``build_python_wheels.sh``, which in turns -calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library that sets up a -suitable Python environment for each OS and processor target. Since we don't have Apple Silicon -machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex -task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call -``pip wheel``. Since XGBoost has a native library component, we created a customized build -backend that hooks into ``pip``. The customized backend contains the glue code to compile the native -library on the fly.) +******************************* +Self-Hosted Runners with RunsOn +******************************* + +`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create +self-hosted runners to use with GitHub Actions pipelines. RunsOn uses +`Amazon Web Services (AWS) `_ under the hood to provision runners with +access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test +GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of +GitHub Actions. ********************************************************* Reproduce CI testing environments using Docker containers @@ -52,113 +48,246 @@ Prerequisites 2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian The runtime lets you access NVIDIA GPUs inside a Docker container. +.. _build_run_docker_locally: ============================================== Building and Running Docker containers locally ============================================== -For your convenience, we provide the wrapper script ``tests/ci_build/ci_build.sh``. You can use it as follows: +For your convenience, we provide three wrapper scripts: + +* ``ops/docker_build.py``: Build a Docker container +* ``ops/docker_build.sh``: Wrapper for ``ops/docker_build.py`` with a more concise interface +* ``ops/docker_run.py``: Run a command inside a Docker container + +**To build a Docker container**, invoke ``docker_build.sh`` as follows: + +.. code-block:: bash + + export CONTAINER_ID="ID of the container" + export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master" + bash ops/docker_build.sh + +where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file +``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, +the script will use the corresponding entry from ``ci_container.yml``: + +.. code-block:: yaml + + xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: "24.10" + +The ``container_def`` entry indicates where the Dockerfile is located. The container +definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where +``CONTAINER_DEF`` is the value of ``container_def`` entry. In this example, the Dockerfile +is ``ops/docker/dockerfile/Dockerfile.gpu``. + +The ``build_args`` entry lists all the build arguments for the Docker build. In this example, +the build arguments are: + +.. code-block:: + + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + +The build arguments provide inputs to the ``ARG`` instructions in the Dockerfile. + +.. note:: Inspect the logs from the CI pipeline to find what's going on under the hood + + When invoked, ``ops/docker_build.sh`` logs the precise commands that it runs under the hood. + Using the example above: + + .. code-block:: bash + + # docker_build.sh calls docker_build.py... + python3 ops/docker_build.py --container-def gpu --container-id xgb-ci.gpu \ + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + + ... + + # .. and docker_build.py in turn calls "docker build"... + docker build --build-arg CUDA_VERSION_ARG=12.4.1 \ + --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --load --progress=plain \ + --ulimit nofile=1024000:1024000 \ + -t xgb-ci.gpu \ + -f ops/docker/dockerfile/Dockerfile.gpu \ + ops/ + + The logs come in handy when debugging the container builds. In addition, you can change + the build arguments to make changes to the container. + +**To run commands within a Docker container**, invoke ``docker_run.py`` as follows: + +.. code-block:: bash + + python3 ops/docker_run.py --container-id "ID of the container" [--use-gpus] \ + -- "command to run inside the container" + +where ``--use-gpus`` should be specified to expose NVIDIA GPUs to the Docker container. + +For example: .. code-block:: bash - tests/ci_build/ci_build.sh --use-gpus --build-arg \ - ... + # Run without GPU + python3 ops/docker_run.py --container-id xgb-ci.cpu \ + -- bash ops/script/build_via_cmake.sh + + # Run with NVIDIA GPU + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +The ``docker_run.py`` script will convert these commands to the following invocations +of ``docker run``: -where: +.. code-block:: bash + + docker run --rm --pid=host \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.cpu \ + bash ops/script/build_via_cmake.sh -* ```` is the identifier for the container. The wrapper script will use the - container definition (Dockerfile) located at ``tests/ci_build/Dockerfile.``. - For example, setting the container type to ``gpu`` will cause the script to load the Dockerfile - ``tests/ci_build/Dockerfile.gpu``. -* Specify ``--use-gpus`` to run any GPU code. This flag will grant the container access to all NVIDIA GPUs in the base machine. Omit the flag if the access to GPUs is not necessary. -* ```` is a build argument to be passed to Docker. Must be of form ``VAR=VALUE``. - Example: ``--build-arg CUDA_VERSION_ARG=11.0``. You can pass multiple ``--build-arg``. -* ```` is the command to run inside the Docker container. This can be more than one argument. - Example: ``tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON``. + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu -Optionally, you can set the environment variable ``CI_DOCKER_EXTRA_PARAMS_INIT`` to pass extra -arguments to Docker. For example: +Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker run``: .. code-block:: bash # Allocate extra space in /dev/shm to enable NCCL - export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - # Run multi-GPU test suite - tests/ci_build/ci_build.sh gpu --use-gpus --build-arg CUDA_VERSION_ARG=11.0 \ - tests/ci_build/test_python.sh mgpu + # Also run the container with elevated privileges + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +which is translated to + +.. code-block:: bash + + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + --shm-size=4g --privileged \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu + +******************************************************************** +The Lay of the Land: how CI pipelines are organized in the code base +******************************************************************** +The XGBoost project stores the configuration for its CI pipelines as part of the codebase. +The git repository therefore stores not only the change history for its source code but also +the change history for the CI pipelines. + +The CI pipelines are organized into the following directories and files: + +* ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax +* ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for + the self-hosted CI runners. +* ``ops/conda_env/``: Definitions for Conda environments +* ``ops/packer/``: Packer scripts to build machine images for Amazon EC2 +* ``ops/patch/``: Patch files +* ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run + locally (to assist with development and debugging); a few must run in the CI. +* ``ops/script/``: Various utility scripts useful for testing +* ``ops/docker/dockerfile/``: Dockerfiles to define containers +* ``ops/docker/ci_container.yml``: Defines the mapping between Dockerfiles and containers. + Also specifies the build arguments to be used with each container. See + :ref:`build_run_docker_locally` to learn how this YAML file is used in the context of + a container build. +* ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See + :ref:`build_run_docker_locally` for the detailed description. + +To inspect a given CI pipeline, open files in the following order: + +.. plot:: + :nofigs: + + from graphviz import Source + source = r""" + digraph ci_graph { + graph [fontname = "helvetica"]; + node [fontname = "helvetica"]; + edge [fontname = "helvetica"]; + 0 [label=<.github/workflows/*.yml>, shape=box]; + 1 [label=, shape=box]; + 2 [label=, shape=box]; + 3 [label=, shape=box]; + 0 -> 1 [xlabel="Calls"]; + 1 -> 2 [xlabel="Calls,\nvia docker_run.py"]; + 2 -> 3 [xlabel="Calls"]; + 1 -> 3 [xlabel="Calls"]; + } + """ + Source(source, format='png').render('../_static/ci_graph', view=False) + Source(source, format='svg').render('../_static/ci_graph', view=False) + +.. figure:: ../_static/ci_graph.svg + :align: center + :figwidth: 80 % + +=================================== +Primitives used in the CI pipelines +=================================== + +------------------------ +Build and run containers +------------------------ + +See :ref:`build_run_docker_locally`. + +------------------------------------------ +Stash artifacts, to move them between jobs +------------------------------------------ + +This primitive is useful when one pipeline job needs to consume the output +from another job. +We use `Amazon S3 `_ to store the stashed files. + +**To stash a file**: + +.. code-block:: bash + + export COMMAND="upload" + export KEY="unique key to identify a group of files" + bash ops/pipeline/stash-artifacts.sh path/to/file + +You can upload multiple files, possibly with wildcard globbing: + +.. code-block:: bash + + export COMMAND="upload" + export KEY="build-cuda" + bash ops/pipeline/stash-artifacts.sh \ + build/testxgboost python-package/dist/*.whl + +**To unstash a file**: + +.. code-block:: bash + + export COMMAND="download" + export KEY="unique key to identify a group of files" + bash ops/pipeline/stash-artifacts.sh path/to/file -To pass multiple extra arguments: +You can also use the wildcard globbing. The script will search for files in +the S3 bucket whose path matches the pattern. .. code-block:: bash - export CI_DOCKER_EXTRA_PARAMS_INIT='-e VAR1=VAL1 -e VAR2=VAL2 -e VAR3=VAL3' - -******************************************** -Update pipeline definitions for BuildKite CI -******************************************** - -`BuildKite `_ is a SaaS (Software as a Service) platform that orchestrates -cloud machines to host CI pipelines. The BuildKite platform allows us to define CI pipelines as a -declarative YAML file. - -The pipeline definitions are found in ``tests/buildkite/``: - -* ``tests/buildkite/pipeline-win64.yml``: This pipeline builds and tests XGBoost for the Windows platform. -* ``tests/buildkite/pipeline-mgpu.yml``: This pipeline builds and tests XGBoost with access to multiple - NVIDIA GPUs. -* ``tests/buildkite/pipeline.yml``: This pipeline builds and tests XGBoost with access to a single - NVIDIA GPU. Most tests are located here. - -**************************************** -Managing Elastic CI Stack with BuildKite -**************************************** - -BuildKite allows us to define cloud resources in -a declarative fashion. Every configuration step is now documented explicitly as code. - -**Prerequisite**: You should have some knowledge of `CloudFormation `_. -CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using -a single YAML file. - -**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then -set up a credential pair in order to provision resources on AWS. See -`Creating an IAM user in your AWS account `_. - -* Option 1. Give full admin privileges to your IAM user. This is the simplest option. -* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources. - For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``. - -===================== -Worker Image Pipeline -===================== -Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and -manually install the necessary packages. This process is not only laborious but also error-prone. You may -forget to install a package or change a system configuration. - -No more. Now we have an automated pipeline for building images for worker machines. - -* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision - CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are - pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively. -* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may - take some time. -* Once they pipelines have been fully provisioned, run the script - ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be - uploaded to the EC2 service. You can locate them in the EC2 console. -* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs. - (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI`` - section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.) - -====================== -EC2 Autoscaling Groups -====================== -In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to -workload. When a pull request is submitted, the following steps take place: - -1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server. -2. BuildKite sends a signal to a `Lambda `_ function named ``Autoscaling``. -3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances. -4. New worker instances run the test jobs. Test results are reported back to BuildKite. -5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group - to scale down. Idle worker instances are shut down. - -To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``. -Check the CloudFormation web console to verify successful provision of auto-scaling groups. + export COMMAND="download" + export KEY="build-cuda" + # Download all files whose path matches pattern + # python-package/dist/*.whl + bash ops/pipeline/stash-artifacts.sh \ + python-package/dist/*.whl diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst index bf18ad08cf53..60b3c4a13bd2 100644 --- a/doc/contrib/coding_guide.rst +++ b/doc/contrib/coding_guide.rst @@ -107,7 +107,7 @@ C++ interface of the R package, please make corresponding changes in ``src/init. Generating the Package and Running Tests ======================================== -The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``tests/ci_build`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. +The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``ops/script`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. ********************************* Running Formatting Checks Locally @@ -127,7 +127,7 @@ To run checks for Python locally, install the checkers mentioned previously and .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_python.py --fix + python ./ops/script/lint_python.py --fix To run checks for R: @@ -135,21 +135,21 @@ To run checks for R: cd /path/to/xgboost/ R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) + Rscript ops/script/lint_r.R $(pwd) To run checks for cpplint locally: .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_cpp.py + python ./ops/script/lint_cpp.py See next section for clang-tidy. For CMake scripts: .. code-block:: bash - bash ./tests/ci_build/lint_cmake.sh + bash ./ops/script/lint_cmake.sh Lastly, the linter for jvm-packages is integrated into the maven build process. @@ -163,21 +163,21 @@ To run this check locally, run the following command from the top level source t .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py + python3 ops/script/run_clang_tidy.py Also, the script accepts two optional integer arguments, namely ``--cpp`` and ``--cuda``. By default they are both set to 1, meaning that both C++ and CUDA code will be checked. If the CUDA toolkit is not installed on your machine, you'll encounter an error. To exclude CUDA source from linting, use: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cuda=0 + python3 ops/script/run_clang_tidy.py --cuda=0 Similarly, if you want to exclude C++ source from linting: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cpp=0 + python3 ops/script/run_clang_tidy.py --cpp=0 ********************************** Guide for handling user input data diff --git a/doc/contrib/donate.rst b/doc/contrib/donate.rst index b6171c412c74..ba7c75a942f9 100644 --- a/doc/contrib/donate.rst +++ b/doc/contrib/donate.rst @@ -13,9 +13,9 @@ DMLC/XGBoost has grown from a research project incubated in academia to one of t A robust and efficient **continuous integration (CI)** infrastructure is one of the most critical solutions to address the above challenge. A CI service will monitor an open-source repository and run a suite of integration tests for every incoming contribution. This way, the CI ensures that every proposed change in the codebase is compatible with existing functionalities. Furthermore, XGBoost can enable more thorough tests with a powerful CI infrastructure to cover cases which are closer to the production environment. -There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We use `BuildKite `_ to organize CI pipelines. +There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We host `Amazon Web Services (AWS) `_ to host the test machines, along with `GitHub Actions `_ and `RunsOn `_ (SaaS app) to organize the CI pipelines. -The cloud-hosted test farm has recurring operating expenses. It utilizes a leading cloud provider (AWS) to accommodate variable workload. BuildKite launches worker machines on AWS on demand, to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. +The cloud-hosted test farm has recurring operating expenses. RunsOn launches worker machines on AWS on demand to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. To help defray the hosting cost, the XGBoost project seeks donations from third parties. @@ -29,9 +29,9 @@ The Project Management Committee (PMC) of the XGBoost project appointed `Open So All expenses incurred for hosting CI will be submitted to the fiscal host with receipts. Only the expenses in the following categories will be approved for reimbursement: -* Cloud expenses for the cloud test farm (https://buildkite.com/xgboost) +* Cloud expenses for the cloud test farm * Cost of domain https://xgboost-ci.net -* Monthly cost of using BuildKite +* Annual subscription for RunsOn * Hosting cost of the User Forum (https://discuss.xgboost.ai) Administration of cloud CI infrastructure diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst index c0370b14ed42..4548b1ffa9a2 100644 --- a/doc/contrib/release.rst +++ b/doc/contrib/release.rst @@ -17,7 +17,7 @@ Making a Release ----------------- 1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue. -2. Create a release branch if this is a major release. Bump release version. There's a helper script ``tests/ci_build/change_version.py``. +2. Create a release branch if this is a major release. Bump release version. There's a helper script ``ops/script/change_version.py``. 3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``. 4. Create a tag on release branch, either on GitHub or locally. 5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub. diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst index aa58cd337020..857d7a067307 100644 --- a/doc/contrib/unit_tests.rst +++ b/doc/contrib/unit_tests.rst @@ -63,7 +63,7 @@ Run .. code-block:: bash - python ./tests/ci_build/test_r_package.py --task=check + python ./ops/script/test_r_package.py --task=check at the root of the project directory. The command builds and checks the XGBoost r-package. Alternatively, if you want to just run the tests, you can use the following diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index 113695b4a820..8d41cd930891 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -19,6 +19,7 @@ else gpu_option="" fi +set -x python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ --run-args='--shm-size=4g' \ -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" From d0e209cf8d956d02c2552151b38f659f99f2899c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 21 Nov 2024 21:11:35 -0800 Subject: [PATCH 62/86] Minor formatting fixes to doc --- doc/contrib/ci.rst | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 2e0725fddbff..84f00c22b1f1 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -45,7 +45,8 @@ You can reproduce the same testing environment as the CI pipelines by running Do Prerequisites ============= 1. Install Docker: https://docs.docker.com/engine/install/ubuntu/ -2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian +2. Install NVIDIA Docker runtime: + https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html. The runtime lets you access NVIDIA GPUs inside a Docker container. .. _build_run_docker_locally: @@ -73,11 +74,11 @@ the script will use the corresponding entry from ``ci_container.yml``: .. code-block:: yaml xgb-ci.gpu: - container_def: gpu - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.10" + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: "24.10" The ``container_def`` entry indicates where the Dockerfile is located. The container definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where @@ -171,7 +172,7 @@ Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker r --run-args='--shm-size=4g --privileged' \ -- bash ops/pipeline/test-python-wheel-impl.sh gpu -which is translated to +which translates to .. code-block:: bash @@ -183,13 +184,17 @@ which is translated to xgb-ci.gpu \ bash ops/pipeline/test-python-wheel-impl.sh gpu -******************************************************************** -The Lay of the Land: how CI pipelines are organized in the code base -******************************************************************** +******************************************************************* +The Lay of the Land: how CI pipelines are organized in the codebase +******************************************************************* The XGBoost project stores the configuration for its CI pipelines as part of the codebase. The git repository therefore stores not only the change history for its source code but also the change history for the CI pipelines. +================= +File Organization +================= + The CI pipelines are organized into the following directories and files: * ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax @@ -209,7 +214,7 @@ The CI pipelines are organized into the following directories and files: * ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See :ref:`build_run_docker_locally` for the detailed description. -To inspect a given CI pipeline, open files in the following order: +To inspect a given CI pipeline, inspect files in the following order: .. plot:: :nofigs: @@ -217,9 +222,9 @@ To inspect a given CI pipeline, open files in the following order: from graphviz import Source source = r""" digraph ci_graph { - graph [fontname = "helvetica"]; - node [fontname = "helvetica"]; - edge [fontname = "helvetica"]; + graph [fontname = "monospace"]; + node [fontname = "monospace"]; + edge [fontname = "monospace"]; 0 [label=<.github/workflows/*.yml>, shape=box]; 1 [label=, shape=box]; 2 [label=, shape=box]; @@ -288,6 +293,6 @@ the S3 bucket whose path matches the pattern. export COMMAND="download" export KEY="build-cuda" # Download all files whose path matches pattern - # python-package/dist/*.whl + # python-package/dist/*.whl bash ops/pipeline/stash-artifacts.sh \ python-package/dist/*.whl From cc6dff70de0579f07f849e716529af71f672c3c7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 22 Nov 2024 00:17:36 -0800 Subject: [PATCH 63/86] Remove unneed env vars --- .github/workflows/windows.yml | 3 --- jvm-packages/xgboost4j-example/model/dump.raw.txt | 14 -------------- 2 files changed, 17 deletions(-) delete mode 100644 jvm-packages/xgboost4j-example/model/dump.raw.txt diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 0c4d027efec5..2793350655d2 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -16,9 +16,6 @@ defaults: env: BRANCH_NAME: >- ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} - # TODO(hcho3): Remove - RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-dv5n3gmnaius jobs: build-win64-gpu: diff --git a/jvm-packages/xgboost4j-example/model/dump.raw.txt b/jvm-packages/xgboost4j-example/model/dump.raw.txt deleted file mode 100644 index 91c6417f383f..000000000000 --- a/jvm-packages/xgboost4j-example/model/dump.raw.txt +++ /dev/null @@ -1,14 +0,0 @@ -booster[0]: -0:[odor=none] yes=1,no=2 -1:[spore-print-color=green] yes=3,no=4 - 3:leaf=1.92847228 - 4:leaf=-1.87165487 -2:[stalk-root=club] yes=5,no=6 - 5:leaf=-1.63159156 - 6:leaf=1.78592122 -booster[1]: -0:[stalk-root=rooted] yes=1,no=2 - 1:leaf=-6.24725294 -2:[odor=none] yes=3,no=4 - 3:leaf=-0.967758596 - 4:leaf=0.784398556 From 0dac1700d248a6cfb0ecb65b1e29cf3cf9df300e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 22 Nov 2024 00:24:25 -0800 Subject: [PATCH 64/86] Add example input and output for the jq script --- ops/docker/extract_build_args.jq | 4 ++++ ops/docker/extract_build_args.sh | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq index 682b62cb63cb..b35240edb626 100644 --- a/ops/docker/extract_build_args.jq +++ b/ops/docker/extract_build_args.jq @@ -1,3 +1,7 @@ +## Example input: +## xgb-ci.gpu_build_r_rockylinux8 +## Example output: +## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 def compute_build_args($input; $container_id): $input | .[$container_id] | diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh index 0fa7b132b760..42a83047742c 100755 --- a/ops/docker/extract_build_args.sh +++ b/ops/docker/extract_build_args.sh @@ -1,6 +1,11 @@ #!/bin/bash ## Extract container definition and build args from ops/docker/ci_container.yml, ## given the container ID. +## +## Example input: +## xgb-ci.clang_tidy +## Example output: +## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' if [ "$#" -ne 1 ]; then echo "Usage: $0 [container_id]" From 6ba85328c43c80cdeba7af11b313257672eccb42 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 24 Nov 2024 23:51:31 -0800 Subject: [PATCH 65/86] Temporarily build all archs for JVM --- ops/pipeline/build-jvm-gpu.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index 7656a3d2f188..6c657df3a11c 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -7,12 +7,13 @@ source ops/pipeline/classify-git-branch.sh echo "--- Build libxgboost4j.so with CUDA" -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi +# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +# then +# arch_flag="-DGPU_COMPUTE_VER=75" +# else +# arch_flag="" +# fi +arch_flag="" COMMAND=$( cat <<-EOF From 9ab82e5c2e54a5cfc4c7f8a160cc39b06d2c98d9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 24 Nov 2024 23:59:04 -0800 Subject: [PATCH 66/86] Fix hanging jobs --- .github/workflows/jvm_tests.yml | 4 ++++ .github/workflows/lint.yml | 2 ++ .github/workflows/main.yml | 5 +++++ .github/workflows/windows.yml | 2 ++ 4 files changed, 13 insertions(+) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index f5f369015bc8..dd196e73c2e0 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -86,6 +86,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-gpu steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -138,6 +139,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-docs steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -165,6 +167,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=jvm-tests-build-test-jvm-packages steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -248,6 +251,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-mgpu + - tag=jvm-tests-test-jvm-packages-gpu steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index fcc804776fff..2e2a4ea209c8 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -21,6 +21,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=lint-build-containers steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -36,6 +37,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=lint-clang-tidy steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 156c813a3446..95dafe832970 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -56,6 +56,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=main-build-cpu steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -79,6 +80,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-arm64-cpu + - tag=build-cpu-arm64 steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -106,6 +108,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=main-build-cuda steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -142,6 +145,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -208,6 +212,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - tag=main-build-gpu-rpkg steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 2793350655d2..2587821e39db 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -23,6 +23,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-cpu + - tag=windows-build-win64-gpu steps: - uses: actions/checkout@v4 with: @@ -43,6 +44,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-gpu + - tag=windows-test-win64-gpu steps: - uses: actions/checkout@v4 with: From 97f3965f45f6564eeb16fdd7a25244e6f7c422e1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 01:43:53 -0800 Subject: [PATCH 67/86] Update stash-artifacts.sh script --- .github/workflows/jvm_tests.yml | 35 +++++++++++++------------------- .github/workflows/main.yml | 35 ++++++++++---------------------- .github/workflows/windows.yml | 10 ++------- doc/contrib/ci.rst | 35 +++++++++++++++++--------------- ops/pipeline/stash-artifacts.ps1 | 29 +++++++++++++------------- ops/pipeline/stash-artifacts.py | 12 +++++------ ops/pipeline/stash-artifacts.sh | 28 +++++++++++-------------- 7 files changed, 79 insertions(+), 105 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index dd196e73c2e0..0d5138f095aa 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -99,10 +99,8 @@ jobs: CONTAINER_ID: xgb-ci.jvm_gpu_build - run: bash ops/pipeline/build-jvm-gpu.sh - name: Stash files - run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-jvm-gpu + run: | + bash ops/pipeline/stash-artifacts.sh stash build-jvm-gpu lib/libxgboost4j.so build-jvm-mac: name: "Build libxgboost4j.dylib for ${{ matrix.description }}" @@ -151,14 +149,13 @@ jobs: env: CONTAINER_ID: xgb-ci.jvm_gpu_build - name: Unstash files - run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so - run: bash ops/pipeline/build-jvm-doc.sh - name: Upload JVM doc run: | - bash ops/pipeline/publish-artifact.sh jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ + bash ops/pipeline/publish-artifact.sh \ + jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ s3://xgboost-docs/ build-test-jvm-packages: @@ -187,10 +184,9 @@ jobs: env: SCALA_VERSION: 2.13 - name: Stash files - run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-test-jvm-packages + run: | + bash ops/pipeline/stash-artifacts.sh stash \ + build-test-jvm-packages lib/libxgboost4j.so build-test-jvm-packages-other-os: name: Build and test JVM packages (${{ matrix.os }}) @@ -263,10 +259,8 @@ jobs: env: CONTAINER_ID: xgb-ci.jvm_gpu_build - name: Unstash files - run: bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so - run: bash ops/pipeline/test-jvm-gpu.sh deploy-jvm-packages: @@ -299,11 +293,10 @@ jobs: CONTAINER_ID: ${{ matrix.container_id }} - name: Unstash files run: | - bash ops/pipeline/stash-artifacts.sh lib/libxgboost4j.so + bash ops/pipeline/stash-artifacts.sh \ + unstash ${{ matrix.artifact_from }} \ + lib/libxgboost4j.so ls -lh lib/libxgboost4j.so - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - name: Deploy JVM packages to S3 run: | bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant }} \ diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 95dafe832970..e2a7d7475912 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -69,10 +69,7 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable - run: bash ops/pipeline/stash-artifacts.sh ./xgboost - env: - COMMAND: upload - KEY: build-cpu + run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel @@ -93,10 +90,9 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files - run: bash ops/pipeline/stash-artifacts.sh ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cpu-arm64 + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cpu-arm64 \ + ./xgboost python-package/dist/*.whl - name: Upload Python wheel run: | bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ @@ -126,11 +122,8 @@ jobs: - run: bash ops/pipeline/build-cuda.sh - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh \ + bash ops/pipeline/stash-artifacts.sh stash build-cuda \ build/testxgboost ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cuda - name: Upload Python wheel run: | for file in python-package/dist/*.whl python-package/dist/meta.json @@ -162,10 +155,8 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/pipeline/build-cuda-with-rmm.sh - name: Stash files - run: bash ops/pipeline/stash-artifacts.sh build/testxgboost - env: - COMMAND: upload - KEY: build-cuda-with-rmm + run: bash ops/pipeline/stash-artifacts.sh \ + stash build-cuda-with-rmm build/testxgboost - name: Upload Python wheel run: | bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ @@ -266,11 +257,9 @@ jobs: CONTAINER_ID: xgb-ci.gpu - name: Unstash gtest run: | - bash ops/pipeline/stash-artifacts.sh build/testxgboost + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + build/testxgboost chmod +x build/testxgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} test-python-wheel: @@ -328,10 +317,8 @@ jobs: CONTAINER_ID: ${{ matrix.container }} - name: Unstash Python wheel run: | - bash ops/pipeline/stash-artifacts.sh python-package/dist/*.whl ./xgboost + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + python-package/dist/*.whl ./xgboost chmod +x ./xgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 2587821e39db..415a33ffbd3f 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -31,12 +31,9 @@ jobs: - run: powershell ops/pipeline/build-win64-gpu.ps1 - name: Stash files run: | - powershell ops/pipeline/stash-artifacts.ps1 ` + powershell ops/pipeline/stash-artifacts.ps1 stash build-win64-gpu ` build/testxgboost.exe xgboost.exe ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) - env: - COMMAND: upload - KEY: build-win64-gpu test-win64-gpu: name: Test XGBoost on Windows @@ -51,9 +48,6 @@ jobs: submodules: "true" - name: Unstash files run: | - powershell ops/pipeline/stash-artifacts.ps1 ` + powershell ops/pipeline/stash-artifacts.ps1 unstash build-wind64-gpu ` build/testxgboost.exe xgboost.exe python-package/dist/*.whl - env: - COMMAND: download - KEY: build-win64-gpu - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 84f00c22b1f1..5040cf6d04bb 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -264,35 +264,38 @@ We use `Amazon S3 `_ to store the stashed files. .. code-block:: bash - export COMMAND="upload" - export KEY="unique key to identify a group of files" - bash ops/pipeline/stash-artifacts.sh path/to/file + REMOTE_PREFIX="remote directory to place the artifact(s)" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" path/to/file + +The ``REMOTE_PREFIX`` argument, which is the second command-line argument +for ``stash-artifacts.sh``, specifies the remote directory in which the artifact(s) +should be placed. More precisely, the artifact(s) will be placed in +``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_REPOSITORY}/stash/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/`` +where ``RUNS_ON_S3_BUCKET_CACHE``, ``GITHUB_REPOSITORY``, and ``GITHUB_RUN_ID`` are set by +the CI. (RunsOn provisions an S3 bucket to stage cache, and its name is stored in the environment +variable ``RUNS_ON_S3_BUCKET_CACHE``.) You can upload multiple files, possibly with wildcard globbing: .. code-block:: bash - export COMMAND="upload" - export KEY="build-cuda" - bash ops/pipeline/stash-artifacts.sh \ + REMOTE_PREFIX="build-cuda" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" \ build/testxgboost python-package/dist/*.whl **To unstash a file**: .. code-block:: bash - export COMMAND="download" - export KEY="unique key to identify a group of files" - bash ops/pipeline/stash-artifacts.sh path/to/file + REMOTE_PREFIX="unique key to identify a group of files" + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file -You can also use the wildcard globbing. The script will search for files in -the S3 bucket whose path matches the pattern. +You can also use the wildcard globbing. The script will download the matching artifacts +from the remote directory. .. code-block:: bash - export COMMAND="download" - export KEY="build-cuda" - # Download all files whose path matches pattern - # python-package/dist/*.whl - bash ops/pipeline/stash-artifacts.sh \ + REMOTE_PREFIX="build-cuda" + # Download all files whose path matches the wildcard pattern python-package/dist/*.whl + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \ python-package/dist/*.whl diff --git a/ops/pipeline/stash-artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 index 202a6c4521ed..f2fc5c5c3cb2 100644 --- a/ops/pipeline/stash-artifacts.ps1 +++ b/ops/pipeline/stash-artifacts.ps1 @@ -2,7 +2,15 @@ Param( [Parameter( Mandatory=$true, - Position=0, + Position=0 + )][string]$command, + [Parameter( + Mandatory=$true, + Position=1 + )][string]$remote_prefix, + [Parameter( + Mandatory=$true, + Position=2, ValueFromRemainingArguments=$true )][string[]]$artifacts ) @@ -10,21 +18,14 @@ Param( ## Convenience wrapper for ops/pipeline/stash-artifacts.py ## Meant to be used inside GitHub Actions -$ENV_VAR_DOC = @' -Inputs - - COMMAND: Either "upload" or "download" - - KEY: Unique string to identify a group of artifacts -'@ - $ErrorActionPreference = "Stop" . ops/pipeline/enforce-ci.ps1 -foreach ($env in "COMMAND", "KEY", "GITHUB_REPOSITORY", "GITHUB_RUN_ID", - "RUNS_ON_S3_BUCKET_CACHE") { +foreach ($env in "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { $val = [Environment]::GetEnvironmentVariable($env) if ($val -eq $null) { - Write-Host "Error: $env must be set.`n${ENV_VAR_DOC}" + Write-Host "Error: $env must be set." exit 1 } } @@ -35,13 +36,13 @@ conda activate Write-Host @" python ops/pipeline/stash-artifacts.py ` - --command "${Env:COMMAND}" ` + --command "${command}" ` --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` - --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` -- $artifacts "@ python ops/pipeline/stash-artifacts.py ` - --command "${Env:COMMAND}" ` + --command "${command}" ` --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` - --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` -- $artifacts diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py index 3c77eb4d2f31..151e187513da 100644 --- a/ops/pipeline/stash-artifacts.py +++ b/ops/pipeline/stash-artifacts.py @@ -85,7 +85,7 @@ def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: def upload(args: argparse.Namespace) -> None: - print(f"Uploading artifacts with prefix {args.prefix}...") + print(f"Stashing artifacts to prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) @@ -93,7 +93,7 @@ def upload(args: argparse.Namespace) -> None: def download(args: argparse.Namespace) -> None: - print(f"Downloading artifacts with prefix {args.prefix}...") + print(f"Unstashing artifacts from prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) print(f"mkdir -p {str(artifact_path.parent)}") @@ -117,9 +117,9 @@ def download(args: argparse.Namespace) -> None: parser.add_argument( "--command", type=str, - choices=["upload", "download"], + choices=["stash", "unstash"], required=True, - help="Whether to upload or download the artifact (upload/download)", + help="Whether to stash or unstash the artifact", ) parser.add_argument( "--s3-bucket", @@ -138,7 +138,7 @@ def download(args: argparse.Namespace) -> None: ) parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") parsed_args = parser.parse_args() - if parsed_args.command == "upload": + if parsed_args.command == "stash": upload(parsed_args) - elif parsed_args.command == "download": + elif parsed_args.command == "unstash": download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh index 3cd0378fc916..17ad40da8465 100755 --- a/ops/pipeline/stash-artifacts.sh +++ b/ops/pipeline/stash-artifacts.sh @@ -3,29 +3,25 @@ ## Convenience wrapper for ops/pipeline/stash-artifacts.py ## Meant to be used inside GitHub Actions -ENV_VAR_DOC=$( -cat <<-EOF -Inputs - - COMMAND: Either "upload" or "download" - - KEY: Unique string to identify a group of artifacts -EOF -) - set -euo pipefail -source ops/pipeline/enforce-ci.sh +# source ops/pipeline/enforce-ci.sh -if [ "$#" -lt 1 ]; then - echo "Usage: $0 [artifact] [artifact ...]" +if [ "$#" -lt 3 ]; then + echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" exit 1 fi -for arg in "COMMAND" "KEY" "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +command="$1" +remote_prefix="$2" +shift 2 + +for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" do if [[ -z "${!arg:-}" ]] then - echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" - exit 1 + echo "Error: $arg must be set." + exit 2 fi done @@ -33,7 +29,7 @@ artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" set -x python3 ops/pipeline/stash-artifacts.py \ - --command "${COMMAND}" \ + --command "${command}" \ --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ - --prefix "${artifact_stash_prefix}/${KEY}" \ + --prefix "${artifact_stash_prefix}/${remote_prefix}" \ -- "$@" From 331dab474e3aeac1d71915d856db1f3a59cdbd8f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 08:46:08 -0800 Subject: [PATCH 68/86] Fix syntax error --- .github/workflows/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e2a7d7475912..acab9592e99a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -155,8 +155,9 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/pipeline/build-cuda-with-rmm.sh - name: Stash files - run: bash ops/pipeline/stash-artifacts.sh \ - stash build-cuda-with-rmm build/testxgboost + run: | + bash ops/pipeline/stash-artifacts.sh \ + stash build-cuda-with-rmm build/testxgboost - name: Upload Python wheel run: | bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ From 7eb89f383c1b732788b03caa768caa90c2085bb1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 08:48:02 -0800 Subject: [PATCH 69/86] Don't use max-parallel for RunsOn runners --- .github/workflows/jvm_tests.yml | 1 - .github/workflows/main.yml | 3 --- 2 files changed, 4 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 0d5138f095aa..f425c80e79c1 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -23,7 +23,6 @@ jobs: - run-id=${{ github.run_id }} - tag=jvm-tests-build-containers-${{ matrix.container_id }} strategy: - max-parallel: 2 matrix: container_id: - xgb-ci.manylinux2014_x86_64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index acab9592e99a..e11379d8a84a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,7 +23,6 @@ jobs: - run-id=${{ github.run_id }} - tag=main-build-containers-${{ matrix.container_id }} strategy: - max-parallel: 2 matrix: container_id: - xgb-ci.gpu_build_rockylinux8 @@ -234,7 +233,6 @@ jobs: - tag=main-test-cpp-gpu-${{ matrix.suite }} strategy: fail-fast: false - max-parallel: 2 matrix: include: - suite: gpu @@ -273,7 +271,6 @@ jobs: - tag=main-test-python-wheel-${{ matrix.description }} strategy: fail-fast: false - max-parallel: 2 matrix: include: - description: single-gpu From ab4070ad02a393e03fa42f2022094afc129bc2ca Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 09:09:28 -0800 Subject: [PATCH 70/86] stash-artifacts.ps1: Fail if unstashing failed --- .github/workflows/windows.yml | 2 +- ops/pipeline/stash-artifacts.ps1 | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 415a33ffbd3f..f97daf761abf 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -48,6 +48,6 @@ jobs: submodules: "true" - name: Unstash files run: | - powershell ops/pipeline/stash-artifacts.ps1 unstash build-wind64-gpu ` + powershell ops/pipeline/stash-artifacts.ps1 unstash build-win64-gpu ` build/testxgboost.exe xgboost.exe python-package/dist/*.whl - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/ops/pipeline/stash-artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 index f2fc5c5c3cb2..9b9989bf376d 100644 --- a/ops/pipeline/stash-artifacts.ps1 +++ b/ops/pipeline/stash-artifacts.ps1 @@ -46,3 +46,4 @@ python ops/pipeline/stash-artifacts.py ` --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` --prefix "${artifact_stash_prefix}/${remote_prefix}" ` -- $artifacts +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } From 74c6ab91fca1abf0de1cac6b7f9bfc3f8b542c91 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 09:14:06 -0800 Subject: [PATCH 71/86] Update doc --- doc/contrib/ci.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 5040cf6d04bb..b61c2a265cb2 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -287,7 +287,7 @@ You can upload multiple files, possibly with wildcard globbing: .. code-block:: bash - REMOTE_PREFIX="unique key to identify a group of files" + REMOTE_PREFIX="remote directory to place the artifact(s)" bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file You can also use the wildcard globbing. The script will download the matching artifacts From b87ca581b6044402bd51af1040cee338f4901a87 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 09:15:49 -0800 Subject: [PATCH 72/86] Uncomment CI guards --- ops/pipeline/deploy-jvm-packages.sh | 2 +- ops/pipeline/stash-artifacts.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index c78f8adfabc3..1b47029b346a 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -3,7 +3,7 @@ set -euox pipefail -# source ops/pipeline/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 2 ]] then diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh index 17ad40da8465..4a6a9f6c2622 100755 --- a/ops/pipeline/stash-artifacts.sh +++ b/ops/pipeline/stash-artifacts.sh @@ -5,7 +5,7 @@ set -euo pipefail -# source ops/pipeline/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ "$#" -lt 3 ]; then echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" From 54902c2f8b704c07181d5dea2b11a3f8177d0dd3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 17:14:44 -0800 Subject: [PATCH 73/86] Uncomment CI guards --- ops/pipeline/build-jvm-gpu.sh | 13 ++++++------- ops/pipeline/deploy-jvm-packages.sh | 6 +++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index 6c657df3a11c..7656a3d2f188 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -7,13 +7,12 @@ source ops/pipeline/classify-git-branch.sh echo "--- Build libxgboost4j.so with CUDA" -# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -# then -# arch_flag="-DGPU_COMPUTE_VER=75" -# else -# arch_flag="" -# fi -arch_flag="" +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi COMMAND=$( cat <<-EOF diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index 1b47029b346a..1fa1e3f3ee41 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -14,9 +14,9 @@ fi variant="$1" container_id="$2" -# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -# then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" python3 ops/docker_run.py --container-id "${container_id}" \ -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" -# fi +fi From 85795d7602c11e3db4b7904afdec0a3991d1bfa3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 17:32:43 -0800 Subject: [PATCH 74/86] Add local composite action miniforge-setup --- .github/workflows/jvm_tests.yml | 15 +++--------- .github/workflows/lint.yml | 16 ++----------- .github/workflows/misc.yml | 11 ++------- .github/workflows/python_tests.yml | 25 ++++---------------- .github/workflows/python_wheels_macos.yml | 8 +++---- .github/workflows/sycl_tests.yml | 22 ++++------------- actions/miniforge-setup/action.yml | 24 +++++++++++++++++++ ops/conda_env/{jvm_tests.yml => minimal.yml} | 2 +- 8 files changed, 43 insertions(+), 80 deletions(-) create mode 100644 actions/miniforge-setup/action.yml rename ops/conda_env/{jvm_tests.yml => minimal.yml} (79%) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index f425c80e79c1..3a2925fe1138 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -195,37 +195,28 @@ jobs: fail-fast: false matrix: os: [windows-latest, macos-13] - steps: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '8' - - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: ops/conda_env/jvm_tests.yml - use-mamba: true - + environment-name: minimal + environment-file: ops/conda_env/minimal.yml - name: Cache Maven packages uses: actions/cache@v4 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - name: Test XGBoost4J (Core) run: | cd jvm-packages mvn test -B -pl :xgboost4j_2.12 - - name: Publish artifact xgboost4j.dll to S3 run: | cd lib/ diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2e2a4ea209c8..e041efb989ff 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -57,18 +57,10 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint + environment-name: python_lint environment-file: ops/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - shell: bash -el {0} - run: | - conda info - conda list - name: Run mypy shell: bash -el {0} run: | @@ -110,23 +102,19 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: r-lib/actions/setup-r@v2 with: r-version: "release" - - name: Cache R packages uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} - - name: Install dependencies shell: Rscript {0} run: | source("./R-package/tests/helper_scripts/install_deps.R") - - name: Run lintr run: | MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index c031cfea3d2d..a314a0990382 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -41,16 +41,9 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test + environment-name: cpp_test environment-file: ops/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - name: Build and run C API demo with shared run: bash ops/pipeline/test-c-api-demo.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index e378ab010d6a..43301c74ce24 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -25,17 +25,10 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test + environment-name: sdist_test environment-file: ops/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - name: Install extra package for MacOS run: | mamba install -c conda-forge llvm-openmp @@ -51,20 +44,10 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test + environment-name: macos_cpu_test environment-file: ops/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - run: bash ops/pipeline/test-python-macos.sh python-system-installation-on-ubuntu: diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index 0b8c62794359..27698e55ee0c 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -37,12 +37,10 @@ jobs: uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true + environment-name: minimal + environment-file: ops/conda_env/minimal.yml - name: Build wheels run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - name: Upload Python wheel diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 180c62310765..655cc62b9cfa 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -25,17 +25,10 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test + environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - name: Run gtest run: bash ops/pipeline/build-test-sycl.sh gtest @@ -47,16 +40,9 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3 + - uses: dmlc/xgboost/actions/miniforge-setup@master with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test + environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - name: Test Python package run: bash ops/pipeline/build-test-sycl.sh pytest diff --git a/actions/miniforge-setup/action.yml b/actions/miniforge-setup/action.yml new file mode 100644 index 000000000000..7a1fe2565697 --- /dev/null +++ b/actions/miniforge-setup/action.yml @@ -0,0 +1,24 @@ +name: "Setup miniforge" +inputs: + environment-name: + description: "Name of Conda environment" + required: true + environment-file: + description: "Path to environment file" + required: true +runs: + using: "composite" + steps: + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: ${{ inputs.environment-name }} + environment-file: ${{ inputs.environment-file }} + use-mamba: true + auto-update-conda: true + - name: Display Conda env + shell: bash -el {0} + run: | + conda info + conda list diff --git a/ops/conda_env/jvm_tests.yml b/ops/conda_env/minimal.yml similarity index 79% rename from ops/conda_env/jvm_tests.yml rename to ops/conda_env/minimal.yml index 56e11dff27bb..efe972bd44d9 100644 --- a/ops/conda_env/jvm_tests.yml +++ b/ops/conda_env/minimal.yml @@ -1,4 +1,4 @@ -name: jvm_tests +name: minimal channels: - conda-forge dependencies: From 8e20d66389d3f67a5d83fa8f0f7dfe001a699650 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 18:42:50 -0800 Subject: [PATCH 75/86] Move custom action to dmlc/xgboost-devops, with dispatching --- .github/workflows/jvm_tests.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/misc.yml | 2 +- .github/workflows/python_tests.yml | 4 ++-- .github/workflows/python_wheels_macos.yml | 2 +- .github/workflows/sycl_tests.yml | 4 ++-- actions/miniforge-setup/action.yml | 24 ----------------------- 7 files changed, 8 insertions(+), 32 deletions(-) delete mode 100644 actions/miniforge-setup/action.yml diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 3a2925fe1138..7ea935591f91 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -203,7 +203,7 @@ jobs: with: distribution: 'temurin' java-version: '8' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: minimal environment-file: ops/conda_env/minimal.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e041efb989ff..756127e69b73 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -57,7 +57,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: python_lint environment-file: ops/conda_env/python_lint.yml diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index a314a0990382..67c1bf57d3a2 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -41,7 +41,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: cpp_test environment-file: ops/conda_env/cpp_test.yml diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 43301c74ce24..dc8de819e2bb 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: sdist_test environment-file: ops/conda_env/sdist_test.yml @@ -44,7 +44,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: macos_cpu_test environment-file: ops/conda_env/macos_cpu_test.yml diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index 27698e55ee0c..ab13dfa395cd 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -37,7 +37,7 @@ jobs: uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: minimal environment-file: ops/conda_env/minimal.yml diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 655cc62b9cfa..22456b1b68e5 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml @@ -40,7 +40,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: dmlc/xgboost/actions/miniforge-setup@master + - uses: dmlc/xgboost-devops/miniforge-setup@main with: environment-name: linux_sycl_test environment-file: ops/conda_env/linux_sycl_test.yml diff --git a/actions/miniforge-setup/action.yml b/actions/miniforge-setup/action.yml deleted file mode 100644 index 7a1fe2565697..000000000000 --- a/actions/miniforge-setup/action.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: "Setup miniforge" -inputs: - environment-name: - description: "Name of Conda environment" - required: true - environment-file: - description: "Path to environment file" - required: true -runs: - using: "composite" - steps: - - uses: conda-incubator/setup-miniconda@v3 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: ${{ inputs.environment-name }} - environment-file: ${{ inputs.environment-file }} - use-mamba: true - auto-update-conda: true - - name: Display Conda env - shell: bash -el {0} - run: | - conda info - conda list From 276e0ef4beada824d94c2b3fe337582dc452090a Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 19:02:39 -0800 Subject: [PATCH 76/86] Document custom actions --- doc/contrib/ci.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index b61c2a265cb2..171a9bd7ac22 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -50,6 +50,7 @@ Prerequisites The runtime lets you access NVIDIA GPUs inside a Docker container. .. _build_run_docker_locally: + ============================================== Building and Running Docker containers locally ============================================== @@ -299,3 +300,33 @@ from the remote directory. # Download all files whose path matches the wildcard pattern python-package/dist/*.whl bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \ python-package/dist/*.whl + +----------------------------------------- +Custom actions in ``dmlc/xgboost-devops`` +----------------------------------------- + +XGBoost implements a few custom +`composite actions `_ +to reduce duplicated code within workflow YAML files. The custom actions are hosted in a separate repository, +`dmlc/xgboost-devops `_, to make it easy to test changes to the custom actions in +a pull request or a fork. + +In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. For example: + +.. code-block:: yaml + + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: cpp_test + environment-file: ops/conda_env/cpp_test.yml + +Each custom action consists of two components: + +* Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version + of the implementation script (see the next item). The main script can clone ``xgboost-devops`` from + a specified fork at a particular ref, allowing us to easily test changes to the custom action. +* Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the + custom script. + +This design was inspired by Mike Sarahan's work in +`rapidsai/shared-actions `_. From bf8844ffe695d2b58d6ab315ef24d0a4822138c0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 19:19:10 -0800 Subject: [PATCH 77/86] Test JVM GPU pkg for Scala 2.12 and 2.13 --- .github/workflows/jvm_tests.yml | 27 +++++++++++++++++---------- ops/pipeline/test-jvm-gpu.sh | 22 ++++++++++++++++++++-- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 7ea935591f91..74a115ac16cf 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -158,12 +158,16 @@ jobs: s3://xgboost-docs/ build-test-jvm-packages: - name: Build and test JVM packages (Linux) + name: Build and test JVM packages (Linux, Scala ${{ matrix.scala_version }}) needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu - - tag=jvm-tests-build-test-jvm-packages + - tag=jvm-tests-build-test-jvm-packages-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -174,18 +178,15 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.jvm - - name: Build and test JVM packages (Scala 2.12) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.12 - - name: Build and test JVM packages (Scala 2.13) + - name: Build and test JVM packages (Scala ${{ matrix.scala_version }}) run: bash ops/pipeline/build-test-jvm-packages.sh env: - SCALA_VERSION: 2.13 + SCALA_VERSION: ${{ matrix.scala_version }} - name: Stash files run: | bash ops/pipeline/stash-artifacts.sh stash \ build-test-jvm-packages lib/libxgboost4j.so + if: matrix.scala_version == '2.13' build-test-jvm-packages-other-os: name: Build and test JVM packages (${{ matrix.os }}) @@ -232,12 +233,16 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} test-jvm-packages-gpu: - name: Test JVM packages with CUDA + name: Test JVM packages with CUDA (Scala ${{ matrix.scala_version }}) needs: [build-jvm-gpu] runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-mgpu - - tag=jvm-tests-test-jvm-packages-gpu + - tag=jvm-tests-test-jvm-packages-gpu-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -252,6 +257,8 @@ jobs: run: | bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so - run: bash ops/pipeline/test-jvm-gpu.sh + env: + SCALA_VERSION: ${{ matrix.scala_version }} deploy-jvm-packages: name: Deploy JVM packages to S3 (${{ matrix.variant }}) diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index c490e58ea01d..b7bf480e5ff3 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -3,9 +3,27 @@ ## the user has already built libxgboost4j.so with CUDA support ## and place it in the lib/ directory. -set -euox pipefail +## Note. This script takes in all inputs via environment variables. -SCALA_VERSION=2.12 +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi From f5061203074477b8219ea229805109cdf586a92b Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 25 Nov 2024 19:25:56 -0800 Subject: [PATCH 78/86] Deploy JVM: Make scala version as matrix arg --- .github/workflows/jvm_tests.yml | 19 ++++++------ ops/pipeline/deploy-jvm-packages-impl.sh | 37 +++++++++--------------- ops/pipeline/deploy-jvm-packages.sh | 7 +++-- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 74a115ac16cf..d99d4fe28289 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -261,23 +261,24 @@ jobs: SCALA_VERSION: ${{ matrix.scala_version }} deploy-jvm-packages: - name: Deploy JVM packages to S3 (${{ matrix.variant }}) + name: Deploy JVM packages to S3 (${{ matrix.variant.name }}) needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] runs-on: - runs-on - runner=linux-amd64-cpu - run-id=${{ github.run_id }} - - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant }} + - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant.name }}-scala${{ matrix.scala_version }} strategy: fail-fast: false matrix: - include: - - variant: cpu + variant: + - name: cpu container_id: xgb-ci.jvm artifact_from: build-test-jvm-packages - - variant: gpu + - name: gpu container_id: xgb-ci.jvm_gpu_build artifact_from: build-jvm-gpu + scala_version: ['2.12', '2.13'] steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -287,14 +288,14 @@ jobs: - name: Fetch container from cache run: bash ops/docker_build.sh env: - CONTAINER_ID: ${{ matrix.container_id }} + CONTAINER_ID: ${{ matrix.variant.container_id }} - name: Unstash files run: | bash ops/pipeline/stash-artifacts.sh \ - unstash ${{ matrix.artifact_from }} \ + unstash ${{ matrix.variant.artifact_from }} \ lib/libxgboost4j.so ls -lh lib/libxgboost4j.so - name: Deploy JVM packages to S3 run: | - bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant }} \ - ${{ matrix.container_id }} + bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant.name }} \ + ${{ matrix.variant.container_id }} ${{ matrix.scala_version }} diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh index 3fb7522ac273..e6cd5d130257 100755 --- a/ops/pipeline/deploy-jvm-packages-impl.sh +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -4,42 +4,33 @@ set -euox pipefail -if [[ "$#" -lt 1 ]] +if [[ "$#" -lt 2 ]] then - echo "Usage: $0 {cpu,gpu}" + echo "Usage: $0 {cpu,gpu} {scala_version}" exit 1 fi variant="$1" +scala_version="$2" maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" case "$variant" in cpu) # CPU variant - for scala_version in 2.12 2.13 - do - python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts - bash ops/script/inject_jvm_lib.sh - pushd jvm-packages - mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} - mvn clean - mvn clean -Pdefault,release-to-s3 - popd - done + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} + popd ;; gpu) # GPU variant - for scala_version in 2.12 2.13 - do - python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts - bash ops/script/inject_jvm_lib.sh - pushd jvm-packages - mvn --no-transfer-progress install -Pgpu ${maven_options} - mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} - mvn clean - mvn clean -Pgpu,release-to-s3 - popd - done + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress install -Pgpu ${maven_options} + mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} + popd ;; *) echo "Unrecognized argument: $variant" diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index 1fa1e3f3ee41..61dc59b52a6b 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -5,18 +5,19 @@ set -euox pipefail source ops/pipeline/enforce-ci.sh -if [[ "$#" -lt 2 ]] +if [[ "$#" -lt 3 ]] then - echo "Usage: $0 {cpu,gpu} {container_id}" + echo "Usage: $0 {cpu,gpu} {container_id} {scala_version}" exit 1 fi variant="$1" container_id="$2" +scala_version="$3" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" python3 ops/docker_run.py --container-id "${container_id}" \ - -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" + -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" "${scala_version}" fi From d1aaa827baa7ef809aa28ba9995053f8fe0f4e15 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 26 Nov 2024 08:12:47 -0800 Subject: [PATCH 79/86] Typo --- doc/contrib/ci.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 171a9bd7ac22..df7881ff66a8 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -323,7 +323,7 @@ In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. Each custom action consists of two components: * Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version - of the implementation script (see the next item). The main script can clone ``xgboost-devops`` from + of the implementation script (see the next item). The main script clones ``xgboost-devops`` from a specified fork at a particular ref, allowing us to easily test changes to the custom action. * Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the custom script. From 2e2ccec5d44828cbc27cdde2af72ca6479ee1d9d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 26 Nov 2024 09:19:24 -0800 Subject: [PATCH 80/86] Test with dev RMM --- .github/workflows/main.yml | 19 +++++++++++++++---- ops/pipeline/build-cuda-with-rmm.sh | 12 ++++++++++-- ops/pipeline/build-cuda.sh | 1 + ops/pipeline/build-manylinux2014.sh | 4 ++-- ops/pipeline/deploy-jvm-packages-impl.sh | 2 +- ops/pipeline/deploy-jvm-packages.sh | 2 +- ops/pipeline/stash-artifacts.sh | 3 ++- ops/pipeline/test-python-wheel.sh | 2 +- 8 files changed, 33 insertions(+), 12 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e11379d8a84a..4855857088ec 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -132,12 +132,20 @@ jobs: done build-cuda-with-rmm: - name: Build CUDA with RMM + name: Build CUDA with RMM (${{ matrix.config.name }}) needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu - - tag=main-build-cuda-with-rmm + - tag=main-build-cuda-with-rmm-${{ matrix.config.name }} + strategy: + fail-fast: false + matrix: + config: + - name: stable + container_id: xgb-ci.gpu_build_rockylinux8 + - name: dev + container_id: xgb-ci.gpu_dev_ver steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -147,20 +155,23 @@ jobs: - name: Fetch container from cache run: bash ops/docker_build.sh env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + CONTAINER_ID: ${{ matrix.config.container_id }} - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda-with-rmm.sh + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh ${{ matrix.config.container_id }} - name: Stash files run: | bash ops/pipeline/stash-artifacts.sh \ stash build-cuda-with-rmm build/testxgboost + if: matrix.config.name == 'stable' - name: Upload Python wheel run: | bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ s3://xgboost-nightly-builds/experimental_build_with_rmm/ + if: matrix.config.name == 'stable' build-manylinux2014: name: Build manylinux2014_${{ matrix.arch }} wheel diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 797051e958ae..c6b33b0588d5 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build XGBoost with CUDA + RMM support set -euox pipefail @@ -8,6 +9,13 @@ then exit 1 fi +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 [container_id]" + exit 1 +fi +container_id="$1" + source ops/pipeline/classify-git-branch.sh WHEEL_TAG=manylinux_2_28_x86_64 @@ -23,7 +31,7 @@ fi echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ + --container-id "${container_id}" \ -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ -DUSE_CUDA=ON \ @@ -39,7 +47,7 @@ python3 ops/docker_run.py \ echo "--- Build binary wheel" python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ + --container-id "${container_id}" \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" python3 ops/script/rename_whl.py \ diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 09d8cad46c30..49475c01c69e 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build XGBoost with CUDA set -euox pipefail diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index cada47e06b72..a8f5af8bc3cd 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -8,13 +8,13 @@ then exit 1 fi -if [[ $# -ne 1 ]] +if [[ "$#" -lt 1 ]] then echo "Usage: $0 {x86_64,aarch64}" exit 1 fi -arch=$1 +arch="$1" WHEEL_TAG="manylinux2014_${arch}" image="xgb-ci.${WHEEL_TAG}" diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh index e6cd5d130257..e9c09112a4bd 100755 --- a/ops/pipeline/deploy-jvm-packages-impl.sh +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -6,7 +6,7 @@ set -euox pipefail if [[ "$#" -lt 2 ]] then - echo "Usage: $0 {cpu,gpu} {scala_version}" + echo "Usage: $0 {cpu,gpu} [scala_version]" exit 1 fi diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index 61dc59b52a6b..e821f334b9d2 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -7,7 +7,7 @@ source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 3 ]] then - echo "Usage: $0 {cpu,gpu} {container_id} {scala_version}" + echo "Usage: $0 {cpu,gpu} [container_id] [scala_version]" exit 1 fi diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh index 4a6a9f6c2622..98c9695c4227 100755 --- a/ops/pipeline/stash-artifacts.sh +++ b/ops/pipeline/stash-artifacts.sh @@ -7,7 +7,8 @@ set -euo pipefail source ops/pipeline/enforce-ci.sh -if [ "$#" -lt 3 ]; then +if [[ "$#" -lt 3 ]] +then echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" exit 1 fi diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index 8d41cd930891..b49c918c4da5 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -5,7 +5,7 @@ set -euo pipefail if [[ "$#" -lt 2 ]] then - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} {container_id}" + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [container_id]" exit 1 fi From d450733dd1c2a8388d7fc8ad4cf799afa6e1aa18 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 26 Nov 2024 09:24:02 -0800 Subject: [PATCH 81/86] gtest shouldn't depend on RMM dev --- .github/workflows/main.yml | 42 +++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4855857088ec..31d94bd51a30 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -132,20 +132,12 @@ jobs: done build-cuda-with-rmm: - name: Build CUDA with RMM (${{ matrix.config.name }}) + name: Build CUDA with RMM needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu - - tag=main-build-cuda-with-rmm-${{ matrix.config.name }} - strategy: - fail-fast: false - matrix: - config: - - name: stable - container_id: xgb-ci.gpu_build_rockylinux8 - - name: dev - container_id: xgb-ci.gpu_dev_ver + - tag=main-build-cuda-with-rmm steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -155,23 +147,45 @@ jobs: - name: Fetch container from cache run: bash ops/docker_build.sh env: - CONTAINER_ID: ${{ matrix.config.container_id }} + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: | - bash ops/pipeline/build-cuda-with-rmm.sh ${{ matrix.config.container_id }} + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8 - name: Stash files run: | bash ops/pipeline/stash-artifacts.sh \ stash build-cuda-with-rmm build/testxgboost - if: matrix.config.name == 'stable' - name: Upload Python wheel run: | bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ s3://xgboost-nightly-builds/experimental_build_with_rmm/ - if: matrix.config.name == 'stable' + + build-cuda-with-rmm-dev: + name: Build CUDA with RMM (dev) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm-dev + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_dev_ver + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_dev_ver build-manylinux2014: name: Build manylinux2014_${{ matrix.arch }} wheel From 0783f3898ef5d10e78b5d1430d0135f91026dc33 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 26 Nov 2024 10:38:03 -0800 Subject: [PATCH 82/86] Change interface of docker_build.sh --- .github/workflows/jvm_tests.yml | 28 +++------- .github/workflows/lint.yml | 6 +- .github/workflows/main.yml | 55 +++++-------------- doc/contrib/ci.rst | 3 +- ops/docker/ci_container.yml | 7 +++ .../Dockerfile.gpu_build_rockylinux8 | 2 +- ops/docker_build.sh | 20 +++++-- ops/pipeline/build-cuda-with-rmm.sh | 4 +- 8 files changed, 50 insertions(+), 75 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index d99d4fe28289..53e695721887 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -39,9 +39,7 @@ jobs: with: submodules: "true" - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} build-jvm-manylinux2014: name: >- @@ -68,9 +66,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - name: Upload libxgboost4j.so run: | @@ -93,9 +89,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build - run: bash ops/pipeline/build-jvm-gpu.sh - name: Stash files run: | @@ -144,9 +138,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build - name: Unstash files run: | bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so @@ -175,9 +167,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm + run: bash ops/docker_build.sh xgb-ci.jvm - name: Build and test JVM packages (Scala ${{ matrix.scala_version }}) run: bash ops/pipeline/build-test-jvm-packages.sh env: @@ -250,9 +240,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build - name: Unstash files run: | bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so @@ -286,9 +274,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.variant.container_id }} + run: bash ops/docker_build.sh ${{ matrix.variant.container_id }} - name: Unstash files run: | bash ops/pipeline/stash-artifacts.sh \ diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 756127e69b73..2c400b073988 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -29,7 +29,7 @@ jobs: with: submodules: "true" - name: Build ${{ env.CONTAINER_ID }} - run: bash ops/docker_build.sh + run: bash ops/docker_build.sh ${{ env.CONTAINER_ID }} clang-tidy: name: Run clang-tidy @@ -45,9 +45,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.clang_tidy + run: bash ops/docker_build.sh xgb-ci.clang_tidy - run: bash ops/pipeline/run-clang-tidy.sh python-mypy-lint: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 31d94bd51a30..2f579d3e9611 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,6 +26,7 @@ jobs: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_rockylinux8_dev_ver - xgb-ci.gpu_build_r_rockylinux8 - xgb-ci.gpu - xgb-ci.gpu_dev_ver @@ -45,9 +46,7 @@ jobs: with: submodules: "true" - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} build-cpu: name: Build CPU @@ -63,9 +62,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.cpu + run: bash ops/docker_build.sh xgb-ci.cpu - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost @@ -84,9 +81,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.aarch64 + run: bash ops/docker_build.sh xgb-ci.aarch64 - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files run: | @@ -111,13 +106,9 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 - run: bash ops/pipeline/build-cuda.sh - name: Stash files run: | @@ -145,13 +136,9 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 - run: | bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8 - name: Stash files @@ -177,15 +164,11 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_dev_ver + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8_dev_ver - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 - run: | - bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_dev_ver + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8_dev_ver build-manylinux2014: name: Build manylinux2014_${{ matrix.arch }} wheel @@ -210,9 +193,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - name: Upload Python wheel run: | @@ -236,9 +217,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + run: bash ops/docker_build.sh xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh - name: Upload R tarball run: | @@ -276,9 +255,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu + run: bash ops/docker_build.sh xgb-ci.gpu - name: Unstash gtest run: | bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ @@ -335,9 +312,7 @@ jobs: with: submodules: "true" - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container }} + run: bash ops/docker_build.sh ${{ matrix.container }} - name: Unstash Python wheel run: | bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index df7881ff66a8..506f62e147c5 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -64,9 +64,8 @@ For your convenience, we provide three wrapper scripts: .. code-block:: bash - export CONTAINER_ID="ID of the container" export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master" - bash ops/docker_build.sh + bash ops/docker_build.sh CONTAINER_ID where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file ``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index f5eb7eb982df..348bf90f8a1f 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -14,6 +14,13 @@ xgb-ci.gpu_build_rockylinux8: NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: *rapids_version +xgb-ci.gpu_build_rockylinux8_dev_ver: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *dev_rapids_version + xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 build_args: diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 index ae79e88b15b3..b686bfbb2b0d 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -53,7 +53,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ # Install RMM # Patch out -Werror # Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 -RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ +RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ pushd rmm && \ find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ mkdir build && \ diff --git a/ops/docker_build.sh b/ops/docker_build.sh index 0539f817ba8e..7d83daec9574 100755 --- a/ops/docker_build.sh +++ b/ops/docker_build.sh @@ -4,12 +4,13 @@ ## Build-time variables (--build-arg) and container defintion are fetched from ## ops/docker/ci_container.yml. ## -## Note. This script takes in all inputs via environment variables. +## Note. This script takes in some inputs via environment variables. -INPUT_DOC=$( +USAGE_DOC=$( cat <<-EOF -Inputs - - CONTAINER_ID: String ID uniquely identifying the container (Required) +Usage: ops/docker_build.sh [container_id] + +In addition, the following environment variables should be set. - BRANCH_NAME: Name of the current git branch or pull request (Required) - USE_DOCKER_CACHE: If set to 1, enable caching EOF @@ -38,15 +39,22 @@ EOF set -euo pipefail -for arg in "CONTAINER_ID" "BRANCH_NAME" +for arg in "BRANCH_NAME" do if [[ -z "${!arg:-}" ]] then - echo -e "Error: $arg must be set.\n${INPUT_DOC}" + echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" exit 1 fi done +if [[ "$#" -lt 1 ]] +then + echo "${USAGE_DOC}" + exit 2 +fi +CONTAINER_ID="$1" + # Fetch CONTAINER_DEF and BUILD_ARGS source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index c6b33b0588d5..479c9a1b1a28 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Build XGBoost with CUDA + RMM support -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -18,6 +18,8 @@ container_id="$1" source ops/pipeline/classify-git-branch.sh +set -x + WHEEL_TAG=manylinux_2_28_x86_64 echo "--- Build with CUDA with RMM" From 63e02bd1663d0c9ff0ffd0511d5c962e4d59b239 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 29 Nov 2024 00:47:54 -0800 Subject: [PATCH 83/86] minor formatting --- .github/workflows/r_nold.yml | 3 --- .github/workflows/r_tests.yml | 15 --------------- 2 files changed, 18 deletions(-) diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 89f079fc1df0..da01f39f650b 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -26,16 +26,13 @@ jobs: shell: bash run: | apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - uses: actions/checkout@v4 with: submodules: 'true' - - name: Install dependencies shell: bash -l {0} run: | /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - name: Run R tests shell: bash run: | diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index f88f9bd2d833..fc0245f5752e 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -30,7 +30,6 @@ jobs: build: cmake env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - steps: - name: Install system dependencies run: | @@ -40,35 +39,28 @@ jobs: - uses: actions/checkout@v4 with: submodules: 'true' - - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.r }} - - name: Cache R packages uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@v5 with: python-version: "3.10" architecture: 'x64' - - uses: r-lib/actions/setup-tinytex@v2 - - name: Install dependencies shell: Rscript {0} run: | source("./R-package/tests/helper_scripts/install_deps.R") - - name: Test R run: | python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check if: matrix.compiler != 'none' - - name: Test R run: | python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check @@ -79,39 +71,32 @@ jobs: runs-on: ubuntu-latest container: image: rhub/debian-gcc-release - steps: - name: Install system dependencies run: | # Must run before checkout to have the latest git installed. # No need to add pandoc, the container has it figured out. apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - name: Trust git cloning project sources run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - uses: actions/checkout@v4 with: submodules: 'true' - - name: Install dependencies shell: bash -l {0} run: | Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - name: Test R shell: bash -l {0} run: | python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - uses: dorny/paths-filter@v3 id: changes with: filters: | r_package: - 'R-package/**' - - name: Run document check if: steps.changes.outputs.r_package == 'true' run: | From e293a51a91ef8bbfa3164a56e45fc7f53d2c1d55 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 30 Nov 2024 18:39:45 -0800 Subject: [PATCH 84/86] Run with --privileged --- ops/pipeline/test-jvm-gpu.sh | 2 +- ops/pipeline/test-python-wheel.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index b7bf480e5ff3..380db97c787c 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -28,5 +28,5 @@ set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --shm-size=4g --privileged" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index b49c918c4da5..b4dd59b7cb0e 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -21,5 +21,5 @@ fi set -x python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" From 480dde407a9223d8196c17eb3aa0b7fef650c09d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 30 Nov 2024 18:40:36 -0800 Subject: [PATCH 85/86] Revert "Disable the host numa virtual memory allocator for now. (#10934)" This reverts commit f4d94a19b903d4bfd6458b90f0f8201616f2765d. --- src/common/device_helpers.cu | 5 ----- tests/cpp/common/test_device_vector.cu | 11 ++++++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 01e81b16ee0b..608a535cd8cb 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,11 +7,6 @@ namespace dh { PinnedMemory::PinnedMemory() { - // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. - // See https://github.com/dmlc/xgboost/issues/10933 - this->impl_.emplace(); - return; - #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 97ee39b31a1e..6f4c34edfa9f 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -32,9 +32,6 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); - if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { - GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); - } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -114,7 +111,15 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +#endif // defined(xgboost_IS_WIN) } TEST(AtomitFetch, Max) { From 070d23fec708f90615e71400658a1cf21efb0be1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 2 Dec 2024 19:48:23 -0800 Subject: [PATCH 86/86] Update doc --- doc/contrib/ci.rst | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 506f62e147c5..d6effa0b09d4 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -35,6 +35,19 @@ access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, w GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of GitHub Actions. +In GitHub Actions, jobs run on Microsoft-hosted runners by default. +To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax: + +.. code-block:: yaml + + runs-on: + - runs-on + - runner=runner-name + - run-id=${{ github.run_id }} + - tag=[unique tag that uniquely identifies the job in the GH Action workflow] + +where the runner is defined in ``.github/runs-on.yml``. + ********************************************************* Reproduce CI testing environments using Docker containers ********************************************************* @@ -201,7 +214,7 @@ The CI pipelines are organized into the following directories and files: * ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for the self-hosted CI runners. * ``ops/conda_env/``: Definitions for Conda environments -* ``ops/packer/``: Packer scripts to build machine images for Amazon EC2 +* ``ops/packer/``: Packer scripts to build VM images for Amazon EC2 * ``ops/patch/``: Patch files * ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run locally (to assist with development and debugging); a few must run in the CI. @@ -250,7 +263,18 @@ Primitives used in the CI pipelines Build and run containers ------------------------ -See :ref:`build_run_docker_locally`. +See :ref:`build_run_docker_locally` to learn about the utility scripts for building and +using containers. + +**What's the relationship between the VM image (for Amazon EC2) and the container image?** +In ``ops/packer/`` directory, we define Packer scripts to build VM images for Amazon EC2. +The VM image contains the minimal set of drivers and system software that are needed to +run the containers. + +We update container images much more often than VM images. Whereas VM images are +updated sparingly (once in a few months), container images are updated each time a branch +or a pull request is updated. This way, developers can make changes to containers and +see the results of the changes immediately in the CI run. ------------------------------------------ Stash artifacts, to move them between jobs