From 6d206b15b4c5d3659b759fd9c40eeee1bd7e0396 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 2 Dec 2024 19:09:52 -0800 Subject: [PATCH 01/12] [Don't merge] A small workflow example with RunsOn --- .github/workflows/freebsd.yml | 34 -- .github/workflows/i386.yml | 43 --- .github/workflows/jvm_tests.yml | 100 ----- .github/workflows/main.yml | 278 ++++++-------- .github/workflows/python_tests.yml | 348 ------------------ .github/workflows/python_wheels.yml | 55 --- .github/workflows/r_nold.yml | 44 --- .github/workflows/r_tests.yml | 150 -------- .github/workflows/scorecards.yml | 54 --- .github/workflows/update_rapids.yml | 44 --- ops/docker/ci_container.yml | 25 ++ ops/docker/docker_cache_ecr.yml | 4 + ops/docker/dockerfile/Dockerfile.gpu | 54 +++ .../Dockerfile.gpu_build_rockylinux8 | 82 +++++ .../Dockerfile.manylinux_2_28_x86_64 | 15 + ops/docker/entrypoint.sh | 45 +++ ops/docker/extract_build_args.jq | 12 + ops/docker/extract_build_args.sh | 26 ++ ops/docker_build.py | 137 +++++++ ops/docker_build.sh | 149 ++++++++ ops/docker_run.py | 168 +++++++++ ops/pipeline/build-cuda.sh | 85 +++++ ops/pipeline/stash-artifacts.py | 144 ++++++++ ops/pipeline/stash-artifacts.sh | 36 ++ ops/pipeline/test-cpp-gpu.sh | 42 +++ ops/pipeline/test-python-wheel-impl.sh | 74 ++++ ops/pipeline/test-python-wheel.sh | 25 ++ tests/buildkite/pipeline-mac-m1.yml | 13 - tests/buildkite/pipeline-mgpu.yml | 48 --- tests/buildkite/pipeline-nightly.yml | 43 --- tests/buildkite/pipeline-win64.yml | 24 -- tests/buildkite/pipeline.yml | 113 ------ 32 files changed, 1233 insertions(+), 1281 deletions(-) delete mode 100644 .github/workflows/freebsd.yml delete mode 100644 .github/workflows/i386.yml delete mode 100644 .github/workflows/jvm_tests.yml delete mode 100644 .github/workflows/python_tests.yml delete mode 100644 .github/workflows/python_wheels.yml delete mode 100644 .github/workflows/r_nold.yml delete mode 100644 .github/workflows/r_tests.yml delete mode 100644 .github/workflows/scorecards.yml delete mode 100644 .github/workflows/update_rapids.yml create mode 100644 ops/docker/ci_container.yml create mode 100644 ops/docker/docker_cache_ecr.yml create mode 100644 ops/docker/dockerfile/Dockerfile.gpu create mode 100644 ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 create mode 100644 ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 create mode 100644 ops/docker/entrypoint.sh create mode 100644 ops/docker/extract_build_args.jq create mode 100755 ops/docker/extract_build_args.sh create mode 100644 ops/docker_build.py create mode 100755 ops/docker_build.sh create mode 100644 ops/docker_run.py create mode 100755 ops/pipeline/build-cuda.sh create mode 100644 ops/pipeline/stash-artifacts.py create mode 100755 ops/pipeline/stash-artifacts.sh create mode 100755 ops/pipeline/test-cpp-gpu.sh create mode 100755 ops/pipeline/test-python-wheel-impl.sh create mode 100755 ops/pipeline/test-python-wheel.sh delete mode 100644 tests/buildkite/pipeline-mac-m1.yml delete mode 100644 tests/buildkite/pipeline-mgpu.yml delete mode 100644 tests/buildkite/pipeline-nightly.yml delete mode 100644 tests/buildkite/pipeline-win64.yml delete mode 100644 tests/buildkite/pipeline.yml diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml deleted file mode 100644 index d3208a1294d1..000000000000 --- a/.github/workflows/freebsd.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: FreeBSD - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test: - runs-on: ubuntu-latest - timeout-minutes: 20 - name: A job to run test in FreeBSD - steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest - - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml deleted file mode 100644 index aec7e9d31087..000000000000 --- a/.github/workflows/i386.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: XGBoost-i386-test - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-32bit: - name: Build 32-bit - runs-on: ubuntu-latest - services: - registry: - image: registry:2 - ports: - - 5000:5000 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: tests/ci_build/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml deleted file mode 100644 index 945f362685a4..000000000000 --- a/.github/workflows/jvm_tests.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: XGBoost-JVM-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} - timeout-minutes: 30 - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-latest, ubuntu-latest, macos-13] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4.5.0 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml - use-mamba: true - - - name: Cache Maven packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 - - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows - - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') - - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d1395c15f77e..e2b50019c54d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,193 +1,135 @@ -# This is a basic workflow to help you get started with Actions +name: XGBoost CI -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -# A workflow run is made up of one or more jobs that can run sequentially or in parallel +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-containers-${{ matrix.container_id }} strategy: - fail-fast: false matrix: - os: [macos-12] + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu + - xgb-ci.manylinux_2_28_x86_64 + runner: [linux-amd64-cpu] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cuda \ + build/testxgboost ./xgboost python-package/dist/*.whl - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} + test-cpp-gpu: + name: >- + Run Google Tests with GPUs + (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) + needs: [build-cuda] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-cpp-gpu-${{ matrix.suite }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.10"] + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu + - name: Unstash gtest + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + build/testxgboost + chmod +x build/testxgboost + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} + test-python-wheel: + name: Run Python tests (${{ matrix.description }}) + needs: [build-cuda] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-python-wheel-${{ matrix.description }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ + include: + - description: single-gpu + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: multiple-gpu + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh ${{ matrix.container }} + - name: Unstash Python wheel + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + python-package/dist/*.whl ./xgboost + chmod +x ./xgboost + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml deleted file mode 100644 index 8f0ab1c68262..000000000000 --- a/.github/workflows/python_tests.yml +++ /dev/null @@ -1,348 +0,0 @@ -name: XGBoost-Python-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' - - python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: macos-13} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - - - python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Set up Python 3.10 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml deleted file mode 100644 index 1bbdedc3f9c6..000000000000 --- a/.github/workflows/python_wheels.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: XGBoost-Python-Wheels - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-wheels: - name: Build wheel for ${{ matrix.platform_id }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-13 - platform_id: macosx_x86_64 - - os: macos-14 - platform_id: macosx_arm64 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml deleted file mode 100644 index 4b506927e06c..000000000000 --- a/.github/workflows/r_nold.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Run expensive R tests with the help of rhub. Only triggered by a pull request review -# See discussion at https://github.com/dmlc/xgboost/pull/6378 - -name: XGBoost-R-noLD - -on: - pull_request_review_comment: - types: [created] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-R-noLD: - if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) - timeout-minutes: 120 - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-devel-nold - steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml deleted file mode 100644 index c56d1f8ef943..000000000000 --- a/.github/workflows/r_tests.yml +++ /dev/null @@ -1,150 +0,0 @@ -name: XGBoost-R-Tests - -on: [push, pull_request] - -env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - fail-fast: false - matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' - - test-R-on-Debian: - name: Test R package on Debian - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-release - - steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml deleted file mode 100644 index 85a9abb57e1b..000000000000 --- a/.github/workflows/scorecards.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: Scorecards supply-chain security -on: - # Only the default branch is supported. - branch_protection_rule: - schedule: - - cron: '17 2 * * 6' - push: - branches: [ "master" ] - -# Declare default permissions as read only. -permissions: read-all - -jobs: - analysis: - name: Scorecards analysis - runs-on: ubuntu-latest - permissions: - # Needed to upload the results to code-scanning dashboard. - security-events: write - # Used to receive a badge. - id-token: write - - steps: - - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - persist-credentials: false - - - name: "Run analysis" - uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 - with: - results_file: results.sarif - results_format: sarif - - # Publish the results for public repositories to enable scorecard badges. For more details, see - # https://github.com/ossf/scorecard-action#publishing-results. - # For private repositories, `publish_results` will automatically be set to `false`, regardless - # of the value entered here. - publish_results: true - - # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF - # format to the repository Actions tab. - - name: "Upload artifact" - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 - with: - name: SARIF file - path: results.sarif - retention-days: 5 - - # Upload the results to GitHub's code scanning dashboard. - - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@83a02f7883b12e0e4e1a146174f5e2292a01e601 # v2.16.4 - with: - sarif_file: results.sarif diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml deleted file mode 100644 index 5e229db4c050..000000000000 --- a/.github/workflows/update_rapids.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: update-rapids - -on: - workflow_dispatch: - schedule: - - cron: "0 20 * * 1" # Run once weekly - -permissions: - pull-requests: write - contents: write - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # To use GitHub CLI - -jobs: - update-rapids: - name: Check latest RAPIDS - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" - diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml new file mode 100644 index 000000000000..df0402293ea2 --- /dev/null +++ b/ops/docker/ci_container.yml @@ -0,0 +1,25 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF + +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 diff --git a/ops/docker/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml new file mode 100644 index 000000000000..e20f35fc8020 --- /dev/null +++ b/ops/docker/docker_cache_ecr.yml @@ -0,0 +1,4 @@ +## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache + +DOCKER_CACHE_ECR_ID: "492475357299" +DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu new file mode 100644 index 000000000000..96a532fc2ff1 --- /dev/null +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -0,0 +1,54 @@ +ARG CUDA_VERSION_ARG=notset +FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 +ARG CUDA_VERSION_ARG +ARG RAPIDS_VERSION_ARG + # Should be first 4 digits (e.g. 24.06) +ARG NCCL_VERSION_ARG +ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" + +# Environment +ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-c"] + +# Install all basic requirements +RUN \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ + apt-get update && \ + apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ + apt-get install libnccl2 libnccl-dev -y --allow-change-held-packages && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge + +ENV PATH=/opt/miniforge/bin:$PATH + +# Create new Conda environment with cuDF, Dask, and cuPy +RUN \ + export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ + export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ + "nccl>=${NCCL_SHORT_VER}" \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ + numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ + python-kubernetes urllib3 graphviz hypothesis loky \ + "pyspark>=3.4.0" cloudpickle cuda-python && \ + mamba clean --all --yes + +ENV GOSU_VERSION=1.10 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY docker/entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 new file mode 100644 index 000000000000..b686bfbb2b0d --- /dev/null +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -0,0 +1,82 @@ +ARG CUDA_VERSION_ARG=notset +FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 +ARG CUDA_VERSION_ARG +ARG NCCL_VERSION_ARG +ARG RAPIDS_VERSION_ARG + +# Install all basic requirements +RUN \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + dnf -y update && \ + dnf -y install dnf-plugins-core && \ + dnf config-manager --set-enabled powertools && \ + dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/miniforge && \ + /opt/miniforge/bin/python -m pip install awscli && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr + +# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) +RUN \ + export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ + export NCCL_VERSION=$NCCL_VERSION_ARG && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ + dnf -y update && \ + dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} + +ENV PATH=/opt/miniforge/bin:/usr/local/ninja:$PATH +ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc +ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ +ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp +ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ + +ENV GOSU_VERSION=1.10 + +# Install gRPC +# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 +RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ + --recurse-submodules --depth 1 && \ + pushd grpc && \ + pushd third_party/abseil-cpp && \ + git fetch origin master && \ + git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ + popd && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ + cmake --build build --target install && \ + popd && \ + rm -rf grpc + +# Install RMM +# Patch out -Werror +# Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 +RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ + pushd rmm && \ + find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ + mkdir build && \ + pushd build && \ + cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \ + pushd _deps/cccl-src/ && \ + git fetch origin main && \ + git cherry-pick -n 9fcb32c228865f21f2b002b29d38a06b4c6fbd73 && \ + popd && \ + cmake --build . --target install && \ + popd && \ + popd && \ + rm -rf rmm + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY docker/entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 new file mode 100644 index 000000000000..f5dac54b9b8f --- /dev/null +++ b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -0,0 +1,15 @@ +FROM quay.io/pypa/manylinux_2_28_x86_64 + +# Install lightweight sudo (not bound to TTY) +ENV GOSU_VERSION=1.10 +RUN set -ex; \ + curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY docker/entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh new file mode 100644 index 000000000000..40135c197c73 --- /dev/null +++ b/ops/docker/entrypoint.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. + +set -euo pipefail + +COMMAND=("$@") + +if ! touch /this_is_writable_file_system; then + echo "You can't write to your filesystem!" + echo "If you are in Docker you should check you do not have too many images" \ + "with too many files in them. Docker has some issue with it." + exit 1 +else + rm /this_is_writable_file_system +fi + +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then + groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true + useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ + "${CI_BUILD_USER}" || true + export HOME="/home/${CI_BUILD_USER}" + shopt -s dotglob + cp -r /root/* "$HOME/" + chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" + + # Allows project-specific customization + if [[ -e "/workspace/.pre_entry.sh" ]]; then + gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh + fi + + # Enable passwordless sudo capabilities for the user + chown root:"${CI_BUILD_GID}" "$(which gosu)" + chmod +s "$(which gosu)"; sync + + exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" +else + exec "${COMMAND[@]}" +fi diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq new file mode 100644 index 000000000000..b35240edb626 --- /dev/null +++ b/ops/docker/extract_build_args.jq @@ -0,0 +1,12 @@ +## Example input: +## xgb-ci.gpu_build_r_rockylinux8 +## Example output: +## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 +def compute_build_args($input; $container_id): + $input | + .[$container_id] | + select(.build_args != null) | + .build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh new file mode 100755 index 000000000000..42a83047742c --- /dev/null +++ b/ops/docker/extract_build_args.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Extract container definition and build args from ops/docker/ci_container.yml, +## given the container ID. +## +## Example input: +## xgb-ci.clang_tidy +## Example output: +## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "ops/docker/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py new file mode 100644 index 000000000000..1fed975ce223 --- /dev/null +++ b/ops/docker_build.py @@ -0,0 +1,137 @@ +""" +Wrapper script to build a Docker container with layer caching +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +from typing import Optional + +from docker_run import OPS_DIR, fancy_print_cli_args + + +def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + container_id: str, + *, + build_args: dict[str, str], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, + cache_from: Optional[str], + cache_to: Optional[str], +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Layer caching + if cache_from: + docker_build_cli_args.extend(["--cache-from", cache_from]) + if cache_to: + docker_build_cli_args.extend(["--cache-to", cache_to]) + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_id, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + # Dockerfile to be used in docker build + dockerfile_path = ( + OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + ) + docker_context_path = OPS_DIR + + build_args = parse_build_args(args.build_arg) + + docker_build( + args.container_id, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + cache_from=args.cache_from, + cache_to=args.cache_to, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from " + "docker/dockerfile/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID to assign to the newly built container", + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" + ), + ) + parser.add_argument( + "--cache-from", + type=str, + help="Use an external cache source for the Docker build", + ) + parser.add_argument( + "--cache-to", + type=str, + help="Export layers from the container to an external cache destination", + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/docker_build.sh b/ops/docker_build.sh new file mode 100755 index 000000000000..7d83daec9574 --- /dev/null +++ b/ops/docker_build.sh @@ -0,0 +1,149 @@ +#!/bin/bash +## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for ops/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## ops/docker/ci_container.yml. +## +## Note. This script takes in some inputs via environment variables. + +USAGE_DOC=$( +cat <<-EOF +Usage: ops/docker_build.sh [container_id] + +In addition, the following environment variables should be set. + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - USE_DOCKER_CACHE: If set to 1, enable caching +EOF +) + +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "BRANCH_NAME" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" + exit 1 + fi +done + +if [[ "$#" -lt 1 ]] +then + echo "${USAGE_DOC}" + exit 2 +fi +CONTAINER_ID="$1" + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false +then + USE_DOCKER_CACHE=0 +fi + +if [[ ${USE_DOCKER_CACHE} -eq 0 ]] +then + echo "USE_DOCKER_CACHE not set; caching disabled" +else + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" + echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} +fi + +# Pull pre-built container from the cache +# First try locating one for the particular branch or pull request +CACHE_FROM_CMD="" +IS_CACHED=0 +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" + IS_CACHED=1 + else + # If there's no pre-built container from the cache, + # use the pre-built container from the master branch. + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" + echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ + "Using a cached container from the master branch: ${DOCKER_URL}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + IS_CACHED=1 + else + echo "Could not find a cached container for the master branch either." + IS_CACHED=0 + fi + fi + if [[ $IS_CACHED -eq 1 ]] + then + CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" + fi +fi + +# Run Docker build +set -x +python3 ops/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-id ${CONTAINER_ID} \ + ${BUILD_ARGS} \ + --cache-to type=inline \ + ${CACHE_FROM_CMD} +set +x + +# Now cache the new container +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" + docker tag "${CONTAINER_ID}" "${DOCKER_URL}" + + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${DOCKER_URL}" + if ! time docker push --quiet "${DOCKER_URL}" + then + echo "ERROR: could not update Docker cache ${DOCKER_URL}" + exit 1 + fi +fi diff --git a/ops/docker_run.py b/ops/docker_run.py new file mode 100644 index 000000000000..7e61c5a14f39 --- /dev/null +++ b/ops/docker_run.py @@ -0,0 +1,168 @@ +""" +Wrapper script to run a command inside a Docker container +""" + +import argparse +import grp +import itertools +import os +import pathlib +import pwd +import subprocess +import sys +import textwrap + +OPS_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = OPS_DIR.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def parse_run_args(raw_run_args: str) -> list[str]: + return [x for x in raw_run_args.split() if x] + + +def get_user_ids() -> dict[str, str]: + uid = os.getuid() + gid = os.getgid() + return { + "CI_BUILD_UID": str(uid), + "CI_BUILD_USER": pwd.getpwuid(uid).pw_name, + "CI_BUILD_GID": str(gid), + "CI_BUILD_GROUP": grp.getgrgid(gid).gr_name, + } + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def docker_run( + container_id: str, + command_args: list[str], + *, + use_gpus: bool, + workdir: pathlib.Path, + user_ids: dict[str, str], + extra_args: list[str], +) -> None: + # Command-line arguments to be passed to `docker run` + docker_run_cli_args = ["--rm", "--pid=host"] + + if use_gpus: + docker_run_cli_args.extend(["--gpus", "all"]) + + docker_run_cli_args.extend(["-v", f"{workdir}:/workspace", "-w", "/workspace"]) + docker_run_cli_args.extend( + itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) + ) + docker_run_cli_args.extend(extra_args) + docker_run_cli_args.append(container_id) + docker_run_cli_args.extend(command_args) + + cli_args = ["docker", "run"] + docker_run_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + run_args = parse_run_args(args.run_args) + user_ids = get_user_ids() + + if args.use_gpus: + print("Using NVIDIA GPUs for `docker run`") + if args.interactive: + print("Using interactive mode for `docker run`") + run_args.append("-it") + + docker_run( + args.container_id, + args.command_args, + use_gpus=args.use_gpus, + workdir=args.workdir, + user_ids=user_ids, + extra_args=run_args, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + usage=( + f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " + "[COMMAND_ARG ...]" + ), + description="Run tasks inside a Docker container", + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID of the container to run.", + ) + parser.add_argument( + "--use-gpus", + action="store_true", + help=( + "Grant the container access to NVIDIA GPUs; requires the NVIDIA " + "Container Toolkit." + ), + ) + parser.add_argument( + "--interactive", + action="store_true", + help=( + "Run the container in the interactive mode; requires an interactive shell " + "(TTY). With this flag, you can use Ctrl-C to interrupt an long-running " + "command." + ), + ) + parser.add_argument( + "--workdir", + type=lambda p: pathlib.Path(p).expanduser().resolve(), + default=PROJECT_ROOT_DIR, + help="Path to working directory; if unset, use the project's root", + ) + parser.add_argument( + "--run-args", + type=str, + default="", + help=( + "Argument(s) to be passed to `docker run`. When passing multiple " + "arguments, use single quotes to wrap them. Example: " + "--run-args '--cap-add SYS_PTRACE --shm-size=4g'" + ), + ) + parser.add_argument( + "command_args", + metavar="COMMAND_ARG", + type=str, + nargs="+", + help=( + "Argument(s) for the command to execute. NOTE. Make sure to specify " + "double-dash (--) to clearly distinguish between the command and the " + "preceding parameters. Example: --run-args '--cap-add SYS_PTRACE " + "--shm-size=4g' -- ./myprog" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh new file mode 100755 index 000000000000..49475c01c69e --- /dev/null +++ b/ops/pipeline/build-cuda.sh @@ -0,0 +1,85 @@ +#!/bin/bash +## Build XGBoost with CUDA + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +WHEEL_TAG=manylinux_2_28_x86_64 + +source ops/pipeline/classify-git-branch.sh + +echo "--- Build with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +set -x +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +# Generate the meta info which includes xgboost version and the commit info +python3 ops/docker_run.py \ +--container-id xgb-ci.gpu_build_rockylinux8 \ +-- python ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress + aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress +fi diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py new file mode 100644 index 000000000000..151e187513da --- /dev/null +++ b/ops/pipeline/stash-artifacts.py @@ -0,0 +1,144 @@ +""" +Stash an artifact in an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: + filename = artifact.name + relative_path = resolve(artifact).relative_to(Path.cwd()) + if resolve(artifact.parent) == resolve(Path.cwd()): + full_prefix = prefix + else: + full_prefix = f"{prefix}/{str(relative_path.parent)}" + return f"s3://{s3_bucket}/{full_prefix}/{filename}" + + +def aws_s3_upload(src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(src: str, dest: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + dest_dir = dest.parent + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(args: argparse.Namespace) -> None: + print(f"Stashing artifacts to prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + aws_s3_upload(artifact_path, s3_url) + + +def download(args: argparse.Namespace) -> None: + print(f"Unstashing artifacts from prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + print(f"mkdir -p {str(artifact_path.parent)}") + artifact_path.parent.mkdir(parents=True, exist_ok=True) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + if "*" in artifact: + aws_s3_download_with_wildcard(s3_url, artifact_path) + else: + aws_s3_download(s3_url, artifact_path) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent.parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--command", + type=str, + choices=["stash", "unstash"], + required=True, + help="Whether to stash or unstash the artifact", + ) + parser.add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parser.add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact would be stored. The artifact will be stored in " + "s3://[s3-bucket]/[prefix]." + ), + ) + parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") + parsed_args = parser.parse_args() + if parsed_args.command == "stash": + upload(parsed_args) + elif parsed_args.command == "unstash": + download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh new file mode 100755 index 000000000000..98c9695c4227 --- /dev/null +++ b/ops/pipeline/stash-artifacts.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## Convenience wrapper for ops/pipeline/stash-artifacts.py +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 3 ]] +then + echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" + exit 1 +fi + +command="$1" +remote_prefix="$2" +shift 2 + +for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +do + if [[ -z "${!arg:-}" ]] + then + echo "Error: $arg must be set." + exit 2 + fi +done + +artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" + +set -x +python3 ops/pipeline/stash-artifacts.py \ + --command "${command}" \ + --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ + --prefix "${artifact_stash_prefix}/${remote_prefix}" \ + -- "$@" diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh new file mode 100755 index 000000000000..9a0cd4743c18 --- /dev/null +++ b/ops/pipeline/test-cpp-gpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu,gpu-rmm,mgpu}" + exit 1 +fi +arg=$1 + +case "${arg}" in + gpu) + echo "--- Run Google Tests, using a single GPU" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost + ;; + + gpu-rmm) + echo "--- Run Google Tests, using a single GPU, RMM enabled" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost --use-rmm-pool + ;; + + mgpu) + echo "--- Run Google Tests, using multiple GPUs" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g' \ + -- build/testxgboost --gtest_filter=*MGPU* + ;; + + *) + echo "Unrecognized arg: ${arg}" + exit 2 + ;; +esac diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh new file mode 100755 index 000000000000..75bfa5fbaffb --- /dev/null +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -0,0 +1,74 @@ +#!/bin/bash +## Companion script for ops/pipeline/test-python-wheel.sh + +set -eo pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64}" + exit 1 +fi + +suite="$1" + +# Cannot set -u before Conda env activation +case "$suite" in + gpu|mgpu) + source activate gpu_test + ;; + cpu) + source activate linux_cpu_test + ;; + cpu-arm64) + source activate aarch64_test + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +set -xu + +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + ;; + mgpu) + echo "-- Run Python tests, using multiple GPUs" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_federated + ;; + cpu) + echo "-- Run Python tests (CPU)" + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + ;; + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + pytest -v -s -rxXs --fulltrace --durations=0 \ + tests/python/test_basic.py tests/python/test_basic_models.py \ + tests/python/test_model_compatibility.py + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh new file mode 100755 index 000000000000..b4dd59b7cb0e --- /dev/null +++ b/ops/pipeline/test-python-wheel.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Test XGBoost Python wheel on the Linux platform + +set -euo pipefail + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [container_id]" + exit 1 +fi + +suite="$1" +container_id="$2" + +if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +then + gpu_option="--use-gpus" +else + gpu_option="" +fi + +set -x +python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" diff --git a/tests/buildkite/pipeline-mac-m1.yml b/tests/buildkite/pipeline-mac-m1.yml deleted file mode 100644 index 57b1b1d12010..000000000000 --- a/tests/buildkite/pipeline-mac-m1.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - - label: ":macos: Build libxgboost4j.dylib for MacOS M1" - command: "tests/buildkite/build-jvm-macos-m1.sh" - key: mac-m1-jvm - agents: - queue: mac-mini-m1 - - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11" - command: "tests/buildkite/test-macos-m1-clang11.sh" - key: mac-m1-appleclang11 - agents: - queue: mac-mini-m1 diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml deleted file mode 100644 index cbb573c3682c..000000000000 --- a/tests/buildkite/pipeline-mgpu.yml +++ /dev/null @@ -1,48 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh jvm_gpu_build" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build and test JVM packages with CUDA" - command: "tests/buildkite/build-jvm-packages-gpu.sh" - key: build-jvm-packages-gpu - agents: - queue: linux-amd64-mgpu - - wait - #### -------- TEST -------- - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-mgpu.sh" - key: test-cpp-mgpu - agents: - queue: linux-amd64-mgpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml deleted file mode 100644 index 4d84f93a54d4..000000000000 --- a/tests/buildkite/pipeline-nightly.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Nightly CI pipeline, to test against dev versions of dependencies - -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 - USE_DEPS_DEV_VER: "1" - # Use dev versions of RAPIDS and other dependencies -steps: - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh gpu_dev_ver" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Build CUDA + RMM Nightly" - command: "tests/buildkite/build-cuda-with-rmm.sh dev" - key: build-cuda-rmm-nightly - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml deleted file mode 100644 index 83a61981e716..000000000000 --- a/tests/buildkite/pipeline-win64.yml +++ /dev/null @@ -1,24 +0,0 @@ -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- BUILD -------- - - label: ":windows: Build XGBoost for Windows with CUDA" - command: "tests/buildkite/build-win64-gpu.ps1" - key: build-win64-gpu - agents: - queue: windows-cpu - - - wait - - #### -------- TEST -------- - - label: ":windows: Test XGBoost on Windows" - command: "tests/buildkite/test-win64-gpu.ps1" - key: test-win64-gpu - agents: - queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml deleted file mode 100644 index 6c1df33b84dd..000000000000 --- a/tests/buildkite/pipeline.yml +++ /dev/null @@ -1,113 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh cpu" - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Run clang-tidy" - command: "tests/buildkite/run-clang-tidy.sh" - key: run-clang-tidy - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU" - command: "tests/buildkite/build-cpu.sh" - key: build-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU ARM64 + manylinux_2_28_aarch64 wheel" - command: "tests/buildkite/build-cpu-arm64.sh" - key: build-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Build CUDA + manylinux_2_28_x86_64 wheel" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build CUDA with RMM" - command: "tests/buildkite/build-cuda-with-rmm.sh stable" - key: build-cuda-with-rmm - agents: - queue: linux-amd64-cpu - - label: ":console: Build R package with CUDA" - command: "tests/buildkite/build-gpu-rpkg.sh" - key: build-gpu-rpkg - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM packages" - timeout_in_minutes: 30 - command: "tests/buildkite/build-jvm-packages.sh" - key: build-jvm-packages - agents: - queue: linux-amd64-cpu - - label: ":console: Build libxgboost4j.so for Linux ARM64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh" - key: build-jvm-linux-arm64-manylinux2014 - agents: - queue: linux-arm64-cpu - - label: ":console: Build libxgboost4j.so for Linux x86_64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh" - key: build-jvm-linux-x86_64-manylinux2014 - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM package doc" - command: "tests/buildkite/build-jvm-doc.sh" - key: build-jvm-doc - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014.sh x86_64" - key: build-manylinux2014-x86_64 - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014.sh aarch64" - key: build-manylinux2014-aarch64 - agents: - queue: linux-arm64-cpu - - wait - #### -------- TEST -------- - - label: ":console: Test Python package, CPU" - command: "tests/buildkite/test-python-cpu.sh" - key: test-python-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Test Python package, CPU ARM64" - command: "tests/buildkite/test-python-cpu-arm64.sh" - key: test-python-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-gpu.sh" - key: test-cpp-gpu - agents: - queue: linux-amd64-gpu - - wait - #### -------- DEPLOY JVM -------- - - label: ":console: Deploy JVM packages" - command: "tests/buildkite/deploy-jvm-packages.sh" - key: deploy-jvm-packages - agents: - queue: linux-amd64-cpu From 07fa2a5dc447904d16297b0664e5db436373a351 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 2 Dec 2024 19:36:58 -0800 Subject: [PATCH 02/12] Add missing files --- ops/pipeline/classify-git-branch.sh | 25 +++++++++++++++++++++++++ ops/pipeline/enforce-ci.sh | 21 +++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100755 ops/pipeline/classify-git-branch.sh create mode 100755 ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/classify-git-branch.sh b/ops/pipeline/classify-git-branch.sh new file mode 100755 index 000000000000..3d9a2348f23e --- /dev/null +++ b/ops/pipeline/classify-git-branch.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Detect whether the current git branch is a pull request or a release branch + +set -euo pipefail + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ ${BRANCH_NAME:-} == "master" || ${BRANCH_NAME:-} == "release_"* || ${BRANCH_NAME:-} == "federated-secure" ]] +then + is_release_branch=1 + enforce_daily_budget=0 +else + is_release_branch=0 + enforce_daily_budget=1 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh new file mode 100755 index 000000000000..1e853a5ea266 --- /dev/null +++ b/ops/pipeline/enforce-ci.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +## Ensure that a script is running inside the CI. +## Usage: source ops/pipeline/enforce-ci.sh + +set -euo pipefail + +if [[ -z ${GITHUB_ACTION:-} ]] +then + echo "$0 is not meant to run locally; it should run inside GitHub Actions." + echo "Please inspect the content of $0 and locate the desired command manually." + exit 1 +fi + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 2 +fi + +source ops/pipeline/classify-git-branch.sh From 4df47bb3bfa863c0400286efac2c490ff08c19be Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 2 Dec 2024 19:49:41 -0800 Subject: [PATCH 03/12] Fix permission --- ops/docker/entrypoint.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 ops/docker/entrypoint.sh diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh old mode 100644 new mode 100755 From 9b0b399128a83f5d4cfcb8afec25452578eaa02f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 2 Dec 2024 20:12:36 -0800 Subject: [PATCH 04/12] Add missing files --- ops/script/build_via_cmake.sh | 54 ++++++++++++++++++++++++++++ ops/script/format_wheel_meta.py | 59 +++++++++++++++++++++++++++++++ ops/script/rename_whl.py | 62 +++++++++++++++++++++++++++++++++ 3 files changed, 175 insertions(+) create mode 100755 ops/script/build_via_cmake.sh create mode 100644 ops/script/format_wheel_meta.py create mode 100644 ops/script/rename_whl.py diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh new file mode 100755 index 000000000000..00a571584ea4 --- /dev/null +++ b/ops/script/build_via_cmake.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "$#" -lt 1 ]] +then + conda_env="" +else + conda_env="$1" +fi + +if [[ "${conda_env}" == --conda-env=* ]] +then + conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) + echo "Activating Conda environment ${conda_env}" + shift 1 + cmake_args="$@" + + # Workaround for file permission error + if [[ -n ${CI_BUILD_UID:-} ]] + then + gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs + fi + + # Don't activate Conda env if it's already activated + if [[ -z ${CONDA_PREFIX:-} ]] + then + source activate ${conda_env} + fi + cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" +else + cmake_args="$@" + cmake_prefix_flag='' +fi + +rm -rf build +mkdir build +cd build +# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until +# https://github.com/dmlc/xgboost/issues/10400 is fixed +set -x +cmake .. ${cmake_args} \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -GNinja \ + ${cmake_prefix_flag} \ + -DHIDE_CXX_SYMBOLS=ON \ + -DBUILD_DEPRECATED_CLI=ON +ninja clean +time ninja -v +cd .. +set +x diff --git a/ops/script/format_wheel_meta.py b/ops/script/format_wheel_meta.py new file mode 100644 index 000000000000..a7def879905e --- /dev/null +++ b/ops/script/format_wheel_meta.py @@ -0,0 +1,59 @@ +""" +Script to generate meta.json to store metadata for a nightly build of +XGBoost Python package. +""" + +import argparse +import json +import pathlib + + +def main(args: argparse.Namespace) -> None: + wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() + if not wheel_path.exists(): + raise ValueError(f"Wheel cannot be found at path {wheel_path}") + if not wheel_path.is_file(): + raise ValueError(f"Path {wheel_path} is not a valid file") + wheel_name = wheel_path.name + + meta_path = pathlib.Path(args.meta_path) + if not meta_path.exists(): + raise ValueError(f"Path {meta_path} does not exist") + if not meta_path.is_dir(): + raise ValueError(f"Path {meta_path} is not a valid directory") + + tokens = wheel_name.split("-") + assert len(tokens) == 5 + version = tokens[1].split("+")[0] + + meta_info = { + "wheel_name": wheel_name, + "platform_tag": args.platform_tag, + "version": version, + "commit_id": args.commit_hash, + } + with open(meta_path / "meta.json", "w") as f: + json.dump(meta_info, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Format meta.json encoding the latest nightly version of the Python wheel" + ) + parser.add_argument( + "--wheel-path", type=str, required=True, help="Path to the wheel" + ) + parser.add_argument( + "--commit-hash", type=str, required=True, help="Git commit hash" + ) + parser.add_argument( + "--platform-tag", + type=str, + required=True, + help="Platform tag (e.g. manylinux_2_28_x86_64)", + ) + parser.add_argument( + "--meta-path", type=str, required=True, help="Directory to place meta.json" + ) + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/script/rename_whl.py b/ops/script/rename_whl.py new file mode 100644 index 000000000000..d4467720c738 --- /dev/null +++ b/ops/script/rename_whl.py @@ -0,0 +1,62 @@ +import argparse +import pathlib + + +def main(args: argparse.Namespace) -> None: + wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() + if not wheel_path.exists(): + raise ValueError(f"Wheel cannot be found at path {wheel_path}") + if not wheel_path.is_file(): + raise ValueError(f"Path {wheel_path} is not a valid file") + wheel_dir, wheel_name = wheel_path.parent, wheel_path.name + + tokens = wheel_name.split("-") + assert len(tokens) == 5 + version = tokens[1].split("+")[0] + keywords = { + "pkg_name": tokens[0], + "version": version, + "commit_id": args.commit_hash, + "platform_tag": args.platform_tag, + } + new_wheel_name = ( + "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format( + **keywords + ) + ) + new_wheel_path = wheel_dir / new_wheel_name + print(f"Renaming {wheel_name} to {new_wheel_name}...") + if new_wheel_name == wheel_name: + print("Skipping, as the old name is identical to the new name.") + else: + if new_wheel_path.is_file(): + new_wheel_path.unlink() + wheel_path.rename(new_wheel_path) + + filesize = new_wheel_path.stat().st_size / 1024 / 1024 # MiB + print(f"Wheel size: {filesize:.2f} MiB") + + if filesize > 300: + raise RuntimeError( + f"Limit of wheel size set by PyPI is exceeded. {new_wheel_name}: {filesize:.2f} MiB" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Format a Python wheel's name using the git commit hash and platform tag" + ) + parser.add_argument( + "--wheel-path", type=str, required=True, help="Path to the wheel" + ) + parser.add_argument( + "--commit-hash", type=str, required=True, help="Git commit hash" + ) + parser.add_argument( + "--platform-tag", + type=str, + required=True, + help="Platform tag (e.g. manylinux_2_28_x86_64)", + ) + parsed_args = parser.parse_args() + main(parsed_args) From f26c0b92542f67dba452d75b5319c376f1b863c9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 7 Dec 2024 20:26:27 -0800 Subject: [PATCH 05/12] Move container build to xgboost-devops --- .github/workflows/main.yml | 38 +---- ops/docker/ci_container.yml | 25 --- ops/docker/docker_cache_ecr.yml | 4 - ops/docker/dockerfile/Dockerfile.gpu | 54 ------- .../Dockerfile.gpu_build_rockylinux8 | 82 ---------- .../Dockerfile.manylinux_2_28_x86_64 | 15 -- ops/docker/entrypoint.sh | 45 ------ ops/docker/extract_build_args.jq | 12 -- ops/docker/extract_build_args.sh | 26 --- ops/docker_build.py | 137 ---------------- ops/docker_build.sh | 149 ------------------ ops/docker_run.py | 31 ++-- ops/pipeline/build-cuda.sh | 27 ++-- ops/pipeline/get-docker-registry-details.sh | 5 + ops/pipeline/login-docker-registry.sh | 11 ++ ops/pipeline/test-cpp-gpu.sh | 22 ++- ops/pipeline/test-python-wheel.sh | 5 +- 17 files changed, 67 insertions(+), 621 deletions(-) delete mode 100644 ops/docker/ci_container.yml delete mode 100644 ops/docker/docker_cache_ecr.yml delete mode 100644 ops/docker/dockerfile/Dockerfile.gpu delete mode 100644 ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 delete mode 100644 ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 delete mode 100755 ops/docker/entrypoint.sh delete mode 100644 ops/docker/extract_build_args.jq delete mode 100755 ops/docker/extract_build_args.sh delete mode 100644 ops/docker_build.py delete mode 100755 ops/docker_build.sh create mode 100755 ops/pipeline/get-docker-registry-details.sh create mode 100755 ops/pipeline/login-docker-registry.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e2b50019c54d..81602602b517 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,32 +15,8 @@ env: USE_DOCKER_CACHE: 1 jobs: - build-containers: - name: Build CI containers (${{ matrix.container_id }}) - runs-on: - - runs-on - - runner=${{ matrix.runner }} - - run-id=${{ github.run_id }} - - tag=main-build-containers-${{ matrix.container_id }} - strategy: - matrix: - container_id: - - xgb-ci.gpu_build_rockylinux8 - - xgb-ci.gpu - - xgb-ci.manylinux_2_28_x86_64 - runner: [linux-amd64-cpu] - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh ${{ matrix.container_id }} - build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel - needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu @@ -51,10 +27,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - run: bash ops/pipeline/build-cuda.sh - name: Stash files run: | @@ -87,8 +61,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh xgb-ci.gpu + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash gtest run: | bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ @@ -124,8 +98,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh ${{ matrix.container }} + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh - name: Unstash Python wheel run: | bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml deleted file mode 100644 index df0402293ea2..000000000000 --- a/ops/docker/ci_container.yml +++ /dev/null @@ -1,25 +0,0 @@ -## List of CI containers with definitions and build arguments - -# Each container will be built using the definition from -# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF - -rapids_versions: - stable: &rapids_version "24.10" - dev: &dev_rapids_version "24.12" - -xgb-ci.gpu_build_rockylinux8: - container_def: gpu_build_rockylinux8 - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *rapids_version - -xgb-ci.gpu: - container_def: gpu - build_args: - CUDA_VERSION_ARG: "12.4.1" - NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: *rapids_version - -xgb-ci.manylinux_2_28_x86_64: - container_def: manylinux_2_28_x86_64 diff --git a/ops/docker/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml deleted file mode 100644 index e20f35fc8020..000000000000 --- a/ops/docker/docker_cache_ecr.yml +++ /dev/null @@ -1,4 +0,0 @@ -## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache - -DOCKER_CACHE_ECR_ID: "492475357299" -DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu deleted file mode 100644 index 96a532fc2ff1..000000000000 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ /dev/null @@ -1,54 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits (e.g. 24.06) -ARG NCCL_VERSION_ARG -ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - apt-get install libnccl2 libnccl-dev -y --allow-change-held-packages && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - "dask<=2024.10.0" \ - "distributed<=2024.10.0" \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 deleted file mode 100644 index b686bfbb2b0d..000000000000 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ /dev/null @@ -1,82 +0,0 @@ -ARG CUDA_VERSION_ARG=notset -FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 -ARG CUDA_VERSION_ARG -ARG NCCL_VERSION_ARG -ARG RAPIDS_VERSION_ARG - -# Install all basic requirements -RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ - > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ - dnf -y update && \ - dnf -y install dnf-plugins-core && \ - dnf config-manager --set-enabled powertools && \ - dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - /opt/miniforge/bin/python -m pip install awscli && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.29/cmake-3.29.5-linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.29.5-linux-x86_64.sh --skip-license --prefix=/usr - -# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) -RUN \ - export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=$NCCL_VERSION_ARG && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ - dnf -y update && \ - dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} - -ENV PATH=/opt/miniforge/bin:/usr/local/ninja:$PATH -ENV CC=/opt/rh/gcc-toolset-10/root/usr/bin/gcc -ENV CXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ -ENV CPP=/opt/rh/gcc-toolset-10/root/usr/bin/cpp -ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-10/root/usr/bin/c++ - -ENV GOSU_VERSION=1.10 - -# Install gRPC -# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629 -RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ - --recurse-submodules --depth 1 && \ - pushd grpc && \ - pushd third_party/abseil-cpp && \ - git fetch origin master && \ - git cherry-pick -n cfde5f74e276049727f9556f13473a59fe77d9eb && \ - popd && \ - cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc -DCMAKE_CXX_VISIBILITY_PRESET=hidden && \ - cmake --build build --target install && \ - popd && \ - rm -rf grpc - -# Install RMM -# Patch out -Werror -# Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 -RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ - pushd rmm && \ - find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ - mkdir build && \ - pushd build && \ - cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \ - pushd _deps/cccl-src/ && \ - git fetch origin main && \ - git cherry-pick -n 9fcb32c228865f21f2b002b29d38a06b4c6fbd73 && \ - popd && \ - cmake --build . --target install && \ - popd && \ - popd && \ - rm -rf rmm - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 deleted file mode 100644 index f5dac54b9b8f..000000000000 --- a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 +++ /dev/null @@ -1,15 +0,0 @@ -FROM quay.io/pypa/manylinux_2_28_x86_64 - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY docker/entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh deleted file mode 100755 index 40135c197c73..000000000000 --- a/ops/docker/entrypoint.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# This wrapper script propagates the user information from the host -# to the container. This way, any files generated by processes running -# in the container will be accessible in the host. - -set -euo pipefail - -COMMAND=("$@") - -if ! touch /this_is_writable_file_system; then - echo "You can't write to your filesystem!" - echo "If you are in Docker you should check you do not have too many images" \ - "with too many files in them. Docker has some issue with it." - exit 1 -else - rm /this_is_writable_file_system -fi - -## Assumption: the host passes correct user information via environment variables -## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP - -if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] -then - groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true - useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ - "${CI_BUILD_USER}" || true - export HOME="/home/${CI_BUILD_USER}" - shopt -s dotglob - cp -r /root/* "$HOME/" - chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" - - # Allows project-specific customization - if [[ -e "/workspace/.pre_entry.sh" ]]; then - gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh - fi - - # Enable passwordless sudo capabilities for the user - chown root:"${CI_BUILD_GID}" "$(which gosu)" - chmod +s "$(which gosu)"; sync - - exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" -else - exec "${COMMAND[@]}" -fi diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq deleted file mode 100644 index b35240edb626..000000000000 --- a/ops/docker/extract_build_args.jq +++ /dev/null @@ -1,12 +0,0 @@ -## Example input: -## xgb-ci.gpu_build_r_rockylinux8 -## Example output: -## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 -def compute_build_args($input; $container_id): - $input | - .[$container_id] | - select(.build_args != null) | - .build_args | - to_entries | - map("--build-arg " + .key + "=" + .value) | - join(" "); diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh deleted file mode 100755 index 42a83047742c..000000000000 --- a/ops/docker/extract_build_args.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -## Extract container definition and build args from ops/docker/ci_container.yml, -## given the container ID. -## -## Example input: -## xgb-ci.clang_tidy -## Example output: -## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 [container_id]" - exit 1 -fi - -CONTAINER_ID="$1" -CONTAINER_DEF=$( - yq -o json ops/docker/ci_container.yml | - jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' -) -BUILD_ARGS=$( - yq -o json ops/docker/ci_container.yml | - jq -r --arg container_id "${CONTAINER_ID}" \ - 'include "ops/docker/extract_build_args"; - compute_build_args(.; $container_id)' -) -echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py deleted file mode 100644 index 1fed975ce223..000000000000 --- a/ops/docker_build.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Wrapper script to build a Docker container with layer caching -""" - -import argparse -import itertools -import pathlib -import subprocess -import sys -from typing import Optional - -from docker_run import OPS_DIR, fancy_print_cli_args - - -def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: - parsed_build_args = dict() - for arg in raw_build_args: - try: - key, value = arg.split("=", maxsplit=1) - except ValueError as e: - raise ValueError( - f"Build argument must be of form KEY=VALUE. Got: {arg}" - ) from e - parsed_build_args[key] = value - return parsed_build_args - - -def docker_build( - container_id: str, - *, - build_args: dict[str, str], - dockerfile_path: pathlib.Path, - docker_context_path: pathlib.Path, - cache_from: Optional[str], - cache_to: Optional[str], -) -> None: - ## Set up command-line arguments to be passed to `docker build` - # Build args - docker_build_cli_args = list( - itertools.chain.from_iterable( - [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] - ) - ) - # When building an image using a non-default driver, we need to specify - # `--load` to load it to the image store. - # See https://docs.docker.com/build/builders/drivers/ - docker_build_cli_args.append("--load") - # Layer caching - if cache_from: - docker_build_cli_args.extend(["--cache-from", cache_from]) - if cache_to: - docker_build_cli_args.extend(["--cache-to", cache_to]) - # Remaining CLI args - docker_build_cli_args.extend( - [ - "--progress=plain", - "--ulimit", - "nofile=1024000:1024000", - "-t", - container_id, - "-f", - str(dockerfile_path), - str(docker_context_path), - ] - ) - cli_args = ["docker", "build"] + docker_build_cli_args - fancy_print_cli_args(cli_args) - subprocess.run(cli_args, check=True, encoding="utf-8") - - -def main(args: argparse.Namespace) -> None: - # Dockerfile to be used in docker build - dockerfile_path = ( - OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" - ) - docker_context_path = OPS_DIR - - build_args = parse_build_args(args.build_arg) - - docker_build( - args.container_id, - build_args=build_args, - dockerfile_path=dockerfile_path, - docker_context_path=docker_context_path, - cache_from=args.cache_from, - cache_to=args.cache_to, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Build a Docker container") - parser.add_argument( - "--container-def", - type=str, - required=True, - help=( - "String uniquely identifying the container definition. The container " - "definition will be fetched from " - "docker/dockerfile/Dockerfile.CONTAINER_DEF." - ), - ) - parser.add_argument( - "--container-id", - type=str, - required=True, - help="String ID to assign to the newly built container", - ) - parser.add_argument( - "--build-arg", - type=str, - default=[], - action="append", - help=( - "Build-time variable(s) to be passed to `docker build`. Each variable " - "should be specified as a key-value pair in the form KEY=VALUE. " - "The variables should match the ARG instructions in the Dockerfile. " - "When passing multiple variables, specify --build-arg multiple times. " - "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" - ), - ) - parser.add_argument( - "--cache-from", - type=str, - help="Use an external cache source for the Docker build", - ) - parser.add_argument( - "--cache-to", - type=str, - help="Export layers from the container to an external cache destination", - ) - - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - - parsed_args = parser.parse_args() - main(parsed_args) diff --git a/ops/docker_build.sh b/ops/docker_build.sh deleted file mode 100755 index 7d83daec9574..000000000000 --- a/ops/docker_build.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/bin/bash -## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). -## This script provides a convenient wrapper for ops/docker_build.py. -## Build-time variables (--build-arg) and container defintion are fetched from -## ops/docker/ci_container.yml. -## -## Note. This script takes in some inputs via environment variables. - -USAGE_DOC=$( -cat <<-EOF -Usage: ops/docker_build.sh [container_id] - -In addition, the following environment variables should be set. - - BRANCH_NAME: Name of the current git branch or pull request (Required) - - USE_DOCKER_CACHE: If set to 1, enable caching -EOF -) - -ECR_LIFECYCLE_RULE=$( -cat <<-EOF -{ - "rules": [ - { - "rulePriority": 1, - "selection": { - "tagStatus": "any", - "countType": "sinceImagePushed", - "countUnit": "days", - "countNumber": 30 - }, - "action": { - "type": "expire" - } - } - ] -} -EOF -) - -set -euo pipefail - -for arg in "BRANCH_NAME" -do - if [[ -z "${!arg:-}" ]] - then - echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" - exit 1 - fi -done - -if [[ "$#" -lt 1 ]] -then - echo "${USAGE_DOC}" - exit 2 -fi -CONTAINER_ID="$1" - -# Fetch CONTAINER_DEF and BUILD_ARGS -source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 - -if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false -then - USE_DOCKER_CACHE=0 -fi - -if [[ ${USE_DOCKER_CACHE} -eq 0 ]] -then - echo "USE_DOCKER_CACHE not set; caching disabled" -else - DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) - DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) - DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" - echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" - # Login for Docker registry - echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ - "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" - aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ - | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} -fi - -# Pull pre-built container from the cache -# First try locating one for the particular branch or pull request -CACHE_FROM_CMD="" -IS_CACHED=0 -if [[ ${USE_DOCKER_CACHE} -eq 1 ]] -then - DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" - echo "docker pull --quiet ${DOCKER_URL}" - if time docker pull --quiet "${DOCKER_URL}" - then - echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" - IS_CACHED=1 - else - # If there's no pre-built container from the cache, - # use the pre-built container from the master branch. - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" - echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ - "Using a cached container from the master branch: ${DOCKER_URL}" - echo "docker pull --quiet ${DOCKER_URL}" - if time docker pull --quiet "${DOCKER_URL}" - then - IS_CACHED=1 - else - echo "Could not find a cached container for the master branch either." - IS_CACHED=0 - fi - fi - if [[ $IS_CACHED -eq 1 ]] - then - CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" - fi -fi - -# Run Docker build -set -x -python3 ops/docker_build.py \ - --container-def ${CONTAINER_DEF} \ - --container-id ${CONTAINER_ID} \ - ${BUILD_ARGS} \ - --cache-to type=inline \ - ${CACHE_FROM_CMD} -set +x - -# Now cache the new container -if [[ ${USE_DOCKER_CACHE} -eq 1 ]] -then - DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" - echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" - docker tag "${CONTAINER_ID}" "${DOCKER_URL}" - - # Attempt to create Docker repository; it will fail if the repository already exists - echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" - if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} - then - # Repository was created. Now set expiration policy - echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ - "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" - echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ - --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin - fi - - echo "docker push --quiet ${DOCKER_URL}" - if ! time docker push --quiet "${DOCKER_URL}" - then - echo "ERROR: could not update Docker cache ${DOCKER_URL}" - exit 1 - fi -fi diff --git a/ops/docker_run.py b/ops/docker_run.py index 7e61c5a14f39..06f9d6cc8dc8 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -24,7 +24,7 @@ ) -def parse_run_args(raw_run_args: str) -> list[str]: +def parse_run_args(*, raw_run_args: str) -> list[str]: return [x for x in raw_run_args.split() if x] @@ -39,7 +39,7 @@ def get_user_ids() -> dict[str, str]: } -def fancy_print_cli_args(cli_args: list[str]) -> None: +def fancy_print_cli_args(*, cli_args: list[str]) -> None: print( "=" * LINEWIDTH + "\n" @@ -52,9 +52,9 @@ def fancy_print_cli_args(cli_args: list[str]) -> None: def docker_run( - container_id: str, - command_args: list[str], *, + container_tag: str, + command_args: list[str], use_gpus: bool, workdir: pathlib.Path, user_ids: dict[str, str], @@ -71,16 +71,16 @@ def docker_run( itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) ) docker_run_cli_args.extend(extra_args) - docker_run_cli_args.append(container_id) + docker_run_cli_args.append(container_tag) docker_run_cli_args.extend(command_args) cli_args = ["docker", "run"] + docker_run_cli_args - fancy_print_cli_args(cli_args) + fancy_print_cli_args(cli_args=cli_args) subprocess.run(cli_args, check=True, encoding="utf-8") -def main(args: argparse.Namespace) -> None: - run_args = parse_run_args(args.run_args) +def main(*, args: argparse.Namespace) -> None: + run_args = parse_run_args(raw_run_args=args.run_args) user_ids = get_user_ids() if args.use_gpus: @@ -90,8 +90,8 @@ def main(args: argparse.Namespace) -> None: run_args.append("-it") docker_run( - args.container_id, - args.command_args, + container_tag=args.container_tag, + command_args=args.command_args, use_gpus=args.use_gpus, workdir=args.workdir, user_ids=user_ids, @@ -102,17 +102,20 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser( usage=( - f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + f"{sys.argv[0]} --container-tag CONTAINER_TAG [--use-gpus] [--interactive] " "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " "[COMMAND_ARG ...]" ), description="Run tasks inside a Docker container", ) parser.add_argument( - "--container-id", + "--container-tag", type=str, required=True, - help="String ID of the container to run.", + help=( + "Container tag to identify the container, e.g. " + "492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main" + ), ) parser.add_argument( "--use-gpus", @@ -165,4 +168,4 @@ def main(args: argparse.Namespace) -> None: sys.exit(1) parsed_args = parser.parse_args() - main(parsed_args) + main(args=parsed_args) diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 49475c01c69e..2170b8a681ac 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Build XGBoost with CUDA -set -euox pipefail +set -euo pipefail if [[ -z "${GITHUB_SHA:-}" ]] then @@ -9,9 +9,12 @@ then exit 1 fi -WHEEL_TAG=manylinux_2_28_x86_64 - source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh + +WHEEL_TAG=manylinux_2_28_x86_64 +BUILD_CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.gpu_build_rockylinux8:main +MANYLINUX_CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:main echo "--- Build with CUDA" @@ -28,7 +31,7 @@ set -x # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ + --container-tag ${BUILD_CONTAINER_TAG} \ -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ @@ -43,7 +46,7 @@ python3 ops/docker_run.py \ echo "--- Build binary wheel" python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ + --container-tag ${BUILD_CONTAINER_TAG} \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" python3 ops/script/rename_whl.py \ @@ -53,7 +56,7 @@ python3 ops/script/rename_whl.py \ echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" python3 ops/docker_run.py \ - --container-id xgb-ci.manylinux_2_28_x86_64 \ + --container-tag ${MANYLINUX_CONTAINER_TAG} \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl python3 ops/script/rename_whl.py \ @@ -61,15 +64,13 @@ python3 ops/script/rename_whl.py \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -python3 ops/docker_run.py \ - --container-id xgb-ci.manylinux_2_28_x86_64 \ - -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then + echo "error: libgomp.so was not vendored in the wheel" + exit -1 +fi # Generate the meta info which includes xgboost version and the commit info -python3 ops/docker_run.py \ ---container-id xgb-ci.gpu_build_rockylinux8 \ --- python ops/script/format_wheel_meta.py \ +python3 ops/script/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} \ diff --git a/ops/pipeline/get-docker-registry-details.sh b/ops/pipeline/get-docker-registry-details.sh new file mode 100755 index 000000000000..000db9a2655a --- /dev/null +++ b/ops/pipeline/get-docker-registry-details.sh @@ -0,0 +1,5 @@ +## Get details for AWS ECR (Elastic Container Registry) in environment variables + +ECR_AWS_ACCOUNT_ID="492475357299" +ECR_AWS_REGION="us-west-2" +DOCKER_REGISTRY_URL="${ECR_AWS_ACCOUNT_ID}.dkr.ecr.${ECR_AWS_REGION}.amazonaws.com" diff --git a/ops/pipeline/login-docker-registry.sh b/ops/pipeline/login-docker-registry.sh new file mode 100755 index 000000000000..a03987f484b8 --- /dev/null +++ b/ops/pipeline/login-docker-registry.sh @@ -0,0 +1,11 @@ +## Log into AWS ECR (Elastic Container Registry) to be able to pull containers from it +## Note. Requires valid AWS credentials + +set -euo pipefail + +source ops/pipeline/get-docker-registry-details.sh + +echo "aws ecr get-login-password --region ${ECR_AWS_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_REGISTRY_URL}" +aws ecr get-login-password --region ${ECR_AWS_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_REGISTRY_URL} diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 9a0cd4743c18..9fdcd314264d 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -7,36 +7,34 @@ then echo "Usage: $0 {gpu,gpu-rmm,mgpu}" exit 1 fi -arg=$1 +suite=$1 -case "${arg}" in +source ops/pipeline/get-docker-registry-details.sh + +CONTAINER_TAG=${DOCKER_REGISTRY_URL}/xgb-ci.gpu:main + +case "${suite}" in gpu) echo "--- Run Google Tests, using a single GPU" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ -- build/testxgboost ;; gpu-rmm) echo "--- Run Google Tests, using a single GPU, RMM enabled" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ -- build/testxgboost --use-rmm-pool ;; mgpu) echo "--- Run Google Tests, using multiple GPUs" - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + python3 ops/docker_run.py --container-tag ${CONTAINER_TAG} --use-gpus \ --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; *) - echo "Unrecognized arg: ${arg}" + echo "Unrecognized suite: ${suite}" exit 2 ;; esac diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index b4dd59b7cb0e..56d54fd65d02 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -19,7 +19,10 @@ else gpu_option="" fi +source ops/pipeline/get-docker-registry-details.sh +CONTAINER_TAG="${DOCKER_REGISTRY_URL}/${container_id}:main" + set -x -python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ +python3 ops/docker_run.py --container-tag "${CONTAINER_TAG}" ${gpu_option} \ --run-args='--shm-size=4g --privileged' \ -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" From 645a5e70956a051b61bf16ef7ab3864da38b1532 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 7 Dec 2024 22:32:58 -0800 Subject: [PATCH 06/12] GITHUB_ACTION -> GITHUB_ACTIONS --- ops/pipeline/enforce-ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh index 1e853a5ea266..292d6baec079 100755 --- a/ops/pipeline/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -5,7 +5,7 @@ set -euo pipefail -if [[ -z ${GITHUB_ACTION:-} ]] +if [[ -z ${GITHUB_ACTIONS:-} ]] then echo "$0 is not meant to run locally; it should run inside GitHub Actions." echo "Please inspect the content of $0 and locate the desired command manually." From cc1e9e8e79330134452a00db5ebbdab17831598f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 00:00:26 -0800 Subject: [PATCH 07/12] Remove build_via_cmake.sh --- ops/pipeline/build-cuda-impl.sh | 36 ++++++++++++++++++++++ ops/pipeline/build-cuda.sh | 18 +---------- ops/script/build_via_cmake.sh | 54 --------------------------------- 3 files changed, 37 insertions(+), 71 deletions(-) create mode 100755 ops/pipeline/build-cuda-impl.sh delete mode 100755 ops/script/build_via_cmake.sh diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh new file mode 100755 index 000000000000..13230a4f0fed --- /dev/null +++ b/ops/pipeline/build-cuda-impl.sh @@ -0,0 +1,36 @@ +#!/bin/bash +## Build XGBoost with CUDA +## Companion script fro build-cuda.sh + +set -euox pipefail + +mkdir -p build +pushd build + +# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until +# https://github.com/dmlc/xgboost/issues/10400 is fixed +echo "--- Build libxgboost from the source" +cmake .. \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -DBUILD_DEPRECATED_CLI=ON \ + -GNinja +time ninja -v +popd + +echo "--- Build binary wheel" +pushd python-package +rm -rfv dist/* +pip wheel --no-deps -v . --wheel-dir dist/ +popd diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 2170b8a681ac..0d66c6eff381 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -32,23 +32,7 @@ set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-tag ${BUILD_CONTAINER_TAG} \ - -- ops/script/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} - -echo "--- Build binary wheel" -python3 ops/docker_run.py \ - --container-tag ${BUILD_CONTAINER_TAG} \ - -- bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" + -- ops/pipeline/build-cuda-impl.sh python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh deleted file mode 100755 index 00a571584ea4..000000000000 --- a/ops/script/build_via_cmake.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [[ "$#" -lt 1 ]] -then - conda_env="" -else - conda_env="$1" -fi - -if [[ "${conda_env}" == --conda-env=* ]] -then - conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) - echo "Activating Conda environment ${conda_env}" - shift 1 - cmake_args="$@" - - # Workaround for file permission error - if [[ -n ${CI_BUILD_UID:-} ]] - then - gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs - fi - - # Don't activate Conda env if it's already activated - if [[ -z ${CONDA_PREFIX:-} ]] - then - source activate ${conda_env} - fi - cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" -else - cmake_args="$@" - cmake_prefix_flag='' -fi - -rm -rf build -mkdir build -cd build -# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until -# https://github.com/dmlc/xgboost/issues/10400 is fixed -set -x -cmake .. ${cmake_args} \ - -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON \ - -DENABLE_ALL_WARNINGS=ON \ - -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ - -GNinja \ - ${cmake_prefix_flag} \ - -DHIDE_CXX_SYMBOLS=ON \ - -DBUILD_DEPRECATED_CLI=ON -ninja clean -time ninja -v -cd .. -set +x From 57f61655ee7bcc9fd24417ceae3777945a05e90c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 01:32:24 -0800 Subject: [PATCH 08/12] Replace stash-artifacts.{sh,py} -> manage-artifacts.py --- .github/workflows/main.yml | 21 +++- ops/pipeline/manage-artifacts.py | 158 +++++++++++++++++++++++++ ops/pipeline/stash-artifacts.py | 144 ---------------------- ops/pipeline/stash-artifacts.sh | 36 ------ ops/pipeline/test-python-wheel-impl.sh | 2 +- 5 files changed, 176 insertions(+), 185 deletions(-) create mode 100644 ops/pipeline/manage-artifacts.py delete mode 100644 ops/pipeline/stash-artifacts.py delete mode 100755 ops/pipeline/stash-artifacts.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 81602602b517..19b4d8b24d55 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,7 +32,9 @@ jobs: - run: bash ops/pipeline/build-cuda.sh - name: Stash files run: | - bash ops/pipeline/stash-artifacts.sh stash build-cuda \ + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda \ build/testxgboost ./xgboost python-package/dist/*.whl test-cpp-gpu: @@ -65,8 +67,11 @@ jobs: run: bash ops/pipeline/login-docker-registry.sh - name: Unstash gtest run: | - bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ - build/testxgboost + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ + --dest-dir build \ + testxgboost chmod +x build/testxgboost - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} @@ -102,8 +107,16 @@ jobs: run: bash ops/pipeline/login-docker-registry.sh - name: Unstash Python wheel run: | - bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ + --dest-dir . \ python-package/dist/*.whl ./xgboost + python3 ops/pipeline/manage-artifacts.py download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ + --dest-dir wheelhouse \ + *.whl chmod +x ./xgboost - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/ops/pipeline/manage-artifacts.py b/ops/pipeline/manage-artifacts.py new file mode 100644 index 000000000000..0b0d237cc7fb --- /dev/null +++ b/ops/pipeline/manage-artifacts.py @@ -0,0 +1,158 @@ +""" +Upload an artifact to an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +import sys +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(*, s3_bucket: str, prefix: str, artifact: str) -> str: + if prefix == "": + return f"s3://{s3_bucket}/{artifact}" + return f"s3://{s3_bucket}/{prefix}/{artifact}" + + +def aws_s3_upload(*, src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(*, src: str, dest_dir: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest_dir)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(*, src: str, dest_dir: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(*, args: argparse.Namespace) -> None: + print(f"Uploading artifacts to prefix {args.prefix}...") + for artifact in args.artifacts: + s3_url = compute_s3_url( + s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact + ) + aws_s3_upload(src=Path(artifact), dest=s3_url) + + +def download(*, args: argparse.Namespace) -> None: + print(f"Downloading artifacts from prefix {args.prefix}...") + dest_dir = Path(args.dest_dir) + print(f"mkdir -p {str(dest_dir)}") + dest_dir.mkdir(parents=True, exist_ok=True) + for artifact in args.artifacts: + s3_url = compute_s3_url( + s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact + ) + if "*" in artifact: + aws_s3_download_with_wildcard(src=s3_url, dest_dir=dest_dir) + else: + aws_s3_download(src=s3_url, dest_dir=dest_dir) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent.parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + root_parser = argparse.ArgumentParser() + subparser_factory = root_parser.add_subparsers(required=True, dest="command") + parsers = {} + for command in ["upload", "download"]: + parsers[command] = subparser_factory.add_parser(command) + parsers[command].add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parsers[command].add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact(s) would be stored. The artifact(s) will be stored at " + "s3://[s3-bucket]/[prefix]/[filename]." + ), + ) + parsers[command].add_argument( + "artifacts", + type=str, + nargs="+", + metavar="artifact", + help=f"Artifact(s) to {command}", + ) + + parsers["download"].add_argument( + "--dest-dir", type=str, required=True, help="Where to download artifact(s)" + ) + + if len(sys.argv) == 1: + print("1. Upload artifact(s)") + parsers["upload"].print_help() + print("\n2. Download artifact(s)") + parsers["download"].print_help() + sys.exit(1) + + parsed_args = root_parser.parse_args() + if parsed_args.command == "upload": + upload(args=parsed_args) + elif parsed_args.command == "download": + download(args=parsed_args) diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py deleted file mode 100644 index 151e187513da..000000000000 --- a/ops/pipeline/stash-artifacts.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -Stash an artifact in an S3 bucket for later use - -Note. This script takes in all inputs via environment variables - except the path to the artifact(s). -""" - -import argparse -import os -import subprocess -from pathlib import Path -from urllib.parse import SplitResult, urlsplit, urlunsplit - - -def resolve(x: Path) -> Path: - return x.expanduser().resolve() - - -def path_equals(a: Path, b: Path) -> bool: - return resolve(a) == resolve(b) - - -def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: - filename = artifact.name - relative_path = resolve(artifact).relative_to(Path.cwd()) - if resolve(artifact.parent) == resolve(Path.cwd()): - full_prefix = prefix - else: - full_prefix = f"{prefix}/{str(relative_path.parent)}" - return f"s3://{s3_bucket}/{full_prefix}/{filename}" - - -def aws_s3_upload(src: Path, dest: str) -> None: - cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def aws_s3_download(src: str, dest: Path) -> None: - cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: - parsed_src = urlsplit(src) - src_dir = urlunsplit( - SplitResult( - scheme="s3", - netloc=parsed_src.netloc, - path=os.path.dirname(parsed_src.path), - query="", - fragment="", - ) - ) - dest_dir = dest.parent - src_glob = os.path.basename(parsed_src.path) - cli_args = [ - "aws", - "s3", - "cp", - "--recursive", - "--no-progress", - "--exclude", - "'*'", - "--include", - src_glob, - src_dir, - str(dest_dir), - ] - print(" ".join(cli_args)) - subprocess.run( - cli_args, - check=True, - encoding="utf-8", - ) - - -def upload(args: argparse.Namespace) -> None: - print(f"Stashing artifacts to prefix {args.prefix}...") - for artifact in args.artifacts: - artifact_path = Path(artifact) - s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) - aws_s3_upload(artifact_path, s3_url) - - -def download(args: argparse.Namespace) -> None: - print(f"Unstashing artifacts from prefix {args.prefix}...") - for artifact in args.artifacts: - artifact_path = Path(artifact) - print(f"mkdir -p {str(artifact_path.parent)}") - artifact_path.parent.mkdir(parents=True, exist_ok=True) - s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) - if "*" in artifact: - aws_s3_download_with_wildcard(s3_url, artifact_path) - else: - aws_s3_download(s3_url, artifact_path) - - -if __name__ == "__main__": - # Ensure that the current working directory is the project root - if not (Path.cwd() / "ops").is_dir() or not path_equals( - Path(__file__).parent.parent, Path.cwd() / "ops" - ): - x = Path(__file__).name - raise RuntimeError(f"Script {x} must be run at the project's root directory") - - parser = argparse.ArgumentParser() - parser.add_argument( - "--command", - type=str, - choices=["stash", "unstash"], - required=True, - help="Whether to stash or unstash the artifact", - ) - parser.add_argument( - "--s3-bucket", - type=str, - required=True, - help="Name of the S3 bucket to store the artifact", - ) - parser.add_argument( - "--prefix", - type=str, - required=True, - help=( - "Where the artifact would be stored. The artifact will be stored in " - "s3://[s3-bucket]/[prefix]." - ), - ) - parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") - parsed_args = parser.parse_args() - if parsed_args.command == "stash": - upload(parsed_args) - elif parsed_args.command == "unstash": - download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh deleted file mode 100755 index 98c9695c4227..000000000000 --- a/ops/pipeline/stash-artifacts.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -## Convenience wrapper for ops/pipeline/stash-artifacts.py -## Meant to be used inside GitHub Actions - -set -euo pipefail - -source ops/pipeline/enforce-ci.sh - -if [[ "$#" -lt 3 ]] -then - echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" - exit 1 -fi - -command="$1" -remote_prefix="$2" -shift 2 - -for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" -do - if [[ -z "${!arg:-}" ]] - then - echo "Error: $arg must be set." - exit 2 - fi -done - -artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" - -set -x -python3 ops/pipeline/stash-artifacts.py \ - --command "${command}" \ - --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ - --prefix "${artifact_stash_prefix}/${remote_prefix}" \ - -- "$@" diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh index 75bfa5fbaffb..837ff03b24d7 100755 --- a/ops/pipeline/test-python-wheel-impl.sh +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -34,7 +34,7 @@ export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) export SPARK_TESTING=1 -pip install -v ./python-package/dist/*.whl +pip install -v ./wheelhouse/*.whl case "$suite" in gpu) From e14d393a675cd043d4ceae81bde071cf46d502a6 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 01:47:26 -0800 Subject: [PATCH 09/12] Use manage-artifacts.py for uploading nightly builds --- ops/pipeline/build-cuda.sh | 8 ++++---- ops/pipeline/manage-artifacts.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 0d66c6eff381..bbe89aaa502d 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -63,8 +63,8 @@ python3 ops/script/format_wheel_meta.py \ echo "--- Upload Python wheel" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME} --make-public \ + python-package/dist/*.whl python-package/dist/meta.json fi diff --git a/ops/pipeline/manage-artifacts.py b/ops/pipeline/manage-artifacts.py index 0b0d237cc7fb..6b12e22bf50c 100644 --- a/ops/pipeline/manage-artifacts.py +++ b/ops/pipeline/manage-artifacts.py @@ -27,8 +27,10 @@ def compute_s3_url(*, s3_bucket: str, prefix: str, artifact: str) -> str: return f"s3://{s3_bucket}/{prefix}/{artifact}" -def aws_s3_upload(*, src: Path, dest: str) -> None: +def aws_s3_upload(*, src: Path, dest: str, make_public=bool) -> None: cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + if make_public: + cli_args.extend(["--acl", "public-read"]) print(" ".join(cli_args)) subprocess.run( cli_args, @@ -83,10 +85,11 @@ def aws_s3_download_with_wildcard(*, src: str, dest_dir: Path) -> None: def upload(*, args: argparse.Namespace) -> None: print(f"Uploading artifacts to prefix {args.prefix}...") for artifact in args.artifacts: + artifact_path = Path(artifact) s3_url = compute_s3_url( - s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact + s3_bucket=args.s3_bucket, prefix=args.prefix, artifact=artifact_path.name ) - aws_s3_upload(src=Path(artifact), dest=s3_url) + aws_s3_upload(src=artifact_path, dest=s3_url, make_public=args.make_public) def download(*, args: argparse.Namespace) -> None: @@ -140,6 +143,9 @@ def download(*, args: argparse.Namespace) -> None: help=f"Artifact(s) to {command}", ) + parsers["upload"].add_argument( + "--make-public", action="store_true", help="Make artifact publicly accessible" + ) parsers["download"].add_argument( "--dest-dir", type=str, required=True, help="Where to download artifact(s)" ) From ac97aecbdfbf4286bac230f3e0e2aae18390ee05 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 23:18:43 -0800 Subject: [PATCH 10/12] Fix test-python-wheel --- .github/workflows/main.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 19b4d8b24d55..9042e1b7b47c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -107,16 +107,12 @@ jobs: run: bash ops/pipeline/login-docker-registry.sh - name: Unstash Python wheel run: | - python3 ops/pipeline/manage-artifacts.py download \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ - --dest-dir . \ - python-package/dist/*.whl ./xgboost python3 ops/pipeline/manage-artifacts.py download \ --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ --dest-dir wheelhouse \ - *.whl + *.whl xgboost + mv -v wheelhouse/xgboost . chmod +x ./xgboost - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} From 89270de2a5aa37bdb9cf857fb514fc3f81acd7b8 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 23:46:11 -0800 Subject: [PATCH 11/12] Build sm_75 only if pull request --- ops/pipeline/build-cuda-impl.sh | 10 +++++++++- ops/pipeline/build-cuda.sh | 5 +++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh index 13230a4f0fed..5ee81cb24dad 100755 --- a/ops/pipeline/build-cuda-impl.sh +++ b/ops/pipeline/build-cuda-impl.sh @@ -7,10 +7,18 @@ set -euox pipefail mkdir -p build pushd build +if [[ "${BUILD_ONLY_SM75:-}" == 1 ]] +then + cmake_args='-DGPU_COMPUTE_VER=75' +else + cmake_args='' +fi + # Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until # https://github.com/dmlc/xgboost/issues/10400 is fixed echo "--- Build libxgboost from the source" cmake .. \ + -GNinja \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ @@ -25,7 +33,7 @@ cmake .. \ -DENABLE_ALL_WARNINGS=ON \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DBUILD_DEPRECATED_CLI=ON \ - -GNinja + ${cmake_args} time ninja -v popd diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index bbe89aaa502d..ad80ce028b5f 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -20,9 +20,9 @@ echo "--- Build with CUDA" if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] then - arch_flag="-DGPU_COMPUTE_VER=75" + export BUILD_ONLY_SM75=1 else - arch_flag="" + export BUILD_ONLY_SM75=0 fi echo "--- Build libxgboost from the source" @@ -32,6 +32,7 @@ set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-tag ${BUILD_CONTAINER_TAG} \ + --run-args='-e BUILD_ONLY_SM75' \ -- ops/pipeline/build-cuda-impl.sh python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ From ea312e955da77d3cbddedc592d3bee5befd19441 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 8 Dec 2024 23:26:50 -0800 Subject: [PATCH 12/12] Remove rename_whl.py; change of dir structure in xgboost-nightly-builds --- ops/pipeline/build-cuda.sh | 21 +++++------ ops/script/format_wheel_meta.py | 1 + ops/script/rename_whl.py | 62 --------------------------------- python-package/pyproject.toml | 6 ++++ 4 files changed, 18 insertions(+), 72 deletions(-) delete mode 100644 ops/script/rename_whl.py diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index ad80ce028b5f..247738e2c80d 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -34,26 +34,23 @@ python3 ops/docker_run.py \ --container-tag ${BUILD_CONTAINER_TAG} \ --run-args='-e BUILD_ONLY_SM75' \ -- ops/pipeline/build-cuda-impl.sh -python3 ops/script/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" python3 ops/docker_run.py \ --container-tag ${MANYLINUX_CONTAINER_TAG} \ - -- auditwheel repair \ + -- auditwheel repair --only-plat \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/script/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${GITHUB_SHA} \ - --platform-tag ${WHEEL_TAG} +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} \ + wheelhouse/*.whl mv -v wheelhouse/*.whl python-package/dist/ if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then echo "error: libgomp.so was not vendored in the wheel" exit -1 fi +# Check size of wheel +pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl + # Generate the meta info which includes xgboost version and the commit info python3 ops/script/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ @@ -64,8 +61,12 @@ python3 ops/script/format_wheel_meta.py \ echo "--- Upload Python wheel" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + python-package/dist/*.whl python3 ops/pipeline/manage-artifacts.py upload \ --s3-bucket xgboost-nightly-builds \ --prefix ${BRANCH_NAME} --make-public \ - python-package/dist/*.whl python-package/dist/meta.json + python-package/dist/meta.json fi diff --git a/ops/script/format_wheel_meta.py b/ops/script/format_wheel_meta.py index a7def879905e..8b37e81bc9a7 100644 --- a/ops/script/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -27,6 +27,7 @@ def main(args: argparse.Namespace) -> None: version = tokens[1].split("+")[0] meta_info = { + "wheel_path": f"{args.commit_hash}/{wheel_name}", "wheel_name": wheel_name, "platform_tag": args.platform_tag, "version": version, diff --git a/ops/script/rename_whl.py b/ops/script/rename_whl.py deleted file mode 100644 index d4467720c738..000000000000 --- a/ops/script/rename_whl.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -import pathlib - - -def main(args: argparse.Namespace) -> None: - wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() - if not wheel_path.exists(): - raise ValueError(f"Wheel cannot be found at path {wheel_path}") - if not wheel_path.is_file(): - raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name - - tokens = wheel_name.split("-") - assert len(tokens) == 5 - version = tokens[1].split("+")[0] - keywords = { - "pkg_name": tokens[0], - "version": version, - "commit_id": args.commit_hash, - "platform_tag": args.platform_tag, - } - new_wheel_name = ( - "{pkg_name}-{version}+{commit_id}-py3-none-{platform_tag}.whl".format( - **keywords - ) - ) - new_wheel_path = wheel_dir / new_wheel_name - print(f"Renaming {wheel_name} to {new_wheel_name}...") - if new_wheel_name == wheel_name: - print("Skipping, as the old name is identical to the new name.") - else: - if new_wheel_path.is_file(): - new_wheel_path.unlink() - wheel_path.rename(new_wheel_path) - - filesize = new_wheel_path.stat().st_size / 1024 / 1024 # MiB - print(f"Wheel size: {filesize:.2f} MiB") - - if filesize > 300: - raise RuntimeError( - f"Limit of wheel size set by PyPI is exceeded. {new_wheel_name}: {filesize:.2f} MiB" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Format a Python wheel's name using the git commit hash and platform tag" - ) - parser.add_argument( - "--wheel-path", type=str, required=True, help="Path to the wheel" - ) - parser.add_argument( - "--commit-hash", type=str, required=True, help="Git commit hash" - ) - parser.add_argument( - "--platform-tag", - type=str, - required=True, - help="Platform tag (e.g. manylinux_2_28_x86_64)", - ) - parsed_args = parser.parse_args() - main(parsed_args) diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 565b61eb0669..cc5042997a6c 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -86,3 +86,9 @@ class-attribute-naming-style = "snake_case" # Allow single-letter variables variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$" + +[tool.pydistcheck] +inspect = true +ignore = ["compiled-objects-have-debug-symbols"] +max_allowed_size_compressed = '300M' +max_allowed_size_uncompressed = '500M'