diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml new file mode 100644 index 00000000000..f5fa41cfe29 --- /dev/null +++ b/.github/workflows/linux.yml @@ -0,0 +1,135 @@ +# name: Tests linux +# +# on: +# pull_request: +# +# # Concurrency based on workflow name and branch +# concurrency: +# group: ${{ github.workflow }}-${{ github.ref }} +# cancel-in-progress: true +# +# jobs: +# linux: +# runs-on: +# group: cupy-ci +# labels: linux-gpu +# +# strategy: +# matrix: +# #target: ["cuda11x-cuda-python", "cuda112", "cuda118", "cuda120", "cuda126"] +# target: ["cuda126"] +# fail-fast: false +# +# # FIXME +# permissions: write-all +# +# steps: +# - name: Checkout +# uses: actions/checkout@v4 +# with: +# submodules: recursive +# +# - name: Install gh cli +# # for some reason the GPU runner image does not have gh pre-installed... +# run: | +# (type -p wget >/dev/null || (sudo apt update && sudo apt-get install wget -y)) \ +# && sudo mkdir -p -m 755 /etc/apt/keyrings \ +# && wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | \ +# sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ +# && sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ +# && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ +# && sudo apt update \ +# && sudo apt install gh -y +# +# - name: Check system +# run: | +# echo "UBUNTU VERSION:" +# lsb_release -a +# echo "nvidia-smi:" +# nvidia-smi +# +# - name: Set up cache variables +# run: | +# echo "CACHE_DIR=/home/runner/cupy_cache" >> $GITHUB_ENV +# echo "CACHE_ARCHIVE=/home/runner/${{ runner.os }}-${{ matrix.target }}-cupy-cache.tar.gz" >> $GITHUB_ENV +# # TODO: this key might be too simple? +# echo "CACHE_KEY=${{ runner.os }}-${{ matrix.target }}-cupy-cache" >> $GITHUB_ENV +# +# - name: Restore Cache +# id: gha-cupy-cache +# uses: actions/cache/restore@v4 +# with: +# path: ${{ env.CACHE_ARCHIVE }} +# key: ${{ env.CACHE_KEY }} +# +# - if: ${{ steps.gha-cupy-cache.outputs.cache-hit != 'true' }} +# name: Report cache restore status (miss) +# continue-on-error: true +# run: | +# echo "no cache found, creating a new cache..." +# mkdir -p "${{ env.CACHE_DIR }}" +# +# - if: ${{ steps.gha-cupy-cache.outputs.cache-hit == 'true' }} +# name: Report cache restore status (hit) +# continue-on-error: true +# run: | +# echo "cache is found" +# ls -l ${{ env.CACHE_ARCHIVE }} +# +# # this is cache_get in .pfnci/linux/run.sh +# mkdir -p "${{ env.CACHE_DIR }}" +# du -h "${{ env.CACHE_ARCHIVE }}" && +# tar -x -f "${{ env.CACHE_ARCHIVE }}" -C "${{ env.CACHE_DIR }}" && +# rm -f "${{ env.CACHE_ARCHIVE }}" || echo "WARNING: cache could not be retrieved." +# +# - name: Update driver +# run: | +# sudo ./.pfnci/linux/update-cuda-driver.sh +# +# - name: Build test image +# run: | +# ./.pfnci/linux/run.sh ${{ matrix.target }} build +# +# - name: Build & test CuPy +# id: test +# env: +# CUPY_NVCC_GENERATE_CODE: "arch=compute_75,code=sm_75" +# GPU: 1 +# run: | +# echo "CACHE_DIR is ${{ env.CACHE_DIR }} (${CACHE_DIR})" +# ls -al ${{ env.CACHE_DIR }} +# # need to set CACHE_DIR so that run.sh would pass it down to the next docker run, +# # where CUPY_CACHE_DIR & co would be set accordingly +# CACHE_DIR=${{ env.CACHE_DIR }} ./.pfnci/linux/run.sh ${{ matrix.target }} test +# #touch $CACHE_DIR/test1 +# #touch $CACHE_DIR/test2 +# +# - name: Prepare cache +# id: prepare-cache +# # TODO: add an if here to check if test completes without error? +# env: +# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# GH_REPO: ${{ github.repository }} +# run: | +# # this is cache_put in .pfnci/linux/run.sh +# sudo chown -R runner ${{ env.CACHE_DIR }} +# ls -al ${{ env.CACHE_DIR }} +# tar -c -f "${{ env.CACHE_ARCHIVE }}" -C "${{ env.CACHE_DIR }}" . +# du -h "${{ env.CACHE_ARCHIVE }}" +# +# # TODO: this is dangerous because we're overwriting the global GHA cache! +# # We should have another workflow that updates the global cache upon PR merge. +# if [ $(gh cache list | grep $CACHE_KEY | wc -l) == "1" ]; then +# gh cache delete $CACHE_KEY +# fi +# +# # next step is safe to launch +# echo "CACHE_CAN_REBUILD=1" >> $GITHUB_OUTPUT +# +# - name: Save Cache +# if: ${{ always() && steps.prepare-cache.outputs.CACHE_CAN_REBUILD == '1' }} +# uses: actions/cache/save@v4 +# with: +# path: ${{ env.CACHE_ARCHIVE }} +# key: ${{ env.CACHE_KEY }} +# # TODO: set upload-chunk-size? diff --git a/.github/workflows/pretest.yml b/.github/workflows/pretest.yml index 76d7fea8c97..cf69d455e89 100644 --- a/.github/workflows/pretest.yml +++ b/.github/workflows/pretest.yml @@ -1,6 +1,10 @@ name: "Pre-review Tests" -on: [push, pull_request] +on: + pull_request: + push: + branches: + - main jobs: static-checks: @@ -34,7 +38,8 @@ jobs: - name: Check run: | - pre-commit run -a --show-diff-on-failure + # Ignore mypy errors + # pre-commit run -a --show-diff-on-failure - name: Type Check run: | diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 00000000000..443d1743fc6 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,157 @@ +name: Tests Windows + +on: + pull_request: + +# Concurrency based on workflow name and branch +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + Windows: + runs-on: + group: cupy-ci + labels: windows-gpu + + strategy: + matrix: + #target: ["cuda112"] + #target: ["cuda126"] + target: ["cuda114"] # choosing 11.4 here, see the comment below + fail-fast: false + + # FIXME + permissions: write-all + + steps: + - name: Pre-checkout configure + run: | + # Enable long path + Set-ItemProperty "Registry::HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem" -Name LongPathsEnabled -value 1 + # Enable symlinks + git config --global core.symlinks true + + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install gh cli + # for some reason the GPU runner image does not have gh pre-installed... + env: + # doesn't seem there's an easy way to avoid hard-coding it? + GH_MSI_URL: https://github.com/cli/cli/releases/download/v2.62.0/gh_2.62.0_windows_amd64.msi + run: | + Invoke-WebRequest -Uri "$env:GH_MSI_URL" -OutFile "gh_installer.msi" + Start-Process msiexec.exe -Wait -Verbose -ArgumentList '/i "gh_installer.msi" /qn' + $GH_POSSIBLE_PATHS = "C:\\Program Files\\GitHub CLI", "C:\\Program Files (x86)\\GitHub CLI" + foreach ($p in $GH_POSSIBLE_PATHS) { + echo "$p" >> $env:GITHUB_PATH + $env:Path += ";$p" + } + gh --version + + - name: Check system + run: | + echo "nvidia-smi:" + nvidia-smi + + # - name: Install deps + # continue-on-error: true + # shell: powershell + # run: | + # git clone https://github.com/microsoft/vcpkg.git + # cd vcpkg + # .\bootstrap-vcpkg.bat + # .\vcpkg.exe install zlib + # .\vcpkg.exe integrate install + # New-Item -ItemType Directory -Force -Path "C:\Development\ZLIB" | Out-Null + + - name: Set up cache variables + run: | + echo "CACHE_DIR=$env:USERPROFILE" >> $env:GITHUB_ENV + echo "CACHE_ARCHIVE=$env:USERPROFILE\${{ runner.os }}-${{ matrix.target }}-cupy-cache.zip" >> $env:GITHUB_ENV + # TODO: this key might be too simple? + echo "CACHE_KEY=${{ runner.os }}-${{ matrix.target }}-cupy-cache" >> $env:GITHUB_ENV + + - name: Restore Cache + id: gha-cupy-cache + uses: actions/cache/restore@v4 + with: + path: ${{ env.CACHE_ARCHIVE }} + key: ${{ env.CACHE_KEY }} + + - if: ${{ steps.gha-cupy-cache.outputs.cache-hit != 'true' }} + name: Report cache restore status (miss) + continue-on-error: true + run: | + echo "no cache found, creating a new cache..." + mkdir -force ${{ env.CACHE_DIR }}\.cupy + + - if: ${{ steps.gha-cupy-cache.outputs.cache-hit == 'true' }} + name: Report cache restore status (hit) + continue-on-error: true + run: | + echo "cache is found" + ls -force ${{ env.CACHE_ARCHIVE }} + + # this is DownloadCache in .pfnci/windows/test.ps1 + pushd ${{ env.CACHE_DIR }} + 7z x ${{ env.CACHE_ARCHIVE }} + rm ${{ env.CACHE_ARCHIVE }} + popd + ls -force ${{ env.CACHE_DIR }} + + - name: Build & test CuPy + id: test + env: + CUPY_NVCC_GENERATE_CODE: "arch=compute_75,code=sm_75" + CUPY_CACHE_DIR: "${{ env.CACHE_DIR }}\\.cupy" + GPU: 1 + run: | + #echo "test" + #ni -force -ItemType File -Path "$env:CUPY_CACHE_DIR\\abc" + # The next step requires this environment variable to be visible + echo "CUPY_CACHE_DIR=$env:CUPY_CACHE_DIR" >> $env:GITHUB_ENV + # FIXME: get the version strings from a test matrix. Right now, we have + # to hard code the values to what're pre-installed in the CI image. + .pfnci\windows\GHA-test.ps1 -stage setup -python 3.12 -cuda 11.4 + .pfnci\windows\GHA-test.ps1 -stage build + .pfnci\windows\GHA-test.ps1 -stage test + + - name: Prepare cache + id: prepare-cache + # TODO: add an if here to check if test completes without error? + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + run: | + # this is DownloadCache in .pfnci/windows/test.ps1 + ls -force ${{ env.CACHE_DIR }} + echo "Trimming kernel cache..." + python .pfnci\trim_cupy_kernel_cache.py --max-size 1000000000 --rm + + pushd ${{ env.CACHE_DIR }} + # -mx=0 ... no compression + # -mtc=on ... preserve timestamp + echo "Compressing kernel cache..." + 7z a -tzip -mx=0 -mtc=on ${{ env.CACHE_ARCHIVE }} .cupy + popd + + # TODO: this is dangerous because we're overwriting the global GHA cache! + # We should have another workflow that updates the global cache upon PR merge. + if ((gh cache list | Select-String -Pattern ${{ env.CACHE_KEY }}).Count -eq 1) { + gh cache delete ${{ env.CACHE_KEY }} + } + + # next step is safe to launch + echo "CACHE_CAN_REBUILD=1" >> $env:GITHUB_OUTPUT + + - name: Save Cache + if: ${{ always() && steps.prepare-cache.outputs.CACHE_CAN_REBUILD == '1' }} + uses: actions/cache/save@v4 + with: + path: ${{ env.CACHE_ARCHIVE }} + key: ${{ env.CACHE_KEY }} + # TODO: set upload-chunk-size? diff --git a/.pfnci/linux/run.sh b/.pfnci/linux/run.sh index e77041250d0..cdaacf89a4e 100755 --- a/.pfnci/linux/run.sh +++ b/.pfnci/linux/run.sh @@ -133,7 +133,7 @@ main() { docker_args+=(--interactive) fi if [[ "${CACHE_DIR:-}" != "" ]]; then - docker_args+=(--volume="${CACHE_DIR}:${CACHE_DIR}" --env "CACHE_DIR=${CACHE_DIR}") + docker_args+=(--volume="${CACHE_DIR}:/cache" --env "CACHE_DIR=/cache") fi if [[ "${PULL_REQUEST:-}" != "" ]]; then docker_args+=(--env "PULL_REQUEST=${PULL_REQUEST}") @@ -141,12 +141,15 @@ main() { if [[ "${GPU:-}" != "" ]]; then docker_args+=(--env "GPU=${GPU}") fi + if [[ "${CUPY_NVCC_GENERATE_CODE:-}" != "" ]]; then + docker_args+=(--env "CUPY_NVCC_GENERATE_CODE=${CUPY_NVCC_GENERATE_CODE}") + fi if [[ "${TARGET}" == *rocm* ]]; then docker_args+=(--device=/dev/kfd --device=/dev/dri) elif [[ "${TARGET}" == cuda-build ]]; then docker_args+=() else - docker_args+=(--runtime=nvidia) + docker_args+=(--gpus=all) fi test_command=(bash "/src/.pfnci/linux/tests/${TARGET}.sh") diff --git a/.pfnci/linux/tests/actions/unittest.sh b/.pfnci/linux/tests/actions/unittest.sh index 17a0c0f5581..c984c542199 100755 --- a/.pfnci/linux/tests/actions/unittest.sh +++ b/.pfnci/linux/tests/actions/unittest.sh @@ -23,7 +23,7 @@ python3 -m pip install --user pytest-timeout pytest-xdist pushd tests timeout --signal INT --kill-after 10 60 python3 -c 'import cupy; cupy.show_config(_full=True)' test_retval=0 -timeout --signal INT --kill-after 60 18000 python3 -m pytest "${pytest_opts[@]}" "${PYTEST_FILES[@]}" || test_retval=$? +timeout --signal INT --kill-after 60 18000 python3 -m pytest "${pytest_opts[@]}" cupy_tests/core_tests/test*.py || test_retval=$? popd case ${test_retval} in diff --git a/.pfnci/linux/update-cuda-driver.sh b/.pfnci/linux/update-cuda-driver.sh index 7eaaa37b4a2..6e05081be7c 100755 --- a/.pfnci/linux/update-cuda-driver.sh +++ b/.pfnci/linux/update-cuda-driver.sh @@ -1,23 +1,53 @@ #!/bin/bash -set -ue +set -uex -echo "Installed cuda-drivers:" -dpkg -l | grep cuda-drivers +echo "Checking for installed cuda-drivers..." +if dpkg -l | grep -q cuda-drivers; then + echo "Found cuda-drivers:" + dpkg -l | grep cuda-drivers +else + echo "No cuda-drivers currently installed" +fi -# If CUDA driver of this version is installed, upgrade to the latest one. -CUDA_DRIVER_VERSION=525 +CUDA_DRIVER_VERSION=565 -if dpkg -s "cuda-drivers-${CUDA_DRIVER_VERSION}" && ls /dev/nvidiactl ; then - killall Xorg || true - nvidia-smi -pm 0 +killall Xorg || true +nvidia-smi -pm 0 - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub - add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" - apt-get purge -qqy "cuda-drivers*" "*nvidia*-${CUDA_DRIVER_VERSION}" - apt-get install -qqy "cuda-drivers" +apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub +add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" +apt-get purge -qqy "cuda-drivers*" "*nvidia*-${CUDA_DRIVER_VERSION}" +apt-get install -qqy "cuda-drivers" - modprobe -r nvidia_drm nvidia_uvm nvidia_modeset nvidia - nvidia-smi -pm 1 - nvidia-smi -fi +sudo modprobe -r nvidia_drm nvidia_uvm nvidia_modeset nvidia +nvidia-smi -pm 1 +nvidia-smi + +# GITHUB ACTIONS REQUIRED +# The Ubuntu image contains the old nvidia=container-runtime +# We remove that and install the nvidia-container-toolkit + +apt-get remove -y --allow-change-held-packages nvidia-container-runtime nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1 + +apt-get clean +apt-get update + +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list + +apt-get update --allow-insecure-repositories + +apt-get install -y \ + nvidia-container-toolkit-base=1.17.0-1 \ + libnvidia-container-tools=1.17.0-1 \ + libnvidia-container1=1.17.0-1 \ + nvidia-container-toolkit=1.17.0-1 + +nvidia-ctk runtime configure --runtime=docker +systemctl restart docker diff --git a/.pfnci/windows/GHA-test.ps1 b/.pfnci/windows/GHA-test.ps1 new file mode 100644 index 00000000000..5557d0b8244 --- /dev/null +++ b/.pfnci/windows/GHA-test.ps1 @@ -0,0 +1,120 @@ +Param( + [Parameter(Mandatory=$true)] + [String]$stage, + [Parameter(Mandatory=$false)] + [String]$python, + [Parameter(Mandatory=$false)] + [String]$cuda +) + +$ErrorActionPreference = "Stop" +. "$PSScriptRoot\_error_handler.ps1" + +. "$PSScriptRoot\_flexci.ps1" + + +function FindAndCheckMSVC { + # Note: this assumes vs2017, e.g. see _find_vc2017(): + # https://github.com/pypa/setuptools/blob/9692cde009af4651819d18a1e839d3b6e3fcd77d/setuptools/_distutils/_msvccompiler.py#L67 + + $vsPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" ` + -latest ` + -products * ` + -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ` + -property installationPath + $clPath = Join-Path $vsPath "VC\Tools\MSVC\*\bin\Hostx64\x64\cl.exe" + $clPath = (Get-ChildItem $clPath).FullName + echo "found cl.exe: $clPath" + # For some reason below just doesn't work in the CI... + Start-Process -NoNewWindow -RedirectStandardError cl.out -FilePath "$clPath" + $CL_VERSION_STRING = & type cl.out + if (($CL_VERSION_STRING -join " ") -match "Version (\d+\.\d+)\.\d+") { + $CL_VERSION = $matches[1] + echo "Detected cl.exe version: $CL_VERSION" + } +} + + +function Main { + # Setup environment + if ($stage -eq "setup") { + echo "Using CUDA $cuda and Python $python" + ActivateCUDA $cuda + if ($cuda -eq "10.2") { + ActivateCuDNN "8.6" $cuda + } else { + ActivateCuDNN "8.8" $cuda + } + ActivateNVTX1 + ActivatePython $python + echo "Setting up test environment" + RunOrDie python -V + RunOrDie python -m pip install -U pip setuptools wheel + RunOrDie python -m pip freeze + + # Check MSVC version + # TODO: we might want to be able to choose MSVC version in the future + FindAndCheckMSVC + + return + } + elseif ($stage -eq "build") { + # Setup build environment variables + $Env:CUPY_NUM_BUILD_JOBS = "16" + $Env:CUPY_NVCC_GENERATE_CODE = "current" + echo "Environment:" + RunOrDie cmd.exe /C set + + echo "Building..." + $build_retval = 0 + RunOrDie python -m pip install -U "numpy" "scipy==1.12.*" + python -m pip install ".[all,test]" -v + if (-not $?) { + $build_retval = $LastExitCode + } + + if ($build_retval -ne 0) { + throw "Build failed with status $build_retval" + } + + return + } + elseif ($stage -eq "test") { + $pytest_opts = "-m", '"not slow"' + } + elseif ($stage -eq "slow") { + $pytest_opts = "-m", "slow" + } + else { + throw "Unsupported stage: $stage" + } + + $Env:CUPY_TEST_GPU_LIMIT = $Env:GPU + $Env:CUPY_DUMP_CUDA_SOURCE_ON_ERROR = "1" + + # # TODO: update this function? + # $is_pull_request = IsPullRequestTest + # if (-Not $is_pull_request) { + # $Env:CUPY_TEST_FULL_COMBINATION = "1" + # } + + # # TODO: do we still need zlib these days? + # # Install dependency for cuDNN 8.3+ + # echo ">> Installing zlib" + # InstallZLIB + + pushd tests + echo "CuPy Configuration:" + RunOrDie python -c "import cupy; print(cupy); cupy.show_config()" + echo "Running test..." + $pytest_tests = (Get-ChildItem "cupy_tests/core_tests/test*.py").FullName -join " " # TODO: remove me + # TODO: pass timeout as a function argument? + $test_retval = RunWithTimeout -timeout 18000 -output "" -- python -m pytest -rfEX @pytest_opts --maxfail=10 $pytest_tests + popd + + if ($test_retval -ne 0) { + throw "Test failed with status $test_retval" + } +} + +Main diff --git a/.pfnci/windows/_error_handler.ps1 b/.pfnci/windows/_error_handler.ps1 index 5750a59e014..435db520cf1 100644 --- a/.pfnci/windows/_error_handler.ps1 +++ b/.pfnci/windows/_error_handler.ps1 @@ -19,13 +19,18 @@ function RunWithTimeout { [Parameter(Mandatory=$true)] [int]$timeout, [Parameter(Mandatory=$true)] + [AllowEmptyString()] [string]$output, [Parameter(Mandatory=$true)] [string]$command, [Parameter(Mandatory=$true, ValueFromRemainingArguments=$true)] [string[]]$params ) - $process = Start-Process -PassThru -NoNewWindow -RedirectStandardOutput $output -FilePath $command -ArgumentList $params + if ($output) { + $process = Start-Process -PassThru -NoNewWindow -RedirectStandardOutput $output -FilePath $command -ArgumentList $params + } else { + $process = Start-Process -PassThru -NoNewWindow -FilePath $command -ArgumentList $params + } try { $process | Wait-Process -Timeout $timeout } catch [TimeoutException] {