From f1e974e97e6b5afc6bcfe16092b0b4ef0fbe1548 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 17 Oct 2024 16:56:47 -0400 Subject: [PATCH 1/7] ci(cuda): bump CUDA to 12.6, TF to 2.18, PT to 2.5 Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index d60a9c909a..0b123f3af4 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -19,7 +19,7 @@ jobs: runs-on: nvidia # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845 container: - image: nvidia/cuda:12.3.1-devel-ubuntu22.04 + image: nvidia/cuda:12.6-devel-ubuntu24.04 options: --gpus all if: github.repository_owner == 'deepmodeling' && (github.event_name == 'pull_request' && github.event.label && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group') steps: @@ -47,7 +47,7 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch==2.3.1.*" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.5.0" - run: | export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') @@ -63,7 +63,7 @@ jobs: CUDA_VISIBLE_DEVICES: 0 - name: Download libtorch run: | - wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip -O libtorch.zip + wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcu124.zip -O libtorch.zip unzip libtorch.zip - run: | export CMAKE_PREFIX_PATH=$GITHUB_WORKSPACE/libtorch From 6aca89ab4fe620a928393cdfa1b332c0abac4693 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 17 Oct 2024 17:00:51 -0400 Subject: [PATCH 2/7] bump more versions Signed-off-by: Jinzhe Zeng --- .github/workflows/test_python.yml | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 87d7266e03..e46bddd98a 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -26,7 +26,7 @@ jobs: - run: python -m pip install -U uv - run: | source/install/uv_with_retry.sh pip install --system mpich - source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ + source/install/uv_with_retry.sh pip install --system torch -i https://download.pytorch.org/whl/cpu export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test,jax] horovod[tensorflow-cpu] mpi4py env: diff --git a/pyproject.toml b/pyproject.toml index b13dceeb07..e8b7d82558 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -276,8 +276,8 @@ PATH = "/usr/lib64/mpich/bin:$PATH" UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu" # trick to find the correction version of mpich CMAKE_PREFIX_PATH="/opt/python/cp311-cp311/" -# PT 2.4.0 requires cudnn 9, incompatible with TF with cudnn 8 -PYTORCH_VERSION = "2.3.1" +TENSORFLOW_VERSION = "2.18.0rc2" +PYTORCH_VERSION = "2.5.0" [tool.cibuildwheel.windows] test-extras = ["cpu", "torch"] From d3fe74ab3aae625dab9e05b186b71a6d45de9e0a Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 17 Oct 2024 18:09:04 -0400 Subject: [PATCH 3/7] 12.6.0 Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index 0b123f3af4..01c9b327fb 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -19,7 +19,7 @@ jobs: runs-on: nvidia # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845 container: - image: nvidia/cuda:12.6-devel-ubuntu24.04 + image: nvidia/cuda:12.6.0-devel-ubuntu24.04 options: --gpus all if: github.repository_owner == 'deepmodeling' && (github.event_name == 'pull_request' && github.event.label && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group') steps: From 8b895eabd48d729f012204934a6f160b14c2d70b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 17 Oct 2024 21:25:22 -0400 Subject: [PATCH 4/7] cudnn Signed-off-by: Jinzhe Zeng --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e8b7d82558..09155a421e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,7 @@ cu12 = [ "nvidia-curand-cu12", "nvidia-cusolver-cu12", "nvidia-cusparse-cu12", - "nvidia-cudnn-cu12<9", + "nvidia-cudnn-cu12", "nvidia-cuda-nvcc-cu12", ] jax = [ From f116ebbc31da64c0231857794b367ff9729aeba5 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 18 Oct 2024 14:57:44 -0400 Subject: [PATCH 5/7] revert changes to the CUDA version Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index 01c9b327fb..6bf4c8552f 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -19,7 +19,7 @@ jobs: runs-on: nvidia # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845 container: - image: nvidia/cuda:12.6.0-devel-ubuntu24.04 + image: nvidia/cuda:12.3.1-devel-ubuntu22.04 options: --gpus all if: github.repository_owner == 'deepmodeling' && (github.event_name == 'pull_request' && github.event.label && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group') steps: From 363ace61319827cbf7fd603d450f9b779c51b2e2 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 24 Oct 2024 03:11:55 -0400 Subject: [PATCH 6/7] we need to have different PT versions for cuda 11 and 12 Signed-off-by: Jinzhe Zeng --- backend/find_pytorch.py | 13 +++++++++++++ backend/find_tensorflow.py | 6 +++--- pyproject.toml | 2 -- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index e01f4e84fe..ff645de458 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -22,6 +22,9 @@ Union, ) +from packaging.specifiers import ( + SpecifierSet, +) from packaging.version import ( Version, ) @@ -104,6 +107,16 @@ def get_pt_requirement(pt_version: str = "") -> dict: """ if pt_version is None: return {"torch": []} + if os.environ.get("CIBUILDWHEEL", "0") == "1": + cuda_version = os.environ.get("CUDA_VERSION", "12.2") + if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"): + # CUDA 12.2, cudnn 9 + pt_version = "2.5.0" + elif cuda_version in SpecifierSet(">=11,<12"): + # CUDA 11.8, cudnn 8 + pt_version = "2.3.1" + else: + raise RuntimeError("Unsupported CUDA version") from None if pt_version == "": pt_version = os.environ.get("PYTORCH_VERSION", "") diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py index 5b0de0b2dd..1fc3a8a6d9 100644 --- a/backend/find_tensorflow.py +++ b/backend/find_tensorflow.py @@ -85,14 +85,14 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]: if os.environ.get("CIBUILDWHEEL", "0") == "1": cuda_version = os.environ.get("CUDA_VERSION", "12.2") if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"): - # CUDA 12.2 + # CUDA 12.2, cudnn 9 requires.extend( [ - "tensorflow-cpu>=2.15.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'", + "tensorflow-cpu>=2.18.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'", ] ) elif cuda_version in SpecifierSet(">=11,<12"): - # CUDA 11.8 + # CUDA 11.8, cudnn 8 requires.extend( [ "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'", diff --git a/pyproject.toml b/pyproject.toml index c0c6b13719..06d39fe2f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -276,8 +276,6 @@ PATH = "/usr/lib64/mpich/bin:$PATH" UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu" # trick to find the correction version of mpich CMAKE_PREFIX_PATH="/opt/python/cp311-cp311/" -TENSORFLOW_VERSION = "2.18.0rc2" -PYTORCH_VERSION = "2.5.0" [tool.cibuildwheel.windows] test-extras = ["cpu", "torch"] From 8a8969a04c6f33c4d9326828647956a7659927b7 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 24 Oct 2024 03:23:44 -0400 Subject: [PATCH 7/7] only apply linux_x86_64 Signed-off-by: Jinzhe Zeng --- backend/find_pytorch.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py index ff645de458..6ca4ddb0ab 100644 --- a/backend/find_pytorch.py +++ b/backend/find_pytorch.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import importlib import os +import platform import site from functools import ( lru_cache, @@ -107,7 +108,11 @@ def get_pt_requirement(pt_version: str = "") -> dict: """ if pt_version is None: return {"torch": []} - if os.environ.get("CIBUILDWHEEL", "0") == "1": + if ( + os.environ.get("CIBUILDWHEEL", "0") == "1" + and platform.system() == "Linux" + and platform.machine() == "x86_64" + ): cuda_version = os.environ.get("CUDA_VERSION", "12.2") if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"): # CUDA 12.2, cudnn 9