From 3e58282c91b9d3a6cbd09eeab579844b55298dde Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Thu, 25 Jul 2024 10:17:16 -0700
Subject: [PATCH] Set CUDA12 as default in GPU packages (#21438)

### Description
* Swap cuda version 11.8/12.2 in GPU CIs
* Set CUDA12 as default version in yamls of publishing nuget/python/java
GPU packages
* Suppress warnings as errors of flash_api.cc during ort win-build
---
 .../cuda/bert/flash_attention/flash_api.cc    |  8 ++++++
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  2 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  2 +-
 ...linux-gpu-tensorrt-daily-perf-pipeline.yml |  8 +++---
 .../nuget-cuda-publishing-pipeline.yml        | 15 ++++++-----
 .../github/azure-pipelines/publish-nuget.yml  | 26 +++++++++++++++----
 .../py-cuda-publishing-pipeline.yml           |  2 +-
 .../stages/java-cuda-publishing-stage.yml     |  2 +-
 .../jobs/download_win_gpu_library.yml         |  2 +-
 .../templates/jobs/set-winenv.yml             |  4 +--
 .../azure-pipelines/win-gpu-ci-pipeline.yml   | 13 +++++++---
 .../win-gpu-tensorrt-ci-pipeline.yml          | 19 ++++++++++++--
 .../docker/Dockerfile.manylinux2_28_cuda      |  2 +-
 .../Dockerfile.package_ubi8_cuda_tensorrt10_0 |  6 ++---
 .../github/windows/setup_env_cuda.bat         | 14 +++++-----
 .../ci_build/github/windows/setup_env_gpu.bat | 16 ++++++------
 .../ci_build/github/windows/setup_env_trt.bat |  8 +++---
 17 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index 90f0b94cafce8..967c04c52b182 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -92,6 +92,11 @@ void set_params_fprop(Flash_fwd_params& params,
   params.softmax_lse_ptr = softmax_lse_d;
 
   // Set the dimensions.
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4267)  // Ignore conversion from 'size_t' to 'int', possible loss of data
+#pragma warning(disable : 4244)  // Ignore conversion from 'double' to 'float', possible loss of data
+#endif
   params.b = batch_size;
   params.h = num_heads;
   params.h_k = num_heads_k;
@@ -119,6 +124,9 @@ void set_params_fprop(Flash_fwd_params& params,
   if (window_size_left >= 0 && window_size_right < 0) {
     window_size_right = seqlen_k;
   }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
   params.window_size_left = window_size_left;
   params.window_size_right = window_size_right;
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 8890a9c4bf56b..c31fff510bda0 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -30,7 +30,7 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 3f9707ff50519..d93c49fe3ab37 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -30,7 +30,7 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 7cfff805c3b3c..4ab1b4996a1db 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,14 +8,12 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.0.cuda_11_8_cudnn_8
+  default: 10.2.cuda_12_5_cudnn_9
   values:
-  - 8.4.cuda_11_6_cudnn_8
-  - 8.5.cuda_11_8_cudnn_8
   - 8.6.cuda_11_8_cudnn_8
   - 8.6.cuda_12_3_cudnn_9
-  - 10.0.cuda_11_8_cudnn_8
-  - 10.0.cuda_12_4_cudnn_9
+  - 10.2.cuda_11_8_cudnn_8
+  - 10.2.cuda_12_5_cudnn_9
   - BIN
 
 - name: UseTensorrtOssParser
diff --git a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
index 4bfd726f5c58c..aeb250e1e0cbc 100644
--- a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
@@ -6,6 +6,7 @@ resources:
       branches:
         include:
         - main
+        - rel-*
     branch: main
 
 parameters:
@@ -16,15 +17,15 @@ parameters:
 variables:
   - name: ArtifactFeed
     ${{ if eq(parameters.isReleaseBuild, false) }}:
-      value: ort-cuda-12-nightly
+      value: ORT-Nightly
     ${{ else }}:
       value: onnxruntime-cuda-12
 
 stages:
-- template: stages/nuget-cuda-publishing-stage.yml
-  parameters:
-    artifact_feed: $(ArtifactFeed)
+  - template: stages/nuget-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
 
-- template: stages/java-cuda-publishing-stage.yml
-  parameters:
-    artifact_feed: $(ArtifactFeed)
+  - template: stages/java-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
index e0c588413415b..206a9464de6ef 100644
--- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml
+++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
@@ -9,10 +9,22 @@ resources:
         - rel-*
     branch: main
 
+parameters:
+  - name: isReleaseBuild
+    type: boolean
+    default: false
+
+variables:
+  - name: ArtifactFeed
+    ${{ if eq(parameters.isReleaseBuild, false) }}:
+      value: ort-cuda-11-nightly
+    ${{ else }}:
+      value: onnxruntime-cuda-11
+
 stages:
   - template: templates/publish-nuget-steps.yml
     parameters:
-      stage_name: 'Publish_NuGet_Packag_And_Report'
+      stage_name: 'Publish_NuGet_Package_And_Report'
       include_cpu_ep: true
       download_artifacts_steps:
         - download: build
@@ -25,7 +37,11 @@ stages:
           artifact: 'drop-signed-nuget-Training-CPU'
         - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-Training-CPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
 
-        - download: build
-          displayName: 'Download Pipeline Artifact - Signed NuGet Package'
-          artifact: 'drop-signed-nuget-GPU'
-        - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-GPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
+  # Publish CUDA 11 Nuget/Java pkgs to ADO feed
+  - template: stages/nuget-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
+
+  - template: stages/java-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
index 50e0ca3708d2d..1217163c07132 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
@@ -16,7 +16,7 @@ parameters:
 variables:
   - name: ArtifactFeed
     ${{ if eq(parameters.isReleaseBuild, false) }}:
-      value: ort-cuda-12-nightly
+      value: ORT-Nightly
     ${{ else }}:
       value: onnxruntime-cuda-12
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
index 70d92286b3964..946d651b795d4 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
@@ -8,7 +8,7 @@ stages:
   jobs:
   - job: JAR_Publishing_GPU
     #TD-DO: figure out a way to package nightly jar. Currently Java version are set from VERSION_NUMBER file
-    condition: ${{ eq(parameters.artifact_feed, 'onnxruntime-cuda-12') }}
+    condition: ${{ or(eq(parameters.artifact_feed, 'onnxruntime-cuda-11'), eq(parameters.artifact_feed, 'onnxruntime-cuda-12')) }}
     workspace:
       clean: all
     pool: 'onnxruntime-Win-CPU-2022'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index de29a3de9fded..6459888a40aea 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -7,7 +7,7 @@ parameters:
     default: false
   - name: CudaVersion
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 63d521f1e7d9a..fba463b49016a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -9,10 +9,10 @@ parameters:
     default: false
   - name: PrimaryCUDAVersion
     type: string
-    default: '11.8'
+    default: '12.2'
   - name: SecondaryCUDAVersion
     type: string
-    default: '12.2'
+    default: '11.8'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 438e51175c5b4..c5262880c4c55 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -28,6 +28,13 @@ pr:
 #### end trigger ####
 
 parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
 - name: RunOnnxRuntimeTests
   displayName: Run Tests?
   type: boolean
@@ -43,7 +50,7 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: >-
-          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --enable_cuda_profiling --enable_transformers_tool_test
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
@@ -65,7 +72,7 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: >-
-          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --skip_onnx_tests
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
@@ -105,7 +112,7 @@ stages:
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
         additionalBuildFlags: >-
           --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
-          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 70c0c7d4a04e7..8c9ecdfb90191 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -26,6 +26,21 @@ pr:
     - 'js/web'
     - 'onnxruntime/core/providers/js'
 #### end trigger ####
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+variables:
+  - name: win_trt_folder
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5
 
 jobs:
 - job: 'build'
@@ -55,7 +70,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
-      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: 'x64'
       Platform: 'x64'
@@ -75,7 +90,7 @@ jobs:
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
      set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}"  --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
 
     workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index d96b342974273..07885ba65af8a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -2,7 +2,7 @@
 # Please overwrite BASEIMAGE, TRT_VERSION and other arguments with
 # --docker-build-args ' --build-arg BASEIMAGE=other_base_image --build-arg TRT_VERSION=other_trt_version etc...'
 # for other cuda version and TRT version
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
 
 FROM $BASEIMAGE
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 2d3dc05285e3c..b587a7df554bd 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -2,11 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------
-# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default
+# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10 and CUDA 12 by default
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.2.0.19-1.cuda11.8
+ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
+ARG TRT_VERSION=10.2.0.19-1.cuda12.4
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index 2233f7611ab6a..f93938e2a9009 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,17 +1,17 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
 
-@REM The default version is still cuda v11.8, because set cuda v12.2 after it
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
 )
 
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index 6c59866ea925a..35e4f7e302430 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -1,17 +1,17 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-    set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH%
 
-@REM The default version is still cuda v11.8, because set cuda v12.2 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib
+if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\extras\CUPTI\lib64
 )
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 249bb98815897..7ec7558edab39 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,11 +1,11 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
\ No newline at end of file