diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 9d533af616288..13f48d1c38543 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -76,3 +76,97 @@
     __version__ = version
 
 onnxruntime_validation.check_distro_info()
+
+
+def check_and_load_cuda_libs(root_directory, cuda_libs_):
+    # Convert the target library names to lowercase for case-insensitive comparison
+    # Convert the target library names to lowercase for case-insensitive comparison
+    if cuda_libs_ is None or len(cuda_libs_) == 0:
+        logging.info("No CUDA libraries provided for loading.")
+        return
+    cuda_libs_ = {lib.lower() for lib in cuda_libs_}
+    found_libs = {}
+    for dirpath, _, filenames in os.walk(root_directory):
+        # Convert filenames in the current directory to lowercase for comparison
+        files_in_dir = {file.lower(): file for file in filenames}  # Map lowercase to original
+        # Find common libraries in the current directory
+        matched_libs = cuda_libs_.intersection(files_in_dir.keys())
+        for lib in matched_libs:
+            # Store the full path of the found DLL
+            full_path = os.path.join(dirpath, files_in_dir[lib])
+            found_libs[lib] = full_path
+            try:
+                # Load the DLL using ctypes
+                _ = ctypes.CDLL(full_path)
+                logging.info(f"Successfully loaded: {full_path}")
+            except OSError as e:
+                logging.info(f"Failed to load {full_path}: {e}")
+
+        # If all required libraries are found, stop the search
+        if set(found_libs.keys()) == cuda_libs_:
+            logging.info("All required CUDA libraries found and loaded.")
+            return
+    logging.info(
+        f"Failed to load CUDA libraries from site-packages/nvidia directory: {cuda_libs_ - found_libs.keys()}. They might be loaded later from standard search paths for shared libraries."
+    )
+    return
+
+
+# Load nvidia libraries from site-packages/nvidia if the package is onnxruntime-gpu
+if cuda_version is not None and cuda_version != "":
+    import ctypes
+    import logging
+    import os
+    import platform
+    import site
+
+    cuda_version_ = tuple(map(int, cuda_version.split(".")))
+    # Get the site-packages path where nvidia packages are installed
+    site_packages_path = site.getsitepackages()[-1]
+    nvidia_path = os.path.join(site_packages_path, "nvidia")
+    # Traverse the directory and subdirectories
+    cuda_libs = ()
+    if platform.system() == "Windows":  #
+        # Define the list of DLL patterns, nvrtc, curand and nvJitLink are not included for Windows
+        if (11, 0) <= cuda_version_ < (12, 0):
+            cuda_libs = (
+                "cublaslt64_11.dll",
+                "cublas64_11.dll",
+                "cufft64_10.dll",
+                "cudart64_11.dll",
+                "cudnn64_8.dll",
+            )
+        elif (12, 0) <= cuda_version_ < (13, 0):
+            cuda_libs = (
+                "cublaslt64_12.dll",
+                "cublas64_12.dll",
+                "cufft64_11.dll",
+                "cudart64_12.dll",
+                "cudnn64_9.dll",
+            )
+    elif platform.system() == "Linux":
+        if (11, 0) <= cuda_version_ < (12, 0):
+            # Define the patterns with optional version number and case-insensitivity
+            cuda_libs = (
+                "libcublaslt.so.11",
+                "libcublas.so.11",
+                "libcurand.so.10",
+                "libcufft.so.10",
+                "libcudart.so.11",
+                "libcudnn.so.8",
+                "libnvrtc.so.11.2",
+                # This is not a mistake, it links to more specific version like libnvrtc.so.11.8.89 etc.
+            )
+        elif (12, 0) <= cuda_version_ < (13, 0):
+            cuda_libs = (
+                "libcublaslt.so.12",
+                "libcublas.so.12",
+                "libcurand.so.10",
+                "libcufft.so.11",
+                "libcudart.so.12",
+                "libcudnn.so.9",
+                "libnvrtc.so.12",
+            )
+    else:
+        logging.info(f"Unsupported platform: {platform.system()}")
+    check_and_load_cuda_libs(nvidia_path, cuda_libs)
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 4f29c7f424845..167f7976aecca 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -99,32 +99,33 @@ def validate_build_package_info():
     version = ""
     cuda_version = ""
 
-    if has_ortmodule:
-        try:
-            # collect onnxruntime package name, version, and cuda version
-            from .build_and_package_info import __version__ as version
-            from .build_and_package_info import package_name
+    try:
+        # collect onnxruntime package name, version, and cuda version
+        from .build_and_package_info import __version__ as version
+        from .build_and_package_info import package_name
+
+        try:  # noqa: SIM105
+            from .build_and_package_info import cuda_version
+        except Exception:
+            pass
 
-            try:  # noqa: SIM105
-                from .build_and_package_info import cuda_version
+        if cuda_version:
+            # collect cuda library build info. the library info may not be available
+            # when the build environment has none or multiple libraries installed
+            try:
+                from .build_and_package_info import cudart_version
             except Exception:
-                pass
-
-            if cuda_version:
-                # collect cuda library build info. the library info may not be available
-                # when the build environment has none or multiple libraries installed
-                try:
-                    from .build_and_package_info import cudart_version
-                except Exception:
-                    warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
-                    cudart_version = None
-
-                def print_build_package_info():
-                    warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
-                    warnings.warn(f"onnxruntime training package info: __version__: {version}")
-                    warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
-                    warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
+                warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
+                cudart_version = None
+
+            def print_build_package_info():
+                warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
+                warnings.warn(f"onnxruntime training package info: __version__: {version}")
+                warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
+                warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
 
+            # Cudart only available on Linux
+            if platform.system().lower() == "linux":
                 # collection cuda library info from current environment.
                 from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
 
@@ -133,13 +134,13 @@ def print_build_package_info():
                     print_build_package_info()
                     warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
                     warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
-            else:
-                # TODO: rcom
-                pass
+        else:
+            # TODO: rcom
+            pass
 
-        except Exception as e:
-            warnings.warn("WARNING: failed to collect onnxruntime version and build info")
-            print(e)
+    except Exception as e:
+        warnings.warn("WARNING: failed to collect onnxruntime version and build info")
+        print(e)
 
     if import_ortmodule_exception:
         raise import_ortmodule_exception
diff --git a/setup.py b/setup.py
index 1ca31cb0019f0..2c9a8a5600401 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@
 import datetime
 import logging
 import platform
+import re
 import shlex
 import subprocess
 import sys
@@ -54,6 +55,7 @@ def parse_arg_remove_string(argv, arg_name_equal):
 wheel_name_suffix = parse_arg_remove_string(sys.argv, "--wheel_name_suffix=")
 
 cuda_version = None
+cuda_version_major = None
 rocm_version = None
 is_migraphx = False
 is_rocm = False
@@ -63,6 +65,11 @@ def parse_arg_remove_string(argv, arg_name_equal):
 if wheel_name_suffix == "gpu":
     # TODO: how to support multiple CUDA versions?
     cuda_version = parse_arg_remove_string(sys.argv, "--cuda_version=")
+    if cuda_version is not None:
+        if not bool(re.match(r"^\d+\.\d+(\.\d+)?$", cuda_version)):
+            logger.error("CUDA version must be in format 'x.y' or  'x.y.z'")
+            sys.exit(1)
+        cuda_version_major = cuda_version.split(".")[0]
 elif parse_arg_remove_boolean(sys.argv, "--use_rocm"):
     is_rocm = True
     rocm_version = parse_arg_remove_string(sys.argv, "--rocm_version=")
@@ -705,11 +712,22 @@ def reformat_run_count(count_str):
     version_number = version_number + local_version
     if is_rocm and enable_rocm_profiling:
         version_number = version_number + ".profiling"
-
+extras_require = {}
 if wheel_name_suffix:
     if not (enable_training and wheel_name_suffix == "gpu"):
         # for training packages, local version is used to indicate device types
         package_name = f"{package_name}-{wheel_name_suffix}"
+    if (wheel_name_suffix == "gpu" or wheel_name_suffix == "cuda") and cuda_version_major is not None:
+        extras_require = {
+            # Optional 'cuda_dlls' dependencies
+            "cuda_dlls": [
+                f"nvidia-cuda-nvrtc-cu{cuda_version_major}",
+                f"nvidia-cuda-runtime-cu{cuda_version_major}",
+                f"nvidia-cudnn-cu{cuda_version_major}",
+                f"nvidia-cufft-cu{cuda_version_major}",
+                f"nvidia-curand-cu{cuda_version_major}",
+            ]
+        }
 
 cmd_classes = {}
 if bdist_wheel is not None:
@@ -727,21 +745,20 @@ def reformat_run_count(count_str):
     install_requires = f.read().splitlines()
 
 
-if enable_training:
-
-    def save_build_and_package_info(package_name, version_number, cuda_version, rocm_version):
-        sys.path.append(path.join(path.dirname(__file__), "onnxruntime", "python"))
-        from onnxruntime_collect_build_info import find_cudart_versions
+def save_build_and_package_info(package_name, version_number, cuda_version, rocm_version):
+    sys.path.append(path.join(path.dirname(__file__), "onnxruntime", "python"))
+    from onnxruntime_collect_build_info import find_cudart_versions
 
-        version_path = path.join("onnxruntime", "capi", "build_and_package_info.py")
-        with open(version_path, "w") as f:
-            f.write(f"package_name = '{package_name}'\n")
-            f.write(f"__version__ = '{version_number}'\n")
+    version_path = path.join("onnxruntime", "capi", "build_and_package_info.py")
+    with open(version_path, "w") as f:
+        f.write(f"package_name = '{package_name}'\n")
+        f.write(f"__version__ = '{version_number}'\n")
 
-            if cuda_version:
-                f.write(f"cuda_version = '{cuda_version}'\n")
+        if cuda_version:
+            f.write(f"cuda_version = '{cuda_version}'\n")
 
-                # cudart_versions are integers
+            # cudart_versions are integers
+            if platform.system().lower() == "linux":
                 cudart_versions = find_cudart_versions(build_env=True)
                 if cudart_versions and len(cudart_versions) == 1:
                     f.write(f"cudart_version = {cudart_versions[0]}\n")
@@ -754,12 +771,15 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
                             else "found multiple cudart libraries"
                         ),
                     )
-            elif rocm_version:
-                f.write(f"rocm_version = '{rocm_version}'\n")
+        elif rocm_version:
+            f.write(f"rocm_version = '{rocm_version}'\n")
 
+
+if enable_training:
     save_build_and_package_info(package_name, version_number, cuda_version, rocm_version)
+else:
+    save_build_and_package_info(package_name, version_number, cuda_version, None)
 
-# Setup
 setup(
     name=package_name,
     version=version_number,
@@ -783,4 +803,5 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
         ]
     },
     classifiers=classifiers,
+    extras_require=extras_require,
 )