diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 2ab3d0658..1baed09b8 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -42,7 +42,6 @@ jobs:
           mkdir -p /core/build
           cd /core/build
           cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_CORE_HEADERS_ONLY=OFF ..
-          export TRITON_PYBIND="_c/triton_bindings.cpython-310-x86_64-linux-gnu.so"
           make -j8
 
       - name: Run tests with pytest
diff --git a/pyproject.toml b/pyproject.toml
index 5e8749f81..3ce1a8a91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,54 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+[project]
+name = "tritonserver"
+authors = [{ name = "NVIDIA Inc.", email = "sw-dl-triton@nvidia.com" }]
+description = "Triton Inference Server In-Process Python API"
+license = { file = "LICENSE.txt" }
+dynamic = ["version"]
+dependencies = ["numpy<2"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Information Technology",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Utilities",
+    "License :: OSI Approved :: BSD License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.12",
+    "Environment :: Console",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.package-data]
+tritonserver = ["_c/triton_bindings.*.so"]
+
+[build-system]
+requires = [
+    "setuptools==75.3.0",
+    "wheel==0.44.0",
+    # For stubgen:
+    "mypy==1.11.0",
+    "numpy<2",
+]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+GPU = ["cupy-cuda12x"]
+test = ["pytest"]
+all = ["tritonserver[GPU]", "tritonserver[test]"]
+
+
 [tool.codespell]
 # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
 # this is only to allow you to run codespell interactively
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 871f682f9..df7f1bde0 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -30,6 +30,7 @@ add_subdirectory(tritonserver)
 file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
 configure_file(../LICENSE LICENSE.txt COPYONLY)
 configure_file(setup.py setup.py @ONLY)
+configure_file(../pyproject.toml pyproject.toml COPYONLY)
 file(COPY test/  DESTINATION ./test/.)
 
 set(WHEEL_DEPENDS
diff --git a/python/build_wheel.py b/python/build_wheel.py
index 150a3e346..2888cfe01 100755
--- a/python/build_wheel.py
+++ b/python/build_wheel.py
@@ -108,17 +108,18 @@ def sed(pattern, replace, source, dest=None):
 
     shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt"))
     shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py"))
+    shutil.copyfile("pyproject.toml", os.path.join(FLAGS.whl_dir, "pyproject.toml"))
 
     os.chdir(FLAGS.whl_dir)
     print("=== Building wheel")
-    args = ["python3", "setup.py", "bdist_wheel"]
+    args = ["python3", "-m", "build"]
 
     wenv = os.environ.copy()
     wenv["VERSION"] = FLAGS.triton_version
     wenv["TRITON_PYBIND"] = PYBIND_LIB
     p = subprocess.Popen(args, env=wenv)
     p.wait()
-    fail_if(p.returncode != 0, "setup.py failed")
+    fail_if(p.returncode != 0, "Building wheel failed failed")
 
     cpdir("dist", FLAGS.dest_dir)
 
diff --git a/python/setup.py b/python/setup.py
index 3d371eaac..9b9b29104 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -25,90 +25,23 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import sys
-from itertools import chain
 
-from setuptools import find_packages, setup
+import subprocess
 
-if "--plat-name" in sys.argv:
-    PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1]
-else:
-    PLATFORM_FLAG = "any"
+from setuptools import setup
+from setuptools.command.build_py import build_py
 
-if "VERSION" not in os.environ:
-    raise Exception("envvar VERSION must be specified")
 
-VERSION = os.environ["VERSION"]
+class BuildPyCommand(build_py):
+    def run(self):
+        build_py.run(self)
+        # Generate stub files:
+        package_name = self.distribution.metadata.name
+        subprocess.run(
+            ["stubgen", "-p", f"{package_name}._c", "-o", f"{self.build_lib}"],
+            check=True,
+        )
 
-try:
-    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
-    class bdist_wheel(_bdist_wheel):
-        def finalize_options(self):
-            _bdist_wheel.finalize_options(self)
-            self.root_is_pure = False
-
-        def get_tag(self):
-            pyver, abi, plat = "py3", "none", PLATFORM_FLAG
-            return pyver, abi, plat
-
-except ImportError:
-    bdist_wheel = None
-
-this_directory = os.path.abspath(os.path.dirname(__file__))
-
-data_files = [
-    ("", ["LICENSE.txt"]),
-]
-
-# Type checking marker file indicating support for type checkers.
-# https://peps.python.org/pep-0561/
-# Type hints for c extension generated by mypy
-platform_package_data = [
-    os.environ["TRITON_PYBIND"],
-    "py.typed",
-    "_c/__init__.pyi",
-    "_c/triton_bindings.pyi",
-]
-
-gpu_extras = ["cupy-cuda12x"]
-test_extras = ["pytest"]
-all_extras = gpu_extras + test_extras
-
-setup(
-    name="tritonserver",
-    version=VERSION,
-    author="NVIDIA Inc.",
-    author_email="sw-dl-triton@nvidia.com",
-    description="Triton Inference Server In-Process Python API",
-    license="BSD",
-    url="https://developer.nvidia.com/nvidia-triton-inference-server",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "Intended Audience :: Information Technology",
-        "Topic :: Scientific/Engineering",
-        "Topic :: Scientific/Engineering :: Image Recognition",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        "Topic :: Software Development :: Libraries",
-        "Topic :: Utilities",
-        "License :: OSI Approved :: BSD License",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.12",
-        "Environment :: Console",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-    ],
-    packages=find_packages(),
-    package_data={
-        "": platform_package_data,
-    },
-    zip_safe=False,
-    cmdclass={"bdist_wheel": bdist_wheel},
-    data_files=data_files,
-    install_requires=["numpy<2"],
-    extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
-)
+if __name__ == "__main__":
+    setup(cmdclass={"build_py": BuildPyCommand})
diff --git a/python/tritonserver/CMakeLists.txt b/python/tritonserver/CMakeLists.txt
index c06292451..d2480bc94 100644
--- a/python/tritonserver/CMakeLists.txt
+++ b/python/tritonserver/CMakeLists.txt
@@ -33,8 +33,6 @@ file(COPY __init__.py DESTINATION .)
 file(COPY py.typed DESTINATION .)
 # Copy the '__init__.py' for the '_c' module
 file(COPY _c/__init__.py DESTINATION ./_c/.)
-file(COPY _c/__init__.pyi DESTINATION ./_c/.)
-file(COPY _c/triton_bindings.pyi DESTINATION ./_c/.)
 # Find and copy _api modules
 file(GLOB PYTHON_MODULE_FILES ./_api/*.py)
 file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.)
@@ -65,3 +63,4 @@ target_compile_features(python-bindings PRIVATE cxx_std_17)
 set_property(TARGET python-bindings PROPERTY OUTPUT_NAME triton_bindings)
 # Add Triton library default path in 'rpath' for runtime library lookup
 set_target_properties(python-bindings PROPERTIES BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib")
+set_target_properties(python-bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/python/tritonserver/_c/)
diff --git a/python/tritonserver/_c/__init__.pyi b/python/tritonserver/_c/__init__.pyi
deleted file mode 100644
index aa7d4a57a..000000000
--- a/python/tritonserver/_c/__init__.pyi
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Type information for Triton _c bindings."""
-
-# Note: this file was generated using mypy with an empty __init__.py
-# file in the tritonserver package directory to avoid any renaming /
-# aliasing done by the wrapper
-#
-# mypy 1.8.0 (compiled: yes)
-#
-# stubgen -p tritonserver._c
-#
-# Todo: add stub generation to build process
-
-from .triton_bindings import *
diff --git a/python/tritonserver/_c/triton_bindings.pyi b/python/tritonserver/_c/triton_bindings.pyi
deleted file mode 100644
index 71deaba6b..000000000
--- a/python/tritonserver/_c/triton_bindings.pyi
+++ /dev/null
@@ -1,696 +0,0 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Type information for Triton _c bindings."""
-
-# Note: this file was generated using mypy with an empty __init__.py
-# file in the tritonserver package directory to avoid any renaming /
-# aliasing done by the wrapper
-#
-# mypy 1.8.0 (compiled: yes)
-#
-# stubgen -p tritonserver._c
-#
-# Todo: add stub generation to build process
-
-from typing import Callable, ClassVar, List, Optional, Tuple, overload
-
-import numpy
-
-ALL: TRITONSERVER_RequestReleaseFlag
-COMPUTE_END: TRITONSERVER_InferenceTraceActivity
-COMPUTE_INPUT_END: TRITONSERVER_InferenceTraceActivity
-COMPUTE_OUTPUT_START: TRITONSERVER_InferenceTraceActivity
-COMPUTE_START: TRITONSERVER_InferenceTraceActivity
-DECOUPLED: TRITONSERVER_ModelTxnPropertyFlag
-DISABLED: TRITONSERVER_InferenceTraceLevel
-FINAL: TRITONSERVER_ResponseCompleteFlag
-FIRST_DIM: TRITONSERVER_ModelBatchFlag
-MAX: TRITONSERVER_InferenceTraceLevel
-MIN: TRITONSERVER_InferenceTraceLevel
-ONE_TO_ONE: TRITONSERVER_ModelTxnPropertyFlag
-QUEUE_START: TRITONSERVER_InferenceTraceActivity
-READY: TRITONSERVER_ModelIndexFlag
-REQUEST_END: TRITONSERVER_InferenceTraceActivity
-REQUEST_START: TRITONSERVER_InferenceTraceActivity
-SEQUENCE_END: TRITONSERVER_RequestFlag
-SEQUENCE_START: TRITONSERVER_RequestFlag
-TENSORS: TRITONSERVER_InferenceTraceLevel
-TENSOR_BACKEND_INPUT: TRITONSERVER_InferenceTraceActivity
-TENSOR_BACKEND_OUTPUT: TRITONSERVER_InferenceTraceActivity
-TENSOR_QUEUE_INPUT: TRITONSERVER_InferenceTraceActivity
-TIMESTAMPS: TRITONSERVER_InferenceTraceLevel
-UNKNOWN: TRITONSERVER_ModelBatchFlag
-
-class AlreadyExistsError(TritonError): ...
-class InternalError(TritonError): ...
-class InvalidArgumentError(TritonError): ...
-class NotFoundError(TritonError): ...
-
-class TRITONSERVER_BufferAttributes:
-    byte_size: int
-    cuda_ipc_handle: int
-    memory_type: TRITONSERVER_MemoryType
-    memory_type_id: int
-    def __init__(self) -> None: ...
-
-class TRITONSERVER_DataType:
-    __members__: ClassVar[dict] = ...  # read-only
-    BF16: ClassVar[TRITONSERVER_DataType] = ...
-    BOOL: ClassVar[TRITONSERVER_DataType] = ...
-    BYTES: ClassVar[TRITONSERVER_DataType] = ...
-    FP16: ClassVar[TRITONSERVER_DataType] = ...
-    FP32: ClassVar[TRITONSERVER_DataType] = ...
-    FP64: ClassVar[TRITONSERVER_DataType] = ...
-    INT16: ClassVar[TRITONSERVER_DataType] = ...
-    INT32: ClassVar[TRITONSERVER_DataType] = ...
-    INT64: ClassVar[TRITONSERVER_DataType] = ...
-    INT8: ClassVar[TRITONSERVER_DataType] = ...
-    INVALID: ClassVar[TRITONSERVER_DataType] = ...
-    UINT16: ClassVar[TRITONSERVER_DataType] = ...
-    UINT32: ClassVar[TRITONSERVER_DataType] = ...
-    UINT64: ClassVar[TRITONSERVER_DataType] = ...
-    UINT8: ClassVar[TRITONSERVER_DataType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InferenceRequest:
-    correlation_id: int
-    correlation_id_string: str
-    flags: int
-    id: str
-    priority: int
-    priority_uint64: int
-    timeout_microseconds: int
-    def __init__(self, arg0, arg1: str, arg2: int) -> None: ...
-    def add_input(
-        self, arg0: str, arg1: TRITONSERVER_DataType, arg2: List[int]
-    ) -> None: ...
-    def add_raw_input(self, arg0: str) -> None: ...
-    def add_requested_output(self, arg0: str) -> None: ...
-    def append_input_data(
-        self, arg0: str, arg1: int, arg2: int, arg3: TRITONSERVER_MemoryType, arg4: int
-    ) -> None: ...
-    def append_input_data_with_buffer_attributes(
-        self, arg0: str, arg1: int, arg2: TRITONSERVER_BufferAttributes
-    ) -> None: ...
-    def append_input_data_with_host_policy(
-        self,
-        arg0: str,
-        arg1: int,
-        arg2: int,
-        arg3: TRITONSERVER_MemoryType,
-        arg4: int,
-        arg5: str,
-    ) -> None: ...
-    def cancel(self) -> None: ...
-    def remove_all_input_data(self, arg0: str) -> None: ...
-    def remove_all_inputs(self) -> None: ...
-    def remove_all_requested_outputs(self) -> None: ...
-    def remove_input(self, arg0: str) -> None: ...
-    def remove_requested_output(self, arg0: str) -> None: ...
-    def set_bool_parameter(self, arg0: str, arg1: bool) -> None: ...
-    def set_int_parameter(self, arg0: str, arg1: int) -> None: ...
-    def set_release_callback(
-        self,
-        arg0: Callable[[TRITONSERVER_InferenceRequest, int, object], None],
-        arg1: object,
-    ) -> None: ...
-    def set_response_callback(
-        self,
-        arg0: object,
-        arg1: object,
-        arg2: Callable[[object, int, object], None],
-        arg3: object,
-    ) -> None: ...
-    def set_string_parameter(self, arg0: str, arg1: str) -> None: ...
-    def set_double_parameter(self, arg0: str, arg1: float) -> None: ...
-
-class TRITONSERVER_InferenceResponse:
-    def __init__(self, *args, **kwargs) -> None: ...
-    def output(
-        self, arg0: int
-    ) -> Tuple[
-        str,
-        TRITONSERVER_DataType,
-        numpy.ndarray[numpy.int64],
-        int,
-        int,
-        TRITONSERVER_MemoryType,
-        int,
-        object,
-    ]: ...
-    def output_classification_label(self, arg0: int, arg1: int) -> str: ...
-    def parameter(
-        self, arg0: int
-    ) -> Tuple[str, TRITONSERVER_ParameterType, object]: ...
-    def throw_if_response_error(self) -> None: ...
-    @property
-    def id(self) -> str: ...
-    @property
-    def model(self) -> Tuple[str, int]: ...
-    @property
-    def output_count(self) -> int: ...
-    @property
-    def parameter_count(self) -> int: ...
-
-class TRITONSERVER_InferenceTrace:
-    @overload
-    def __init__(
-        self,
-        level: int,
-        parent_id: int,
-        activity_function: Callable[
-            [object, TRITONSERVER_InferenceTraceActivity, int, object], None
-        ],
-        tensor_activity_function: Callable[
-            [
-                object,
-                TRITONSERVER_InferenceTraceActivity,
-                str,
-                TRITONSERVER_DataType,
-                int,
-                int,
-                numpy.ndarray[numpy.int64],
-                TRITONSERVER_MemoryType,
-                int,
-                object,
-            ],
-            None,
-        ],
-        release_function: Callable[[TRITONSERVER_InferenceTrace, object], None],
-        trace_userp: object,
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        level: int,
-        parent_id: int,
-        activity_function: Callable[
-            [object, TRITONSERVER_InferenceTraceActivity, int, object], None
-        ],
-        release_function: Callable[[TRITONSERVER_InferenceTrace, object], None],
-        trace_userp: object,
-    ) -> None: ...
-    @property
-    def id(self) -> int: ...
-    @property
-    def model_name(self) -> str: ...
-    @property
-    def model_version(self) -> int: ...
-    @property
-    def parent_id(self) -> int: ...
-    @property
-    def request_id(self) -> str: ...
-
-class TRITONSERVER_InferenceTraceActivity:
-    __members__: ClassVar[dict] = ...  # read-only
-    COMPUTE_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_INPUT_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_OUTPUT_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    QUEUE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    REQUEST_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    REQUEST_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_BACKEND_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_BACKEND_OUTPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_QUEUE_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InferenceTraceLevel:
-    __members__: ClassVar[dict] = ...  # read-only
-    DISABLED: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    MAX: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    MIN: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    TENSORS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    TIMESTAMPS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InstanceGroupKind:
-    __members__: ClassVar[dict] = ...  # read-only
-    AUTO: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    CPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    GPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    MODEL: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_LogFormat:
-    __members__: ClassVar[dict] = ...  # read-only
-    DEFAULT: ClassVar[TRITONSERVER_LogFormat] = ...
-    ISO8601: ClassVar[TRITONSERVER_LogFormat] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_LogLevel:
-    __members__: ClassVar[dict] = ...  # read-only
-    ERROR: ClassVar[TRITONSERVER_LogLevel] = ...
-    INFO: ClassVar[TRITONSERVER_LogLevel] = ...
-    VERBOSE: ClassVar[TRITONSERVER_LogLevel] = ...
-    WARN: ClassVar[TRITONSERVER_LogLevel] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_MemoryType:
-    __members__: ClassVar[dict] = ...  # read-only
-    CPU: ClassVar[TRITONSERVER_MemoryType] = ...
-    CPU_PINNED: ClassVar[TRITONSERVER_MemoryType] = ...
-    GPU: ClassVar[TRITONSERVER_MemoryType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Message:
-    def __init__(self, arg0: str) -> None: ...
-    def serialize_to_json(self) -> str: ...
-
-class TRITONSERVER_Metric:
-    def __init__(
-        self, arg0: TRITONSERVER_MetricFamily, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def increment(self, arg0: float) -> None: ...
-    def set_value(self, arg0: float) -> None: ...
-    @property
-    def kind(self) -> TRITONSERVER_MetricKind: ...
-    @property
-    def value(self) -> float: ...
-
-class TRITONSERVER_MetricFamily:
-    def __init__(self, arg0: TRITONSERVER_MetricKind, arg1: str, arg2: str) -> None: ...
-
-class TRITONSERVER_MetricFormat:
-    __members__: ClassVar[dict] = ...  # read-only
-    PROMETHEUS: ClassVar[TRITONSERVER_MetricFormat] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_MetricKind:
-    __members__: ClassVar[dict] = ...  # read-only
-    COUNTER: ClassVar[TRITONSERVER_MetricKind] = ...
-    GAUGE: ClassVar[TRITONSERVER_MetricKind] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Metrics:
-    def __init__(self, *args, **kwargs) -> None: ...
-    def formatted(self, arg0: TRITONSERVER_MetricFormat) -> str: ...
-
-class TRITONSERVER_ModelBatchFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    FIRST_DIM: ClassVar[TRITONSERVER_ModelBatchFlag] = ...
-    UNKNOWN: ClassVar[TRITONSERVER_ModelBatchFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelControlMode:
-    __members__: ClassVar[dict] = ...  # read-only
-    EXPLICIT: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    NONE: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    POLL: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelIndexFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    READY: ClassVar[TRITONSERVER_ModelIndexFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelTxnPropertyFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    DECOUPLED: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ...
-    ONE_TO_ONE: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Parameter:
-    @overload
-    def __init__(self, arg0: str, arg1: bytes) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: str) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: int) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: bool) -> None: ...
-
-class TRITONSERVER_ParameterType:
-    __members__: ClassVar[dict] = ...  # read-only
-    BOOL: ClassVar[TRITONSERVER_ParameterType] = ...
-    BYTES: ClassVar[TRITONSERVER_ParameterType] = ...
-    INT: ClassVar[TRITONSERVER_ParameterType] = ...
-    STRING: ClassVar[TRITONSERVER_ParameterType] = ...
-    DOUBLE: ClassVar[TRITONSERVER_ParameterType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RateLimitMode:
-    __members__: ClassVar[dict] = ...  # read-only
-    EXEC_COUNT: ClassVar[TRITONSERVER_RateLimitMode] = ...
-    OFF: ClassVar[TRITONSERVER_RateLimitMode] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RequestFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    SEQUENCE_END: ClassVar[TRITONSERVER_RequestFlag] = ...
-    SEQUENCE_START: ClassVar[TRITONSERVER_RequestFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RequestReleaseFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    ALL: ClassVar[TRITONSERVER_RequestReleaseFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ResponseAllocator:
-    @overload
-    def __init__(
-        self,
-        alloc_function: Callable[
-            [object, str, int, TRITONSERVER_MemoryType, int, object],
-            Tuple[int, object, TRITONSERVER_MemoryType, int],
-        ],
-        release_function: Callable[
-            [object, int, object, int, TRITONSERVER_MemoryType, int], None
-        ],
-        start_function: Callable[[object, object], None],
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        alloc_function: Callable[
-            [object, str, int, TRITONSERVER_MemoryType, int, object],
-            Tuple[int, object, TRITONSERVER_MemoryType, int],
-        ],
-        release_function: Callable[
-            [object, int, object, int, TRITONSERVER_MemoryType, int], None
-        ],
-    ) -> None: ...
-    def set_buffer_attributes_function(
-        self,
-        buffer_attributes_function: Callable[
-            [object, str, object, object, object], object
-        ],
-    ) -> None: ...
-    def set_query_function(
-        self,
-        query_function: Callable[
-            [object, object, str, Optional[int], TRITONSERVER_MemoryType, int],
-            Tuple[TRITONSERVER_MemoryType, int],
-        ],
-    ) -> None: ...
-
-class TRITONSERVER_ResponseCompleteFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    FINAL: ClassVar[TRITONSERVER_ResponseCompleteFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Server:
-    def __init__(self, arg0: TRITONSERVER_ServerOptions) -> None: ...
-    @overload
-    def infer_async(
-        self, arg0: TRITONSERVER_InferenceRequest, arg1: TRITONSERVER_InferenceTrace
-    ) -> None: ...
-    @overload
-    def infer_async(self, arg0: TRITONSERVER_InferenceRequest) -> None: ...
-    def is_live(self) -> bool: ...
-    def is_ready(self) -> bool: ...
-    def load_model(self, arg0: str) -> None: ...
-    def load_model_with_parameters(
-        self, arg0: str, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def metadata(self) -> TRITONSERVER_Message: ...
-    def metrics(self) -> TRITONSERVER_Metrics: ...
-    def model_batch_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ...
-    def model_config(self, arg0: str, arg1: int, arg2: int) -> TRITONSERVER_Message: ...
-    def model_index(self, arg0: int) -> TRITONSERVER_Message: ...
-    def model_is_ready(self, arg0: str, arg1: int) -> bool: ...
-    def model_metadata(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ...
-    def model_statistics(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ...
-    def model_transaction_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ...
-    @overload
-    def poll_model_repository(self) -> None: ...
-    @overload
-    def poll_model_repository(self) -> None: ...
-    def register_model_repository(
-        self, arg0: str, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def stop(self) -> None: ...
-    def unload_model(self, arg0: str) -> None: ...
-    def unload_model_and_dependents(self, arg0: str) -> None: ...
-    def unregister_model_repository(self, arg0: str) -> None: ...
-
-class TRITONSERVER_ServerOptions:
-    def __init__(self) -> None: ...
-    def add_rate_limiter_resource(self, arg0: str, arg1: int, arg2: int) -> None: ...
-    def set_backend_config(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_backend_directory(self, arg0: str) -> None: ...
-    def set_buffer_manager_thread_count(self, arg0: int) -> None: ...
-    def set_cache_config(self, arg0: str, arg1: str) -> None: ...
-    def set_cache_directory(self, arg0: str) -> None: ...
-    def set_cpu_metrics(self, arg0: bool) -> None: ...
-    def set_cuda_memory_pool_byte_size(self, arg0: int, arg1: int) -> None: ...
-    def set_exit_on_error(self, arg0: bool) -> None: ...
-    def set_exit_timeout(self, arg0: int) -> None: ...
-    def set_gpu_metrics(self, arg0: bool) -> None: ...
-    def set_host_policy(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_log_error(self, arg0: bool) -> None: ...
-    def set_log_file(self, arg0: str) -> None: ...
-    def set_log_format(self, arg0: TRITONSERVER_LogFormat) -> None: ...
-    def set_log_info(self, arg0: bool) -> None: ...
-    def set_log_verbose(self, arg0: int) -> None: ...
-    def set_log_warn(self, arg0: bool) -> None: ...
-    def set_metrics(self, arg0: bool) -> None: ...
-    def set_metrics_config(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_metrics_interval(self, arg0: int) -> None: ...
-    def set_min_supported_compute_capability(self, arg0: float) -> None: ...
-    def set_model_control_mode(self, arg0: TRITONSERVER_ModelControlMode) -> None: ...
-    def set_model_load_device_limit(
-        self, arg0: TRITONSERVER_InstanceGroupKind, arg1: int, arg2: float
-    ) -> None: ...
-    def set_model_load_thread_count(self, arg0: int) -> None: ...
-    def set_model_load_retry_count(self, arg0: int) -> None: ...
-    def set_model_namespacing(self, arg0: bool) -> None: ...
-    def set_enable_peer_access(self, arg0: bool) -> None: ...
-    def set_model_repository_path(self, arg0: str) -> None: ...
-    def set_pinned_memory_pool_byte_size(self, arg0: int) -> None: ...
-    def set_rate_limiter_mode(self, arg0: TRITONSERVER_RateLimitMode) -> None: ...
-    def set_repo_agent_directory(self, arg0: str) -> None: ...
-    def set_response_cache_byte_size(self, arg0: int) -> None: ...
-    def set_server_id(self, arg0: str) -> None: ...
-    def set_startup_model(self, arg0: str) -> None: ...
-    def set_strict_model_config(self, arg0: bool) -> None: ...
-    def set_strict_readiness(self, arg0: bool) -> None: ...
-
-class TritonError(Exception): ...
-class UnavailableError(TritonError): ...
-class UnknownError(TritonError): ...
-class UnsupportedError(TritonError): ...
-
-def TRITONSERVER_DataTypeByteSize(arg0: TRITONSERVER_DataType) -> int: ...
-def TRITONSERVER_DataTypeString(arg0: TRITONSERVER_DataType) -> str: ...
-def TRITONSERVER_InferenceTraceActivityString(
-    arg0: TRITONSERVER_InferenceTraceActivity,
-) -> str: ...
-def TRITONSERVER_InferenceTraceLevelString(
-    arg0: TRITONSERVER_InferenceTraceLevel,
-) -> str: ...
-def TRITONSERVER_InstanceGroupKindString(
-    arg0: TRITONSERVER_InstanceGroupKind,
-) -> str: ...
-def TRITONSERVER_LogIsEnabled(arg0: TRITONSERVER_LogLevel) -> bool: ...
-def TRITONSERVER_LogMessage(
-    arg0: TRITONSERVER_LogLevel, arg1: str, arg2: int, arg3: str
-) -> None: ...
-def TRITONSERVER_MemoryTypeString(arg0: TRITONSERVER_MemoryType) -> str: ...
-def TRITONSERVER_ParameterTypeString(arg0: TRITONSERVER_ParameterType) -> str: ...
-def TRITONSERVER_StringToDataType(arg0: str) -> TRITONSERVER_DataType: ...
-def api_version() -> tuple: ...