diff --git a/.gitattributes b/.gitattributes
index 00407cdc..aeb32006 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -5,3 +5,5 @@ cuda/_version.py export-subst
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
+# git should not convert line endings in PNG files
+*.png binary
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 431bb7c5..c2d246aa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.4
-    hooks:
-      - id: ruff
-        args: [--fix, --show-fixes]
-      - id: ruff-format
-
-default_language_version:
-      python: python3
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.4
+    hooks:
+      - id: ruff
+        args: [--fix, --show-fixes]
+      - id: ruff-format
+
+default_language_version:
+      python: python3
diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py
index 8d70bfe2..f16e971a 100755
--- a/cuda_bindings/benchmarks/test_launch_latency.py
+++ b/cuda_bindings/benchmarks/test_launch_latency.py
@@ -1,341 +1,341 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-import ctypes
-
-import pytest
-
-from cuda import cuda
-
-from .kernels import kernel_string
-from .perf_test_utils import ASSERT_DRV
-
-
-def launch(kernel, stream, args=(), arg_types=()):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        (args, arg_types),
-        0,
-    )  # arguments
-
-
-def launch_packed(kernel, stream, params):
-    cuda.cuLaunchKernel(
-        kernel,
-        1,
-        1,
-        1,  # grid dim
-        1,
-        1,
-        1,  # block dim
-        0,
-        stream,  # shared mem and stream
-        params,
-        0,
-    )  # arguments
-
-
-# Measure launch latency with no parmaeters
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with a single parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
-    ASSERT_DRV(err)
-
-    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
-    ASSERT_DRV(err)
-
-    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
-
-    (err,) = cuda.cuMemFree(f)
-    ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 512
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
-    ASSERT_DRV(err)
-
-    args = [True] * 512
-    arg_types = [ctypes.c_bool] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
-    ASSERT_DRV(err)
-
-    args = [1.2345] * 512
-    arg_types = [ctypes.c_double] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
-    ASSERT_DRV(err)
-
-    args = [123] * 512
-    arg_types = [ctypes.c_int] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
-    ASSERT_DRV(err)
-
-    args = [127] * 512
-    arg_types = [ctypes.c_byte] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
-    ASSERT_DRV(err)
-
-    args = [9223372036854775806] * 512
-    arg_types = [ctypes.c_longlong] * 512
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 256
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters using builtin parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
-    ASSERT_DRV(err)
-
-    args = []
-    arg_types = [None] * 16
-    for _ in arg_types:
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        args.append(p)
-
-    args = tuple(args)
-    arg_types = tuple(arg_types)
-
-    benchmark(launch, func, stream, args=args, arg_types=arg_types)
-
-    cuda.cuCtxSynchronize()
-
-    for p in args:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with many parameters, excluding parameter packing
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    val_ps = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-        val_ps.append(ctypes.c_void_p(int(vals[i])))
-
-    packagedParams = (ctypes.c_void_p * 512)()
-    for i in range(512):
-        packagedParams[i] = ctypes.addressof(val_ps[i])
-
-    benchmark(launch_packed, func, stream, packagedParams)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-def pack_and_launch(kernel, stream, params):
-    packed_params = (ctypes.c_void_p * len(params))()
-    ptrs = [0] * len(params)
-    for i in range(len(params)):
-        ptrs[i] = ctypes.c_void_p(int(params[i]))
-        packed_params[i] = ctypes.addressof(ptrs[i])
-
-    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
-
-
-# Measure launch latency plus parameter packing using ctypes
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
-    ASSERT_DRV(err)
-
-    vals = []
-    for i in range(512):
-        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
-        ASSERT_DRV(err)
-        vals.append(p)
-
-    benchmark(pack_and_launch, func, stream, vals)
-
-    cuda.cuCtxSynchronize()
-
-    for p in vals:
-        (err,) = cuda.cuMemFree(p)
-        ASSERT_DRV(err)
-
-
-# Measure launch latency with a single large struct parameter
-@pytest.mark.benchmark(group="launch-latency")
-def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
-    device, ctx, stream = init_cuda
-    module = load_module(kernel_string, device)
-
-    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
-    ASSERT_DRV(err)
-
-    class struct_2048B(ctypes.Structure):
-        _fields_ = [("values", ctypes.c_uint8 * 2048)]
-
-    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
-
-    cuda.cuCtxSynchronize()
+# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+import ctypes
+
+import pytest
+
+from cuda import cuda
+
+from .kernels import kernel_string
+from .perf_test_utils import ASSERT_DRV
+
+
+def launch(kernel, stream, args=(), arg_types=()):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (args, arg_types),
+        0,
+    )  # arguments
+
+
+def launch_packed(kernel, stream, params):
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        params,
+        0,
+    )  # arguments
+
+
+# Measure launch latency with no parmaeters
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with a single parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
+    ASSERT_DRV(err)
+
+    err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
+    ASSERT_DRV(err)
+
+    benchmark(launch, func, stream, args=(f,), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
+
+    (err,) = cuda.cuMemFree(f)
+    ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 512
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
+    ASSERT_DRV(err)
+
+    args = [True] * 512
+    arg_types = [ctypes.c_bool] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
+    ASSERT_DRV(err)
+
+    args = [1.2345] * 512
+    arg_types = [ctypes.c_double] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
+    ASSERT_DRV(err)
+
+    args = [123] * 512
+    arg_types = [ctypes.c_int] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
+    ASSERT_DRV(err)
+
+    args = [127] * 512
+    arg_types = [ctypes.c_byte] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
+    ASSERT_DRV(err)
+
+    args = [9223372036854775806] * 512
+    arg_types = [ctypes.c_longlong] * 512
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 256
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters using builtin parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
+    ASSERT_DRV(err)
+
+    args = []
+    arg_types = [None] * 16
+    for _ in arg_types:
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        args.append(p)
+
+    args = tuple(args)
+    arg_types = tuple(arg_types)
+
+    benchmark(launch, func, stream, args=args, arg_types=arg_types)
+
+    cuda.cuCtxSynchronize()
+
+    for p in args:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with many parameters, excluding parameter packing
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    val_ps = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+        val_ps.append(ctypes.c_void_p(int(vals[i])))
+
+    packagedParams = (ctypes.c_void_p * 512)()
+    for i in range(512):
+        packagedParams[i] = ctypes.addressof(val_ps[i])
+
+    benchmark(launch_packed, func, stream, packagedParams)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+def pack_and_launch(kernel, stream, params):
+    packed_params = (ctypes.c_void_p * len(params))()
+    ptrs = [0] * len(params)
+    for i in range(len(params)):
+        ptrs[i] = ctypes.c_void_p(int(params[i]))
+        packed_params[i] = ctypes.addressof(ptrs[i])
+
+    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
+
+
+# Measure launch latency plus parameter packing using ctypes
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
+    ASSERT_DRV(err)
+
+    vals = []
+    for i in range(512):
+        err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int))
+        ASSERT_DRV(err)
+        vals.append(p)
+
+    benchmark(pack_and_launch, func, stream, vals)
+
+    cuda.cuCtxSynchronize()
+
+    for p in vals:
+        (err,) = cuda.cuMemFree(p)
+        ASSERT_DRV(err)
+
+
+# Measure launch latency with a single large struct parameter
+@pytest.mark.benchmark(group="launch-latency")
+def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
+    device, ctx, stream = init_cuda
+    module = load_module(kernel_string, device)
+
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
+    ASSERT_DRV(err)
+
+    class struct_2048B(ctypes.Structure):
+        _fields_ = [("values", ctypes.c_uint8 * 2048)]
+
+    benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
+
+    cuda.cuCtxSynchronize()
diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index d92a3ca7..839c7be1 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -1,168 +1,168 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import pytest
-
-from cuda.bindings import nvjitlink, nvrtc
-
-# Establish a handful of compatible architectures and PTX versions to test with
-ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"]
-PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"]
-
-
-def ptx_header(version, arch):
-    return f"""
-.version {version}
-.target {arch}
-.address_size 64
-"""
-
-
-ptx_kernel = """
-.visible .entry _Z6kernelPi(
-    .param .u64 _Z6kernelPi_param_0
-)
-{
-    .reg .pred  %p<2>;
-    .reg .b32   %r<3>;
-    .reg .b64   %rd<3>;
-
-    ld.param.u64    %rd1, [_Z6kernelPi_param_0];
-    cvta.to.global.u64  %rd2, %rd1;
-    mov.u32     %r1, %tid.x;
-    st.global.u32   [%rd2+0], %r1;
-    ret;
-}
-"""
-
-minimal_ptx_kernel = """
-.func _MinimalKernel()
-{
-    ret;
-}
-"""
-
-ptx_kernel_bytes = [
-    (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
-]
-minimal_ptx_kernel_bytes = [
-    (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8")
-    for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
-]
-
-
-# create a valid LTOIR input for testing
-@pytest.fixture
-def get_dummy_ltoir():
-    def CHECK_NVRTC(err):
-        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError(f"Nvrtc Error: {err}")
-
-    empty_cplusplus_kernel = "__global__ void A() {}"
-    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
-    CHECK_NVRTC(err)
-    nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"])
-    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
-    CHECK_NVRTC(err)
-    empty_kernel_ltoir = b" " * size
-    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
-    CHECK_NVRTC(err)
-    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
-    CHECK_NVRTC(err)
-    return empty_kernel_ltoir
-
-
-def test_unrecognized_option_error():
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-fictitious_option"])
-
-
-def test_invalid_arch_error():
-    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
-        nvjitlink.create(1, ["-arch=sm_XX"])
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_create_and_destroy(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    assert handle != 0
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_complete_empty(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_add_data(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_add_file(option, ptx_bytes, tmp_path):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    file_path = tmp_path / "test_file.cubin"
-    file_path.write_bytes(ptx_bytes)
-    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
-    nvjitlink.complete(handle)
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_get_error_log(option):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_error_log_size(handle)
-    log = bytearray(log_size)
-    nvjitlink.get_error_log(handle, log)
-    assert len(log) == log_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_get_info_log(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    log_size = nvjitlink.get_info_log_size(handle)
-    log = bytearray(log_size)
-    nvjitlink.get_info_log(handle, log)
-    assert len(log) == log_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
-def test_get_linked_cubin(option, ptx_bytes):
-    handle = nvjitlink.create(1, [f"-arch={option}"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
-    nvjitlink.complete(handle)
-    cubin_size = nvjitlink.get_linked_cubin_size(handle)
-    cubin = bytearray(cubin_size)
-    nvjitlink.get_linked_cubin(handle, cubin)
-    assert len(cubin) == cubin_size
-    nvjitlink.destroy(handle)
-
-
-@pytest.mark.parametrize("option", ARCHITECTURES)
-def test_get_linked_ptx(option, get_dummy_ltoir):
-    handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"])
-    nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data")
-    nvjitlink.complete(handle)
-    ptx_size = nvjitlink.get_linked_ptx_size(handle)
-    ptx = bytearray(ptx_size)
-    nvjitlink.get_linked_ptx(handle, ptx)
-    assert len(ptx) == ptx_size
-    nvjitlink.destroy(handle)
-
-
-def test_package_version():
-    ver = nvjitlink.version()
-    assert len(ver) == 2
-    assert ver >= (12, 0)
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import pytest
+
+from cuda.bindings import nvjitlink, nvrtc
+
+# Establish a handful of compatible architectures and PTX versions to test with
+ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"]
+PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"]
+
+
+def ptx_header(version, arch):
+    return f"""
+.version {version}
+.target {arch}
+.address_size 64
+"""
+
+
+ptx_kernel = """
+.visible .entry _Z6kernelPi(
+    .param .u64 _Z6kernelPi_param_0
+)
+{
+    .reg .pred  %p<2>;
+    .reg .b32   %r<3>;
+    .reg .b64   %rd<3>;
+
+    ld.param.u64    %rd1, [_Z6kernelPi_param_0];
+    cvta.to.global.u64  %rd2, %rd1;
+    mov.u32     %r1, %tid.x;
+    st.global.u32   [%rd2+0], %r1;
+    ret;
+}
+"""
+
+minimal_ptx_kernel = """
+.func _MinimalKernel()
+{
+    ret;
+}
+"""
+
+ptx_kernel_bytes = [
+    (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
+]
+minimal_ptx_kernel_bytes = [
+    (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8")
+    for version, arch in zip(PTX_VERSIONS, ARCHITECTURES)
+]
+
+
+# create a valid LTOIR input for testing
+@pytest.fixture
+def get_dummy_ltoir():
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(f"Nvrtc Error: {err}")
+
+    empty_cplusplus_kernel = "__global__ void A() {}"
+    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"])
+    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
+    CHECK_NVRTC(err)
+    empty_kernel_ltoir = b" " * size
+    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return empty_kernel_ltoir
+
+
+def test_unrecognized_option_error():
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-fictitious_option"])
+
+
+def test_invalid_arch_error():
+    with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvjitlink.create(1, ["-arch=sm_XX"])
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_create_and_destroy(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    assert handle != 0
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_complete_empty(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_add_data(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_add_file(option, ptx_bytes, tmp_path):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    file_path = tmp_path / "test_file.cubin"
+    file_path.write_bytes(ptx_bytes)
+    nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path))
+    nvjitlink.complete(handle)
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_get_error_log(option):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_error_log_size(handle)
+    log = bytearray(log_size)
+    nvjitlink.get_error_log(handle, log)
+    assert len(log) == log_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_get_info_log(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    log_size = nvjitlink.get_info_log_size(handle)
+    log = bytearray(log_size)
+    nvjitlink.get_info_log(handle, log)
+    assert len(log) == log_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes))
+def test_get_linked_cubin(option, ptx_bytes):
+    handle = nvjitlink.create(1, [f"-arch={option}"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data")
+    nvjitlink.complete(handle)
+    cubin_size = nvjitlink.get_linked_cubin_size(handle)
+    cubin = bytearray(cubin_size)
+    nvjitlink.get_linked_cubin(handle, cubin)
+    assert len(cubin) == cubin_size
+    nvjitlink.destroy(handle)
+
+
+@pytest.mark.parametrize("option", ARCHITECTURES)
+def test_get_linked_ptx(option, get_dummy_ltoir):
+    handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"])
+    nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data")
+    nvjitlink.complete(handle)
+    ptx_size = nvjitlink.get_linked_ptx_size(handle)
+    ptx = bytearray(ptx_size)
+    nvjitlink.get_linked_ptx(handle, ptx)
+    assert len(ptx) == ptx_size
+    nvjitlink.destroy(handle)
+
+
+def test_package_version():
+    ver = nvjitlink.version()
+    assert len(ver) == 2
+    assert ver >= (12, 0)
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 9b94ecd3..9a9432cb 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -1,25 +1,25 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-# If we have subcategories of examples in the future, this file can be split along those lines
-
-import glob
-import os
-
-import pytest
-
-from .utils import run_example
-
-samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples")
-sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
-
-
-@pytest.mark.parametrize("example", sample_files)
-class TestExamples:
-    def test_example(self, example, deinit_cuda):
-        run_example(samples_path, example)
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+# If we have subcategories of examples in the future, this file can be split along those lines
+
+import glob
+import os
+
+import pytest
+
+from .utils import run_example
+
+samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples")
+sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
+
+
+@pytest.mark.parametrize("example", sample_files)
+class TestExamples:
+    def test_example(self, example, deinit_cuda):
+        run_example(samples_path, example)
diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py
index f6ac3e15..3d218a91 100644
--- a/cuda_core/tests/example_tests/utils.py
+++ b/cuda_core/tests/example_tests/utils.py
@@ -1,56 +1,56 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import gc
-import os
-import sys
-
-import cupy as cp
-import pytest
-
-
-class SampleTestError(Exception):
-    pass
-
-
-def parse_python_script(filepath):
-    if not filepath.endswith(".py"):
-        raise ValueError(f"{filepath} not supported")
-    with open(filepath, encoding="utf-8") as f:
-        script = f.read()
-    return script
-
-
-def run_example(samples_path, filename, env=None):
-    fullpath = os.path.join(samples_path, filename)
-    script = parse_python_script(fullpath)
-    try:
-        old_argv = sys.argv
-        sys.argv = [fullpath]
-        old_sys_path = sys.path.copy()
-        sys.path.append(samples_path)
-        exec(script, env if env else {})
-    except ImportError as e:
-        # for samples requiring any of optional dependencies
-        for m in ("cupy",):
-            if f"No module named '{m}'" in str(e):
-                pytest.skip(f"{m} not installed, skipping related tests")
-                break
-        else:
-            raise
-    except Exception as e:
-        msg = "\n"
-        msg += f"Got error ({filename}):\n"
-        msg += str(e)
-        raise SampleTestError(msg) from e
-    finally:
-        sys.path = old_sys_path
-        sys.argv = old_argv
-        # further reduce the memory watermark
-        gc.collect()
-        cp.get_default_memory_pool().free_all_blocks()
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import gc
+import os
+import sys
+
+import cupy as cp
+import pytest
+
+
+class SampleTestError(Exception):
+    pass
+
+
+def parse_python_script(filepath):
+    if not filepath.endswith(".py"):
+        raise ValueError(f"{filepath} not supported")
+    with open(filepath, encoding="utf-8") as f:
+        script = f.read()
+    return script
+
+
+def run_example(samples_path, filename, env=None):
+    fullpath = os.path.join(samples_path, filename)
+    script = parse_python_script(fullpath)
+    try:
+        old_argv = sys.argv
+        sys.argv = [fullpath]
+        old_sys_path = sys.path.copy()
+        sys.path.append(samples_path)
+        exec(script, env if env else {})
+    except ImportError as e:
+        # for samples requiring any of optional dependencies
+        for m in ("cupy",):
+            if f"No module named '{m}'" in str(e):
+                pytest.skip(f"{m} not installed, skipping related tests")
+                break
+        else:
+            raise
+    except Exception as e:
+        msg = "\n"
+        msg += f"Got error ({filename}):\n"
+        msg += str(e)
+        raise SampleTestError(msg) from e
+    finally:
+        sys.path = old_sys_path
+        sys.argv = old_argv
+        # further reduce the memory watermark
+        gc.collect()
+        cp.get_default_memory_pool().free_all_blocks()
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index afc3ed5b..876299f3 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -1,80 +1,80 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-try:
-    from cuda.bindings import driver, runtime
-except ImportError:
-    from cuda import cuda as driver
-    from cuda import cudart as runtime
-
-from cuda.core.experimental import Device
-from cuda.core.experimental._utils import ComputeCapability, handle_return
-
-
-def test_device_set_current(deinit_cuda):
-    device = Device()
-    device.set_current()
-    assert handle_return(driver.cuCtxGetCurrent()) is not None
-
-
-def test_device_repr():
-    device = Device(0)
-    assert str(device).startswith("<Device 0")
-
-
-def test_device_alloc(init_cuda):
-    device = Device()
-    buffer = device.allocate(1024)
-    device.sync()
-    assert buffer.handle != 0
-    assert buffer.size == 1024
-    assert buffer.device_id == 0
-
-
-def test_device_create_stream(init_cuda):
-    device = Device()
-    stream = device.create_stream()
-    assert stream is not None
-    assert stream.handle
-
-
-def test_pci_bus_id():
-    device = Device()
-    bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, device.device_id))
-    assert device.pci_bus_id == bus_id[:12].decode()
-
-
-def test_uuid():
-    device = Device()
-    driver_ver = handle_return(driver.cuDriverGetVersion())
-    if driver_ver >= 11040:
-        uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id))
-    else:
-        uuid = handle_return(driver.cuDeviceGetUuid(device.device_id))
-    uuid = uuid.bytes.hex()
-    expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
-    assert device.uuid == expected_uuid
-
-
-def test_name():
-    device = Device()
-    name = handle_return(driver.cuDeviceGetName(128, device.device_id))
-    name = name.split(b"\0")[0]
-    assert device.name == name.decode()
-
-
-def test_compute_capability():
-    device = Device()
-    major = handle_return(
-        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)
-    )
-    minor = handle_return(
-        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)
-    )
-    expected_cc = ComputeCapability(major, minor)
-    assert device.compute_capability == expected_cc
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+try:
+    from cuda.bindings import driver, runtime
+except ImportError:
+    from cuda import cuda as driver
+    from cuda import cudart as runtime
+
+from cuda.core.experimental import Device
+from cuda.core.experimental._utils import ComputeCapability, handle_return
+
+
+def test_device_set_current(deinit_cuda):
+    device = Device()
+    device.set_current()
+    assert handle_return(driver.cuCtxGetCurrent()) is not None
+
+
+def test_device_repr():
+    device = Device(0)
+    assert str(device).startswith("<Device 0")
+
+
+def test_device_alloc(init_cuda):
+    device = Device()
+    buffer = device.allocate(1024)
+    device.sync()
+    assert buffer.handle != 0
+    assert buffer.size == 1024
+    assert buffer.device_id == 0
+
+
+def test_device_create_stream(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    assert stream is not None
+    assert stream.handle
+
+
+def test_pci_bus_id():
+    device = Device()
+    bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, device.device_id))
+    assert device.pci_bus_id == bus_id[:12].decode()
+
+
+def test_uuid():
+    device = Device()
+    driver_ver = handle_return(driver.cuDriverGetVersion())
+    if driver_ver >= 11040:
+        uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id))
+    else:
+        uuid = handle_return(driver.cuDeviceGetUuid(device.device_id))
+    uuid = uuid.bytes.hex()
+    expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
+    assert device.uuid == expected_uuid
+
+
+def test_name():
+    device = Device()
+    name = handle_return(driver.cuDeviceGetName(128, device.device_id))
+    name = name.split(b"\0")[0]
+    assert device.name == name.decode()
+
+
+def test_compute_capability():
+    device = Device()
+    major = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)
+    )
+    minor = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)
+    )
+    expected_cc = ComputeCapability(major, minor)
+    assert device.compute_capability == expected_cc
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 21548078..0d650b4f 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -1,46 +1,46 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, EventOptions
-
-
-@pytest.mark.parametrize("enable_timing", [True, False, None])
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True)
-
-
-def test_is_sync_busy_waited(init_cuda):
-    options = EventOptions(enable_timing=False, busy_waited_sync=True)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_sync_busy_waited is True
-
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_sync_busy_waited is False
-
-
-def test_sync(init_cuda):
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    event.sync()
-    assert event.is_done is True
-
-
-def test_is_done(init_cuda):
-    options = EventOptions(enable_timing=False)
-    stream = Device().create_stream()
-    event = stream.record(options=options)
-    assert event.is_done is True
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, EventOptions
+
+
+@pytest.mark.parametrize("enable_timing", [True, False, None])
+def test_timing(init_cuda, enable_timing):
+    options = EventOptions(enable_timing=enable_timing)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True)
+
+
+def test_is_sync_busy_waited(init_cuda):
+    options = EventOptions(enable_timing=False, busy_waited_sync=True)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_sync_busy_waited is True
+
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_sync_busy_waited is False
+
+
+def test_sync(init_cuda):
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    event.sync()
+    assert event.is_done is True
+
+
+def test_is_done(init_cuda):
+    options = EventOptions(enable_timing=False)
+    stream = Device().create_stream()
+    event = stream.record(options=options)
+    assert event.is_done is True
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 874d7f07..08f7e6d3 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -1,68 +1,68 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, LaunchConfig, Stream
-
-
-def test_launch_config_init(init_cuda):
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
-    assert config.grid == (1, 1, 1)
-    assert config.block == (1, 1, 1)
-    assert config.stream is None
-    assert config.shmem_size == 0
-
-    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024)
-    assert config.grid == (2, 2, 2)
-    assert config.block == (2, 2, 2)
-    assert isinstance(config.stream, Stream)
-    assert config.shmem_size == 1024
-
-
-def test_launch_config_cast_to_3_tuple():
-    config = LaunchConfig(grid=1, block=1)
-    assert config._cast_to_3_tuple(1) == (1, 1, 1)
-    assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1)
-    assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3)
-
-    # Edge cases
-    assert config._cast_to_3_tuple(999) == (999, 1, 1)
-    assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
-    assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
-
-
-def test_launch_config_invalid_values():
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=0, block=1)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(0, 1), block=1)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=0)
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=(0, 1))
-
-
-def test_launch_config_stream(init_cuda):
-    stream = Device().create_stream()
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
-    assert config.stream == stream
-
-    with pytest.raises(ValueError):
-        LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
-
-
-def test_launch_config_shmem_size():
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
-    assert config.shmem_size == 2048
-
-    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None)
-    assert config.shmem_size == 0
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, LaunchConfig, Stream
+
+
+def test_launch_config_init(init_cuda):
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
+    assert config.grid == (1, 1, 1)
+    assert config.block == (1, 1, 1)
+    assert config.stream is None
+    assert config.shmem_size == 0
+
+    config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024)
+    assert config.grid == (2, 2, 2)
+    assert config.block == (2, 2, 2)
+    assert isinstance(config.stream, Stream)
+    assert config.shmem_size == 1024
+
+
+def test_launch_config_cast_to_3_tuple():
+    config = LaunchConfig(grid=1, block=1)
+    assert config._cast_to_3_tuple(1) == (1, 1, 1)
+    assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1)
+    assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3)
+
+    # Edge cases
+    assert config._cast_to_3_tuple(999) == (999, 1, 1)
+    assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
+    assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
+
+
+def test_launch_config_invalid_values():
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=0, block=1)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(0, 1), block=1)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=0)
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=(0, 1))
+
+
+def test_launch_config_stream(init_cuda):
+    stream = Device().create_stream()
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
+    assert config.stream == stream
+
+    with pytest.raises(ValueError):
+        LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
+
+
+def test_launch_config_shmem_size():
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
+    assert config.shmem_size == 2048
+
+    config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None)
+    assert config.shmem_size == 0
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index c78b5673..a48db69b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1,213 +1,213 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-try:
-    from cuda.bindings import driver
-except ImportError:
-    from cuda import cuda as driver
-
-import ctypes
-
-from cuda.core.experimental import Device
-from cuda.core.experimental._memory import Buffer, MemoryResource
-from cuda.core.experimental._utils import handle_return
-
-
-class DummyDeviceMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAlloc(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return False
-
-    @property
-    def device_id(self) -> int:
-        return 0
-
-
-class DummyHostMemoryResource(MemoryResource):
-    def __init__(self):
-        pass
-
-    def allocate(self, size, stream=None) -> Buffer:
-        # Allocate a ctypes buffer of size `size`
-        ptr = (ctypes.c_byte * size)()
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        # the memory is deallocated per the ctypes deallocation at garbage collection time
-        pass
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return False
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
-
-
-class DummyUnifiedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFree(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        return 0
-
-
-class DummyPinnedMemoryResource(MemoryResource):
-    def __init__(self, device):
-        self.device = device
-
-    def allocate(self, size, stream=None) -> Buffer:
-        ptr = handle_return(driver.cuMemAllocHost(size))
-        return Buffer(ptr=ptr, size=size, mr=self)
-
-    def deallocate(self, ptr, size, stream=None):
-        handle_return(driver.cuMemFreeHost(ptr))
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return True
-
-    @property
-    def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
-
-
-def buffer_initialization(dummy_mr: MemoryResource):
-    buffer = dummy_mr.allocate(size=1024)
-    assert buffer.handle != 0
-    assert buffer.size == 1024
-    assert buffer.memory_resource == dummy_mr
-    assert buffer.is_device_accessible == dummy_mr.is_device_accessible
-    assert buffer.is_host_accessible == dummy_mr.is_host_accessible
-    buffer.close()
-
-
-def test_buffer_initialization():
-    device = Device()
-    device.set_current()
-    buffer_initialization(DummyDeviceMemoryResource(device))
-    buffer_initialization(DummyHostMemoryResource())
-    buffer_initialization(DummyUnifiedMemoryResource(device))
-    buffer_initialization(DummyPinnedMemoryResource(device))
-
-
-def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
-    src_buffer = dummy_mr.allocate(size=1024)
-    dst_buffer = dummy_mr.allocate(size=1024)
-    stream = device.create_stream()
-
-    if check:
-        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        for i in range(1024):
-            src_ptr[i] = ctypes.c_byte(i)
-
-    src_buffer.copy_to(dst_buffer, stream=stream)
-    device.sync()
-
-    if check:
-        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-
-        for i in range(10):
-            assert dst_ptr[i] == src_ptr[i]
-
-    dst_buffer.close()
-    src_buffer.close()
-
-
-def test_buffer_copy_to():
-    device = Device()
-    device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
-
-
-def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
-    src_buffer = dummy_mr.allocate(size=1024)
-    dst_buffer = dummy_mr.allocate(size=1024)
-    stream = device.create_stream()
-
-    if check:
-        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        for i in range(1024):
-            src_ptr[i] = ctypes.c_byte(i)
-
-    dst_buffer.copy_from(src_buffer, stream=stream)
-    device.sync()
-
-    if check:
-        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-
-        for i in range(10):
-            assert dst_ptr[i] == src_ptr[i]
-
-    dst_buffer.close()
-    src_buffer.close()
-
-
-def test_buffer_copy_from():
-    device = Device()
-    device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
-
-
-def buffer_close(dummy_mr: MemoryResource):
-    buffer = dummy_mr.allocate(size=1024)
-    buffer.close()
-    assert buffer.handle == 0
-    assert buffer.memory_resource is None
-
-
-def test_buffer_close():
-    device = Device()
-    device.set_current()
-    buffer_close(DummyDeviceMemoryResource(device))
-    buffer_close(DummyHostMemoryResource())
-    buffer_close(DummyUnifiedMemoryResource(device))
-    buffer_close(DummyPinnedMemoryResource(device))
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+try:
+    from cuda.bindings import driver
+except ImportError:
+    from cuda import cuda as driver
+
+import ctypes
+
+from cuda.core.experimental import Device
+from cuda.core.experimental._memory import Buffer, MemoryResource
+from cuda.core.experimental._utils import handle_return
+
+
+class DummyDeviceMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAlloc(size))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return 0
+
+
+class DummyHostMemoryResource(MemoryResource):
+    def __init__(self):
+        pass
+
+    def allocate(self, size, stream=None) -> Buffer:
+        # Allocate a ctypes buffer of size `size`
+        ptr = (ctypes.c_byte * size)()
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        # the memory is deallocated per the ctypes deallocation at garbage collection time
+        pass
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return False
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class DummyUnifiedMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFree(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        return 0
+
+
+class DummyPinnedMemoryResource(MemoryResource):
+    def __init__(self, device):
+        self.device = device
+
+    def allocate(self, size, stream=None) -> Buffer:
+        ptr = handle_return(driver.cuMemAllocHost(size))
+        return Buffer(ptr=ptr, size=size, mr=self)
+
+    def deallocate(self, ptr, size, stream=None):
+        handle_return(driver.cuMemFreeHost(ptr))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return True
+
+    @property
+    def device_id(self) -> int:
+        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+def buffer_initialization(dummy_mr: MemoryResource):
+    buffer = dummy_mr.allocate(size=1024)
+    assert buffer.handle != 0
+    assert buffer.size == 1024
+    assert buffer.memory_resource == dummy_mr
+    assert buffer.is_device_accessible == dummy_mr.is_device_accessible
+    assert buffer.is_host_accessible == dummy_mr.is_host_accessible
+    buffer.close()
+
+
+def test_buffer_initialization():
+    device = Device()
+    device.set_current()
+    buffer_initialization(DummyDeviceMemoryResource(device))
+    buffer_initialization(DummyHostMemoryResource())
+    buffer_initialization(DummyUnifiedMemoryResource(device))
+    buffer_initialization(DummyPinnedMemoryResource(device))
+
+
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
+    src_buffer = dummy_mr.allocate(size=1024)
+    dst_buffer = dummy_mr.allocate(size=1024)
+    stream = device.create_stream()
+
+    if check:
+        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+        for i in range(1024):
+            src_ptr[i] = ctypes.c_byte(i)
+
+    src_buffer.copy_to(dst_buffer, stream=stream)
+    device.sync()
+
+    if check:
+        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+
+        for i in range(10):
+            assert dst_ptr[i] == src_ptr[i]
+
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_buffer_copy_to():
+    device = Device()
+    device.set_current()
+    buffer_copy_to(DummyDeviceMemoryResource(device), device)
+    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+
+
+def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
+    src_buffer = dummy_mr.allocate(size=1024)
+    dst_buffer = dummy_mr.allocate(size=1024)
+    stream = device.create_stream()
+
+    if check:
+        src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+        for i in range(1024):
+            src_ptr[i] = ctypes.c_byte(i)
+
+    dst_buffer.copy_from(src_buffer, stream=stream)
+    device.sync()
+
+    if check:
+        dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
+
+        for i in range(10):
+            assert dst_ptr[i] == src_ptr[i]
+
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_buffer_copy_from():
+    device = Device()
+    device.set_current()
+    buffer_copy_from(DummyDeviceMemoryResource(device), device)
+    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+
+
+def buffer_close(dummy_mr: MemoryResource):
+    buffer = dummy_mr.allocate(size=1024)
+    buffer.close()
+    assert buffer.handle == 0
+    assert buffer.memory_resource is None
+
+
+def test_buffer_close():
+    device = Device()
+    device.set_current()
+    buffer_close(DummyDeviceMemoryResource(device))
+    buffer_close(DummyHostMemoryResource())
+    buffer_close(DummyUnifiedMemoryResource(device))
+    buffer_close(DummyPinnedMemoryResource(device))
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 5f0b6056..a976726f 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -1,48 +1,48 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import importlib
-
-import pytest
-
-from cuda.core.experimental._module import ObjectCode
-
-
-@pytest.mark.skipif(
-    int(importlib.metadata.version("cuda-python").split(".")[0]) < 12,
-    reason="Module loading for older drivers validate require valid module code.",
-)
-def test_object_code_initialization():
-    # Test with supported code types
-    for code_type in ["cubin", "ptx", "fatbin"]:
-        module_data = b"dummy_data"
-        obj_code = ObjectCode(module_data, code_type)
-        assert obj_code._code_type == code_type
-        assert obj_code._module == module_data
-        assert obj_code._handle is not None
-
-    # Test with unsupported code type
-    with pytest.raises(ValueError):
-        ObjectCode(b"dummy_data", "unsupported_code_type")
-
-
-# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
-def test_object_code_initialization_with_str():
-    assert True
-
-
-def test_object_code_initialization_with_jit_options():
-    assert True
-
-
-def test_object_code_get_kernel():
-    assert True
-
-
-def test_kernel_from_obj():
-    assert True
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import importlib
+
+import pytest
+
+from cuda.core.experimental._module import ObjectCode
+
+
+@pytest.mark.skipif(
+    int(importlib.metadata.version("cuda-python").split(".")[0]) < 12,
+    reason="Module loading for older drivers validate require valid module code.",
+)
+def test_object_code_initialization():
+    # Test with supported code types
+    for code_type in ["cubin", "ptx", "fatbin"]:
+        module_data = b"dummy_data"
+        obj_code = ObjectCode(module_data, code_type)
+        assert obj_code._code_type == code_type
+        assert obj_code._module == module_data
+        assert obj_code._handle is not None
+
+    # Test with unsupported code type
+    with pytest.raises(ValueError):
+        ObjectCode(b"dummy_data", "unsupported_code_type")
+
+
+# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
+def test_object_code_initialization_with_str():
+    assert True
+
+
+def test_object_code_initialization_with_jit_options():
+    assert True
+
+
+def test_object_code_get_kernel():
+    assert True
+
+
+def test_kernel_from_obj():
+    assert True
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index af94a7ba..95c4d377 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -1,66 +1,66 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Program
-from cuda.core.experimental._module import Kernel, ObjectCode
-
-
-def test_program_init_valid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-    assert program.handle is not None
-
-
-def test_program_init_invalid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    with pytest.raises(NotImplementedError):
-        Program(code, "python")
-
-
-def test_program_init_invalid_code_format():
-    code = 12345
-    with pytest.raises(TypeError):
-        Program(code, "c++")
-
-
-def test_program_compile_valid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    object_code = program.compile("ptx")
-    kernel = object_code.get_kernel("my_kernel")
-    assert isinstance(object_code, ObjectCode)
-    assert isinstance(kernel, Kernel)
-
-
-def test_program_compile_invalid_target_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    with pytest.raises(NotImplementedError):
-        program.compile("invalid_target")
-
-
-def test_program_backend_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.backend == "nvrtc"
-
-
-def test_program_handle_property():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    assert program.handle is not None
-
-
-def test_program_close():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    program.close()
-    assert program.handle is None
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Program
+from cuda.core.experimental._module import Kernel, ObjectCode
+
+
+def test_program_init_valid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+    assert program.handle is not None
+
+
+def test_program_init_invalid_code_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    with pytest.raises(NotImplementedError):
+        Program(code, "python")
+
+
+def test_program_init_invalid_code_format():
+    code = 12345
+    with pytest.raises(TypeError):
+        Program(code, "c++")
+
+
+def test_program_compile_valid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    object_code = program.compile("ptx")
+    kernel = object_code.get_kernel("my_kernel")
+    assert isinstance(object_code, ObjectCode)
+    assert isinstance(kernel, Kernel)
+
+
+def test_program_compile_invalid_target_type():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    with pytest.raises(NotImplementedError):
+        program.compile("invalid_target")
+
+
+def test_program_backend_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.backend == "nvrtc"
+
+
+def test_program_handle_property():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    assert program.handle is not None
+
+
+def test_program_close():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    program.close()
+    assert program.handle is None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 03cdd852..9c661192 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -1,115 +1,115 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
-
-import pytest
-
-from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._event import Event
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
-
-
-def test_stream_init():
-    with pytest.raises(NotImplementedError):
-        Stream()
-
-
-def test_stream_init_with_options(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0))
-    assert stream.is_nonblocking is True
-    assert stream.priority == 0
-
-
-def test_stream_handle(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    assert isinstance(stream.handle, int)
-
-
-def test_stream_is_nonblocking(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(nonblocking=True))
-    assert stream.is_nonblocking is True
-
-
-def test_stream_priority(init_cuda):
-    stream = Device().create_stream(options=StreamOptions(priority=0))
-    assert stream.priority == 0
-    stream = Device().create_stream(options=StreamOptions(priority=-1))
-    assert stream.priority == -1
-    with pytest.raises(ValueError):
-        stream = Device().create_stream(options=StreamOptions(priority=1))
-
-
-def test_stream_sync(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    stream.sync()  # Should not raise any exceptions
-
-
-def test_stream_record(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    event = stream.record()
-    assert isinstance(event, Event)
-
-
-def test_stream_record_invalid_event(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    with pytest.raises(TypeError):
-        stream.record(event="invalid_event")
-
-
-def test_stream_wait_event(init_cuda):
-    s1 = Device().create_stream()
-    s2 = Device().create_stream()
-    e1 = s1.record()
-    s2.wait(e1)  # Should not raise any exceptions
-    s2.sync()
-
-
-def test_stream_wait_invalid_event(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    with pytest.raises(ValueError):
-        stream.wait(event_or_stream="invalid_event")
-
-
-def test_stream_device(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    device = stream.device
-    assert isinstance(device, Device)
-
-
-def test_stream_context(init_cuda):
-    stream = Device().create_stream(options=StreamOptions())
-    context = stream.context
-    assert context is not None
-
-
-def test_stream_from_foreign_stream(init_cuda):
-    device = Device()
-    other_stream = device.create_stream(options=StreamOptions())
-    stream = device.create_stream(obj=other_stream)
-    assert other_stream.handle == stream.handle
-    device = stream.device
-    assert isinstance(device, Device)
-    context = stream.context
-    assert context is not None
-
-
-def test_stream_from_handle():
-    stream = Stream.from_handle(0)
-    assert isinstance(stream, Stream)
-
-
-def test_legacy_default_stream():
-    assert isinstance(LEGACY_DEFAULT_STREAM, Stream)
-
-
-def test_per_thread_default_stream():
-    assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream)
-
-
-def test_default_stream():
-    stream = default_stream()
-    assert isinstance(stream, Stream)
+# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+#
+# Please refer to the NVIDIA end user license agreement (EULA) associated
+# with this source code for terms and conditions that govern your use of
+# this software. Any use, reproduction, disclosure, or distribution of
+# this software and related documentation outside the terms of the EULA
+# is strictly prohibited.
+
+import pytest
+
+from cuda.core.experimental import Device, Stream, StreamOptions
+from cuda.core.experimental._event import Event
+from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
+
+
+def test_stream_init():
+    with pytest.raises(NotImplementedError):
+        Stream()
+
+
+def test_stream_init_with_options(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0))
+    assert stream.is_nonblocking is True
+    assert stream.priority == 0
+
+
+def test_stream_handle(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    assert isinstance(stream.handle, int)
+
+
+def test_stream_is_nonblocking(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(nonblocking=True))
+    assert stream.is_nonblocking is True
+
+
+def test_stream_priority(init_cuda):
+    stream = Device().create_stream(options=StreamOptions(priority=0))
+    assert stream.priority == 0
+    stream = Device().create_stream(options=StreamOptions(priority=-1))
+    assert stream.priority == -1
+    with pytest.raises(ValueError):
+        stream = Device().create_stream(options=StreamOptions(priority=1))
+
+
+def test_stream_sync(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    stream.sync()  # Should not raise any exceptions
+
+
+def test_stream_record(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    event = stream.record()
+    assert isinstance(event, Event)
+
+
+def test_stream_record_invalid_event(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    with pytest.raises(TypeError):
+        stream.record(event="invalid_event")
+
+
+def test_stream_wait_event(init_cuda):
+    s1 = Device().create_stream()
+    s2 = Device().create_stream()
+    e1 = s1.record()
+    s2.wait(e1)  # Should not raise any exceptions
+    s2.sync()
+
+
+def test_stream_wait_invalid_event(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    with pytest.raises(ValueError):
+        stream.wait(event_or_stream="invalid_event")
+
+
+def test_stream_device(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    device = stream.device
+    assert isinstance(device, Device)
+
+
+def test_stream_context(init_cuda):
+    stream = Device().create_stream(options=StreamOptions())
+    context = stream.context
+    assert context is not None
+
+
+def test_stream_from_foreign_stream(init_cuda):
+    device = Device()
+    other_stream = device.create_stream(options=StreamOptions())
+    stream = device.create_stream(obj=other_stream)
+    assert other_stream.handle == stream.handle
+    device = stream.device
+    assert isinstance(device, Device)
+    context = stream.context
+    assert context is not None
+
+
+def test_stream_from_handle():
+    stream = Stream.from_handle(0)
+    assert isinstance(stream, Stream)
+
+
+def test_legacy_default_stream():
+    assert isinstance(LEGACY_DEFAULT_STREAM, Stream)
+
+
+def test_per_thread_default_stream():
+    assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream)
+
+
+def test_default_stream():
+    stream = default_stream()
+    assert isinstance(stream, Stream)