diff --git a/python/taichi/_kernels.py b/python/taichi/_kernels.py
index 4b1d86302f84a..71c407bc61073 100644
--- a/python/taichi/_kernels.py
+++ b/python/taichi/_kernels.py
@@ -39,8 +39,13 @@ def fill_ndarray_matrix(ndarray: ndarray_type.ndarray(), val: template()):
 
 @kernel
 def tensor_to_ext_arr(tensor: template(), arr: ndarray_type.ndarray()):
+    offset = static(tensor.snode.ptr.offset)
+    shape = static(tensor.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
+
     for I in grouped(tensor):
-        arr[I] = tensor[I]
+        arr[I - offset_new] = tensor[I]
 
 
 @kernel
@@ -73,10 +78,12 @@ def ndarray_matrix_to_ext_arr(
 
 @kernel
 def vector_to_fast_image(img: template(), out: ndarray_type.ndarray()):
+    i_offset = static(img.snode.ptr.offset[0] if len(img.snode.ptr.offset) != 0 else 0)
+    j_offset = static(img.snode.ptr.offset[1] if len(img.snode.ptr.offset) != 0 else 0)
     # FIXME: Why is ``for i, j in img:`` slower than:
     for i, j in ndrange(*img.shape):
         r, g, b = 0, 0, 0
-        color = img[i, img.shape[1] - 1 - j]
+        color = img[i + i_offset, (img.shape[1] + j_offset) - 1 - j]
         if static(img.dtype in [f16, f32, f64]):
             r, g, b = ops.min(255, ops.max(0, int(color * 255)))[:3]
         else:
@@ -100,32 +107,51 @@ def vector_to_fast_image(img: template(), out: ndarray_type.ndarray()):
 
 @kernel
 def tensor_to_image(tensor: template(), arr: ndarray_type.ndarray()):
+    offset = static(tensor.snode.ptr.offset)
+    shape = static(tensor.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
     for I in grouped(tensor):
         t = ops.cast(tensor[I], f32)
-        arr[I, 0] = t
-        arr[I, 1] = t
-        arr[I, 2] = t
+        arr[I - offset_new, 0] = t
+        arr[I - offset_new, 1] = t
+        arr[I - offset_new, 2] = t
 
 
 @kernel
 def vector_to_image(mat: template(), arr: ndarray_type.ndarray()):
+    offset = static(mat.snode.ptr.offset)
+    shape = static(mat.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
     for I in grouped(mat):
         for p in static(range(mat.n)):
-            arr[I, p] = ops.cast(mat[I][p], f32)
+            arr[I - offset_new, p] = ops.cast(mat[I][p], f32)
             if static(mat.n <= 2):
-                arr[I, 2] = 0
+                arr[I - offset_new, 2] = 0
 
 
 @kernel
 def tensor_to_tensor(tensor: template(), other: template()):
-    for I in grouped(tensor):
-        tensor[I] = other[I]
+    # assumes that tensor and other have the same shape
+    shape = static(tensor.shape)
+    tensor_offset = static(tensor.snode.ptr.offset)
+    tensor_offset_new = static([0] * len(shape) if len(tensor_offset) == 0 else tensor_offset)
+    other_offset = static(other.snode.ptr.offset)
+    other_offset_new = static([0] * len(shape) if len(other_offset) == 0 else other_offset)
+
+    for I in grouped(ndrange(*shape)):
+        tensor[I + tensor_offset_new] = other[I + other_offset_new]
 
 
 @kernel
 def ext_arr_to_tensor(arr: ndarray_type.ndarray(), tensor: template()):
+    offset = static(tensor.snode.ptr.offset)
+    shape = static(tensor.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
     for I in grouped(tensor):
-        tensor[I] = arr[I]
+        tensor[I] = arr[I - offset_new]
 
 
 @kernel
@@ -164,36 +190,46 @@ def ext_arr_to_ndarray_matrix(
 
 @kernel
 def matrix_to_ext_arr(mat: template(), arr: ndarray_type.ndarray(), as_vector: template()):
+    offset = static(mat.snode.ptr.offset)
+    shape = static(mat.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
+
     for I in grouped(mat):
         for p in static(range(mat.n)):
             for q in static(range(mat.m)):
                 if static(as_vector):
                     if static(getattr(mat, "ndim", 2) == 1):
-                        arr[I, p] = mat[I][p]
+                        arr[I - offset_new, p] = mat[I][p]
                     else:
-                        arr[I, p] = mat[I][p, q]
+                        arr[I - offset_new, p] = mat[I][p, q]
                 else:
                     if static(getattr(mat, "ndim", 2) == 1):
-                        arr[I, p, q] = mat[I][p]
+                        arr[I - offset_new, p, q] = mat[I][p]
                     else:
-                        arr[I, p, q] = mat[I][p, q]
+                        arr[I - offset_new, p, q] = mat[I][p, q]
 
 
 @kernel
 def ext_arr_to_matrix(arr: ndarray_type.ndarray(), mat: template(), as_vector: template()):
+    offset = static(mat.snode.ptr.offset)
+    shape = static(mat.shape)
+    # default value of offset is [], replace it with [0] * len
+    offset_new = static([0] * len(shape) if len(offset) == 0 else offset)
+
     for I in grouped(mat):
         for p in static(range(mat.n)):
             for q in static(range(mat.m)):
                 if static(getattr(mat, "ndim", 2) == 1):
                     if static(as_vector):
-                        mat[I][p] = arr[I, p]
+                        mat[I][p] = arr[I - offset_new, p]
                     else:
-                        mat[I][p] = arr[I, p, q]
+                        mat[I][p] = arr[I - offset_new, p, q]
                 else:
                     if static(as_vector):
-                        mat[I][p, q] = arr[I, p]
+                        mat[I][p, q] = arr[I - offset_new, p]
                     else:
-                        mat[I][p, q] = arr[I, p, q]
+                        mat[I][p, q] = arr[I - offset_new, p, q]
 
 
 # extract ndarray of raw vulkan memory layout to normal memory layout.
@@ -215,10 +251,13 @@ def arr_vulkan_layout_to_arr_normal_layout(vk_arr: ndarray_type.ndarray(), norma
 @kernel
 def arr_vulkan_layout_to_field_normal_layout(vk_arr: ndarray_type.ndarray(), normal_field: template()):
     static_assert(len(normal_field.shape) == 2)
-    w = normal_field.shape[0]
-    h = normal_field.shape[1]
+    w = static(normal_field.shape[0])
+    h = static(normal_field.shape[1])
+    i_offset = static(normal_field.snode.ptr.offset[0] if len(normal_field.snode.ptr.offset) != 0 else 0)
+    j_offset = static(normal_field.snode.ptr.offset[1] if len(normal_field.snode.ptr.offset) != 0 else 0)
+
     for i, j in ndrange(w, h):
-        normal_field[i, j] = vk_arr[(h - 1 - j) * w + i]
+        normal_field[i + i_offset, j + j_offset] = vk_arr[(h - 1 - j) * w + i]
 
 
 @kernel
@@ -277,21 +316,23 @@ def sort_stage(
     k: int,
     invocations: int,
 ):
+    keys_offset = static(keys.snode.ptr.offset if len(keys.snode.ptr.offset) != 0 else 0)
+    values_offset = static(values.snode.ptr.offset if len(values.snode.ptr.offset) != 0 else 0)
     for inv in range(invocations):
         j = k % p + inv * 2 * k
         for i in range(0, ops.min(k, N - j - k)):
             a = i + j
             b = i + j + k
             if int(a / (p * 2)) == int(b / (p * 2)):
-                key_a = keys[a]
-                key_b = keys[b]
+                key_a = keys[a + keys_offset]
+                key_b = keys[b + keys_offset]
                 if key_a > key_b:
-                    keys[a] = key_b
-                    keys[b] = key_a
+                    keys[a + keys_offset] = key_b
+                    keys[b + keys_offset] = key_a
                     if use_values != 0:
-                        temp = values[a]
-                        values[a] = values[b]
-                        values[b] = temp
+                        temp = values[a + values_offset]
+                        values[a + values_offset] = values[b + values_offset]
+                        values[b + values_offset] = temp
 
 
 # Parallel Prefix Sum (Scan)
@@ -383,5 +424,8 @@ def uniform_add(arr_in: template(), in_beg: i32, in_end: i32):
 
 @kernel
 def blit_from_field_to_field(dst: template(), src: template(), offset: i32, size: i32):
+    dst_offset = static(dst.snode.ptr.offset if len(dst.snode.ptr.offset) != 0 else 0)
+    src_offset = static(src.snode.ptr.offset if len(src.snode.ptr.offset) != 0 else 0)
+    print("[debug]", dst_offset, src_offset)
     for i in range(size):
-        dst[i + offset] = src[i]
+        dst[i + dst_offset + offset] = src[i + src_offset]
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index fe3e26dd213b4..79a3833736609 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -466,6 +466,7 @@ void export_lang(py::module &m) {
       .def_readwrite("parent", &SNode::parent)
       .def_readonly("type", &SNode::type)
       .def_readonly("id", &SNode::id)
+      .def_readonly("offset", &SNode::index_offsets)
       .def("dense",
            (SNode & (SNode::*)(const std::vector<Axis> &,
                                const std::vector<int> &,
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index 33bf5000a8dc5..9077027f2aa29 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -84,6 +84,33 @@ def test_scalr_field_from_numpy(dtype, shape):
     x.from_numpy(arr)
 
 
+@pytest.mark.parametrize("dtype", data_types)
+@pytest.mark.parametrize(
+    "shape, offset",
+    [((), ()), (8, 0), (8, 8), (8, -4), ((6, 12), (-4, -4)), ((6, 12), (-4, 4)), ((6, 12), (4, -4)), ((6, 12), (8, 8))],
+)
+@test_utils.test(arch=get_host_arch_list())
+def test_scalr_field_from_numpy_with_offset(dtype, shape, offset):
+    import numpy as np
+
+    x = ti.field(dtype=dtype, shape=shape, offset=offset)
+    # use the corresponding dtype for the numpy array.
+    numpy_dtypes = {
+        ti.i32: np.int32,
+        ti.f32: np.float32,
+        ti.f64: np.float64,
+        ti.i64: np.int64,
+    }
+    arr = np.ones(shape, dtype=numpy_dtypes[dtype])
+    x.from_numpy(arr)
+
+    def mat_equal(A, B, tol=1e-6):
+        return np.max(np.abs(A - B)) < tol
+
+    tol = 1e-5 if dtype == ti.f32 else 1e-12
+    assert mat_equal(x.to_numpy(), arr, tol=tol)
+
+
 @pytest.mark.parametrize("dtype", data_types)
 @pytest.mark.parametrize("shape", field_shapes)
 @test_utils.test(arch=get_host_arch_list())
@@ -248,6 +275,32 @@ def test_field_copy_from_with_mismatch_shape():
             x.copy_from(other)
 
 
+@test_utils.test()
+@pytest.mark.parametrize(
+    "shape, x_offset, other_offset",
+    [
+        ((), (), ()),
+        (8, 4, 0),
+        (8, 0, -4),
+        (8, -4, -4),
+        (8, 8, -4),
+        ((6, 12), (0, 0), (-6, -6)),
+        ((6, 12), (-6, -6), (0, 0)),
+        ((6, 12), (-6, -6), (-6, -6)),
+    ],
+)
+@pytest.mark.parametrize("dtype", [ti.i32, ti.f32])
+def test_field_copy_from_with_offset(shape, dtype, x_offset, other_offset):
+    x = ti.field(dtype=ti.f32, shape=shape, offset=x_offset)
+    other = ti.field(dtype=dtype, shape=shape, offset=other_offset)
+    other.fill(1)
+    x.copy_from(other)
+    convert = lambda arr: arr[0] if len(arr) == 1 else arr
+    assert convert(x.shape) == shape
+    assert x.dtype == ti.f32
+    assert (x.to_numpy() == 1).all()
+
+
 @test_utils.test()
 def test_field_copy_from_with_non_filed_object():
     import numpy as np
diff --git a/tests/python/test_ggui.py b/tests/python/test_ggui.py
index 6f3def7d51c3c..aa9ccb3efcec9 100644
--- a/tests/python/test_ggui.py
+++ b/tests/python/test_ggui.py
@@ -492,6 +492,39 @@ def render():
     window.destroy()
 
 
+@pytest.mark.parametrize("offset", [(0, 0), (-256, -256), (256, -256), (-256, 256), (256, 256), (23333, 233333)])
+@pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
+@test_utils.test(arch=supported_archs)
+def test_get_depth_buffer_with_offset(offset):
+    window = ti.ui.Window("test", (512, 512), vsync=True, show_window=False)
+    canvas = window.get_canvas()
+    scene = ti.ui.Scene()
+    camera = ti.ui.Camera()
+
+    ball_center = ti.Vector.field(3, dtype=float, shape=(1,))
+    ball_center[0] = ti.math.vec3(0, 0, 0.5)
+
+    def render():
+        camera.position(0.0, 0.0, 1)
+        camera.lookat(0.0, 0.0, 0)
+        scene.set_camera(camera)
+        scene.point_light(pos=(0, 1, 2), color=(1, 1, 1))
+        scene.ambient_light((0.5, 0.5, 0.5))
+        scene.particles(ball_center, radius=0.05, color=(0.5, 0.42, 0.8))
+        canvas.scene(scene)
+
+    for _ in range(RENDER_REPEAT):
+        render()
+        window.get_image_buffer_as_numpy()
+
+    render()
+
+    depth_buffer_field = ti.field(dtype=ti.f32, shape=(512, 512), offset=offset)
+    window.get_depth_buffer(depth_buffer_field)
+    verify_image(depth_buffer_field, "test_depth")
+    window.destroy()
+
+
 @pytest.mark.skipif(not _ti_core.GGUI_AVAILABLE, reason="GGUI Not Available")
 @test_utils.test(arch=supported_archs)
 def test_draw_lines():
diff --git a/tests/python/test_gui.py b/tests/python/test_gui.py
index 308c0209dc883..db05451322e49 100644
--- a/tests/python/test_gui.py
+++ b/tests/python/test_gui.py
@@ -29,3 +29,58 @@ def paint(c: dtype):
         image = ti.tools.imread(image_path)
         delta = (image - i).sum()
         assert delta == 0, "Expected image difference to be 0 but got {} instead.".format(delta)
+
+
+@pytest.mark.parametrize("vector_field", [True, False])
+@pytest.mark.parametrize("dtype", [ti.u8, ti.f32, ti.f64])
+@pytest.mark.parametrize("color", [0, 32, 64, 128, 255])
+@pytest.mark.parametrize("offset", [(-150, -150), (0, 0), (150, 150)])
+@test_utils.test(arch=get_host_arch_list())
+def test_set_image_with_offset(vector_field, dtype, color, offset):
+    n = 300
+    shape = (n, n)
+
+    img = (
+        ti.Vector.field(dtype=dtype, n=3, shape=shape, offset=offset)
+        if vector_field
+        else ti.field(dtype=dtype, shape=shape, offset=offset)
+    )
+    img.fill(color if dtype is ti.u8 else color * 1.0 / 255)
+
+    gui = ti.GUI(name="test", res=shape, show_gui=False, fast_gui=False)
+    gui.set_image(img)
+
+    image_path = test_utils.make_temp_file(suffix=".png")
+    gui.show(image_path)
+    image = ti.tools.imread(image_path)
+    delta = (image - color).sum()
+    assert delta == 0, "Expected image difference to be 0 but got {} instead.".format(delta)
+
+
+@pytest.mark.parametrize("channel", [3, 4])
+@pytest.mark.parametrize("dtype", [ti.u8, ti.f32, ti.f64])
+@pytest.mark.parametrize("color", [0, 32, 64, 128, 255])
+@pytest.mark.parametrize("offset", [(-150, -150), (0, 0), (150, 150)])
+@test_utils.test(arch=get_host_arch_list())
+def test_set_image_fast_gui_with_offset(channel, dtype, color, offset):
+    n = 300
+    shape = (n, n)
+
+    img = ti.Vector.field(dtype=dtype, n=channel, shape=shape, offset=offset)
+    img.fill(color if dtype is ti.u8 else color * 1.0 / 255)
+
+    gui = ti.GUI(name="test", res=shape, show_gui=False, fast_gui=True)
+    gui.set_image(img)
+    fast_image = gui.img
+
+    alpha = 0xFF << 24
+    from taichi._lib.utils import get_os_name  # pylint: disable=C0415
+
+    rgb_color = (
+        (color << 16) + (color << 8) + color
+        if ti.static(get_os_name() != "osx")
+        else (color << 16) + (color << 8) + color + alpha
+    )
+    ground_truth = np.full(n * n, rgb_color, dtype=np.uint32)
+
+    assert np.allclose(fast_image, ground_truth)
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 278859b4ed6d1..ea666a12a8f26 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -1305,6 +1305,57 @@ def access_mat(i: ti.i32, j: ti.i32):
     #    access_mat(3, -1)
 
 
+@pytest.mark.parametrize("dtype", [ti.i32, ti.f32, ti.i64, ti.f64])
+@pytest.mark.parametrize("shape", [(8,), (6, 12)])
+@pytest.mark.parametrize("offset", [0, -4, 4])
+@pytest.mark.parametrize("m, n", [(3, 4)])
+@test_utils.test(arch=get_host_arch_list())
+def test_matrix_from_numpy_with_offset(dtype, shape, offset, m, n):
+    import numpy as np
+
+    x = ti.Matrix.field(dtype=dtype, m=m, n=n, shape=shape, offset=[offset] * len(shape))
+    # use the corresponding dtype for the numpy array.
+    numpy_dtypes = {
+        ti.i32: np.int32,
+        ti.f32: np.float32,
+        ti.f64: np.float64,
+        ti.i64: np.int64,
+    }
+    numpy_shape = ((shape,) if isinstance(shape, int) else shape) + (n, m)
+    arr = np.ones(numpy_shape, dtype=numpy_dtypes[dtype])
+    x.from_numpy(arr)
+
+    @ti.kernel
+    def func():
+        for I in ti.grouped(x):
+            assert all(abs(I - 1.0) < 1e-6)
+
+    func()
+
+
+@pytest.mark.parametrize("dtype", [ti.i32, ti.f32, ti.i64, ti.f64])
+@pytest.mark.parametrize("shape", [(8,), (6, 12)])
+@pytest.mark.parametrize("offset", [0, -4, 4])
+@pytest.mark.parametrize("m, n", [(3, 4)])
+@test_utils.test(arch=get_host_arch_list())
+def test_matrix_to_numpy_with_offset(dtype, shape, offset, m, n):
+    import numpy as np
+
+    x = ti.Matrix.field(dtype=dtype, m=m, n=n, shape=shape, offset=[offset] * len(shape))
+    x.fill(1.0)
+    # use the corresponding dtype for the numpy array.
+    numpy_dtypes = {
+        ti.i32: np.int32,
+        ti.f32: np.float32,
+        ti.f64: np.float64,
+        ti.i64: np.int64,
+    }
+    numpy_shape = ((shape,) if isinstance(shape, int) else shape) + (n, m)
+    arr = x.to_numpy()
+
+    assert np.allclose(arr, np.ones(numpy_shape, dtype=numpy_dtypes[dtype]))
+
+
 @test_utils.test()
 def test_matrix_dtype():
     a = ti.types.vector(3, dtype=ti.f32)([0, 1, 2])
diff --git a/tests/python/test_offset.py b/tests/python/test_offset.py
index 58334b2fc85c7..a6e99a2e6848a 100644
--- a/tests/python/test_offset.py
+++ b/tests/python/test_offset.py
@@ -3,6 +3,7 @@
 
 import taichi as ti
 from tests import test_utils
+import numpy as np
 
 
 @test_utils.test()
@@ -144,3 +145,20 @@ def test_offset_must_throw_matrix():
         a = ti.Matrix.field(3, 3, dtype=ti.i32, shape=(32, 16, 8), offset=(32, 16))
     with pytest.raises(ti.TaichiCompilationError, match="shape cannot be None when offset is set"):
         b = ti.Matrix.field(3, 3, dtype=ti.i32, shape=None, offset=(32, 16))
+
+
+@pytest.mark.parametrize("offset", [(0, 0), (-1, -1), (2, 2), (-23333, -23333), (23333, 23333)])
+@test_utils.test(arch=get_host_arch_list())
+def test_field_with_offset_print(offset):
+    val = ti.field(dtype=ti.f32, shape=(3, 3), offset=offset)
+    val.fill(1.0)
+    print(val)
+
+
+@pytest.mark.parametrize("offset", [(0, 0), (-1, -1), (2, 2), (-23333, -23333), (23333, 23333)])
+@test_utils.test(arch=get_host_arch_list())
+def test_field_with_offset_to_numpy(offset):
+    shape = (3, 3)
+    val = ti.field(dtype=ti.f32, shape=shape, offset=offset)
+    val.fill(1.0)
+    assert np.allclose(val.to_numpy(), np.ones(shape, dtype=np.float32))
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 55ff669aee5ee..48b4281ee3017 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -1,3 +1,4 @@
+import pytest
 import taichi as ti
 from tests import test_utils
 
@@ -30,3 +31,31 @@ def fill():
     test_scan_for_dtype(ti.i32, 512)
     test_scan_for_dtype(ti.i32, 1024)
     test_scan_for_dtype(ti.i32, 4096)
+
+
+@pytest.mark.parametrize("dtype", [ti.i32])
+@pytest.mark.parametrize("N", [512, 1024, 4096])
+@pytest.mark.parametrize("offset", [0, -1, 1, 256, -256, -23333, 23333])
+@test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
+def test_scan_with_offset(dtype, N, offset):
+    arr = ti.field(dtype, N, offset=offset)
+    arr_aux = ti.field(dtype, N, offset=offset)
+
+    @ti.kernel
+    def fill():
+        for i in arr:
+            arr[i] = ti.random() * N
+            arr_aux[i] = arr[i]
+
+    fill()
+
+    # Performing an inclusive in-place's parallel prefix sum,
+    # only one exectutor is needed for a specified sorting length.
+    executor = ti.algorithms.PrefixSumExecutor(N)
+
+    executor.run(arr)
+
+    cur_sum = 0
+    for i in range(N):
+        cur_sum += arr_aux[i + offset]
+        assert arr[i + offset] == cur_sum
diff --git a/tests/python/test_sort.py b/tests/python/test_sort.py
index ab8477a4c8018..cd93588cacd49 100644
--- a/tests/python/test_sort.py
+++ b/tests/python/test_sort.py
@@ -1,3 +1,4 @@
+import pytest
 import taichi as ti
 from tests import test_utils
 
@@ -31,3 +32,29 @@ def fill():
     test_sort_for_dtype(ti.f32, 1)
     test_sort_for_dtype(ti.f32, 256)
     test_sort_for_dtype(ti.f32, 100001)
+
+
+@pytest.mark.parametrize("dtype", [ti.i32, ti.f32])
+@pytest.mark.parametrize("N", [1, 256, 100001])
+@pytest.mark.parametrize("offset", [0, -1, 1, 128, -128, -23333, 23333])
+@test_utils.test()
+def test_sort_with_offset(dtype, N, offset):
+    keys = ti.field(dtype, N, offset=offset)
+    values = ti.field(dtype, N, offset=offset)
+
+    @ti.kernel
+    def fill():
+        for i in keys:
+            keys[i] = ti.random() * N
+            values[i] = keys[i]
+
+    fill()
+    ti.algorithms.parallel_sort(keys, values)
+
+    keys_host = keys.to_numpy()
+    values_host = values.to_numpy()
+
+    for i in range(N):
+        if i < N - 1:
+            assert keys_host[i] <= keys_host[i + 1]
+        assert keys_host[i] == values_host[i]