diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index 38c93b3558ba4e..756fac583ae405 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -9,6 +9,7 @@ export TAICHI_AOT_FOLDER_PATH="taichi/tests"
 export TI_SKIP_VERSION_CHECK=ON
 export LD_LIBRARY_PATH=$PWD/build/:$LD_LIBRARY_PATH
 export TI_OFFLINE_CACHE_FILE_PATH=$PWD/.cache/taichi
+export TI_SKIP_CPP_TESTS=1
 
 
 # Disable compat tests to save time.
diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index cb1e37c726b2ba..ae46c26c55aa6c 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -243,6 +243,7 @@ def __init__(self, dtype, arr_shape):
 
     def __del__(self):
         if impl is not None and impl.get_runtime() is not None and impl.get_runtime().prog is not None:
+            print(impl.get_runtime().prog)
             impl.get_runtime().prog.delete_ndarray(self.arr)
 
     @property
diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index d44f7671e41a51..0186a6b0babcf1 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -12,6 +12,7 @@
 from taichi.lang import _ndarray, any_array, expr, impl, kernel_arguments, matrix, mesh
 from taichi.lang import ops as ti_ops
 from taichi.lang._ndrange import _Ndrange, ndrange
+from taichi.lang.argpack import ArgPackType
 from taichi.lang.ast.ast_transformer_utils import Builder, LoopStatus, ReturnStatus
 from taichi.lang.ast.symbol_resolver import ASTResolver
 from taichi.lang.exception import (
@@ -601,6 +602,45 @@ def build_FunctionDef(ctx, node):
         assert args.kw_defaults == []
         assert args.kwarg is None
 
+        def decl_and_create_variable(annotation, name, arg_features):
+            if not isinstance(annotation, primitive_types.RefType):
+                ctx.kernel_args.append(name)
+            if isinstance(annotation, ArgPackType):
+                d = {}
+                for j, (_name, anno) in enumerate(annotation.members.items()):
+                    d[_name] = decl_and_create_variable(anno, _name, arg_features[j])
+                return kernel_arguments.decl_argpack_arg(annotation, d)
+            if isinstance(annotation, annotations.template):
+                return ctx.global_vars[name]
+            if isinstance(annotation, annotations.sparse_matrix_builder):
+                return kernel_arguments.decl_sparse_matrix(
+                    to_taichi_type(arg_features),
+                    name,
+                )
+            if isinstance(annotation, ndarray_type.NdarrayType):
+                return kernel_arguments.decl_ndarray_arg(
+                    to_taichi_type(arg_features[0]),
+                    arg_features[1],
+                    arg_features[2],
+                    arg_features[3],
+                    name,
+                    arg_features[4],
+                )
+            if isinstance(annotation, texture_type.TextureType):
+                return kernel_arguments.decl_texture_arg(arg_features[0], name)
+            if isinstance(annotation, texture_type.RWTextureType):
+                return kernel_arguments.decl_rw_texture_arg(
+                    arg_features[0],
+                    arg_features[1],
+                    arg_features[2],
+                    name,
+                )
+            if isinstance(annotation, MatrixType):
+                return kernel_arguments.decl_matrix_arg(annotation, name)
+            if isinstance(annotation, StructType):
+                return kernel_arguments.decl_struct_arg(annotation, name)
+            return kernel_arguments.decl_scalar_arg(annotation, name)
+
         def transform_as_kernel():
             # Treat return type
             if node.returns is not None:
@@ -608,60 +648,21 @@ def transform_as_kernel():
             impl.get_runtime().compiling_callable.finalize_rets()
 
             for i, arg in enumerate(args.args):
-                if not isinstance(ctx.func.arguments[i].annotation, primitive_types.RefType):
-                    ctx.kernel_args.append(arg.arg)
-                if isinstance(ctx.func.arguments[i].annotation, annotations.template):
-                    ctx.create_variable(arg.arg, ctx.global_vars[arg.arg])
-                elif isinstance(ctx.func.arguments[i].annotation, annotations.sparse_matrix_builder):
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_sparse_matrix(
-                            to_taichi_type(ctx.arg_features[i]),
-                            ctx.func.arguments[i].name,
-                        ),
-                    )
-                elif isinstance(ctx.func.arguments[i].annotation, ndarray_type.NdarrayType):
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_ndarray_arg(
-                            to_taichi_type(ctx.arg_features[i][0]),
-                            ctx.arg_features[i][1],
-                            ctx.arg_features[i][2],
-                            ctx.arg_features[i][3],
-                            ctx.func.arguments[i].name,
-                            ctx.arg_features[i][4],
-                        ),
-                    )
-                elif isinstance(ctx.func.arguments[i].annotation, texture_type.TextureType):
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_texture_arg(ctx.arg_features[i][0], ctx.func.arguments[i].name),
-                    )
-                elif isinstance(ctx.func.arguments[i].annotation, texture_type.RWTextureType):
+                if isinstance(ctx.func.arguments[i].annotation, ArgPackType):
+                    d = {}
+                    for j, (name, anno) in enumerate(ctx.func.arguments[i].annotation.members.items()):
+                        d[name] = decl_and_create_variable(anno, name, ctx.arg_features[i][j])
+                    ctx.create_variable(arg.arg, kernel_arguments.decl_argpack_arg(ctx.func.arguments[i].annotation, d))
+                else:
                     ctx.create_variable(
                         arg.arg,
-                        kernel_arguments.decl_rw_texture_arg(
-                            ctx.arg_features[i][0],
-                            ctx.arg_features[i][1],
-                            ctx.arg_features[i][2],
+                        decl_and_create_variable(
+                            ctx.func.arguments[i].annotation,
                             ctx.func.arguments[i].name,
+                            ctx.arg_features[i] if ctx.arg_features is not None else None,
                         ),
                     )
-                elif isinstance(ctx.func.arguments[i].annotation, MatrixType):
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_matrix_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name),
-                    )
-                elif isinstance(ctx.func.arguments[i].annotation, StructType):
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_struct_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name),
-                    )
-                else:
-                    ctx.create_variable(
-                        arg.arg,
-                        kernel_arguments.decl_scalar_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name),
-                    )
+
             impl.get_runtime().compiling_callable.finalize_params()
             # remove original args
             node.args.args = []
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 5c67e51d85f494..699a68922623ee 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -377,6 +377,7 @@ def set_default_ip(self, ip):
     def create_program(self):
         if self.prog is None:
             self.prog = _ti_core.Program()
+            print('create prog =', self.prog)
 
     @staticmethod
     def materialize_root_fb(is_first_call):
@@ -480,6 +481,7 @@ def _register_signal_handlers(self):
 
     def clear(self):
         if self.prog:
+            print('clear prog =', self.prog)
             self.prog.finalize()
             self.prog = None
         self._signal_handler_registry = None
diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py
index c15b8555a8f340..6daedd27acfc80 100644
--- a/python/taichi/lang/kernel_arguments.py
+++ b/python/taichi/lang/kernel_arguments.py
@@ -95,6 +95,10 @@ def decl_struct_arg(structtype, name):
     return structtype.from_taichi_object(arg_load)
 
 
+def decl_argpack_arg(argpacktype, member_dict):
+    return argpacktype.from_taichi_object(member_dict)
+
+
 def decl_sparse_matrix(dtype, name):
     value_type = cook_dtype(dtype)
     ptr_type = cook_dtype(u64)
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 010ffe50a94d72..cd17b3953a9d3a 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -13,6 +13,7 @@
 from taichi._lib import core as _ti_core
 from taichi.lang import impl, ops, runtime_ops
 from taichi.lang._wrap_inspect import getsourcefile, getsourcelines
+from taichi.lang.argpack import ArgPackType, ArgPack
 from taichi.lang.ast import (
     ASTTransformerContext,
     KernelSimplicityASTChecker,
@@ -368,6 +369,13 @@ def extract_arg(arg, anno):
 
             # [Primitive arguments] Return the value
             return arg
+        if isinstance(anno, ArgPackType):
+            if not isinstance(arg, ArgPack):
+                raise TaichiRuntimeTypeError(f"Argument must be a argument pack, got {type(arg)}")
+            return tuple(
+                TaichiCallableTemplateMapper.extract_arg(arg[name], dtype)
+                for index, (name, dtype) in enumerate(anno.members.items())
+            )
         if isinstance(anno, texture_type.TextureType):
             if not isinstance(arg, taichi.lang._texture.Texture):
                 raise TaichiRuntimeTypeError(f"Argument must be a texture, got {type(arg)}")
@@ -545,6 +553,8 @@ def extract_arguments(self):
                     pass
                 elif isinstance(annotation, StructType):
                     pass
+                elif isinstance(annotation, ArgPackType):
+                    pass
                 else:
                     raise TaichiSyntaxError(f"Invalid type annotation (argument {i}) of Taichi kernel: {annotation}")
             self.arguments.append(KernelArgument(annotation, param.name, param.default))
@@ -603,189 +613,210 @@ def taichi_ast_generator(kernel_cxx):
     def launch_kernel(self, t_kernel, *args):
         assert len(args) == len(self.arguments), f"{len(self.arguments)} arguments needed but {len(args)} provided"
 
-        tmps = []
+        self.tmps = []
         callbacks = []
 
         actual_argument_slot = 0
         launch_ctx = t_kernel.make_launch_context()
         max_arg_num = 64
         exceed_max_arg_num = False
-        for i, v in enumerate(args):
-            needed = self.arguments[i].annotation
-            if isinstance(needed, template):
+        for i, val in enumerate(args):
+            _needed = self.arguments[i].annotation
+            if isinstance(_needed, template):
                 continue
-            if actual_argument_slot >= max_arg_num:
-                exceed_max_arg_num = True
-                break
-            provided = type(v)
-            # Note: do not use sth like "needed == f32". That would be slow.
-            if id(needed) in primitive_types.real_type_ids:
-                if not isinstance(v, (float, int, np.floating, np.integer)):
-                    raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided)
-                launch_ctx.set_arg_float(actual_argument_slot, float(v))
-            elif id(needed) in primitive_types.integer_type_ids:
-                if not isinstance(v, (int, np.integer)):
-                    raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided)
-                if is_signed(cook_dtype(needed)):
-                    launch_ctx.set_arg_int(actual_argument_slot, int(v))
-                else:
-                    launch_ctx.set_arg_uint(actual_argument_slot, int(v))
-            elif isinstance(needed, sparse_matrix_builder):
-                # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
-                launch_ctx.set_arg_uint(actual_argument_slot, v._get_ndarray_addr())
-            elif isinstance(needed, ndarray_type.NdarrayType) and isinstance(v, taichi.lang._ndarray.Ndarray):
-                v_primal = v.arr
-                v_grad = v.grad.arr if v.grad else None
-                if v_grad is None:
-                    launch_ctx.set_arg_ndarray(actual_argument_slot, v_primal)
-                else:
-                    launch_ctx.set_arg_ndarray_with_grad(actual_argument_slot, v_primal, v_grad)
-            elif isinstance(needed, texture_type.TextureType) and isinstance(v, taichi.lang._texture.Texture):
-                launch_ctx.set_arg_texture(actual_argument_slot, v.tex)
-            elif isinstance(needed, texture_type.RWTextureType) and isinstance(v, taichi.lang._texture.Texture):
-                launch_ctx.set_arg_rw_texture(actual_argument_slot, v.tex)
-            elif isinstance(needed, ndarray_type.NdarrayType):
-                # Element shapes are already specialized in Taichi codegen.
-                # The shape information for element dims are no longer needed.
-                # Therefore we strip the element shapes from the shape vector,
-                # so that it only holds "real" array shapes.
-                is_soa = needed.layout == Layout.SOA
-                array_shape = v.shape
-                if functools.reduce(operator.mul, array_shape, 1) > np.iinfo(np.int32).max:
-                    warnings.warn(
-                        "Ndarray index might be out of int32 boundary but int64 indexing is not supported yet."
-                    )
-                if needed.dtype is None or id(needed.dtype) in primitive_types.type_ids:
-                    element_dim = 0
-                else:
-                    element_dim = needed.dtype.ndim
-                    array_shape = v.shape[element_dim:] if is_soa else v.shape[:-element_dim]
-                if isinstance(v, np.ndarray):
-                    if v.flags.c_contiguous:
-                        launch_ctx.set_arg_external_array_with_shape(
-                            actual_argument_slot, int(v.ctypes.data), v.nbytes, array_shape, 0
-                        )
-                    elif v.flags.f_contiguous:
-                        # TODO: A better way that avoids copying is saving strides info.
-                        tmp = np.ascontiguousarray(v)
-                        # Purpose: DO NOT GC |tmp|!
-                        tmps.append(tmp)
-
-                        def callback(original, updated):
-                            np.copyto(original, np.asfortranarray(updated))
-
-                        callbacks.append(functools.partial(callback, v, tmp))
-                        launch_ctx.set_arg_external_array_with_shape(
-                            actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, array_shape, 0
-                        )
+            needed_list, provided_list = [], []
+
+            def flatten_argpack(argpack, argpack_type):
+                for j, (name, anno) in enumerate(argpack_type.members.items()):
+                    if isinstance(anno, ArgPackType):
+                        flatten_argpack(argpack[name], anno)
                     else:
-                        raise ValueError(
-                            "Non contiguous numpy arrays are not supported, please call np.ascontiguousarray(arr) before passing it into taichi kernel."
-                        )
-                elif has_pytorch():
-                    import torch  # pylint: disable=C0415
+                        needed_list.append(anno)
+                        provided_list.append(argpack[name])
 
-                    if isinstance(v, torch.Tensor):
-                        if not v.is_contiguous():
-                            raise ValueError(
-                                "Non contiguous tensors are not supported, please call tensor.contiguous() before passing it into taichi kernel."
-                            )
-                        taichi_arch = self.runtime.prog.config().arch
-
-                        def get_call_back(u, v):
-                            def call_back():
-                                u.copy_(v)
-
-                            return call_back
-
-                        # FIXME: only allocate when launching grad kernel
-                        if v.requires_grad and v.grad is None:
-                            v.grad = torch.zeros_like(v)
-
-                        tmp = v
-                        if str(v.device).startswith("cuda") and taichi_arch != _ti_core.Arch.cuda:
-                            # Getting a torch CUDA tensor on Taichi non-cuda arch:
-                            # We just replace it with a CPU tensor and by the end of kernel execution we'll use the callback to copy the values back to the original CUDA tensor.
-                            host_v = v.to(device="cpu", copy=True)
-                            tmp = host_v
-                            callbacks.append(get_call_back(v, host_v))
-
-                        launch_ctx.set_arg_external_array_with_shape(
-                            actual_argument_slot,
-                            int(tmp.data_ptr()),
-                            tmp.element_size() * tmp.nelement(),
-                            array_shape,
-                            int(v.grad.data_ptr()) if v.grad is not None else 0,
+            if isinstance(_needed, ArgPackType) and isinstance(val, ArgPack):
+                flatten_argpack(val, _needed)
+            else:
+                needed_list, provided_list = [_needed], [val]
+
+            for j, _v in enumerate(needed_list):
+                needed, provided, v = _v, type(provided_list[j]), provided_list[j]
+                if actual_argument_slot >= max_arg_num:
+                    exceed_max_arg_num = True
+                    break
+                # Note: do not use sth like "needed == f32". That would be slow.
+                if id(needed) in primitive_types.real_type_ids:
+                    if not isinstance(v, (float, int, np.floating, np.integer)):
+                        raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided)
+                    launch_ctx.set_arg_float(actual_argument_slot, float(v))
+                elif id(needed) in primitive_types.integer_type_ids:
+                    if not isinstance(v, (int, np.integer)):
+                        raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided)
+                    if is_signed(cook_dtype(needed)):
+                        launch_ctx.set_arg_int(actual_argument_slot, int(v))
+                    else:
+                        launch_ctx.set_arg_uint(actual_argument_slot, int(v))
+                elif isinstance(needed, sparse_matrix_builder):
+                    # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
+                    launch_ctx.set_arg_uint(actual_argument_slot, v._get_ndarray_addr())
+                elif isinstance(needed, ndarray_type.NdarrayType) and isinstance(v, taichi.lang._ndarray.Ndarray):
+                    v_primal = v.arr
+                    v_grad = v.grad.arr if v.grad else None
+                    if v_grad is None:
+                        launch_ctx.set_arg_ndarray(actual_argument_slot, v_primal)
+                    else:
+                        launch_ctx.set_arg_ndarray_with_grad(actual_argument_slot, v_primal, v_grad)
+                elif isinstance(needed, texture_type.TextureType) and isinstance(v, taichi.lang._texture.Texture):
+                    launch_ctx.set_arg_texture(actual_argument_slot, v.tex)
+                elif isinstance(needed, texture_type.RWTextureType) and isinstance(v, taichi.lang._texture.Texture):
+                    launch_ctx.set_arg_rw_texture(actual_argument_slot, v.tex)
+                elif isinstance(needed, ndarray_type.NdarrayType):
+                    # Element shapes are already specialized in Taichi codegen.
+                    # The shape information for element dims are no longer needed.
+                    # Therefore we strip the element shapes from the shape vector,
+                    # so that it only holds "real" array shapes.
+                    is_soa = needed.layout == Layout.SOA
+                    array_shape = v.shape
+                    if functools.reduce(operator.mul, array_shape, 1) > np.iinfo(np.int32).max:
+                        warnings.warn(
+                            "Ndarray index might be out of int32 boundary but int64 indexing is not supported yet."
                         )
+                    if needed.dtype is None or id(needed.dtype) in primitive_types.type_ids:
+                        element_dim = 0
                     else:
-                        raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
-                elif has_paddle():
-                    import paddle  # pylint: disable=C0415
-
-                    if isinstance(v, paddle.Tensor):
-                        # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
-                        def get_call_back(u, v):
-                            def call_back():
-                                u.copy_(v, False)
-
-                            return call_back
-
-                        tmp = v.value().get_tensor()
-                        taichi_arch = self.runtime.prog.config().arch
-                        if v.place.is_gpu_place():
-                            if taichi_arch != _ti_core.Arch.cuda:
-                                # Paddle cuda tensor on Taichi non-cuda arch
-                                host_v = v.cpu()
-                                tmp = host_v.value().get_tensor()
+                        element_dim = needed.dtype.ndim
+                        array_shape = v.shape[element_dim:] if is_soa else v.shape[:-element_dim]
+                    if isinstance(v, np.ndarray):
+                        if v.flags.c_contiguous:
+                            launch_ctx.set_arg_external_array_with_shape(
+                                actual_argument_slot, int(v.ctypes.data), v.nbytes, array_shape, 0
+                            )
+                        elif v.flags.f_contiguous:
+                            # TODO: A better way that avoids copying is saving strides info.
+                            tmp = np.ascontiguousarray(v)
+                            # Purpose: DO NOT GC |tmp|!
+                            self.tmps.append(tmp)
+
+                            def callback(original, updated):
+                                np.copyto(original, np.asfortranarray(updated))
+
+                            callbacks.append(functools.partial(callback, v, tmp))
+                            launch_ctx.set_arg_external_array_with_shape(
+                                actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, array_shape, 0
+                            )
+                        else:
+                            raise ValueError(
+                                "Non contiguous numpy arrays are not supported, please call np.ascontiguousarray(arr) "
+                                "before passing it into taichi kernel."
+                            )
+                    elif has_pytorch():
+                        import torch  # pylint: disable=C0415
+
+                        if isinstance(v, torch.Tensor):
+                            if not v.is_contiguous():
+                                raise ValueError(
+                                    "Non contiguous tensors are not supported, please call tensor.contiguous() before "
+                                    "passing it into taichi kernel."
+                                )
+                            taichi_arch = self.runtime.prog.config().arch
+
+                            def get_call_back(u, v):
+                                def call_back():
+                                    u.copy_(v)
+
+                                return call_back
+
+                            # FIXME: only allocate when launching grad kernel
+                            if v.requires_grad and v.grad is None:
+                                v.grad = torch.zeros_like(v)
+
+                            tmp = v
+                            if str(v.device).startswith("cuda") and taichi_arch != _ti_core.Arch.cuda:
+                                # Getting a torch CUDA tensor on Taichi non-cuda arch:
+                                # We just replace it with a CPU tensor and by the end of kernel execution we'll use the
+                                # callback to copy the values back to the original CUDA tensor.
+                                host_v = v.to(device="cpu", copy=True)
+                                tmp = host_v
                                 callbacks.append(get_call_back(v, host_v))
-                        elif v.place.is_cpu_place():
-                            if taichi_arch == _ti_core.Arch.cuda:
-                                # Paddle cpu tensor on Taichi cuda arch
-                                gpu_v = v.cuda()
-                                tmp = gpu_v.value().get_tensor()
-                                callbacks.append(get_call_back(v, gpu_v))
+
+                            launch_ctx.set_arg_external_array_with_shape(
+                                actual_argument_slot,
+                                int(tmp.data_ptr()),
+                                tmp.element_size() * tmp.nelement(),
+                                array_shape,
+                                int(v.grad.data_ptr()) if v.grad is not None else 0,
+                            )
                         else:
-                            # Paddle do support many other backends like XPU, NPU, MLU, IPU
-                            raise TaichiRuntimeTypeError(f"Taichi do not support backend {v.place} that Paddle support")
-                        launch_ctx.set_arg_external_array_with_shape(
-                            actual_argument_slot, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0
-                        )
+                            raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
+                    elif has_paddle():
+                        import paddle  # pylint: disable=C0415
+
+                        if isinstance(v, paddle.Tensor):
+                            # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
+                            def get_call_back(u, v):
+                                def call_back():
+                                    u.copy_(v, False)
+
+                                return call_back
+
+                            tmp = v.value().get_tensor()
+                            taichi_arch = self.runtime.prog.config().arch
+                            if v.place.is_gpu_place():
+                                if taichi_arch != _ti_core.Arch.cuda:
+                                    # Paddle cuda tensor on Taichi non-cuda arch
+                                    host_v = v.cpu()
+                                    tmp = host_v.value().get_tensor()
+                                    callbacks.append(get_call_back(v, host_v))
+                            elif v.place.is_cpu_place():
+                                if taichi_arch == _ti_core.Arch.cuda:
+                                    # Paddle cpu tensor on Taichi cuda arch
+                                    gpu_v = v.cuda()
+                                    tmp = gpu_v.value().get_tensor()
+                                    callbacks.append(get_call_back(v, gpu_v))
+                            else:
+                                # Paddle do support many other backends like XPU, NPU, MLU, IPU
+                                raise TaichiRuntimeTypeError(
+                                    f"Taichi do not support backend {v.place} that Paddle support"
+                                )
+                            launch_ctx.set_arg_external_array_with_shape(
+                                actual_argument_slot, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0
+                            )
+                        else:
+                            raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
                     else:
                         raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
-                else:
-                    raise TaichiRuntimeTypeError.get(i, needed.to_string(), v)
 
-            elif isinstance(needed, MatrixType):
-                if needed.dtype in primitive_types.real_types:
+                elif isinstance(needed, MatrixType):
+                    if needed.dtype in primitive_types.real_types:
 
-                    def cast_func(x):
-                        if not isinstance(x, (int, float, np.integer, np.floating)):
-                            raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
-                        return float(x)
+                        def cast_func(x):
+                            if not isinstance(x, (int, float, np.integer, np.floating)):
+                                raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
+                            return float(x)
 
-                elif needed.dtype in primitive_types.integer_types:
+                    elif needed.dtype in primitive_types.integer_types:
 
-                    def cast_func(x):
-                        if not isinstance(x, (int, np.integer)):
-                            raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
-                        return int(x)
+                        def cast_func(x):
+                            if not isinstance(x, (int, np.integer)):
+                                raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x))
+                            return int(x)
 
-                else:
-                    raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.")
+                    else:
+                        raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.")
 
-                if needed.ndim == 2:
-                    v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)]
+                    if needed.ndim == 2:
+                        v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)]
+                    else:
+                        v = [cast_func(v[i]) for i in range(needed.n)]
+                    v = needed(*v)
+                    needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
+                elif isinstance(needed, StructType):
+                    if not isinstance(v, needed):
+                        raise TaichiRuntimeTypeError.get(i, str(needed), provided)
+                    needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
                 else:
-                    v = [cast_func(v[i]) for i in range(needed.n)]
-                v = needed(*v)
-                needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
-            elif isinstance(needed, StructType):
-                if not isinstance(v, needed):
-                    raise TaichiRuntimeTypeError.get(i, str(needed), provided)
-                needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,))
-            else:
-                raise ValueError(f"Argument type mismatch. Expecting {needed}, got {type(v)}.")
-            actual_argument_slot += 1
+                    raise ValueError(f"Argument type mismatch. Expecting {needed}, got {type(v)}.")
+                actual_argument_slot += 1
 
         if exceed_max_arg_num:
             raise TaichiRuntimeError(
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index 6aa326fbb25848..d4eb9951bbed45 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -35,6 +35,8 @@ Ndarray::Ndarray(Program *prog,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)),
       prog_(prog) {
+  TI_INFO("Ndarray {} is created.", (long long)this);
+  TI_FLUSH_LOGGER
   // Now that we have two shapes which may be concatenated differently
   // depending on layout, total_shape_ comes handy.
   total_shape_ = shape;
@@ -71,6 +73,8 @@ Ndarray::Ndarray(DeviceAllocation &devalloc,
                                 1,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)) {
+  TI_INFO("Ndarray {} is created.", (long long)this);
+  TI_FLUSH_LOGGER
   // When element_shape is specified but layout is not, default layout is AOS.
   auto element_shape = data_type_shape(dtype);
   if (!element_shape.empty() && layout == ExternalArrayLayout::kNull) {
@@ -109,6 +113,8 @@ Ndarray::Ndarray(DeviceAllocation &devalloc,
 }
 
 Ndarray::~Ndarray() {
+  TI_INFO("Ndarray {} is released.", (long long)this);
+  TI_FLUSH_LOGGER
   if (prog_) {
     // prog_->flush();
     ndarray_alloc_.device->dealloc_memory(ndarray_alloc_);
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index f21579baf20b70..8932e2d149de86 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -52,6 +52,7 @@ std::atomic<int> Program::num_instances_;
 
 Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) {
   TI_TRACE("Program initializing...");
+  TI_INFO("Program initializing, this = {}", (long long)this);
 
   // For performance considerations and correctness of QuantFloatType
   // operations, we force floating-point operations to flush to zero on all
@@ -328,6 +329,7 @@ uint64 Program::fetch_result_uint64(int i) {
 }
 
 void Program::finalize() {
+  TI_INFO("Program finalizing, this = {}", (long long)this);
   if (finalized_) {
     return;
   }
@@ -368,12 +370,12 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
   return program_impl_->get_snode_num_dynamically_allocated(snode,
                                                             result_buffer);
 }
-
+std::mutex mutex;
 Ndarray *Program::create_ndarray(const DataType type,
                                  const std::vector<int> &shape,
                                  ExternalArrayLayout layout,
                                  bool zero_fill) {
-  auto arr = std::make_unique<Ndarray>(this, type, shape, layout);
+  auto arr = std::make_shared<Ndarray>(this, type, shape, layout);
   if (zero_fill) {
     Arch arch = compile_config().arch;
     if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) {
@@ -391,7 +393,9 @@ Ndarray *Program::create_ndarray(const DataType type,
     }
   }
   auto arr_ptr = arr.get();
-  ndarrays_.insert({arr_ptr, std::move(arr)});
+  mutex.lock();
+  ndarrays_.insert({arr_ptr, (arr)});
+  mutex.unlock();
   return arr_ptr;
 }
 
@@ -407,10 +411,23 @@ void Program::delete_ndarray(Ndarray *ndarray) {
   // runtime instead of this giant program and it should be freed when:
   // - Python GC signals taichi that it's no longer useful
   // - All kernels using it are executed.
+  mutex.lock();
+  TI_INFO("this = {}", (long long)this);
+  TI_INFO("bucket count = {}", ndarrays_.bucket_count());
+  TI_FLUSH_LOGGER
+  TI_INFO("ndarray count = {}", ndarrays_.count(ndarray));
+  if (ndarrays_.count(ndarray))
+    TI_INFO("alloc_id = {}", ndarray->ndarray_alloc_.alloc_id);
+  TI_ASSERT(program_impl_.get() != nullptr);
+  TI_ASSERT(ndarray != nullptr);
+  TI_INFO("ndarray ptr = {}", (long long)ndarray);
+  TI_INFO("used in kernel = {}", program_impl_->used_in_kernel(ndarray->ndarray_alloc_.alloc_id));
+  TI_FLUSH_LOGGER;
   if (ndarrays_.count(ndarray) &&
       !program_impl_->used_in_kernel(ndarray->ndarray_alloc_.alloc_id)) {
     ndarrays_.erase(ndarray);
   }
+  mutex.unlock();
 }
 
 Texture *Program::create_texture(BufferFormat buffer_format,
diff --git a/taichi/program/program.h b/taichi/program/program.h
index f0a53c40cf94ba..cbff4571b241f2 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -336,7 +336,7 @@ class TI_DLL_EXPORT Program {
   bool finalized_{false};
 
   // TODO: Move ndarrays_ and textures_ to be managed by runtime
-  std::unordered_map<void *, std::unique_ptr<Ndarray>> ndarrays_;
+  std::unordered_map<void *, std::shared_ptr<Ndarray>> ndarrays_;
   std::vector<std::unique_ptr<Texture>> textures_;
 };
 
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 499202b9d26db2..dd0a051f2191a7 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -100,6 +100,7 @@ SparseMatrixBuilder::SparseMatrixBuilder(int rows,
 }
 
 SparseMatrixBuilder::~SparseMatrixBuilder() {
+  TI_INFO("SparseMatrixBuilder::~SparseMatrixBuilder, prog = {}", (long long) prog_);
   prog_->delete_ndarray(ndarray_data_base_ptr_);
 }
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 8ca9ca2b8c5c7f..bd259d484eed60 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -421,7 +421,9 @@ void export_lang(py::module &m) {
           py::arg("dt"), py::arg("shape"),
           py::arg("layout") = ExternalArrayLayout::kNull,
           py::arg("zero_fill") = false, py::return_value_policy::reference)
-      .def("delete_ndarray", &Program::delete_ndarray)
+      .def("delete_ndarray", [&](Program* p, Ndarray* arr) -> void {
+        p->delete_ndarray(arr);
+      })
       .def(
           "create_texture",
           [&](Program *program, BufferFormat fmt, const std::vector<int> &shape)