diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh index 38c93b3558ba4e..756fac583ae405 100755 --- a/.github/workflows/scripts/unix_test.sh +++ b/.github/workflows/scripts/unix_test.sh @@ -9,6 +9,7 @@ export TAICHI_AOT_FOLDER_PATH="taichi/tests" export TI_SKIP_VERSION_CHECK=ON export LD_LIBRARY_PATH=$PWD/build/:$LD_LIBRARY_PATH export TI_OFFLINE_CACHE_FILE_PATH=$PWD/.cache/taichi +export TI_SKIP_CPP_TESTS=1 # Disable compat tests to save time. diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py index cb1e37c726b2ba..ae46c26c55aa6c 100644 --- a/python/taichi/lang/_ndarray.py +++ b/python/taichi/lang/_ndarray.py @@ -243,6 +243,7 @@ def __init__(self, dtype, arr_shape): def __del__(self): if impl is not None and impl.get_runtime() is not None and impl.get_runtime().prog is not None: + print(impl.get_runtime().prog) impl.get_runtime().prog.delete_ndarray(self.arr) @property diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py index d44f7671e41a51..0186a6b0babcf1 100644 --- a/python/taichi/lang/ast/ast_transformer.py +++ b/python/taichi/lang/ast/ast_transformer.py @@ -12,6 +12,7 @@ from taichi.lang import _ndarray, any_array, expr, impl, kernel_arguments, matrix, mesh from taichi.lang import ops as ti_ops from taichi.lang._ndrange import _Ndrange, ndrange +from taichi.lang.argpack import ArgPackType from taichi.lang.ast.ast_transformer_utils import Builder, LoopStatus, ReturnStatus from taichi.lang.ast.symbol_resolver import ASTResolver from taichi.lang.exception import ( @@ -601,6 +602,45 @@ def build_FunctionDef(ctx, node): assert args.kw_defaults == [] assert args.kwarg is None + def decl_and_create_variable(annotation, name, arg_features): + if not isinstance(annotation, primitive_types.RefType): + ctx.kernel_args.append(name) + if isinstance(annotation, ArgPackType): + d = {} + for j, (_name, anno) in enumerate(annotation.members.items()): + d[_name] = decl_and_create_variable(anno, _name, arg_features[j]) + return kernel_arguments.decl_argpack_arg(annotation, d) + if isinstance(annotation, annotations.template): + return ctx.global_vars[name] + if isinstance(annotation, annotations.sparse_matrix_builder): + return kernel_arguments.decl_sparse_matrix( + to_taichi_type(arg_features), + name, + ) + if isinstance(annotation, ndarray_type.NdarrayType): + return kernel_arguments.decl_ndarray_arg( + to_taichi_type(arg_features[0]), + arg_features[1], + arg_features[2], + arg_features[3], + name, + arg_features[4], + ) + if isinstance(annotation, texture_type.TextureType): + return kernel_arguments.decl_texture_arg(arg_features[0], name) + if isinstance(annotation, texture_type.RWTextureType): + return kernel_arguments.decl_rw_texture_arg( + arg_features[0], + arg_features[1], + arg_features[2], + name, + ) + if isinstance(annotation, MatrixType): + return kernel_arguments.decl_matrix_arg(annotation, name) + if isinstance(annotation, StructType): + return kernel_arguments.decl_struct_arg(annotation, name) + return kernel_arguments.decl_scalar_arg(annotation, name) + def transform_as_kernel(): # Treat return type if node.returns is not None: @@ -608,60 +648,21 @@ def transform_as_kernel(): impl.get_runtime().compiling_callable.finalize_rets() for i, arg in enumerate(args.args): - if not isinstance(ctx.func.arguments[i].annotation, primitive_types.RefType): - ctx.kernel_args.append(arg.arg) - if isinstance(ctx.func.arguments[i].annotation, annotations.template): - ctx.create_variable(arg.arg, ctx.global_vars[arg.arg]) - elif isinstance(ctx.func.arguments[i].annotation, annotations.sparse_matrix_builder): - ctx.create_variable( - arg.arg, - kernel_arguments.decl_sparse_matrix( - to_taichi_type(ctx.arg_features[i]), - ctx.func.arguments[i].name, - ), - ) - elif isinstance(ctx.func.arguments[i].annotation, ndarray_type.NdarrayType): - ctx.create_variable( - arg.arg, - kernel_arguments.decl_ndarray_arg( - to_taichi_type(ctx.arg_features[i][0]), - ctx.arg_features[i][1], - ctx.arg_features[i][2], - ctx.arg_features[i][3], - ctx.func.arguments[i].name, - ctx.arg_features[i][4], - ), - ) - elif isinstance(ctx.func.arguments[i].annotation, texture_type.TextureType): - ctx.create_variable( - arg.arg, - kernel_arguments.decl_texture_arg(ctx.arg_features[i][0], ctx.func.arguments[i].name), - ) - elif isinstance(ctx.func.arguments[i].annotation, texture_type.RWTextureType): + if isinstance(ctx.func.arguments[i].annotation, ArgPackType): + d = {} + for j, (name, anno) in enumerate(ctx.func.arguments[i].annotation.members.items()): + d[name] = decl_and_create_variable(anno, name, ctx.arg_features[i][j]) + ctx.create_variable(arg.arg, kernel_arguments.decl_argpack_arg(ctx.func.arguments[i].annotation, d)) + else: ctx.create_variable( arg.arg, - kernel_arguments.decl_rw_texture_arg( - ctx.arg_features[i][0], - ctx.arg_features[i][1], - ctx.arg_features[i][2], + decl_and_create_variable( + ctx.func.arguments[i].annotation, ctx.func.arguments[i].name, + ctx.arg_features[i] if ctx.arg_features is not None else None, ), ) - elif isinstance(ctx.func.arguments[i].annotation, MatrixType): - ctx.create_variable( - arg.arg, - kernel_arguments.decl_matrix_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name), - ) - elif isinstance(ctx.func.arguments[i].annotation, StructType): - ctx.create_variable( - arg.arg, - kernel_arguments.decl_struct_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name), - ) - else: - ctx.create_variable( - arg.arg, - kernel_arguments.decl_scalar_arg(ctx.func.arguments[i].annotation, ctx.func.arguments[i].name), - ) + impl.get_runtime().compiling_callable.finalize_params() # remove original args node.args.args = [] diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py index 5c67e51d85f494..699a68922623ee 100644 --- a/python/taichi/lang/impl.py +++ b/python/taichi/lang/impl.py @@ -377,6 +377,7 @@ def set_default_ip(self, ip): def create_program(self): if self.prog is None: self.prog = _ti_core.Program() + print('create prog =', self.prog) @staticmethod def materialize_root_fb(is_first_call): @@ -480,6 +481,7 @@ def _register_signal_handlers(self): def clear(self): if self.prog: + print('clear prog =', self.prog) self.prog.finalize() self.prog = None self._signal_handler_registry = None diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py index c15b8555a8f340..6daedd27acfc80 100644 --- a/python/taichi/lang/kernel_arguments.py +++ b/python/taichi/lang/kernel_arguments.py @@ -95,6 +95,10 @@ def decl_struct_arg(structtype, name): return structtype.from_taichi_object(arg_load) +def decl_argpack_arg(argpacktype, member_dict): + return argpacktype.from_taichi_object(member_dict) + + def decl_sparse_matrix(dtype, name): value_type = cook_dtype(dtype) ptr_type = cook_dtype(u64) diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py index 010ffe50a94d72..cd17b3953a9d3a 100644 --- a/python/taichi/lang/kernel_impl.py +++ b/python/taichi/lang/kernel_impl.py @@ -13,6 +13,7 @@ from taichi._lib import core as _ti_core from taichi.lang import impl, ops, runtime_ops from taichi.lang._wrap_inspect import getsourcefile, getsourcelines +from taichi.lang.argpack import ArgPackType, ArgPack from taichi.lang.ast import ( ASTTransformerContext, KernelSimplicityASTChecker, @@ -368,6 +369,13 @@ def extract_arg(arg, anno): # [Primitive arguments] Return the value return arg + if isinstance(anno, ArgPackType): + if not isinstance(arg, ArgPack): + raise TaichiRuntimeTypeError(f"Argument must be a argument pack, got {type(arg)}") + return tuple( + TaichiCallableTemplateMapper.extract_arg(arg[name], dtype) + for index, (name, dtype) in enumerate(anno.members.items()) + ) if isinstance(anno, texture_type.TextureType): if not isinstance(arg, taichi.lang._texture.Texture): raise TaichiRuntimeTypeError(f"Argument must be a texture, got {type(arg)}") @@ -545,6 +553,8 @@ def extract_arguments(self): pass elif isinstance(annotation, StructType): pass + elif isinstance(annotation, ArgPackType): + pass else: raise TaichiSyntaxError(f"Invalid type annotation (argument {i}) of Taichi kernel: {annotation}") self.arguments.append(KernelArgument(annotation, param.name, param.default)) @@ -603,189 +613,210 @@ def taichi_ast_generator(kernel_cxx): def launch_kernel(self, t_kernel, *args): assert len(args) == len(self.arguments), f"{len(self.arguments)} arguments needed but {len(args)} provided" - tmps = [] + self.tmps = [] callbacks = [] actual_argument_slot = 0 launch_ctx = t_kernel.make_launch_context() max_arg_num = 64 exceed_max_arg_num = False - for i, v in enumerate(args): - needed = self.arguments[i].annotation - if isinstance(needed, template): + for i, val in enumerate(args): + _needed = self.arguments[i].annotation + if isinstance(_needed, template): continue - if actual_argument_slot >= max_arg_num: - exceed_max_arg_num = True - break - provided = type(v) - # Note: do not use sth like "needed == f32". That would be slow. - if id(needed) in primitive_types.real_type_ids: - if not isinstance(v, (float, int, np.floating, np.integer)): - raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided) - launch_ctx.set_arg_float(actual_argument_slot, float(v)) - elif id(needed) in primitive_types.integer_type_ids: - if not isinstance(v, (int, np.integer)): - raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided) - if is_signed(cook_dtype(needed)): - launch_ctx.set_arg_int(actual_argument_slot, int(v)) - else: - launch_ctx.set_arg_uint(actual_argument_slot, int(v)) - elif isinstance(needed, sparse_matrix_builder): - # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument - launch_ctx.set_arg_uint(actual_argument_slot, v._get_ndarray_addr()) - elif isinstance(needed, ndarray_type.NdarrayType) and isinstance(v, taichi.lang._ndarray.Ndarray): - v_primal = v.arr - v_grad = v.grad.arr if v.grad else None - if v_grad is None: - launch_ctx.set_arg_ndarray(actual_argument_slot, v_primal) - else: - launch_ctx.set_arg_ndarray_with_grad(actual_argument_slot, v_primal, v_grad) - elif isinstance(needed, texture_type.TextureType) and isinstance(v, taichi.lang._texture.Texture): - launch_ctx.set_arg_texture(actual_argument_slot, v.tex) - elif isinstance(needed, texture_type.RWTextureType) and isinstance(v, taichi.lang._texture.Texture): - launch_ctx.set_arg_rw_texture(actual_argument_slot, v.tex) - elif isinstance(needed, ndarray_type.NdarrayType): - # Element shapes are already specialized in Taichi codegen. - # The shape information for element dims are no longer needed. - # Therefore we strip the element shapes from the shape vector, - # so that it only holds "real" array shapes. - is_soa = needed.layout == Layout.SOA - array_shape = v.shape - if functools.reduce(operator.mul, array_shape, 1) > np.iinfo(np.int32).max: - warnings.warn( - "Ndarray index might be out of int32 boundary but int64 indexing is not supported yet." - ) - if needed.dtype is None or id(needed.dtype) in primitive_types.type_ids: - element_dim = 0 - else: - element_dim = needed.dtype.ndim - array_shape = v.shape[element_dim:] if is_soa else v.shape[:-element_dim] - if isinstance(v, np.ndarray): - if v.flags.c_contiguous: - launch_ctx.set_arg_external_array_with_shape( - actual_argument_slot, int(v.ctypes.data), v.nbytes, array_shape, 0 - ) - elif v.flags.f_contiguous: - # TODO: A better way that avoids copying is saving strides info. - tmp = np.ascontiguousarray(v) - # Purpose: DO NOT GC |tmp|! - tmps.append(tmp) - - def callback(original, updated): - np.copyto(original, np.asfortranarray(updated)) - - callbacks.append(functools.partial(callback, v, tmp)) - launch_ctx.set_arg_external_array_with_shape( - actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, array_shape, 0 - ) + needed_list, provided_list = [], [] + + def flatten_argpack(argpack, argpack_type): + for j, (name, anno) in enumerate(argpack_type.members.items()): + if isinstance(anno, ArgPackType): + flatten_argpack(argpack[name], anno) else: - raise ValueError( - "Non contiguous numpy arrays are not supported, please call np.ascontiguousarray(arr) before passing it into taichi kernel." - ) - elif has_pytorch(): - import torch # pylint: disable=C0415 + needed_list.append(anno) + provided_list.append(argpack[name]) - if isinstance(v, torch.Tensor): - if not v.is_contiguous(): - raise ValueError( - "Non contiguous tensors are not supported, please call tensor.contiguous() before passing it into taichi kernel." - ) - taichi_arch = self.runtime.prog.config().arch - - def get_call_back(u, v): - def call_back(): - u.copy_(v) - - return call_back - - # FIXME: only allocate when launching grad kernel - if v.requires_grad and v.grad is None: - v.grad = torch.zeros_like(v) - - tmp = v - if str(v.device).startswith("cuda") and taichi_arch != _ti_core.Arch.cuda: - # Getting a torch CUDA tensor on Taichi non-cuda arch: - # We just replace it with a CPU tensor and by the end of kernel execution we'll use the callback to copy the values back to the original CUDA tensor. - host_v = v.to(device="cpu", copy=True) - tmp = host_v - callbacks.append(get_call_back(v, host_v)) - - launch_ctx.set_arg_external_array_with_shape( - actual_argument_slot, - int(tmp.data_ptr()), - tmp.element_size() * tmp.nelement(), - array_shape, - int(v.grad.data_ptr()) if v.grad is not None else 0, + if isinstance(_needed, ArgPackType) and isinstance(val, ArgPack): + flatten_argpack(val, _needed) + else: + needed_list, provided_list = [_needed], [val] + + for j, _v in enumerate(needed_list): + needed, provided, v = _v, type(provided_list[j]), provided_list[j] + if actual_argument_slot >= max_arg_num: + exceed_max_arg_num = True + break + # Note: do not use sth like "needed == f32". That would be slow. + if id(needed) in primitive_types.real_type_ids: + if not isinstance(v, (float, int, np.floating, np.integer)): + raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided) + launch_ctx.set_arg_float(actual_argument_slot, float(v)) + elif id(needed) in primitive_types.integer_type_ids: + if not isinstance(v, (int, np.integer)): + raise TaichiRuntimeTypeError.get(i, needed.to_string(), provided) + if is_signed(cook_dtype(needed)): + launch_ctx.set_arg_int(actual_argument_slot, int(v)) + else: + launch_ctx.set_arg_uint(actual_argument_slot, int(v)) + elif isinstance(needed, sparse_matrix_builder): + # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument + launch_ctx.set_arg_uint(actual_argument_slot, v._get_ndarray_addr()) + elif isinstance(needed, ndarray_type.NdarrayType) and isinstance(v, taichi.lang._ndarray.Ndarray): + v_primal = v.arr + v_grad = v.grad.arr if v.grad else None + if v_grad is None: + launch_ctx.set_arg_ndarray(actual_argument_slot, v_primal) + else: + launch_ctx.set_arg_ndarray_with_grad(actual_argument_slot, v_primal, v_grad) + elif isinstance(needed, texture_type.TextureType) and isinstance(v, taichi.lang._texture.Texture): + launch_ctx.set_arg_texture(actual_argument_slot, v.tex) + elif isinstance(needed, texture_type.RWTextureType) and isinstance(v, taichi.lang._texture.Texture): + launch_ctx.set_arg_rw_texture(actual_argument_slot, v.tex) + elif isinstance(needed, ndarray_type.NdarrayType): + # Element shapes are already specialized in Taichi codegen. + # The shape information for element dims are no longer needed. + # Therefore we strip the element shapes from the shape vector, + # so that it only holds "real" array shapes. + is_soa = needed.layout == Layout.SOA + array_shape = v.shape + if functools.reduce(operator.mul, array_shape, 1) > np.iinfo(np.int32).max: + warnings.warn( + "Ndarray index might be out of int32 boundary but int64 indexing is not supported yet." ) + if needed.dtype is None or id(needed.dtype) in primitive_types.type_ids: + element_dim = 0 else: - raise TaichiRuntimeTypeError.get(i, needed.to_string(), v) - elif has_paddle(): - import paddle # pylint: disable=C0415 - - if isinstance(v, paddle.Tensor): - # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch - def get_call_back(u, v): - def call_back(): - u.copy_(v, False) - - return call_back - - tmp = v.value().get_tensor() - taichi_arch = self.runtime.prog.config().arch - if v.place.is_gpu_place(): - if taichi_arch != _ti_core.Arch.cuda: - # Paddle cuda tensor on Taichi non-cuda arch - host_v = v.cpu() - tmp = host_v.value().get_tensor() + element_dim = needed.dtype.ndim + array_shape = v.shape[element_dim:] if is_soa else v.shape[:-element_dim] + if isinstance(v, np.ndarray): + if v.flags.c_contiguous: + launch_ctx.set_arg_external_array_with_shape( + actual_argument_slot, int(v.ctypes.data), v.nbytes, array_shape, 0 + ) + elif v.flags.f_contiguous: + # TODO: A better way that avoids copying is saving strides info. + tmp = np.ascontiguousarray(v) + # Purpose: DO NOT GC |tmp|! + self.tmps.append(tmp) + + def callback(original, updated): + np.copyto(original, np.asfortranarray(updated)) + + callbacks.append(functools.partial(callback, v, tmp)) + launch_ctx.set_arg_external_array_with_shape( + actual_argument_slot, int(tmp.ctypes.data), tmp.nbytes, array_shape, 0 + ) + else: + raise ValueError( + "Non contiguous numpy arrays are not supported, please call np.ascontiguousarray(arr) " + "before passing it into taichi kernel." + ) + elif has_pytorch(): + import torch # pylint: disable=C0415 + + if isinstance(v, torch.Tensor): + if not v.is_contiguous(): + raise ValueError( + "Non contiguous tensors are not supported, please call tensor.contiguous() before " + "passing it into taichi kernel." + ) + taichi_arch = self.runtime.prog.config().arch + + def get_call_back(u, v): + def call_back(): + u.copy_(v) + + return call_back + + # FIXME: only allocate when launching grad kernel + if v.requires_grad and v.grad is None: + v.grad = torch.zeros_like(v) + + tmp = v + if str(v.device).startswith("cuda") and taichi_arch != _ti_core.Arch.cuda: + # Getting a torch CUDA tensor on Taichi non-cuda arch: + # We just replace it with a CPU tensor and by the end of kernel execution we'll use the + # callback to copy the values back to the original CUDA tensor. + host_v = v.to(device="cpu", copy=True) + tmp = host_v callbacks.append(get_call_back(v, host_v)) - elif v.place.is_cpu_place(): - if taichi_arch == _ti_core.Arch.cuda: - # Paddle cpu tensor on Taichi cuda arch - gpu_v = v.cuda() - tmp = gpu_v.value().get_tensor() - callbacks.append(get_call_back(v, gpu_v)) + + launch_ctx.set_arg_external_array_with_shape( + actual_argument_slot, + int(tmp.data_ptr()), + tmp.element_size() * tmp.nelement(), + array_shape, + int(v.grad.data_ptr()) if v.grad is not None else 0, + ) else: - # Paddle do support many other backends like XPU, NPU, MLU, IPU - raise TaichiRuntimeTypeError(f"Taichi do not support backend {v.place} that Paddle support") - launch_ctx.set_arg_external_array_with_shape( - actual_argument_slot, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0 - ) + raise TaichiRuntimeTypeError.get(i, needed.to_string(), v) + elif has_paddle(): + import paddle # pylint: disable=C0415 + + if isinstance(v, paddle.Tensor): + # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch + def get_call_back(u, v): + def call_back(): + u.copy_(v, False) + + return call_back + + tmp = v.value().get_tensor() + taichi_arch = self.runtime.prog.config().arch + if v.place.is_gpu_place(): + if taichi_arch != _ti_core.Arch.cuda: + # Paddle cuda tensor on Taichi non-cuda arch + host_v = v.cpu() + tmp = host_v.value().get_tensor() + callbacks.append(get_call_back(v, host_v)) + elif v.place.is_cpu_place(): + if taichi_arch == _ti_core.Arch.cuda: + # Paddle cpu tensor on Taichi cuda arch + gpu_v = v.cuda() + tmp = gpu_v.value().get_tensor() + callbacks.append(get_call_back(v, gpu_v)) + else: + # Paddle do support many other backends like XPU, NPU, MLU, IPU + raise TaichiRuntimeTypeError( + f"Taichi do not support backend {v.place} that Paddle support" + ) + launch_ctx.set_arg_external_array_with_shape( + actual_argument_slot, int(tmp._ptr()), v.element_size() * v.size, array_shape, 0 + ) + else: + raise TaichiRuntimeTypeError.get(i, needed.to_string(), v) else: raise TaichiRuntimeTypeError.get(i, needed.to_string(), v) - else: - raise TaichiRuntimeTypeError.get(i, needed.to_string(), v) - elif isinstance(needed, MatrixType): - if needed.dtype in primitive_types.real_types: + elif isinstance(needed, MatrixType): + if needed.dtype in primitive_types.real_types: - def cast_func(x): - if not isinstance(x, (int, float, np.integer, np.floating)): - raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x)) - return float(x) + def cast_func(x): + if not isinstance(x, (int, float, np.integer, np.floating)): + raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x)) + return float(x) - elif needed.dtype in primitive_types.integer_types: + elif needed.dtype in primitive_types.integer_types: - def cast_func(x): - if not isinstance(x, (int, np.integer)): - raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x)) - return int(x) + def cast_func(x): + if not isinstance(x, (int, np.integer)): + raise TaichiRuntimeTypeError.get(i, needed.dtype.to_string(), type(x)) + return int(x) - else: - raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.") + else: + raise ValueError(f"Matrix dtype {needed.dtype} is not integer type or real type.") - if needed.ndim == 2: - v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)] + if needed.ndim == 2: + v = [cast_func(v[i, j]) for i in range(needed.n) for j in range(needed.m)] + else: + v = [cast_func(v[i]) for i in range(needed.n)] + v = needed(*v) + needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,)) + elif isinstance(needed, StructType): + if not isinstance(v, needed): + raise TaichiRuntimeTypeError.get(i, str(needed), provided) + needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,)) else: - v = [cast_func(v[i]) for i in range(needed.n)] - v = needed(*v) - needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,)) - elif isinstance(needed, StructType): - if not isinstance(v, needed): - raise TaichiRuntimeTypeError.get(i, str(needed), provided) - needed.set_kernel_struct_args(v, launch_ctx, (actual_argument_slot,)) - else: - raise ValueError(f"Argument type mismatch. Expecting {needed}, got {type(v)}.") - actual_argument_slot += 1 + raise ValueError(f"Argument type mismatch. Expecting {needed}, got {type(v)}.") + actual_argument_slot += 1 if exceed_max_arg_num: raise TaichiRuntimeError( diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp index 6aa326fbb25848..d4eb9951bbed45 100644 --- a/taichi/program/ndarray.cpp +++ b/taichi/program/ndarray.cpp @@ -35,6 +35,8 @@ Ndarray::Ndarray(Program *prog, std::multiplies<>())), element_size_(data_type_size(dtype)), prog_(prog) { + TI_INFO("Ndarray {} is created.", (long long)this); + TI_FLUSH_LOGGER // Now that we have two shapes which may be concatenated differently // depending on layout, total_shape_ comes handy. total_shape_ = shape; @@ -71,6 +73,8 @@ Ndarray::Ndarray(DeviceAllocation &devalloc, 1, std::multiplies<>())), element_size_(data_type_size(dtype)) { + TI_INFO("Ndarray {} is created.", (long long)this); + TI_FLUSH_LOGGER // When element_shape is specified but layout is not, default layout is AOS. auto element_shape = data_type_shape(dtype); if (!element_shape.empty() && layout == ExternalArrayLayout::kNull) { @@ -109,6 +113,8 @@ Ndarray::Ndarray(DeviceAllocation &devalloc, } Ndarray::~Ndarray() { + TI_INFO("Ndarray {} is released.", (long long)this); + TI_FLUSH_LOGGER if (prog_) { // prog_->flush(); ndarray_alloc_.device->dealloc_memory(ndarray_alloc_); diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index f21579baf20b70..8932e2d149de86 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -52,6 +52,7 @@ std::atomic Program::num_instances_; Program::Program(Arch desired_arch) : snode_rw_accessors_bank_(this) { TI_TRACE("Program initializing..."); + TI_INFO("Program initializing, this = {}", (long long)this); // For performance considerations and correctness of QuantFloatType // operations, we force floating-point operations to flush to zero on all @@ -328,6 +329,7 @@ uint64 Program::fetch_result_uint64(int i) { } void Program::finalize() { + TI_INFO("Program finalizing, this = {}", (long long)this); if (finalized_) { return; } @@ -368,12 +370,12 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) { return program_impl_->get_snode_num_dynamically_allocated(snode, result_buffer); } - +std::mutex mutex; Ndarray *Program::create_ndarray(const DataType type, const std::vector &shape, ExternalArrayLayout layout, bool zero_fill) { - auto arr = std::make_unique(this, type, shape, layout); + auto arr = std::make_shared(this, type, shape, layout); if (zero_fill) { Arch arch = compile_config().arch; if (arch_is_cpu(arch) || arch == Arch::cuda || arch == Arch::amdgpu) { @@ -391,7 +393,9 @@ Ndarray *Program::create_ndarray(const DataType type, } } auto arr_ptr = arr.get(); - ndarrays_.insert({arr_ptr, std::move(arr)}); + mutex.lock(); + ndarrays_.insert({arr_ptr, (arr)}); + mutex.unlock(); return arr_ptr; } @@ -407,10 +411,23 @@ void Program::delete_ndarray(Ndarray *ndarray) { // runtime instead of this giant program and it should be freed when: // - Python GC signals taichi that it's no longer useful // - All kernels using it are executed. + mutex.lock(); + TI_INFO("this = {}", (long long)this); + TI_INFO("bucket count = {}", ndarrays_.bucket_count()); + TI_FLUSH_LOGGER + TI_INFO("ndarray count = {}", ndarrays_.count(ndarray)); + if (ndarrays_.count(ndarray)) + TI_INFO("alloc_id = {}", ndarray->ndarray_alloc_.alloc_id); + TI_ASSERT(program_impl_.get() != nullptr); + TI_ASSERT(ndarray != nullptr); + TI_INFO("ndarray ptr = {}", (long long)ndarray); + TI_INFO("used in kernel = {}", program_impl_->used_in_kernel(ndarray->ndarray_alloc_.alloc_id)); + TI_FLUSH_LOGGER; if (ndarrays_.count(ndarray) && !program_impl_->used_in_kernel(ndarray->ndarray_alloc_.alloc_id)) { ndarrays_.erase(ndarray); } + mutex.unlock(); } Texture *Program::create_texture(BufferFormat buffer_format, diff --git a/taichi/program/program.h b/taichi/program/program.h index f0a53c40cf94ba..cbff4571b241f2 100644 --- a/taichi/program/program.h +++ b/taichi/program/program.h @@ -336,7 +336,7 @@ class TI_DLL_EXPORT Program { bool finalized_{false}; // TODO: Move ndarrays_ and textures_ to be managed by runtime - std::unordered_map> ndarrays_; + std::unordered_map> ndarrays_; std::vector> textures_; }; diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index 499202b9d26db2..dd0a051f2191a7 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -100,6 +100,7 @@ SparseMatrixBuilder::SparseMatrixBuilder(int rows, } SparseMatrixBuilder::~SparseMatrixBuilder() { + TI_INFO("SparseMatrixBuilder::~SparseMatrixBuilder, prog = {}", (long long) prog_); prog_->delete_ndarray(ndarray_data_base_ptr_); } diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 8ca9ca2b8c5c7f..bd259d484eed60 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -421,7 +421,9 @@ void export_lang(py::module &m) { py::arg("dt"), py::arg("shape"), py::arg("layout") = ExternalArrayLayout::kNull, py::arg("zero_fill") = false, py::return_value_policy::reference) - .def("delete_ndarray", &Program::delete_ndarray) + .def("delete_ndarray", [&](Program* p, Ndarray* arr) -> void { + p->delete_ndarray(arr); + }) .def( "create_texture", [&](Program *program, BufferFormat fmt, const std::vector &shape)