taichi-dev · lin-hitonami · Nov 2, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/taichi/analysis/data_source_analysis.cpp b/taichi/analysis/data_source_analysis.cpp
@@ -63,6 +63,12 @@ std::vector<Stmt *> get_store_destination(Stmt *store_stmt) {
     return std::vector<Stmt *>(1, global_store->dest);
   } else if (auto atomic = store_stmt->cast<AtomicOpStmt>()) {
     return std::vector<Stmt *>(1, atomic->dest);
+  } else if (auto snode_op = store_stmt->cast<SNodeOpStmt>()) {
+    if (snode_op->op_type == SNodeOpType::allocate) {
+      return {snode_op->val, snode_op->ptr};
+    } else {
+      return {};
+    }
   } else if (auto external_func = store_stmt->cast<ExternalFuncCallStmt>()) {
     if (store_stmt->cast<ExternalFuncCallStmt>()->type ==
         ExternalFuncCallStmt::BITCODE) {

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1367,11 +1367,13 @@ void TaskCodeGenLLVM::visit(AssertStmt *stmt) {
 
 void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
   auto snode = stmt->snode;
-  if (stmt->op_type == SNodeOpType::append) {
+  if (stmt->op_type == SNodeOpType::allocate) {
     TI_ASSERT(snode->type == SNodeType::dynamic);
-    TI_ASSERT(stmt->ret_type->is_primitive(PrimitiveTypeID::i32));
-    llvm_val[stmt] =
-        call(snode, llvm_val[stmt->ptr], "append", {llvm_val[stmt->val]});
+    TI_ASSERT(stmt->ret_type.is_pointer() &&
+              stmt->ret_type.ptr_removed()->is_primitive(PrimitiveTypeID::gen));
+    auto ptr =
+        call(snode, llvm_val[stmt->ptr], "allocate", {llvm_val[stmt->val]});
+    llvm_val[stmt] = ptr;
   } else if (stmt->op_type == SNodeOpType::length) {
     TI_ASSERT(snode->type == SNodeType::dynamic);
     llvm_val[stmt] = call(snode, llvm_val[stmt->ptr], "get_num_elements", {});

diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp
@@ -604,7 +604,8 @@ void ControlFlowGraph::reaching_definition_analysis(bool after_lower_access) {
           (!after_lower_access &&
            (stmt->is<GlobalPtrStmt>() || stmt->is<ExternalPtrStmt>() ||
             stmt->is<BlockLocalPtrStmt>() || stmt->is<ThreadLocalPtrStmt>() ||
-            stmt->is<GlobalTemporaryStmt>() || stmt->is<MatrixPtrStmt>()))) {
+            stmt->is<GlobalTemporaryStmt>() || stmt->is<MatrixPtrStmt>() ||
+            stmt->is<GetChStmt>()))) {
         // TODO: unify them
         // A global pointer that may contain some data before this kernel.
         nodes[start_node]->reach_gen.insert(stmt);

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
@@ -926,13 +926,17 @@ void SNodeOpExpression::flatten(FlattenContext *ctx) {
     ctx->push_back<SNodeOpStmt>(SNodeOpType::get_addr, snode, ptr, nullptr);
   } else if (op_type == SNodeOpType::append) {
     flatten_rvalue(value, ctx);
-    ctx->push_back<SNodeOpStmt>(SNodeOpType::append, snode, ptr, value->stmt);
+
+    auto alloca = ctx->push_back<AllocaStmt>(PrimitiveType::i32);
+    auto addr =
+        ctx->push_back<SNodeOpStmt>(SNodeOpType::allocate, snode, ptr, alloca);
+    auto ch_addr = ctx->push_back<GetChStmt>(addr, snode, 0);
+    ctx->push_back<GlobalStoreStmt>(ch_addr, value->stmt);
+    ctx->push_back<LocalLoadStmt>(alloca);
     TI_ERROR_IF(snode->type != SNodeType::dynamic,
                 "ti.append only works on dynamic nodes.");
     TI_ERROR_IF(snode->ch.size() != 1,
                 "ti.append only works on single-child dynamic nodes.");
-    TI_ERROR_IF(data_type_size(snode->ch[0]->dt) != 4,
-                "ti.append only works on i32/f32 nodes.");
   }
   stmt = ctx->back_stmt();
 }

diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
@@ -121,7 +121,8 @@ bool SNodeOpStmt::activation_related(SNodeOpType op) {
 }
 
 bool SNodeOpStmt::need_activation(SNodeOpType op) {
-  return op == SNodeOpType::activate || op == SNodeOpType::append;
+  return op == SNodeOpType::activate || op == SNodeOpType::append ||
+         op == SNodeOpType::allocate;
 }
 
 ExternalTensorShapeAlongAxisStmt::ExternalTensorShapeAlongAxisStmt(int axis,
@@ -273,6 +274,16 @@ GetChStmt::GetChStmt(Stmt *input_ptr, int chid, bool is_bit_vectorized)
   TI_STMT_REG_FIELDS;
 }
 
+GetChStmt::GetChStmt(Stmt *input_ptr,
+                     SNode *snode,
+                     int chid,
+                     bool is_bit_vectorized)
+    : input_ptr(input_ptr), chid(chid), is_bit_vectorized(is_bit_vectorized) {
+  input_snode = snode;
+  output_snode = input_snode->ch[chid].get();
+  TI_STMT_REG_FIELDS;
+}
+
 OffloadedStmt::OffloadedStmt(TaskType task_type, Arch arch)
     : task_type(task_type), device(arch) {
   if (has_body()) {

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -1182,6 +1182,10 @@ class GetChStmt : public Stmt {
   bool is_bit_vectorized;
 
   GetChStmt(Stmt *input_ptr, int chid, bool is_bit_vectorized = false);
+  GetChStmt(Stmt *input_ptr,
+            SNode *snode,
+            int chid,
+            bool is_bit_vectorized = false);
 
   bool has_global_side_effect() const override {
     return false;

diff --git a/taichi/ir/stmt_op_types.cpp b/taichi/ir/stmt_op_types.cpp
@@ -136,6 +136,7 @@ std::string snode_op_type_name(SNodeOpType type) {
     REGISTER_TYPE(activate);
     REGISTER_TYPE(deactivate);
     REGISTER_TYPE(append);
+    REGISTER_TYPE(allocate);
     REGISTER_TYPE(clear);
     REGISTER_TYPE(undefined);
 

diff --git a/taichi/ir/stmt_op_types.h b/taichi/ir/stmt_op_types.h
@@ -77,6 +77,7 @@ enum class SNodeOpType : int {
   activate,
   deactivate,
   append,
+  allocate,
   clear,
   undefined
 };

diff --git a/taichi/runtime/llvm/runtime_module/node_dynamic.h b/taichi/runtime/llvm/runtime_module/node_dynamic.h
@@ -58,11 +58,12 @@ void Dynamic_deactivate(Ptr meta_, Ptr node_) {
   }
 }
 
-i32 Dynamic_append(Ptr meta_, Ptr node_, i32 data) {
+Ptr Dynamic_allocate(Ptr meta_, Ptr node_, i32 *len) {
   auto meta = (DynamicMeta *)(meta_);
   auto node = (DynamicNode *)(node_);
   auto chunk_size = meta->chunk_size;
   auto i = atomic_add_i32(&node->n, 1);
+  *len = i;
   int chunk_start = 0;
   auto p_chunk_ptr = &node->ptr;
   while (true) {
@@ -76,14 +77,12 @@ i32 Dynamic_append(Ptr meta_, Ptr node_, i32 data) {
       });
     }
     if (i < chunk_start + chunk_size) {
-      *(i32 *)(*p_chunk_ptr + sizeof(Ptr) +
-               (i - chunk_start) * meta->element_size) = data;
-      break;
+      return *p_chunk_ptr + sizeof(Ptr) +
+             (i - chunk_start) * meta->element_size;
     }
     p_chunk_ptr = (Ptr *)(*p_chunk_ptr);
     chunk_start += chunk_size;
   }
-  return i;
 }
 
 i32 Dynamic_is_active(Ptr meta_, Ptr node_, int i) {

diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
@@ -125,6 +125,9 @@ class TypeCheck : public IRVisitor {
   void visit(SNodeOpStmt *stmt) override {
     if (stmt->op_type == SNodeOpType::get_addr) {
       stmt->ret_type = PrimitiveType::u64;
+    } else if (stmt->op_type == SNodeOpType::allocate) {
+      stmt->ret_type = PrimitiveType::gen;
+      stmt->ret_type.set_is_pointer(true);
     } else {
       stmt->ret_type = PrimitiveType::i32;
     }

diff --git a/tests/python/test_compare.py b/tests/python/test_compare.py
@@ -83,7 +83,7 @@ def func():
     assert not a[11]
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_no_duplicate_eval():
     a = ti.field(ti.i32)
     ti.root.dynamic(ti.i, 256).place(a)

diff --git a/tests/python/test_dynamic.py b/tests/python/test_dynamic.py
@@ -5,7 +5,7 @@
 from tests import test_utils
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dynamic():
     x = ti.field(ti.f32)
     n = 128
@@ -23,7 +23,7 @@ def func():
         assert x[i] == i
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dynamic2():
     x = ti.field(ti.f32)
     n = 128
@@ -41,7 +41,7 @@ def func():
         assert x[i] == i
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dynamic_matrix():
     x = ti.Matrix.field(2, 1, dtype=ti.i32)
     n = 8192
@@ -64,7 +64,7 @@ def func():
             assert b == 0
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_append():
     x = ti.field(ti.i32)
     n = 128
@@ -86,7 +86,7 @@ def func():
         assert elements[i] == i
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_length():
     x = ti.field(ti.i32)
     y = ti.field(ti.f32, shape=())
@@ -110,7 +110,7 @@ def get_len():
     assert y[None] == n
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_append_ret_value():
     x = ti.field(ti.i32)
     y = ti.field(ti.i32)
@@ -135,14 +135,14 @@ def func():
         assert x[i] + 3 == z[i]
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dense_dynamic():
     # The spin lock implementation has triggered a bug in CUDA, the end result
     # being that appending to Taichi's dynamic node messes up its length. See
     # https://stackoverflow.com/questions/65995357/cuda-spinlock-implementation-with-independent-thread-scheduling-supported
     # CUDA 11.2 didn't fix this bug, unfortunately.
-    if ti.lang.impl.current_cfg().arch == ti.cuda:
-        pytest.skip('CUDA spinlock bug')
+    # if ti.lang.impl.current_cfg().arch == ti.cuda:
+    #     pytest.skip('CUDA spinlock bug')
 
     n = 128
     x = ti.field(ti.i32)
@@ -166,7 +166,7 @@ def func():
         assert l[i] == n
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dense_dynamic_len():
     n = 128
     x = ti.field(ti.i32)
@@ -185,7 +185,7 @@ def func():
         assert l[i] == 0
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dynamic_activate():
     # record the lengths
     l = ti.field(ti.i32, 3)
@@ -212,7 +212,7 @@ def func():
     assert l[2] == 21
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_append_vec():
     x = ti.Vector.field(3, ti.f32)
     block = ti.root.dense(ti.i, 16)
@@ -229,3 +229,39 @@ def make_lists():
     with pytest.raises(TaichiCompilationError,
                        match=r'append only supports appending a scalar value'):
         make_lists()
+
+
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
+def test_append_u8():
+    x = ti.field(ti.u8)
+    pixel = ti.root.dynamic(ti.j, 20)
+    pixel.place(x)
+
+    @ti.kernel
+    def make_list():
+        ti.loop_config(serialize=True)
+        for i in range(20):
+            x[()].append(i * i * i)
+
+    make_list()
+
+    for i in range(20):
+        assert x[i] == i * i * i % 256
+
+
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
+def test_append_u64():
+    x = ti.field(ti.u64)
+    pixel = ti.root.dynamic(ti.i, 20)
+    pixel.place(x)
+
+    @ti.kernel
+    def make_list():
+        ti.loop_config(serialize=True)
+        for i in range(20):
+            x[()].append(i * i * i * ti.u64(10000000000))
+
+    make_list()
+
+    for i in range(20):
+        assert x[i] == i * i * i * 10000000000
diff --git a/tests/python/test_dynamic_append_length.py b/tests/python/test_dynamic_append_length.py
@@ -21,7 +21,7 @@ def test():
     test()
 
 
-@test_utils.test(exclude=[ti.cc, ti.opengl, ti.vulkan],
+@test_utils.test(exclude=[ti.cc, ti.opengl, ti.vulkan, ti.metal],
                  default_fp=ti.f32,
                  debug=True)
 def test_dynamic_append_length_f32():

diff --git a/tests/python/test_sparse_deactivate.py b/tests/python/test_sparse_deactivate.py
@@ -170,7 +170,7 @@ def clear_temp():
                     assert xn[i, j] == i + j
 
 
-@test_utils.test(require=ti.extension.sparse)
+@test_utils.test(require=ti.extension.sparse, exclude=[ti.metal])
 def test_dynamic():
     x = ti.field(ti.i32)
     s = ti.field(ti.i32)

diff --git a/tests/python/test_struct_for_dynamic.py b/tests/python/test_struct_for_dynamic.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan, ti.metal])
 def test_dynamic():
     x = ti.field(ti.i32)
     y = ti.field(ti.i32, shape=())
@@ -23,7 +23,7 @@ def count():
     assert y[None] == n // 3 + 1
 
 
-@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.cc, ti.vulkan, ti.metal])
 def test_dense_dynamic():
     n = 128