taichi-dev · galeselee · Feb 2, 2023 · Jan 29, 2023 · Jan 29, 2023 · Jan 29, 2023
diff --git a/external/assets b/external/assets
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -294,7 +294,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
       } else if (stmt->task_type == Type::range_for) {
         create_offload_range_for(stmt);
       } else if (stmt->task_type == Type::struct_for) {
-        create_offload_struct_for(stmt, true);
+        create_offload_struct_for(stmt, "amdgpu");
       } else if (stmt->task_type == Type::mesh_for) {
         create_offload_mesh_for(stmt);
       } else if (stmt->task_type == Type::listgen) {

diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -179,7 +179,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     } else if (stmt->task_type == Type::struct_for) {
       stmt->block_dim = std::min(stmt->snode->parent->max_num_elements(),
                                  (int64)stmt->block_dim);
-      create_offload_struct_for(stmt);
+      create_offload_struct_for(stmt, "cpu");
     } else if (stmt->task_type == Type::listgen) {
       emit_list_gen(stmt);
     } else if (stmt->task_type == Type::gc) {

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -474,7 +474,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
       } else if (stmt->task_type == Type::range_for) {
         create_offload_range_for(stmt);
       } else if (stmt->task_type == Type::struct_for) {
-        create_offload_struct_for(stmt, true);
+        create_offload_struct_for(stmt, "nvgpu");
       } else if (stmt->task_type == Type::mesh_for) {
         create_offload_mesh_for(stmt);
       } else if (stmt->task_type == Type::listgen) {

diff --git a/taichi/codegen/dx12/codegen_dx12.cpp b/taichi/codegen/dx12/codegen_dx12.cpp
@@ -163,7 +163,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
     } else if (stmt->task_type == Type::struct_for) {
       stmt->block_dim = std::min(stmt->snode->parent->max_num_elements(),
                                  (int64)stmt->block_dim);
-      create_offload_struct_for(stmt);
+      create_offload_struct_for(stmt, "dx12");
     } else if (stmt->task_type == Type::listgen) {
       emit_list_gen(stmt);
     } else if (stmt->task_type == Type::gc) {

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -2024,7 +2024,7 @@ std::tuple<llvm::Value *, llvm::Value *> TaskCodeGenLLVM::get_range_for_bounds(
 }
 
 void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
-                                                bool spmd) {
+                                                std::string spmd) {
   using namespace llvm;
   // TODO: instead of constructing tons of LLVM IR, writing the logic in
   // runtime.cpp may be a cleaner solution. See
@@ -2126,13 +2126,27 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
 
     llvm::Value *thread_idx = nullptr, *block_dim = nullptr;
 
-    if (spmd) {
+    if (spmd == "nvgpu") {
       thread_idx =
           builder->CreateIntrinsic(Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {});
       block_dim = builder->CreateIntrinsic(Intrinsic::nvvm_read_ptx_sreg_ntid_x,
                                            {}, {});
       builder->CreateStore(builder->CreateAdd(thread_idx, lower_bound),
                            loop_index);
+    } else if (spmd == "amdgpu") {
+#ifdef TI_WITH_AMDGPU
+      thread_idx =
+          builder->CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
+      auto workgroup_dim_ = call(
+          "__ockl_get_local_size",
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(*llvm_context), 0));
+      block_dim = builder->CreateTrunc(workgroup_dim_,
+                                       llvm::Type::getInt32Ty(*llvm_context));
+      builder->CreateStore(builder->CreateAdd(thread_idx, lower_bound),
+                           loop_index);
+#else
+      TI_NOT_IMPLEMENTED
+#endif
     } else {
       builder->CreateStore(lower_bound, loop_index);
     }
@@ -2218,7 +2232,7 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
       // body tail: increment loop_index and jump to loop_test
       builder->SetInsertPoint(body_tail_bb);
 
-      if (spmd) {
+      if (spmd == "nvgpu" || spmd == "amdgpu") {
         create_increment(loop_index, block_dim);
       } else {
         create_increment(loop_index, tlctx->get_constant(1));

diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h
@@ -332,7 +332,7 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
     TI_NOT_IMPLEMENTED;
   }
 
-  void create_offload_struct_for(OffloadedStmt *stmt, bool spmd = false);
+  void create_offload_struct_for(OffloadedStmt *stmt, std::string spmd);
 
   void visit(LoopIndexStmt *stmt) override;
 

diff --git a/taichi/codegen/llvm/llvm_codegen_utils.h b/taichi/codegen/llvm/llvm_codegen_utils.h
@@ -10,6 +10,11 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#endif
+
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
@@ -46,9 +46,8 @@
 #include "llvm_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/codegen_utils.h"
-#ifdef TI_WITH_AMDGPU
+
 #include "taichi/runtime/llvm/llvm_context_pass.h"
-#endif
 
 #ifdef _WIN32
 // Travis CI seems doesn't support <filesystem>...
@@ -1033,52 +1032,21 @@ void TaichiLLVMContext::add_struct_for_func(llvm::Module *module,
   if (module->getFunction(func_name)) {
     return;
   }
-  auto struct_for_func = module->getFunction("parallel_struct_for");
-  auto &llvm_context = module->getContext();
-  auto value_map = llvm::ValueToValueMapTy();
-  auto patched_struct_for_func =
-      llvm::CloneFunction(struct_for_func, value_map);
-  patched_struct_for_func->setName(func_name);
-
-  int num_found_alloca = 0;
-  llvm::AllocaInst *alloca = nullptr;
-
-  auto char_type = llvm::Type::getInt8Ty(llvm_context);
-
-  // Find the "1" in "char tls_buffer[1]" and replace it with
-  // "tls_buffer_size"
-  for (auto &bb : *patched_struct_for_func) {
-    for (llvm::Instruction &inst : bb) {
-      auto now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
-      if (!now_alloca || now_alloca->getAlign().value() != 8)
-        continue;
-      auto alloca_type = now_alloca->getAllocatedType();
-      // Allocated type should be array [1 x i8]
-      if (alloca_type->isArrayTy() && alloca_type->getArrayNumElements() == 1 &&
-          alloca_type->getArrayElementType() == char_type) {
-        alloca = now_alloca;
-        num_found_alloca++;
-      }
-    }
-  }
-  // There should be **exactly** one replacement.
-  TI_ASSERT(num_found_alloca == 1 && alloca);
-  auto new_type = llvm::ArrayType::get(char_type, tls_size);
-  {
-    llvm::IRBuilder<> builder(alloca);
-    auto *new_alloca = builder.CreateAlloca(new_type);
-    new_alloca->setAlignment(Align(8));
-    TI_ASSERT(alloca->hasOneUse());
-    auto *gep = llvm::cast<llvm::GetElementPtrInst>(alloca->user_back());
-    TI_ASSERT(gep->getPointerOperand() == alloca);
-    std::vector<Value *> indices(gep->idx_begin(), gep->idx_end());
-    builder.SetInsertPoint(gep);
-    auto *new_gep = builder.CreateInBoundsGEP(new_type, new_alloca, indices);
-    gep->replaceAllUsesWith(new_gep);
-    gep->eraseFromParent();
-    alloca->eraseFromParent();
+  llvm::legacy::PassManager module_pass_manager;
+  if (config_->arch == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
+    module_pass_manager.add(
+        new AMDGPUAddStructForFuncPass(func_name, tls_size));
+    module_pass_manager.run(*module);
+#else
+    TI_NOT_IMPLEMENTED
+#endif
+  } else {
+    module_pass_manager.add(new AddStructForFuncPass(func_name, tls_size));
+    module_pass_manager.run(*module);
   }
 }
+
 std::string TaichiLLVMContext::get_struct_for_func_name(int tls_size) {
   return "parallel_struct_for_" + std::to_string(tls_size);
 }

diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
@@ -10,6 +10,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 
 #if defined(TI_WITH_AMDGPU)
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
@@ -18,6 +20,63 @@
 namespace taichi {
 namespace lang {
 using namespace llvm;
+
+struct AddStructForFuncPass : public ModulePass {
+  static inline char ID{0};
+  std::string func_name_;
+  int tls_size_;
+  AddStructForFuncPass(std::string func_name, int tls_size) : ModulePass(ID) {
+    func_name_ = func_name;
+    tls_size_ = tls_size;
+  }
+  bool runOnModule(llvm::Module &M) override {
+    auto struct_for_func = M.getFunction("parallel_struct_for");
+    auto &llvm_context = M.getContext();
+    auto value_map = llvm::ValueToValueMapTy();
+    auto patched_struct_for_func =
+        llvm::CloneFunction(struct_for_func, value_map);
+    patched_struct_for_func->setName(func_name_);
+
+    int num_found_alloca = 0;
+    llvm::AllocaInst *alloca = nullptr;
+
+    auto char_type = llvm::Type::getInt8Ty(llvm_context);
+
+    // Find the "1" in "char tls_buffer[1]" and replace it with
+    // "tls_buffer_size"
+    for (auto &bb : *patched_struct_for_func) {
+      for (llvm::Instruction &inst : bb) {
+        auto now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
+        if (!now_alloca || now_alloca->getAlign().value() != 8)
+          continue;
+        auto alloca_type = now_alloca->getAllocatedType();
+        // Allocated type should be array [1 x i8]
+        if (alloca_type->isArrayTy() &&
+            alloca_type->getArrayNumElements() == 1 &&
+            alloca_type->getArrayElementType() == char_type) {
+          alloca = now_alloca;
+          num_found_alloca++;
+        }
+      }
+    }
+    TI_ASSERT(num_found_alloca == 1 && alloca);
+    auto new_type = llvm::ArrayType::get(char_type, tls_size_);
+    llvm::IRBuilder<> builder(alloca);
+    auto *new_alloca = builder.CreateAlloca(new_type);
+    new_alloca->setAlignment(Align(8));
+    TI_ASSERT(alloca->hasOneUse());
+    auto *gep = llvm::cast<llvm::GetElementPtrInst>(alloca->user_back());
+    TI_ASSERT(gep->getPointerOperand() == alloca);
+    std::vector<Value *> indices(gep->idx_begin(), gep->idx_end());
+    builder.SetInsertPoint(gep);
+    auto *new_gep = builder.CreateInBoundsGEP(new_type, new_alloca, indices);
+    gep->replaceAllUsesWith(new_gep);
+    gep->eraseFromParent();
+    alloca->eraseFromParent();
+    return false;
+  }
+};
+
 #if defined(TI_WITH_AMDGPU)
 struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
   static inline char ID{0};
@@ -52,6 +111,69 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
   }
 };
 
+struct AMDGPUAddStructForFuncPass : public ModulePass {
+  static inline char ID{0};
+  std::string func_name_;
+  int tls_size_;
+  AMDGPUAddStructForFuncPass(std::string func_name, int tls_size)
+      : ModulePass(ID) {
+    func_name_ = func_name;
+    tls_size_ = tls_size;
+  }
+  bool runOnModule(llvm::Module &M) override {
+    auto struct_for_func = M.getFunction("parallel_struct_for");
+    auto &llvm_context = M.getContext();
+    auto value_map = llvm::ValueToValueMapTy();
+    auto patched_struct_for_func =
+        llvm::CloneFunction(struct_for_func, value_map);
+    patched_struct_for_func->setName(func_name_);
+
+    int num_found_alloca = 0;
+    llvm::AllocaInst *alloca = nullptr;
+
+    auto char_type = llvm::Type::getInt8Ty(llvm_context);
+
+    // Find the "1" in "char tls_buffer[1]" and replace it with
+    // "tls_buffer_size"
+    for (auto &bb : *patched_struct_for_func) {
+      for (llvm::Instruction &inst : bb) {
+        auto now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
+        if (!now_alloca || now_alloca->getAlign().value() != 8)
+          continue;
+        auto alloca_type = now_alloca->getAllocatedType();
+        // Allocated type should be array [1 x i8]
+        if (alloca_type->isArrayTy() &&
+            alloca_type->getArrayNumElements() == 1 &&
+            alloca_type->getArrayElementType() == char_type) {
+          alloca = now_alloca;
+          num_found_alloca++;
+        }
+      }
+    }
+    TI_ASSERT(num_found_alloca == 1 && alloca);
+    auto new_type = llvm::ArrayType::get(char_type, tls_size_);
+    llvm::IRBuilder<> builder(alloca);
+    auto *new_alloca = builder.CreateAlloca(new_type, (unsigned)5);
+    new_alloca->setAlignment(Align(8));
+    auto new_ty = llvm::PointerType::get(new_type, unsigned(0));
+    auto *new_cast = builder.CreateAddrSpaceCast(new_alloca, new_ty);
+    new_alloca->setAlignment(Align(8));
+    TI_ASSERT(alloca->hasOneUse());
+    auto *cast = llvm::cast<llvm::AddrSpaceCastInst>(alloca->user_back());
+    TI_ASSERT(cast->hasOneUse());
+    auto *gep = llvm::cast<llvm::GetElementPtrInst>(cast->user_back());
+    TI_ASSERT(gep->getPointerOperand() == cast);
+    std::vector<Value *> indices(gep->idx_begin(), gep->idx_end());
+    builder.SetInsertPoint(gep);
+    auto *new_gep = builder.CreateInBoundsGEP(new_type, new_cast, indices);
+    gep->replaceAllUsesWith(new_gep);
+    gep->eraseFromParent();
+    cast->eraseFromParent();
+    alloca->eraseFromParent();
+    return false;
+  }
+};
+
 struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
   static inline char ID{0};
   AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {