apache · tqchen · Feb 11, 2017 · Feb 7, 2017
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
@@ -49,6 +49,30 @@ struct Reduce : public ExprNode<Reduce> {
   static constexpr const char* Min = "Min";
 };
 
+/*! \brief namespace of possible attribute sin AttrStmt.type_key */
+namespace attr {
+/*!
+ * \brief Mark scope of iteration variable, used by Schedule.
+ */
+constexpr const char* scope = "scope";
+/*!
+ * \brief Mark launching extent of thread, used by device API.
+ */
+constexpr const char* thread_extent = "thread_extent";
+/*!
+ * \brief Mark launching of a virtual thread.
+ */
+constexpr const char* virtual_thread = "virtual_thread";
+/*!
+ * \brief Mark storage scope of buffers
+ */
+constexpr const char* storage_scope = "storage_scope";
+/*!
+ * \brief Mark storage scope of realizations
+ */
+constexpr const char* realize_scope = "realize_scope";
+}  // namespace attr
+
 /*! \brief namespace of TVM Intrinsic functions */
 namespace intrinsic {
 // Most of the intrinsics is to enab

diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
@@ -63,6 +63,7 @@ class IRMutator {
   virtual Stmt Mutate_(const Store* op, const Stmt& s);
   virtual Stmt Mutate_(const Free* op, const Stmt& s);
   virtual Stmt Mutate_(const IfThenElse* op, const Stmt& s);
+  virtual Stmt Mutate_(const Block* op, const Stmt& s);
   virtual Expr Mutate_(const Call* op, const Expr& e);
   virtual Expr Mutate_(const Load* op, const Expr& s);
   virtual Expr Mutate_(const Variable* op, const Expr& e);

diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
@@ -100,6 +100,7 @@ Stmt Inline(Stmt stmt,
  * \param stmt The stmt to be trasnformed.
  * \param extern_buffer Map specifies external
  *    buffer assignment of input and outputs.
+ * \return Transformed stmt.
  */
 Stmt StorageFlatten(Stmt stmt,
                     Map<Tensor, Buffer> extern_buffer);
@@ -108,15 +109,34 @@ Stmt StorageFlatten(Stmt stmt,
  * \brief unroll the constant loops
  * \param stmt The statment to be unrolled.
  * \param max_auto_step The maximum step to stop performing automatic unrolling.
+ * \return Transformed stmt.
  */
 Stmt UnrollLoop(Stmt stmt, int max_auto_step);
 
 /*!
  * \brief vectorize the constant loops
  * \param stmt The statment to be vectorized.
+ * \return Transformed stmt.
  */
 Stmt VectorizeLoop(Stmt stmt);
 
+/*!
+ * \brief Inject virtual thread loops into stmt.
+ * \param stmt The statment to be transformed.
+ * \return Transformed stmt.
+ */
+Stmt InjectVirtualThread(Stmt stmt);
+
+/*!
+ * \brief Lift storage allocation to relevant outpost location
+ *
+ *  Only do this after vectorization and virtual thread injection completes.
+ *
+ * \param stmt The stmt to be trasnformed
+ * \return Transformed stmt.
+ */
+Stmt LiftAllocate(Stmt stmt);
+
 /*!
  * \brief Make an user callable API LoweredFunc.
  *

diff --git a/python/tvm/build.py b/python/tvm/build.py
@@ -70,6 +70,8 @@ def build(sch,
     stmt = ir_pass.StorageFlatten(stmt, binds)
     stmt = ir_pass.CanonicalSimplify(stmt)
     stmt = ir_pass.VectorizeLoop(stmt)
+    stmt = ir_pass.InjectVirtualThread(stmt)
+    stmt = ir_pass.LiftAllocate(stmt)
     stmt = ir_pass.UnrollLoop(stmt, max_auto_unroll_step)
     stmt = ir_pass.Simplify(stmt)
     fapi = ir_pass.MakeAPI(stmt, name, arg_list, len(arg_list))

diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
@@ -67,6 +67,8 @@ REGISTER_PASS2(UnrollLoop);
 REGISTER_PASS2(StorageSync);
 REGISTER_PASS4(MakeAPI);
 REGISTER_PASS1(SplitHostDevice);
+REGISTER_PASS1(LiftAllocate);
+REGISTER_PASS1(InjectVirtualThread);
 
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
@@ -288,7 +288,8 @@ class Canonical::Internal : public IRMutator {
   }
   // AttrStmt
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) {
-    if (op->type_key == "thread_extent") {
+    if (op->type_key == attr::thread_extent ||
+        op->type_key == attr::virtual_thread) {
       ++level_counter_;
       IterVar iv(op->node.node_);
       CHECK_NE(iv->thread_tag.length(), 0U);

diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
@@ -743,7 +743,7 @@ void CodeGenC::PrintStmt(const Allocate* op) {
 }
 
 void CodeGenC::PrintStmt(const AttrStmt* op) {
-  if (op->type_key == "scope") {
+  if (op->type_key == ir::attr::thread_extent) {
     IterVar iv(op->node.node_);
     if (iv->thread_tag.length() != 0) {
       if (!var_idmap_.count(iv->var.get())) {
@@ -756,7 +756,7 @@ void CodeGenC::PrintStmt(const AttrStmt* op) {
         stream << ";\n";
       }
     }
-  } else if (op->type_key == "storage_scope") {
+  } else if (op->type_key == ir::attr::storage_scope) {
     const Variable* v = op->node.as<Variable>();
     CHECK(v);
     alloc_storage_scope_[v] = op->value.as<StringImm>()->value;

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
@@ -9,6 +9,7 @@
 #include <string>
 #include "./codegen_cuda.h"
 #include "./codegen_stack_vm.h"
+#include "../arithmetic/compute_expr.h"
 #include "../runtime/cuda/cuda_common.h"
 #include "../runtime/cuda/cuda_module.h"
 
@@ -22,6 +23,17 @@ std::string CodeGenCUDA::Compile(
   return CodeGenC::Compile(f, output_ssa);
 }
 
+void CodeGenCUDA::PrintStmt(const ir::For* op) {
+  int ext;
+  CHECK(is_zero(op->min));
+  if (arith::GetConstInt(op->extent, &ext) &&
+      ext <= max_auto_unroll_) {
+    PrintIndent();
+    stream << "#pragma unroll\n";
+  }
+  CodeGenC::PrintStmt(op);
+}
+
 void CodeGenCUDA::PrintType(Type t, std::ostream& os) const {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {

diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
@@ -27,6 +27,7 @@ class CodeGenCUDA : public CodeGenC {
                       bool output_ssa);
 
   // override behavior
+  void PrintStmt(const ir::For* op) final;
   void PrintStorageSync(const std::string& sync) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintVecBinaryOp(
@@ -37,6 +38,11 @@ class CodeGenCUDA : public CodeGenC {
       const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
   void PrintVecElemStore(
       const std::string& vec, Type t, int i, const std::string& value) final;
+
+ private:
+  // magic number to add pragma unroll to it.
+  // used to generate code that is compact but still unrolls.
+  int max_auto_unroll_{8};
 };
 
 }  // namespace codegen