taichi-dev · yuanming-hu · Jul 6, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/misc/benchmark_reduction_tmps.py b/misc/benchmark_reduction_tmps.py
@@ -0,0 +1,48 @@
+import taichi as ti
+import time
+from pytest import approx
+
+# TODO: make this a real benchmark and set up regression
+# TODO: merge this file into benchmark_reduction.py
+ti.init(arch=ti.gpu,
+        print_ir=True,
+        print_kernel_llvm_ir=True,
+        kernel_profiler=True,
+        print_kernel_llvm_ir_optimized=True)
+
+N = 1024 * 1024 * 128
+
+a = ti.var(ti.f32, shape=N)
+
+
+@ti.kernel
+def fill():
+    ti.block_dim(128)
+    for i in a:
+        a[i] = 1.0
+
+
+@ti.kernel
+def reduce() -> ti.f32:
+    s = 0.0
+    ti.block_dim(1024)
+    for i in a:
+        s += a[i]
+    return s
+
+
+fill()
+
+num_runs = 10
+# Invoke it here to get the kernel compiled
+reduce()
+
+start = time.time()
+got = 0.0
+for i in range(num_runs):
+    got += reduce()
+duration = time.time() - start
+print(f'duration={duration:.2e}s average={(duration / num_runs):.2e}s')
+
+ground_truth = float(N * num_runs)
+assert got == approx(ground_truth, 1e-4)
diff --git a/taichi/transforms/make_thread_local.cpp b/taichi/transforms/make_thread_local.cpp
@@ -1,6 +1,11 @@
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <type_traits>
+
+#include "taichi/ir/analysis.h"
 #include "taichi/ir/ir.h"
 #include "taichi/ir/transforms.h"
-#include "taichi/ir/analysis.h"
 #include "taichi/ir/visitors.h"
 
 TLANG_NAMESPACE_BEGIN
@@ -11,22 +16,25 @@ bool is_atomic_op_linear(AtomicOpType op_type) {
   return op_type == AtomicOpType::add || op_type == AtomicOpType::sub;
 }
 
-void make_thread_local_offload(OffloadedStmt *offload) {
-  // TODO: deal with struct for
-  if (offload->task_type != offload->range_for)
-    return;
-
+// Find the destinations of global atomic reductions that can be demoted into
+// TLS buffer.
+template <typename T>
+std::vector<T *> find_global_reduction_destinations(
+    OffloadedStmt *offload,
+    const std::function<bool(T *)> &dest_checker) {
+  static_assert(std::is_same_v<T, GlobalPtrStmt> ||
+                std::is_same_v<T, GlobalTemporaryStmt>);
   // Gather all atomic adds/subs destinations
   // We use std::vector instead of std::set to keep an deterministic order here.
-  std::vector<GlobalPtrStmt *> atomic_destinations;
+  std::vector<T *> atomic_destinations;
   // TODO: this is again an abuse since it gathers nothing. Need to design a IR
   // map/reduce system
   auto linear_atomics =
       irpass::analysis::gather_statements(offload, [&](Stmt *stmt) {
         if (auto atomic_op = stmt->cast<AtomicOpStmt>()) {
           if (is_atomic_op_linear(atomic_op->op_type)) {
             // Local or global tmp atomics does not count
-            if (auto dest = atomic_op->dest->cast<GlobalPtrStmt>()) {
+            if (auto dest = atomic_op->dest->cast<T>()) {
               if (std::find(atomic_destinations.begin(),
                             atomic_destinations.end(),
                             dest) == atomic_destinations.end()) {
@@ -38,8 +46,7 @@ void make_thread_local_offload(OffloadedStmt *offload) {
         return false;
       });
 
-  std::vector<GlobalPtrStmt *> valid_reduction_values;
-
+  std::vector<T *> valid_reduction_values;
   for (auto dest : atomic_destinations) {
     // check if there is any other global load/store/atomic operations
     auto related_global_mem_ops =
@@ -70,13 +77,35 @@ void make_thread_local_offload(OffloadedStmt *offload) {
                          // destination
         });
     TI_ASSERT(dest->width() == 1);
-    // We can only optimized reductions to global ptrs with form like loss[None]
-    // (0-D tensors) for now
-    if (related_global_mem_ops.empty() &&
-        dest->snodes[0]->type == SNodeType::place && dest->indices.empty()) {
+    if (related_global_mem_ops.empty() && dest_checker(dest)) {
       valid_reduction_values.push_back(dest);
     }
   }
+  return valid_reduction_values;
+}
+
+void make_thread_local_offload(OffloadedStmt *offload) {
+  // TODO: deal with struct for
+  if (offload->task_type != offload->range_for)
+    return;
+
+  std::vector<Stmt *> valid_reduction_values;
+  {
+    auto valid_global_ptrs = find_global_reduction_destinations<GlobalPtrStmt>(
+        offload, [](auto *dest) {
+          // We can only optimized reductions to global ptrs with form like
+          // loss[None] (0-D tensors) for now
+          return (dest->snodes[0]->type == SNodeType::place) &&
+                 dest->indices.empty();
+        });
+    auto valid_global_tmps =
+        find_global_reduction_destinations<GlobalTemporaryStmt>(
+            offload, [](auto *) { return true; });
+    std::copy(valid_global_ptrs.begin(), valid_global_ptrs.end(),
+              std::back_inserter(valid_reduction_values));
+    std::copy(valid_global_tmps.begin(), valid_global_tmps.end(),
+              std::back_inserter(valid_reduction_values));
+  }
 
   std::size_t tls_offset = 0;
 

diff --git a/tests/python/test_reduction.py b/tests/python/test_reduction.py
@@ -18,11 +18,20 @@ def reduce():
         for i in a:
             tot[None] += a[i]
 
+    @ti.kernel
+    def reduce_tmp() -> dtype:
+        s = tot[None] * 0  # Hack to get |s| to the correct type...
+        for i in a:
+            s += a[i]
+        return s
+
     fill()
     reduce()
+    tot2 = reduce_tmp()
 
     ground_truth = N * (N - 1) / 2
     assert criterion(tot[None], ground_truth)
+    assert criterion(tot2, ground_truth)
 
 
 @ti.all_archs