Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Perf] Support TLS for GlobalTemporaryStmt #1423

Merged
merged 2 commits into from
Jul 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions misc/benchmark_reduction_tmps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import taichi as ti
import time
from pytest import approx

# TODO: make this a real benchmark and set up regression
# TODO: merge this file into benchmark_reduction.py
ti.init(arch=ti.gpu,
print_ir=True,
print_kernel_llvm_ir=True,
kernel_profiler=True,
print_kernel_llvm_ir_optimized=True)

N = 1024 * 1024 * 128

a = ti.var(ti.f32, shape=N)


@ti.kernel
def fill():
ti.block_dim(128)
for i in a:
a[i] = 1.0


@ti.kernel
def reduce() -> ti.f32:
s = 0.0
ti.block_dim(1024)
for i in a:
s += a[i]
return s


fill()

num_runs = 10
# Invoke it here to get the kernel compiled
reduce()

start = time.time()
got = 0.0
for i in range(num_runs):
got += reduce()
duration = time.time() - start
print(f'duration={duration:.2e}s average={(duration / num_runs):.2e}s')

ground_truth = float(N * num_runs)
assert got == approx(ground_truth, 1e-4)
57 changes: 43 additions & 14 deletions taichi/transforms/make_thread_local.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#include <algorithm>
#include <functional>
#include <iterator>
#include <type_traits>

#include "taichi/ir/analysis.h"
#include "taichi/ir/ir.h"
#include "taichi/ir/transforms.h"
#include "taichi/ir/analysis.h"
#include "taichi/ir/visitors.h"

TLANG_NAMESPACE_BEGIN
Expand All @@ -11,22 +16,25 @@ bool is_atomic_op_linear(AtomicOpType op_type) {
return op_type == AtomicOpType::add || op_type == AtomicOpType::sub;
}

void make_thread_local_offload(OffloadedStmt *offload) {
// TODO: deal with struct for
if (offload->task_type != offload->range_for)
return;

// Find the destinations of global atomic reductions that can be demoted into
// TLS buffer.
template <typename T>
std::vector<T *> find_global_reduction_destinations(
OffloadedStmt *offload,
const std::function<bool(T *)> &dest_checker) {
static_assert(std::is_same_v<T, GlobalPtrStmt> ||
std::is_same_v<T, GlobalTemporaryStmt>);
// Gather all atomic adds/subs destinations
// We use std::vector instead of std::set to keep an deterministic order here.
std::vector<GlobalPtrStmt *> atomic_destinations;
std::vector<T *> atomic_destinations;
// TODO: this is again an abuse since it gathers nothing. Need to design a IR
// map/reduce system
auto linear_atomics =
irpass::analysis::gather_statements(offload, [&](Stmt *stmt) {
if (auto atomic_op = stmt->cast<AtomicOpStmt>()) {
if (is_atomic_op_linear(atomic_op->op_type)) {
// Local or global tmp atomics does not count
if (auto dest = atomic_op->dest->cast<GlobalPtrStmt>()) {
if (auto dest = atomic_op->dest->cast<T>()) {
if (std::find(atomic_destinations.begin(),
atomic_destinations.end(),
dest) == atomic_destinations.end()) {
Expand All @@ -38,8 +46,7 @@ void make_thread_local_offload(OffloadedStmt *offload) {
return false;
});

std::vector<GlobalPtrStmt *> valid_reduction_values;

std::vector<T *> valid_reduction_values;
for (auto dest : atomic_destinations) {
// check if there is any other global load/store/atomic operations
auto related_global_mem_ops =
Expand Down Expand Up @@ -70,13 +77,35 @@ void make_thread_local_offload(OffloadedStmt *offload) {
// destination
});
TI_ASSERT(dest->width() == 1);
// We can only optimized reductions to global ptrs with form like loss[None]
// (0-D tensors) for now
if (related_global_mem_ops.empty() &&
dest->snodes[0]->type == SNodeType::place && dest->indices.empty()) {
if (related_global_mem_ops.empty() && dest_checker(dest)) {
valid_reduction_values.push_back(dest);
}
}
return valid_reduction_values;
}

void make_thread_local_offload(OffloadedStmt *offload) {
// TODO: deal with struct for
if (offload->task_type != offload->range_for)
return;

std::vector<Stmt *> valid_reduction_values;
{
auto valid_global_ptrs = find_global_reduction_destinations<GlobalPtrStmt>(
offload, [](auto *dest) {
// We can only optimized reductions to global ptrs with form like
// loss[None] (0-D tensors) for now
return (dest->snodes[0]->type == SNodeType::place) &&
dest->indices.empty();
});
auto valid_global_tmps =
find_global_reduction_destinations<GlobalTemporaryStmt>(
offload, [](auto *) { return true; });
std::copy(valid_global_ptrs.begin(), valid_global_ptrs.end(),
std::back_inserter(valid_reduction_values));
std::copy(valid_global_tmps.begin(), valid_global_tmps.end(),
std::back_inserter(valid_reduction_values));
}

std::size_t tls_offset = 0;

Expand Down
9 changes: 9 additions & 0 deletions tests/python/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,20 @@ def reduce():
for i in a:
tot[None] += a[i]

@ti.kernel
def reduce_tmp() -> dtype:
s = tot[None] * 0 # Hack to get |s| to the correct type...
for i in a:
s += a[i]
return s

fill()
reduce()
tot2 = reduce_tmp()

ground_truth = N * (N - 1) / 2
assert criterion(tot[None], ground_truth)
assert criterion(tot2, ground_truth)


@ti.all_archs
Expand Down