From e60192fffa767bc35f65fd1bbdbf875d1707fc12 Mon Sep 17 00:00:00 2001 From: Ye Kuang Date: Mon, 30 Mar 2020 22:06:25 +0900 Subject: [PATCH 1/4] [metal] Use grid-stride loop to implement listgen kernels Also added KernelAttributes::debug_string() method --- taichi/backends/metal/kernel_manager.cpp | 2 ++ taichi/backends/metal/kernel_util.cpp | 34 +++++++++++++++++++ taichi/backends/metal/kernel_util.h | 3 ++ .../metal/shaders/runtime_kernels.metal.h | 32 +++++++++++------ taichi/codegen/codegen_metal.cpp | 11 +++--- taichi/ir/ir.cpp | 22 ++++++++++-- taichi/ir/statements.h | 5 ++- tests/python/test_bitmasked.py | 30 ++++++++++++++++ 8 files changed, 119 insertions(+), 20 deletions(-) diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp index f8b7320cd4322..c307e82c153cc 100644 --- a/taichi/backends/metal/kernel_manager.cpp +++ b/taichi/backends/metal/kernel_manager.cpp @@ -237,6 +237,8 @@ class CompiledTaichiKernel { TI_ASSERT(kernel != nullptr); compiled_mtl_kernels.push_back(std::move(kernel)); + TI_DEBUG("Added {} for Taichi kernel {}", ka.debug_string(), + params.taichi_kernel_name); } if (args_attribs.has_args()) { args_mem = std::make_unique(args_attribs.total_bytes(), diff --git a/taichi/backends/metal/kernel_util.cpp b/taichi/backends/metal/kernel_util.cpp index 431f7555ce227..02d0429d8a9a1 100644 --- a/taichi/backends/metal/kernel_util.cpp +++ b/taichi/backends/metal/kernel_util.cpp @@ -1,5 +1,7 @@ #include "taichi/backends/metal/kernel_util.h" +#include + #define TI_RUNTIME_HOST #include "taichi/runtime/llvm/context.h" #undef TI_RUNTIME_HOST @@ -8,6 +10,38 @@ TLANG_NAMESPACE_BEGIN namespace metal { +// static +std::string KernelAttributes::buffers_name(Buffers b) { +#define REGISTER_NAME(x) \ + { Buffers::x, #x } + const static std::unordered_map m = { + REGISTER_NAME(Root), + REGISTER_NAME(GlobalTmps), + REGISTER_NAME(Args), + REGISTER_NAME(Runtime), + }; +#undef REGISTER_NAME + return m.find(b)->second; +} + +std::string KernelAttributes::debug_string() const { + std::string result; + result += fmt::format( + "id); + } + result += ">"; + return result; +} + KernelArgsAttributes::KernelArgsAttributes(const std::vector &args) : args_bytes_(0), extra_args_bytes_(Context::extra_args_size) { arg_attribs_vec_.reserve(args.size()); diff --git a/taichi/backends/metal/kernel_util.h b/taichi/backends/metal/kernel_util.h index 6eae96138ca15..d048305323bca 100644 --- a/taichi/backends/metal/kernel_util.h +++ b/taichi/backends/metal/kernel_util.h @@ -58,6 +58,9 @@ struct KernelAttributes { RangeForAttributes range_for_attribs; // clear_list + listgen RuntimeListOpAttributes runtime_list_op_attribs; + + static std::string buffers_name(Buffers b); + std::string debug_string() const; }; // Note that all Metal kernels belonging to the same Taichi kernel will share diff --git a/taichi/backends/metal/shaders/runtime_kernels.metal.h b/taichi/backends/metal/shaders/runtime_kernels.metal.h index 6d9a647d35ced..0343036b34758 100644 --- a/taichi/backends/metal/shaders/runtime_kernels.metal.h +++ b/taichi/backends/metal/shaders/runtime_kernels.metal.h @@ -58,7 +58,8 @@ STR( kernel void element_listgen(device byte *runtime_addr [[buffer(0)]], device byte *root_addr [[buffer(1)]], device int *args [[buffer(2)]], - const uint utid_ [[thread_position_in_grid]]) { + const uint utid_ [[thread_position_in_grid]], + const uint grid_size [[threads_per_grid]]) { device Runtime *runtime = reinterpret_cast(runtime_addr); device byte *list_data_addr = @@ -72,20 +73,29 @@ STR( const SNodeMeta child_meta = runtime->snode_metas[child_snode_id]; const int child_stride = child_meta.element_stride; const int num_slots = child_meta.num_slots; - if ((int)utid_ >= num_active(parent_list)) { - return; - } - const auto parent_elem = - get(parent_list, utid_, list_data_addr); - for (int i = 0; i < num_slots; ++i) { + const int range = max( + (int)((child_list->max_num_elems + grid_size - 1) / grid_size), 1); + const int begin = range * (int)utid_; + + for (int ii = begin; ii < (begin + range); ++ii) { + const int parent_idx = (ii / num_slots); + if (parent_idx >= num_active(parent_list)) { + // Since |parent_idx| increases monotonically, we can return directly + // once it goes beyond the number of active parent elements. + return; + } + const int child_idx = (ii % num_slots); + const auto parent_elem = + get(parent_list, parent_idx, list_data_addr); ListgenElement child_elem; child_elem.root_mem_offset = parent_elem.root_mem_offset + - i * child_stride + + child_idx * child_stride + child_meta.mem_offset_in_parent; - if (is_active(root_addr + child_elem.root_mem_offset, child_meta, i)) { + if (is_active(root_addr + child_elem.root_mem_offset, child_meta, + child_idx)) { refine_coordinates(parent_elem, - runtime->snode_extractors[child_snode_id], i, - &child_elem); + runtime->snode_extractors[child_snode_id], + child_idx, &child_elem); append(child_list, child_elem, list_data_addr); } } diff --git a/taichi/codegen/codegen_metal.cpp b/taichi/codegen/codegen_metal.cpp index 9688c10a450fb..49a04b64f0633 100644 --- a/taichi/codegen/codegen_metal.cpp +++ b/taichi/codegen/codegen_metal.cpp @@ -680,11 +680,12 @@ class KernelCodegen : public IRVisitor { ka.num_threads = 1; ka.buffers = {BuffersEnum::Runtime, BuffersEnum::Args}; } else if (type == Type::listgen) { - // This launches |total_num_elems_from_root| number of threads, which - // could be a huge waste of GPU resources. - // TODO(k-ye): use grid-stride loop to reduce #threads. - ka.num_threads = compiled_structs_->snode_descriptors.find(sn->id) - ->second.total_num_elems_from_root; + // listgen kernels use grid-stride loops, so that we can cap its maximum + // number of threads at 1M. + ka.num_threads = + std::min(compiled_structs_->snode_descriptors.find(sn->id) + ->second.total_num_elems_from_root, + 1024 * 1024); ka.buffers = {BuffersEnum::Runtime, BuffersEnum::Root, BuffersEnum::Args}; } else { TI_ERROR("Unsupported offload task type {}", stmt->task_name()); diff --git a/taichi/ir/ir.cpp b/taichi/ir/ir.cpp index ccc25c16a0057..011aef142d4cf 100644 --- a/taichi/ir/ir.cpp +++ b/taichi/ir/ir.cpp @@ -1,9 +1,12 @@ // Intermediate representations -#include "ir.h" -#include +#include "taichi/ir/ir.h" + #include -#include "frontend.h" +#include +#include + +#include "taichi/ir/frontend.h" TLANG_NAMESPACE_BEGIN @@ -531,4 +534,17 @@ std::string OffloadedStmt::task_name() const { } } +// static +std::string OffloadedStmt::task_type_name(TaskType tt) { +#define REGISTER_NAME(x) \ + { TaskType::x, #x } + const static std::unordered_map m = { + REGISTER_NAME(serial), REGISTER_NAME(range_for), + REGISTER_NAME(struct_for), REGISTER_NAME(clear_list), + REGISTER_NAME(listgen), REGISTER_NAME(gc), + }; +#undef REGISTER_NAME + return m.find(tt)->second; +} + TLANG_NAMESPACE_END diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h index 2052ec370f7ae..a45778cf23d26 100644 --- a/taichi/ir/statements.h +++ b/taichi/ir/statements.h @@ -1,5 +1,6 @@ #pragma once -#include "ir.h" + +#include "taichi/ir/ir.h" TLANG_NAMESPACE_BEGIN @@ -190,6 +191,8 @@ class OffloadedStmt : public Stmt { std::string task_name() const; + static std::string task_type_name(TaskType tt); + bool has_body() const { return task_type != clear_list && task_type != listgen && task_type != gc; } diff --git a/tests/python/test_bitmasked.py b/tests/python/test_bitmasked.py index 2f757534aa0ea..85f12a6215a10 100644 --- a/tests/python/test_bitmasked.py +++ b/tests/python/test_bitmasked.py @@ -82,3 +82,33 @@ def func(): func() assert s[None] == 4 + + +@archs_support_bitmasked +def test_huge_bitmasked(): + # Mainly for testing Metal listgen's grid-stride loop implementation. + x = ti.var(ti.f32) + s = ti.var(ti.i32) + + n = 1024 + + ti.root.bitmasked(ti.i, n).bitmasked(ti.i, 2 * n).place(x) + ti.root.place(s) + + @ti.kernel + def func(): + for i in range(n * n * 2): + if i % 32 == 0: + x[i] = 1.0 + + @ti.kernel + def count(): + for i in x: + s[None] += 1 + + func() + count() + assert s[None] == (n * n * 2) // 32 + + +test_huge_bitmasked() From 12cda6e994a306dc637c8d3a776f15e06fc8398a Mon Sep 17 00:00:00 2001 From: Yuanming Hu Date: Mon, 30 Mar 2020 18:57:41 -0400 Subject: [PATCH 2/4] Update test_bitmasked.py --- tests/python/test_bitmasked.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python/test_bitmasked.py b/tests/python/test_bitmasked.py index 85f12a6215a10..57c640f3057db 100644 --- a/tests/python/test_bitmasked.py +++ b/tests/python/test_bitmasked.py @@ -109,6 +109,3 @@ def count(): func() count() assert s[None] == (n * n * 2) // 32 - - -test_huge_bitmasked() From 0d5a1800854783c35b4978e9df9445dd597ddb6b Mon Sep 17 00:00:00 2001 From: Taichi Gardener Date: Mon, 30 Mar 2020 18:58:04 -0400 Subject: [PATCH 3/4] [skip ci] enforce code format --- .../metal/shaders/runtime_kernels.metal.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/taichi/backends/metal/shaders/runtime_kernels.metal.h b/taichi/backends/metal/shaders/runtime_kernels.metal.h index 0343036b34758..5bd15cacf8db1 100644 --- a/taichi/backends/metal/shaders/runtime_kernels.metal.h +++ b/taichi/backends/metal/shaders/runtime_kernels.metal.h @@ -43,9 +43,9 @@ struct Runtime { METAL_BEGIN_RUNTIME_KERNELS_DEF STR( // clang-format on - kernel void clear_list(device byte *runtime_addr [[buffer(0)]], - device int *args [[buffer(1)]], - const uint utid_ [[thread_position_in_grid]]) { + kernel void clear_list(device byte *runtime_addr[[buffer(0)]], + device int *args[[buffer(1)]], + const uint utid_[[thread_position_in_grid]]) { if (utid_ > 0) return; int child_snode_id = args[1]; @@ -55,11 +55,11 @@ STR( clear(child_list); } - kernel void element_listgen(device byte *runtime_addr [[buffer(0)]], - device byte *root_addr [[buffer(1)]], - device int *args [[buffer(2)]], - const uint utid_ [[thread_position_in_grid]], - const uint grid_size [[threads_per_grid]]) { + kernel void element_listgen(device byte *runtime_addr[[buffer(0)]], + device byte *root_addr[[buffer(1)]], + device int *args[[buffer(2)]], + const uint utid_[[thread_position_in_grid]], + const uint grid_size[[threads_per_grid]]) { device Runtime *runtime = reinterpret_cast(runtime_addr); device byte *list_data_addr = From b5a3256fb59560f28ee6c8a3dd56ff8afc6e9cfc Mon Sep 17 00:00:00 2001 From: Ye Kuang Date: Tue, 31 Mar 2020 20:12:52 +0900 Subject: [PATCH 4/4] reduce size --- taichi/codegen/codegen_metal.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taichi/codegen/codegen_metal.cpp b/taichi/codegen/codegen_metal.cpp index 49a04b64f0633..cc1f14162c369 100644 --- a/taichi/codegen/codegen_metal.cpp +++ b/taichi/codegen/codegen_metal.cpp @@ -685,7 +685,7 @@ class KernelCodegen : public IRVisitor { ka.num_threads = std::min(compiled_structs_->snode_descriptors.find(sn->id) ->second.total_num_elems_from_root, - 1024 * 1024); + 64 * 1024); ka.buffers = {BuffersEnum::Runtime, BuffersEnum::Root, BuffersEnum::Args}; } else { TI_ERROR("Unsupported offload task type {}", stmt->task_name());