Skip to content

Commit

Permalink
graph: backend: compiler: fusion: skip barrier removal for some cases…
Browse files Browse the repository at this point in the history
… during parallel merge
  • Loading branch information
Yun-Fly authored and vpirogov committed Oct 17, 2023
1 parent bf12207 commit 57e14b5
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <compiler/ir/transform/scope_flatten.hpp>
#include <compiler/ir/transform/tensor2var.hpp>
#include <compiler/ir/transform/tensor_inplace_info.hpp>
#include <compiler/ir/viewer.hpp>
#include <compiler/ir/visitor.hpp>
#include <ops/convolution.hpp>
#include <ops/fusible/memory_movement.hpp>
Expand Down Expand Up @@ -1752,6 +1753,39 @@ static size_t get_great_common_loop_size(const std::vector<for_loop> &loop_A,
return merged_loop_size;
}

/**
* find nested parallel for. E.g.
* pfor(){
* tensor a; // optional
* pfor(){
* }
* }
* */
class nested_pfor_finder_t : public ir_viewer_t {
public:
using ir_viewer_t::dispatch;
using ir_viewer_t::view;
// return whether nested pfor exist
bool operator()(for_loop_c v) {
ir_viewer_t::dispatch(std::move(v));
return pfor_cnt_ > 1;
}
expr_c dispatch(expr_c v) override { return v; }

void view(for_loop_c f) override {
// check pfor
if (f->kind_ == for_type::PARALLEL && f->num_threads_ > 0) {
pfor_cnt_++;
}
// to be faster
if (pfor_cnt_ > 1) return;
ir_viewer_t::view(f);
}

private:
int pfor_cnt_ = 0;
};

static bool try_merge_mixed_parti_parallel_inners(
mixed_parti_t *pa_to_merge, mixed_parti_t *parti_be_merged) {
pa_to_merge = pa_to_merge->get_root(),
Expand Down Expand Up @@ -1789,10 +1823,42 @@ static bool try_merge_mixed_parti_parallel_inners(
SC_MODULE_INFO << parti_be_merged->func_;

if (check_parti_dep(pa_to_merge, parti_be_merged) == parti_dep::no_dep) {
auto last_for = get_last_loop_in_body(
outer_loops_to_merge[merged_loop_size - 1]->body_);
if (last_for.defined()) {
last_for->attr()[stmt_attr_key::no_post_barrier] = true;
auto &to_merge_body = outer_loops_to_merge[merged_loop_size - 1]->body_;
/**
* The thread-shared buffer of the previous pfor and the thread-shared
* buffer of the next pfor may share the same memory location after
* buffer scheduling and hoist. After removal of barrier, some threads
* may work on the previous pfor and others may work on the next pfor,
* and they share the same memory localtion. This will cause race
* condition. To avoid that, we need to check that:
* 1. there are no tensors defined in the immediate body of the merged
* pfor.
* 2. the buffers inside of the child pfor of the merged pfor must be
* the most inner loop (i.e. merged pfor must be second most inner
* loop). This is to avoid hoisting of the tensors. */
bool barrier_can_remove = true;
if (to_merge_body.isa<stmts>()) {
for (auto &s : to_merge_body.static_as<stmts>()->seq_) {
if (s.isa<define>()) {
// Case 1: if tensor node is defined
if (s.static_as<define>()->var_.isa<tensor>()) {
barrier_can_remove = false;
break;
}
} else if (s.isa<for_loop>()) {
// Case 2: nested pfor and potential hoist buffer
if (nested_pfor_finder_t()(s.static_as<for_loop>())) {
barrier_can_remove = false;
break;
}
}
}
}
if (barrier_can_remove) {
auto last_for = get_last_loop_in_body(to_merge_body);
if (last_for.defined()) {
last_for->attr()[stmt_attr_key::no_post_barrier] = true;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1881,57 +1881,108 @@ TEST(GCCore_CPU_graph_mixed_partition_cpp, TestPrefetchSelected) {
EXPECT_TRUE(found);
}

TEST(GCCore_CPU_graph_mixed_partition_cpp, ParallelMergeAndNoBarrier) {
SET_THREADS_OR_SKIP(16);
int M = 256, K1 = 2048, N = 1024;
// loop finder
class loop_finder_t : public ir_viewer_t {
public:
using ir_viewer_t::dispatch;
using ir_viewer_t::view;
void operator()(stmt_c v) { ir_viewer_t::dispatch(std::move(v)); }
bool has_illegal_var() const { return illegal_loop_var_; }
bool has_dummy_range() const { return dummy_loop_range_; }
bool has_no_barrier_attr() const { return no_barrier_attr_; }
void view(for_loop_c f) override {
// check `var_` if var type
if (!f->var_.isa<var>()) { illegal_loop_var_ = true; }
// check loop range if dummy
if (f->iter_begin_.isa<constant>() && f->iter_end_.isa<constant>()
&& get_expr_as_int(f->iter_begin_) == 0
&& get_expr_as_int(f->iter_end_) == 1) {
dummy_loop_range_ = true;
}
if (f->attr_
&& f->attr_->get_or_else(
stmt_attr_key::no_post_barrier, false)) {
no_barrier_attr_ = true;
}
ir_viewer_t::view(f);
}

sc_graph_t graph;
auto input0 = graph.make_input(
{graph_tensor::make({M, K1}, sc_data_format_t(format_kinds::MK))});
auto weight0 = graph.make_input(
{graph_tensor::make({K1, N}, sc_data_format_t(format_kinds::KN))});
auto weight1 = graph.make_input(
{graph_tensor::make({K1, N}, sc_data_format_t(format_kinds::KN))});
private:
bool illegal_loop_var_ = false;
bool dummy_loop_range_ = false;
bool no_barrier_attr_ = false;
};

// mmm0
auto mmm0 = graph.make("managed_matmul_core",
{input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
{
ops::managed_matmul_core_config_t cfg = {2, 8, 1, 1, 1, 0};
TEST(GCCore_CPU_graph_mixed_partition_cpp, ParallelMergeAndBarrier) {
SET_THREADS_OR_SKIP(16);

auto make_parallel_mmm_graph = [](bool barrier) {
int M = 256, K1 = 2048, N = 1024;
sc_graph_t graph;
auto input0 = graph.make_input({graph_tensor::make(
{M, K1}, sc_data_format_t(format_kinds::MK))});
auto weight0 = graph.make_input({graph_tensor::make(
{K1, N}, sc_data_format_t(format_kinds::KN))});
auto weight1 = graph.make_input({graph_tensor::make(
{K1, N}, sc_data_format_t(format_kinds::KN))});
ops::managed_matmul_core_config_t cfg0 = {2, 8, 1, 1, 1, 0},
cfg1 = {2, 4, 1, 1, 1, 0};
// mmm0
auto mmm0 = graph.make("managed_matmul_core",
{input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
reflection::general_object_t::make(cfg));
}
// mmm1
auto mmm1 = graph.make("managed_matmul_core",
{input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
{
ops::managed_matmul_core_config_t cfg = {2, 4, 1, 1, 1, 0};
reflection::general_object_t::make(barrier ? cfg1 : cfg0));
// mmm1
auto mmm1 = graph.make("managed_matmul_core",
{input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
mmm1->dyn_cast<op_traits::configurable_t>()->set_config(
reflection::general_object_t::make(cfg));
}
auto out0 = graph.make_output({mmm0->get_outputs()[0]});
graph.make_output({mmm1->get_outputs()[0]});
reflection::general_object_t::make(cfg1));
graph.make_output({mmm0->get_outputs()[0]});
graph.make_output({mmm1->get_outputs()[0]});
return graph;
};

auto ctx = std::make_shared<context_t>(*get_test_ctx());
ctx->flags_.mixed_fusion_ = true;
// split outmost and merge inners
mixed_partition(graph, ctx);
auto mixed_op = get_mixed_op_from_graph(graph);
ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
auto &body = mixed_op->parti_list_[0]->func_->body_;
auto inner_loop = body.cast<stmts>()
.map([](const stmts &v) {
return v->seq_.at(0).as<for_loop>();
})
.map([](const for_loop &v) {
return v->body_.as<stmts>()
->seq_.at(0)
.as<for_loop>();
})
.get_or_else(for_loop());
ASSERT_TRUE(inner_loop.defined());
ASSERT_TRUE(inner_loop->attr().get_or_else(
stmt_attr_key::no_post_barrier, false));
bool barrier;
{
/* Case 0: need barrier */
barrier = true;
auto graph = make_parallel_mmm_graph(barrier);
mixed_partition(graph, ctx);
auto mixed_op = get_mixed_op_from_graph(graph);
ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
auto &inner_body
= mixed_op->parti_list_[0]->get_outer_loops().back()->body_;
// loop attr finder
loop_finder_t la_finder;
la_finder(inner_body);
// `no_post_barrier attr` is not expected
ASSERT_FALSE(la_finder.has_no_barrier_attr());
}
{
/* Case 1: remove barrier */
barrier = false;
auto graph = make_parallel_mmm_graph(barrier);
mixed_partition(graph, ctx);
auto mixed_op = get_mixed_op_from_graph(graph);
ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
auto &body = mixed_op->parti_list_[0]->func_->body_;
auto inner_loop = body.cast<stmts>()
.map([](const stmts &v) {
return v->seq_.at(0).as<for_loop>();
})
.map([](const for_loop &v) {
return v->body_.as<stmts>()
->seq_.at(0)
.as<for_loop>();
})
.get_or_else(for_loop());
ASSERT_TRUE(inner_loop.defined());
// `no_post_barrier` attr is expected
ASSERT_TRUE(inner_loop->attr().get_or_else(
stmt_attr_key::no_post_barrier, false));
}
}

TEST(GCCore_CPU_graph_mixed_partition_cpp, ParallelMergeNotAppendInputAnchor) {
Expand Down Expand Up @@ -2149,31 +2200,6 @@ TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop1) {
stmt_attr_key::merge_loop, false));
}

// loop finder
class loop_finder_t : public ir_viewer_t {
public:
using ir_viewer_t::dispatch;
using ir_viewer_t::view;
void operator()(stmt_c v) { ir_viewer_t::dispatch(std::move(v)); }
bool has_illegal_var() const { return illegal_loop_var_; }
bool has_dummy_range() const { return dummy_loop_range_; }
void view(for_loop_c f) override {
// check `var_` if var type
if (!f->var_.isa<var>()) { illegal_loop_var_ = true; }
// check loop range if dummy
if (f->iter_begin_.isa<constant>() && f->iter_end_.isa<constant>()
&& get_expr_as_int(f->iter_begin_) == 0
&& get_expr_as_int(f->iter_end_) == 1) {
dummy_loop_range_ = true;
}
ir_viewer_t::view(f);
}

private:
bool illegal_loop_var_ = false;
bool dummy_loop_range_ = false;
};

TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop2) {
SET_THREADS_OR_SKIP(1);
int BS = 1, H = 8, W = 8, C = 64;
Expand All @@ -2196,6 +2222,7 @@ TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop2) {
auto mixed_op = get_mixed_op_from_graph(graph);
ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
auto &func = mixed_op->parti_list_[0]->func_;
// loop var finder
loop_finder_t lv_finder;
lv_finder(func->body_);
// All loop var should be `var` type. `for 0 in (0, 1, 1)` is not expected.
Expand Down Expand Up @@ -2230,10 +2257,11 @@ TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop3) {
auto mixed_op = get_mixed_op_from_graph(graph);
ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
auto &func = mixed_op->parti_list_[0]->func_;
loop_finder_t lv_finder;
lv_finder(func->body_);
// loop range finder
loop_finder_t lr_finder;
lr_finder(func->body_);
// All loop range should not be dummp like (0, 1, 1)
EXPECT_FALSE(lv_finder.has_dummy_range());
EXPECT_FALSE(lr_finder.has_dummy_range());
}

TEST(GCCore_CPU_graph_mixed_partition_cpp, PoolingLoopReSchedule) {
Expand Down

0 comments on commit 57e14b5

Please sign in to comment.