Skip to content

Commit

Permalink
[LLVMCPU] Enable tileDispatchUsingForall as default (#18777)
Browse files Browse the repository at this point in the history
  • Loading branch information
pashu123 authored Oct 25, 2024
1 parent e96e3c0 commit 3b751a4
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,16 @@ static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter,
llvm::SmallDenseSet<int> droppedLoops;
for (auto [index, lb, ub, step] :
llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {

std::optional<int64_t> lbVal = getConstantIntValue(lb);
std::optional<int64_t> ubVal = getConstantIntValue(ub);
std::optional<int64_t> stepVal = getConstantIntValue(step);

if (!(lbVal && ubVal && stepVal)) {
continue;
}
int64_t lbVal = getConstantIntValue(lb).value();
int64_t ubVal = getConstantIntValue(ub).value();
int64_t stepVal = getConstantIntValue(step).value();
if (CEILDIV(ubVal - lbVal, stepVal) == 1) {

if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
droppedLoops.insert(index);
}
}
Expand Down
28 changes: 10 additions & 18 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
static llvm::cl::opt<bool> clTileDispatchUsingForall(
"iree-llvmcpu-tile-dispatch-using-forall",
llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
llvm::cl::init(false));
llvm::cl::init(true));

// By default, IREE does not enable the Armv9-A streaming SVE mode in the
// presence of scalable vectors (even when using `+sme`), as currently there's
Expand All @@ -111,9 +111,8 @@ static llvm::cl::opt<bool> clForceArmStreaming(
llvm::cl::init(false));

// TODO: Enable `TileDispatchUsingForall` for every pipeline.
static void addTileAndDistributePasses(OpPassManager &funcPassManager,
bool enableTileDispatchUsingForall) {
if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
if (clTileDispatchUsingForall) {
funcPassManager.addPass(
createTileAndDistributeToWorkgroupsUsingForallOpPass());
} else {
Expand Down Expand Up @@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline(
void addCPUBufferOpsTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// Skip tiling reduction loops because this is expected to apply on copy ops
// only.
Expand Down Expand Up @@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
// Apply tile and fuse to all the non-distribution fusable levels. Skip
Expand Down Expand Up @@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
void addConvTileAndDecomposeExpertPassPipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
Expand Down Expand Up @@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

funcPassManager.addPass(createLLVMCPUTileAndFusePass(
static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
Expand Down Expand Up @@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// The below two passes are nop if pack/unpack is not specified in ukernels
// attribute. By default, they are disabled.
Expand Down Expand Up @@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
void addCPULinalgExtTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/false);
addTileAndDistributePasses(funcPassManager);
funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
// TODO: Remove the pass once we have PartialReductionOpInterface implemented
Expand Down Expand Up @@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
}

void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/false);
addTileAndDistributePasses(funcPassManager);
addCPUBufferizePasses(funcPassManager);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ hal.executable private @main {
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) {
// CHECK: scf.forall ({{.*}}) in (2, 4, 5) {
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
// CHECK: gpu.barrier
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
Expand All @@ -307,7 +307,7 @@ hal.executable private @main {
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]]
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

// -----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@

stream.executable private @__builtin_fill_i64 {
stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
%0 = tensor.empty(%count) : tensor<?xi64>
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
%0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@

stream.executable private @__builtin_splat_i64 {
stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
%0 = tensor.empty(%count) : tensor<?xi64>
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
%0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,14 @@ isFusableWithConsumer(OpOperand &fusedOperand,
return false;
}

// TODO: Enable grouped convolution and depth wise pooling fusion.
// Rightnow, this is going through the default CPU pipeline and not through
// CONVTilingExpert.
if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
linalg::PoolingNdhwcSumOp>(producer)) {
return false;
}

auto producerFusionOp =
dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
auto consumerFusionOp =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,13 +392,6 @@
"onnx/node/generated/test_softsign_example",
"onnx/node/generated/test_stft",
"onnx/node/generated/test_stft_with_window",
"onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
"onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
"onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
"onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0",
"onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty",
"onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5",
"onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5",
"onnx/node/generated/test_training_dropout",
"onnx/node/generated/test_training_dropout_default",
"onnx/node/generated/test_training_dropout_default_mask",
Expand Down

0 comments on commit 3b751a4

Please sign in to comment.