From 3b751a4d2797d29422e08327b1a53933448a26fd Mon Sep 17 00:00:00 2001
From: Prashant Kumar <pk5561@gmail.com>
Date: Fri, 25 Oct 2024 08:00:21 +0530
Subject: [PATCH] [LLVMCPU] Enable tileDispatchUsingForall as default (#18777)

---
 .../Common/TileDispatchUsingForall.cpp        | 13 +++++----
 .../iree/compiler/Codegen/LLVMCPU/Passes.cpp  | 28 +++++++------------
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    |  4 +--
 .../Dialect/Stream/Builtins/fill_i64.mlir     |  9 +++---
 .../Dialect/Stream/Builtins/splat_i64.mlir    |  9 +++---
 .../DispatchCreation/FormDispatchRegions.cpp  |  8 ++++++
 .../onnx_ops/onnx_ops_cpu_llvm_sync.json      |  7 -----
 7 files changed, 38 insertions(+), 40 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
index ebbe585bf53e..218b7f5217f1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -202,13 +202,16 @@ static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter,
   llvm::SmallDenseSet<int> droppedLoops;
   for (auto [index, lb, ub, step] :
        llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
-    if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {
+
+    std::optional<int64_t> lbVal = getConstantIntValue(lb);
+    std::optional<int64_t> ubVal = getConstantIntValue(ub);
+    std::optional<int64_t> stepVal = getConstantIntValue(step);
+
+    if (!(lbVal && ubVal && stepVal)) {
       continue;
     }
-    int64_t lbVal = getConstantIntValue(lb).value();
-    int64_t ubVal = getConstantIntValue(ub).value();
-    int64_t stepVal = getConstantIntValue(step).value();
-    if (CEILDIV(ubVal - lbVal, stepVal) == 1) {
+
+    if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
       droppedLoops.insert(index);
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 0951fbba4273..71b3aec7389f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -95,7 +95,7 @@ static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
 static llvm::cl::opt<bool> clTileDispatchUsingForall(
     "iree-llvmcpu-tile-dispatch-using-forall",
     llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 // By default, IREE does not enable the Armv9-A streaming SVE mode in the
 // presence of scalable vectors (even when using `+sme`), as currently there's
@@ -111,9 +111,8 @@ static llvm::cl::opt<bool> clForceArmStreaming(
     llvm::cl::init(false));
 
 // TODO: Enable `TileDispatchUsingForall` for every pipeline.
-static void addTileAndDistributePasses(OpPassManager &funcPassManager,
-                                       bool enableTileDispatchUsingForall) {
-  if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
+static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
+  if (clTileDispatchUsingForall) {
     funcPassManager.addPass(
         createTileAndDistributeToWorkgroupsUsingForallOpPass());
   } else {
@@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline(
 void addCPUBufferOpsTileAndVectorizePipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // Skip tiling reduction loops because this is expected to apply on copy ops
   // only.
@@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
 void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
                                       TilingConfig &tilingConfig,
                                       LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
   // Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
 void addConvTileAndDecomposeExpertPassPipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
 void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
                                       TilingConfig &tilingConfig,
                                       LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   funcPassManager.addPass(createLLVMCPUTileAndFusePass(
       static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
@@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
 void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
                               TilingConfig &tilingConfig,
                               LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // The below two passes are nop if pack/unpack is not specified in ukernels
   // attribute. By default, they are disabled.
@@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
 void addCPULinalgExtTileAndVectorizePipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/false);
+  addTileAndDistributePasses(funcPassManager);
   funcPassManager.addPass(
       createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
   // TODO: Remove the pass once we have PartialReductionOpInterface implemented
@@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
 }
 
 void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/false);
+  addTileAndDistributePasses(funcPassManager);
   addCPUBufferizePasses(funcPassManager);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 912acf310b26..2ebc85496759 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -290,7 +290,7 @@ hal.executable private @main {
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C720:.+]] = arith.constant 720 : index
 //      CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//         CHECK:   scf.forall ({{.*}}) in (2, 4, 1, 5) {
+//         CHECK:   scf.forall ({{.*}}) in (2, 4, 5) {
 //          CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
 //          CHECK:     gpu.barrier
 //      CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
@@ -307,7 +307,7 @@ hal.executable private @main {
 //          CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
 //          CHECK:   %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
 //          CHECK:   vector.transfer_write %[[EXTRACT]], %[[B2]]
-//         CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+//         CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
index 96e527a20f0f..5d7d686b5bd1 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
@@ -9,16 +9,17 @@
 
 stream.executable private @__builtin_fill_i64 {
   stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module {
     func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
-      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
-      %0 = tensor.empty(%count) : tensor<?xi64>
+      %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+      %0 = tensor.empty(%count0) : tensor<?xi64>
       %1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
-      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
       return
     }
   }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
index 7d94e51a26d7..4d25d358c7b4 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
@@ -9,16 +9,17 @@
 
 stream.executable private @__builtin_splat_i64 {
   stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module {
     func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
-      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
-      %0 = tensor.empty(%count) : tensor<?xi64>
+      %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+      %0 = tensor.empty(%count0) : tensor<?xi64>
       %1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
-      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
       return
     }
   }
diff --git a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
index e866022eb9a9..b38b1a593001 100644
--- a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
@@ -547,6 +547,14 @@ isFusableWithConsumer(OpOperand &fusedOperand,
     return false;
   }
 
+  // TODO: Enable grouped convolution and depth wise pooling fusion.
+  // Rightnow, this is going through the default CPU pipeline and not through
+  // CONVTilingExpert.
+  if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
+          linalg::PoolingNdhwcSumOp>(producer)) {
+    return false;
+  }
+
   auto producerFusionOp =
       dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
   auto consumerFusionOp =
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
index a025431d7af4..f8ca790fe5b1 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
@@ -392,13 +392,6 @@
     "onnx/node/generated/test_softsign_example",
     "onnx/node/generated/test_stft",
     "onnx/node/generated/test_stft_with_window",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0",
-    "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty",
-    "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5",
     "onnx/node/generated/test_training_dropout",
     "onnx/node/generated/test_training_dropout_default",
     "onnx/node/generated/test_training_dropout_default_mask",