Use LLVMGPUTileandFuse instead of LLVMGPUVectorize for convolutions (#…

…19469) With this PR for convs that are not picked by VectorDistribute or TileAndFuse via IGEMM, we default lower them with TileAndFuse instead of using Vectorize pipeline. There doesnt seem to be a major performance impact in testing done with iree-kernel-benchmark as shown [here](https://docs.google.com/spreadsheets/d/1WaJ1ELhwdo1wFvNiKbdoddSncSt2_UsbvrTdSObNaAo/edit?gid=0#gid=0) and we can always look into improving the heuristics if performance is a problem. Fixes #19478 --------- Signed-off-by: Nirvedh <[email protected]>
iree-org · Dec 13, 2024 · 442956c · 442956c
1 parent 99b600f
commit 442956c
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 12 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -77,6 +77,12 @@ llvm::cl::opt<bool> clGPUUnalignedGEMMVectorDistribution(
                    "unaligned GEMMs when supported"),
     llvm::cl::init(false));
 
+llvm::cl::opt<bool> clGPUUseTileAndFuseConvolution(
+    "iree-codegen-llvmgpu-use-tile-and-fuse-convolution",
+    llvm::cl::desc(
+        "enable the tile and fuse pipeline for supported convolutions"),
+    llvm::cl::init(true));
+
 /// Flag to force using WMMA tensorcore operations.
 llvm::cl::opt<bool>
     clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
@@ -2196,12 +2202,19 @@ static bool distributeToSquare(const int64_t oh, const int64_t ow,
 // Convolution Pipeline Configuration
 //====---------------------------------------------------------------------===//
 
-static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
-                                          linalg::LinalgOp linalgOp,
-                                          const int64_t bestTilingFactor) {
+static LogicalResult setConvolutionConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn,
+    linalg::LinalgOp linalgOp, const int64_t bestTilingFactor) {
   if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp>(linalgOp)) {
     return failure();
   }
+  if (clGPUUseTileAndFuseConvolution) {
+    if (succeeded(IREE::GPU::setTileAndFuseLoweringConfig(target, entryPointFn,
+                                                          linalgOp))) {
+      LDBG("Tile and fuse convolution config");
+      return success();
+    }
+  }
   const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp);
   const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp);
 
@@ -2284,9 +2297,8 @@ static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> windowTileSizes(4, 0);
   windowTileSizes[ohIndex] = 1;
   tileSizes.push_back(windowTileSizes);
-  auto funcOp = linalgOp->getParentOfType<mlir::FunctionOpInterface>();
-  return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes,
-                                               pipeline, workgroupSize);
+  return setOpConfigAndEntryPointFnTranslation(
+      entryPointFn, linalgOp, tileSizes, pipeline, workgroupSize);
 }
 
 //====---------------------------------------------------------------------===//
@@ -2340,7 +2352,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
       LDBG("Warp Reduction Config");
       return success();
     }
-    if (succeeded(setConvolutionConfig(target, linalgOp, 16))) {
+    if (succeeded(setConvolutionConfig(target, entryPointFn, linalgOp, 16))) {
       LDBG("Convolution Config");
       return success();
     }

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-use-tile-and-fuse-convolution=false \
 // RUN:   --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \
 // RUN:   %s | FileCheck %s
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -612,12 +612,11 @@ func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() {
   return
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8, 64, 1, 1, 4], [0, 1, 0, 0]{{\]}}
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [16, 2, 1]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1]
 //      CHECK: func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//      CHECK:   linalg.generic
-// CHECK-SAME:       lowering_config = #[[CONFIG]]
+//      CHECK:   linalg.conv_2d
+// CHECK-SAME:       lowering_config =  #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 0, 0, 1, 7, 3], thread = [1, 1, 1, 1, 0, 0, 0], workgroup = [1, 1, 1, 32, 0, 0, 0]}>
 
 // -----