Skip to content

Commit

Permalink
Use LLVMGPUTileandFuse instead of LLVMGPUVectorize for convolutions (#…
Browse files Browse the repository at this point in the history
…19469)

With this PR for convs that are not picked by VectorDistribute or
TileAndFuse via IGEMM, we default lower them with TileAndFuse instead of
using Vectorize pipeline. There doesnt seem to be a major performance
impact in testing done with iree-kernel-benchmark as shown
[here](https://docs.google.com/spreadsheets/d/1WaJ1ELhwdo1wFvNiKbdoddSncSt2_UsbvrTdSObNaAo/edit?gid=0#gid=0)
and we can always look into improving the heuristics if performance is a
problem.
Fixes #19478

---------

Signed-off-by: Nirvedh <[email protected]>
  • Loading branch information
nirvedhmeshram authored Dec 13, 2024
1 parent 99b600f commit 442956c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 12 deletions.
26 changes: 19 additions & 7 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ llvm::cl::opt<bool> clGPUUnalignedGEMMVectorDistribution(
"unaligned GEMMs when supported"),
llvm::cl::init(false));

llvm::cl::opt<bool> clGPUUseTileAndFuseConvolution(
"iree-codegen-llvmgpu-use-tile-and-fuse-convolution",
llvm::cl::desc(
"enable the tile and fuse pipeline for supported convolutions"),
llvm::cl::init(true));

/// Flag to force using WMMA tensorcore operations.
llvm::cl::opt<bool>
clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
Expand Down Expand Up @@ -2196,12 +2202,19 @@ static bool distributeToSquare(const int64_t oh, const int64_t ow,
// Convolution Pipeline Configuration
//====---------------------------------------------------------------------===//

static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
linalg::LinalgOp linalgOp,
const int64_t bestTilingFactor) {
static LogicalResult setConvolutionConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn,
linalg::LinalgOp linalgOp, const int64_t bestTilingFactor) {
if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp>(linalgOp)) {
return failure();
}
if (clGPUUseTileAndFuseConvolution) {
if (succeeded(IREE::GPU::setTileAndFuseLoweringConfig(target, entryPointFn,
linalgOp))) {
LDBG("Tile and fuse convolution config");
return success();
}
}
const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp);
const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp);

Expand Down Expand Up @@ -2284,9 +2297,8 @@ static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
SmallVector<int64_t> windowTileSizes(4, 0);
windowTileSizes[ohIndex] = 1;
tileSizes.push_back(windowTileSizes);
auto funcOp = linalgOp->getParentOfType<mlir::FunctionOpInterface>();
return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes,
pipeline, workgroupSize);
return setOpConfigAndEntryPointFnTranslation(
entryPointFn, linalgOp, tileSizes, pipeline, workgroupSize);
}

//====---------------------------------------------------------------------===//
Expand Down Expand Up @@ -2340,7 +2352,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Warp Reduction Config");
return success();
}
if (succeeded(setConvolutionConfig(target, linalgOp, 16))) {
if (succeeded(setConvolutionConfig(target, entryPointFn, linalgOp, 16))) {
LDBG("Convolution Config");
return success();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 \
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-use-tile-and-fuse-convolution=false \
// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \
// RUN: %s | FileCheck %s

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,12 +612,11 @@ func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() {
return
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8, 64, 1, 1, 4], [0, 1, 0, 0]{{\]}}
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [16, 2, 1]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1]
// CHECK: func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.conv_2d
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 0, 0, 1, 7, 3], thread = [1, 1, 1, 1, 0, 0, 0], workgroup = [1, 1, 1, 32, 0, 0, 0]}>

// -----

Expand Down

0 comments on commit 442956c

Please sign in to comment.