From 9da0309b0491df57629a2177ab1dbec4aa73ae6e Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 26 Jun 2024 16:56:44 +0100
Subject: [PATCH] [LLVMGPU] Set prefetching on translation info (#17744)

This patch makes prefetch_shared_memory part of translation_info config
dictionary, allowing us to control prefetching at dispatch level,
instead of globally turning it on/off. Prefetching is still off by
default, the flag makes KernelConfig add prefetch_shared_memory unit
attribute to config dictionary.
---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 22 +++++++++++++++++++
 .../LLVMGPU/LLVMGPULowerExecutableTarget.cpp  |  2 ++
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  | 10 +++------
 .../iree/compiler/Codegen/LLVMGPU/Passes.h    |  2 ++
 .../ROCDL/pipeline_vector_distribute.mlir     |  9 +++++++-
 5 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index a6415cd9d700..5858759c740c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -15,6 +15,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
@@ -70,6 +71,11 @@ llvm::cl::opt<int> clGPUMatmulCThreshold(
     // TODO: We should get this value from the target's parallelism.
     llvm::cl::init(512 * 512));
 
+static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
+    "iree-llvmgpu-enable-prefetch",
+    llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
+    llvm::cl::init(false));
+
 namespace {
 
 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
@@ -356,6 +362,14 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
       schedule->nWarpCount);
   SmallVector<NamedAttribute, 1> attrs;
   attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+  // Prefetch shared memory if requested.
+  if (clLLVMGPUEnablePrefetch) {
+    attrs.emplace_back(
+        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+        UnitAttr::get(context));
+  }
+
   auto configDict = DictionaryAttr::get(context, attrs);
 
   return setOpConfigAndEntryPointFnTranslation(
@@ -566,6 +580,14 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
       schedule->nWarpCount);
   SmallVector<NamedAttribute, 1> attrs;
   attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+  // Prefetch shared memory if requested.
+  if (clLLVMGPUEnablePrefetch) {
+    attrs.emplace_back(
+        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+        UnitAttr::get(context));
+  }
+
   auto configDict = DictionaryAttr::get(context, attrs);
 
   return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 2fb329579986..6ee91d5f7085 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -88,6 +88,8 @@ getPipelineOptions(FunctionOpInterface funcOp,
   if (DictionaryAttr config = translationInfo.getConfiguration()) {
     if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
       pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
+    if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
+      pipelineOptions.prefetchSharedMemory = true;
     if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
       // Get the workgroups reorder config and enable the workgroup reordering.
       Attribute reorderWorkgroupOption =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index a417e310a909..16982072c71f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -72,11 +72,6 @@ static llvm::cl::opt<int64_t> clLLVMGPUSharedMemoryLimit(
                    "allocated for the given target"),
     llvm::cl::init(163 * 1024));
 
-static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
-    "iree-llvmgpu-enable-prefetch",
-    llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
-    llvm::cl::init(false));
-
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               const LLVMGPUPipelineOptions &options) {
   StringRef reorderStr = "<not set>";
@@ -93,7 +88,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
   }
 
   return os << "{" << "enableReduceSharedMemoryBankConflicts = "
-            << options.enableReduceSharedMemoryBankConflicts
+            << options.enableReduceSharedMemoryBankConflicts << ", "
+            << ", prefetchSharedMemory = " << options.prefetchSharedMemory
             << ", reorderWorkgroupsStrategy = " << reorderStr
             << ", enableUkernels = " << options.enableUkernels << "}";
 }
@@ -790,7 +786,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
     funcPassManager.addPass(createGPUReduceBankConflictsPass(options));
   }
 
-  if (clLLVMGPUEnablePrefetch) {
+  if (options.prefetchSharedMemory) {
     funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass());
   }
   funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 0492d49f6dee..488705fc38e1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -29,10 +29,12 @@ namespace LLVMGPUAttrNames {
 inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
 inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
     "no_reduce_shared_memory_bank_conflicts";
+inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
 } //  namespace LLVMGPUAttrNames
 
 struct LLVMGPUPipelineOptions {
   bool enableReduceSharedMemoryBankConflicts = true;
+  bool prefetchSharedMemory = false;
   bool enableUkernels = false;
   std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
 };
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index a8bfa7172d0d..d0254d03bf1a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -44,6 +44,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -91,6 +92,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
 //     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
@@ -156,6 +158,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 }
 
 //          CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+//     CHECK-SAME: prefetch_shared_memory
+
 //          CHECK: func @expanded_matmul_transpose_b
 //     CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -272,6 +276,7 @@ hal.executable public @main_dispatch_expanded_matmul {
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME: prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -321,7 +326,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 //       RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
 //  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
 //  RDNA3-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-
+//  RDNA3-SAME:   prefetch_shared_memory
 
 //    RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
 //     RDNA3-SAME:    translation_info = #[[$TRANSLATION]]
@@ -373,6 +378,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 1>
+// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -462,6 +468,7 @@ hal.executable public @contract_schedule_considering_read_layout {
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 4>
+// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]