From 9da0309b0491df57629a2177ab1dbec4aa73ae6e Mon Sep 17 00:00:00 2001 From: Kunwar Grover <groverkss@gmail.com> Date: Wed, 26 Jun 2024 16:56:44 +0100 Subject: [PATCH] [LLVMGPU] Set prefetching on translation info (#17744) This patch makes prefetch_shared_memory part of translation_info config dictionary, allowing us to control prefetching at dispatch level, instead of globally turning it on/off. Prefetching is still off by default, the flag makes KernelConfig add prefetch_shared_memory unit attribute to config dictionary. --- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 22 +++++++++++++++++++ .../LLVMGPU/LLVMGPULowerExecutableTarget.cpp | 2 ++ .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 10 +++------ .../iree/compiler/Codegen/LLVMGPU/Passes.h | 2 ++ .../ROCDL/pipeline_vector_distribute.mlir | 9 +++++++- 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index a6415cd9d700..5858759c740c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -15,6 +15,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h" +#include "iree/compiler/Codegen/LLVMGPU/Passes.h" #include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" @@ -70,6 +71,11 @@ llvm::cl::opt<int> clGPUMatmulCThreshold( // TODO: We should get this value from the target's parallelism. llvm::cl::init(512 * 512)); +static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch( + "iree-llvmgpu-enable-prefetch", + llvm::cl::desc("Enable prefetch in the vector distribute pipeline"), + llvm::cl::init(false)); + namespace { using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline; @@ -356,6 +362,14 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, schedule->nWarpCount); SmallVector<NamedAttribute, 1> attrs; attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr); + + // Prefetch shared memory if requested. + if (clLLVMGPUEnablePrefetch) { + attrs.emplace_back( + StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory), + UnitAttr::get(context)); + } + auto configDict = DictionaryAttr::get(context, attrs); return setOpConfigAndEntryPointFnTranslation( @@ -566,6 +580,14 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, schedule->nWarpCount); SmallVector<NamedAttribute, 1> attrs; attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr); + + // Prefetch shared memory if requested. + if (clLLVMGPUEnablePrefetch) { + attrs.emplace_back( + StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory), + UnitAttr::get(context)); + } + auto configDict = DictionaryAttr::get(context, attrs); return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp index 2fb329579986..6ee91d5f7085 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp @@ -88,6 +88,8 @@ getPipelineOptions(FunctionOpInterface funcOp, if (DictionaryAttr config = translationInfo.getConfiguration()) { if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts)) pipelineOptions.enableReduceSharedMemoryBankConflicts = false; + if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory)) + pipelineOptions.prefetchSharedMemory = true; if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) { // Get the workgroups reorder config and enable the workgroup reordering. Attribute reorderWorkgroupOption = diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index a417e310a909..16982072c71f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -72,11 +72,6 @@ static llvm::cl::opt<int64_t> clLLVMGPUSharedMemoryLimit( "allocated for the given target"), llvm::cl::init(163 * 1024)); -static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch( - "iree-llvmgpu-enable-prefetch", - llvm::cl::desc("Enable prefetch in the vector distribute pipeline"), - llvm::cl::init(false)); - llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const LLVMGPUPipelineOptions &options) { StringRef reorderStr = "<not set>"; @@ -93,7 +88,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, } return os << "{" << "enableReduceSharedMemoryBankConflicts = " - << options.enableReduceSharedMemoryBankConflicts + << options.enableReduceSharedMemoryBankConflicts << ", " + << ", prefetchSharedMemory = " << options.prefetchSharedMemory << ", reorderWorkgroupsStrategy = " << reorderStr << ", enableUkernels = " << options.enableUkernels << "}"; } @@ -790,7 +786,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createGPUReduceBankConflictsPass(options)); } - if (clLLVMGPUEnablePrefetch) { + if (options.prefetchSharedMemory) { funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass()); } funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h index 0492d49f6dee..488705fc38e1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h @@ -29,10 +29,12 @@ namespace LLVMGPUAttrNames { inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups"; inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts = "no_reduce_shared_memory_bank_conflicts"; +inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory"; } // namespace LLVMGPUAttrNames struct LLVMGPUPipelineOptions { bool enableReduceSharedMemoryBankConflicts = true; + bool prefetchSharedMemory = false; bool enableUkernels = false; std::optional<ReorderWorkgroupsStrategy> reorderStrategy; }; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir index a8bfa7172d0d..d0254d03bf1a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir @@ -44,6 +44,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64 // CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> +// CHECK-SAME: prefetch_shared_memory // CHECK-LABEL: func.func @matmul_256x256x256_f16_f32() // CHECK-SAME: translation_info = #[[$TRANSLATION]] @@ -91,6 +92,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64 // CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> +// CHECK-SAME: prefetch_shared_memory // CHECK-LABEL: func.func @matmul_256x256x256_f16_f16() // CHECK-SAME: translation_info = #[[$TRANSLATION]] @@ -156,6 +158,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { } // CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64 +// CHECK-SAME: prefetch_shared_memory + // CHECK: func @expanded_matmul_transpose_b // CHECK-SAME: translation_info = #[[TRANSLATION]] // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to @@ -272,6 +276,7 @@ hal.executable public @main_dispatch_expanded_matmul { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64 // CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> +// CHECK-SAME: prefetch_shared_memory // CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to @@ -321,7 +326,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32 // RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, // RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2> - +// RDNA3-SAME: prefetch_shared_memory // RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32 // RDNA3-SAME: translation_info = #[[$TRANSLATION]] @@ -373,6 +378,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64 // CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1> +// CHECK-SAME: prefetch_shared_memory // CHECK-LABEL: func.func @unaligned_nk_batch_matmul() // CHECK-SAME: translation_info = #[[$TRANSLATION]] @@ -462,6 +468,7 @@ hal.executable public @contract_schedule_considering_read_layout { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64 // CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4> +// CHECK-SAME: prefetch_shared_memory // CHECK-LABEL: func.func @contract_schedule_considering_read_layout() // CHECK-SAME: translation_info = #[[$TRANSLATION]]