Skip to content

Commit

Permalink
[LLVMGPU] Set prefetching on translation info (#17744)
Browse files Browse the repository at this point in the history
This patch makes prefetch_shared_memory part of translation_info config
dictionary, allowing us to control prefetching at dispatch level,
instead of globally turning it on/off. Prefetching is still off by
default, the flag makes KernelConfig add prefetch_shared_memory unit
attribute to config dictionary.
  • Loading branch information
Groverkss authored Jun 26, 2024
1 parent c62fc9d commit 9da0309
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 8 deletions.
22 changes: 22 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
#include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
Expand Down Expand Up @@ -70,6 +71,11 @@ llvm::cl::opt<int> clGPUMatmulCThreshold(
// TODO: We should get this value from the target's parallelism.
llvm::cl::init(512 * 512));

static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
"iree-llvmgpu-enable-prefetch",
llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
llvm::cl::init(false));

namespace {

using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
Expand Down Expand Up @@ -356,6 +362,14 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
schedule->nWarpCount);
SmallVector<NamedAttribute, 1> attrs;
attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);

// Prefetch shared memory if requested.
if (clLLVMGPUEnablePrefetch) {
attrs.emplace_back(
StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
UnitAttr::get(context));
}

auto configDict = DictionaryAttr::get(context, attrs);

return setOpConfigAndEntryPointFnTranslation(
Expand Down Expand Up @@ -566,6 +580,14 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
schedule->nWarpCount);
SmallVector<NamedAttribute, 1> attrs;
attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);

// Prefetch shared memory if requested.
if (clLLVMGPUEnablePrefetch) {
attrs.emplace_back(
StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
UnitAttr::get(context));
}

auto configDict = DictionaryAttr::get(context, attrs);

return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ getPipelineOptions(FunctionOpInterface funcOp,
if (DictionaryAttr config = translationInfo.getConfiguration()) {
if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
pipelineOptions.prefetchSharedMemory = true;
if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
// Get the workgroups reorder config and enable the workgroup reordering.
Attribute reorderWorkgroupOption =
Expand Down
10 changes: 3 additions & 7 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,6 @@ static llvm::cl::opt<int64_t> clLLVMGPUSharedMemoryLimit(
"allocated for the given target"),
llvm::cl::init(163 * 1024));

static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
"iree-llvmgpu-enable-prefetch",
llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
llvm::cl::init(false));

llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
const LLVMGPUPipelineOptions &options) {
StringRef reorderStr = "<not set>";
Expand All @@ -93,7 +88,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
}

return os << "{" << "enableReduceSharedMemoryBankConflicts = "
<< options.enableReduceSharedMemoryBankConflicts
<< options.enableReduceSharedMemoryBankConflicts << ", "
<< ", prefetchSharedMemory = " << options.prefetchSharedMemory
<< ", reorderWorkgroupsStrategy = " << reorderStr
<< ", enableUkernels = " << options.enableUkernels << "}";
}
Expand Down Expand Up @@ -790,7 +786,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createGPUReduceBankConflictsPass(options));
}

if (clLLVMGPUEnablePrefetch) {
if (options.prefetchSharedMemory) {
funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass());
}
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ namespace LLVMGPUAttrNames {
inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
"no_reduce_shared_memory_bank_conflicts";
inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
} // namespace LLVMGPUAttrNames

struct LLVMGPUPipelineOptions {
bool enableReduceSharedMemoryBankConflicts = true;
bool prefetchSharedMemory = false;
bool enableUkernels = false;
std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// CHECK-SAME: prefetch_shared_memory

// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
Expand Down Expand Up @@ -91,6 +92,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// CHECK-SAME: prefetch_shared_memory

// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
Expand Down Expand Up @@ -156,6 +158,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
}

// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: prefetch_shared_memory

// CHECK: func @expanded_matmul_transpose_b
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
Expand Down Expand Up @@ -272,6 +276,7 @@ hal.executable public @main_dispatch_expanded_matmul {
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// CHECK-SAME: prefetch_shared_memory

// CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
Expand Down Expand Up @@ -321,7 +326,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>

// RDNA3-SAME: prefetch_shared_memory

// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
Expand Down Expand Up @@ -373,6 +378,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1>
// CHECK-SAME: prefetch_shared_memory

// CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
Expand Down Expand Up @@ -462,6 +468,7 @@ hal.executable public @contract_schedule_considering_read_layout {
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4>
// CHECK-SAME: prefetch_shared_memory

// CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
Expand Down

0 comments on commit 9da0309

Please sign in to comment.