Skip to content

Commit

Permalink
[LLVMGPU] Enable scf.forall distr. on vectorDistribute Pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
pashu123 committed Dec 13, 2024
1 parent 0cafee9 commit f6c10c5
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 6 deletions.
3 changes: 2 additions & 1 deletion compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ static void addVectorBufferizePasses(OpPassManager &funcPassManager) {
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options,
bool usePadToModelSharedMemcpy) {
tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false);
tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true);

ReorderWorkgroupsStrategy reorderStrategy =
getReorderWorkgroupsStrategy(options.reorderStrategy);
Expand Down Expand Up @@ -1267,6 +1267,7 @@ void buildROCDLCodegenPassPipeline(OpPassManager &variantPassManager) {
.addPass(createVerifyWorkgroupDistributionPass);
}
variantPassManager.addPass(createReconcileTranslationInfoPass());
variantPassManager.addPass(createLowerAffinePass());
variantPassManager.addPass(IREE::Util::createDropCompilerHintsPass());

addLowerToLLVMGPUPasses(variantPassManager.nest<ModuleOp>(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1023,7 +1023,7 @@ hal.executable private @attention_20x4096x64x4096x64 {
// Check that we only use alloc for Q, K, and V. No shared memory for S is
// needed because the intrinsic layout mathes.
// MEMORY-LABEL: func.func @attention_20x4096x64x4096x64()
// MEMORY-COUNT-3: memref.alloc
// MEMORY-COUNT-4: memref.alloc
// MEMORY-NOT: memref.alloc

// -----
Expand Down Expand Up @@ -1084,14 +1084,14 @@ hal.executable private @attention_multiple_m_transpose {

// CHECK-LABEL: func.func @attention_multiple_m_transpose()
// CHECK: scf.for %{{.*}} = %c0 to %c4608 step %c64
// CHECK-SAME: -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x8x1x1x1x4xf32>)
// CHECK-SAME: -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32>)
// CHECK-COUNT-96: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
// CHECK: scf.yield

// Check that we only use alloc for Q, K, and V. No shared memory for S is
// needed because the intrinsic layout mathes.
// MEMORY-LABEL: func.func @attention_multiple_m_transpose()
// MEMORY-COUNT-3: memref.alloc
// MEMORY-COUNT-4: memref.alloc
// MEMORY-NOT: memref.alloc

// -----
Expand Down Expand Up @@ -1152,8 +1152,8 @@ hal.executable private @attention_mfma_32x32x8 {

// CHECK-LABEL: func.func @attention_mfma_32x32x8()
// CHECK: scf.for %{{.*}} = %c0 to %c4608 step %c32
// CHECK-SAME: -> (vector<1x1x1xf32>, vector<1x1x1xf32>, vector<1x4x1x4x1x4xf32>)
// CHECK-COUNT-32: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
// CHECK-SAME: -> (vector<1x1x1xf32>, vector<1x1x1xf32>, vector<1x2x1x4x1x4xf32>)
// CHECK-COUNT-24: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
// CHECK: scf.yield

// Check that we only use alloc for Q, K, and V. No shared memory for S is
Expand Down

0 comments on commit f6c10c5

Please sign in to comment.