diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 53e49efbf66a..2ab98ce03367 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -837,7 +837,7 @@ static void addVectorBufferizePasses(OpPassManager &funcPassManager) { void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, const GPUPipelineOptions &options, bool usePadToModelSharedMemcpy) { - tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false); + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true); ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); @@ -1267,6 +1267,7 @@ void buildROCDLCodegenPassPipeline(OpPassManager &variantPassManager) { .addPass(createVerifyWorkgroupDistributionPass); } variantPassManager.addPass(createReconcileTranslationInfoPass()); + variantPassManager.addPass(createLowerAffinePass()); variantPassManager.addPass(IREE::Util::createDropCompilerHintsPass()); addLowerToLLVMGPUPasses(variantPassManager.nest(), diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir index 4396888ad90b..3958e3499ee1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir @@ -1023,7 +1023,7 @@ hal.executable private @attention_20x4096x64x4096x64 { // Check that we only use alloc for Q, K, and V. No shared memory for S is // needed because the intrinsic layout mathes. // MEMORY-LABEL: func.func @attention_20x4096x64x4096x64() -// MEMORY-COUNT-3: memref.alloc +// MEMORY-COUNT-4: memref.alloc // MEMORY-NOT: memref.alloc // ----- @@ -1084,14 +1084,14 @@ hal.executable private @attention_multiple_m_transpose { // CHECK-LABEL: func.func @attention_multiple_m_transpose() // CHECK: scf.for %{{.*}} = %c0 to %c4608 step %c64 -// CHECK-SAME: -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x8x1x1x1x4xf32>) +// CHECK-SAME: -> (vector<2x1x1xf32>, vector<2x1x1xf32>, vector<2x4x1x1x1x4xf32>) // CHECK-COUNT-96: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> // CHECK: scf.yield // Check that we only use alloc for Q, K, and V. No shared memory for S is // needed because the intrinsic layout mathes. // MEMORY-LABEL: func.func @attention_multiple_m_transpose() -// MEMORY-COUNT-3: memref.alloc +// MEMORY-COUNT-4: memref.alloc // MEMORY-NOT: memref.alloc // ----- @@ -1152,8 +1152,8 @@ hal.executable private @attention_mfma_32x32x8 { // CHECK-LABEL: func.func @attention_mfma_32x32x8() // CHECK: scf.for %{{.*}} = %c0 to %c4608 step %c32 -// CHECK-SAME: -> (vector<1x1x1xf32>, vector<1x1x1xf32>, vector<1x4x1x4x1x4xf32>) -// CHECK-COUNT-32: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> +// CHECK-SAME: -> (vector<1x1x1xf32>, vector<1x1x1xf32>, vector<1x2x1x4x1x4xf32>) +// CHECK-COUNT-24: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> // CHECK: scf.yield // Check that we only use alloc for Q, K, and V. No shared memory for S is