From 6020cbf3d1377ff9ce7a63548fcb2e4d59eb82eb Mon Sep 17 00:00:00 2001 From: hanhanW Date: Tue, 6 Aug 2024 14:44:52 -0700 Subject: [PATCH] [GPU][NFC] Follow the official convention to define mfma/wmma attributes The LLVM intrinsics and official docs are all using `[output_type]_MxNxK_[input_type]` format. The revision updates IREE's definitions to follow the convention. Some examples from official docs: - https://gpuopen.com/learn/wmma_on_rdna3/ - https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme/ - https://github.com/ROCm/amd_matrix_instruction_calculator Signed-off-by: hanhanW --- .../gpu_nested_layout_contract_amdgpu.mlir | 12 +-- .../Common/GPU/test/gpu_tensor_alloc.mlir | 2 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 94 +++++++++---------- .../Codegen/Dialect/GPU/IR/IREEGPUEnums.td | 20 ++-- .../Codegen/Dialect/GPU/IR/IREEGPUOps.td | 6 +- .../Dialect/GPU/IR/test/iree_gpu_attrs.mlir | 8 +- .../Dialect/GPU/IR/test/iree_gpu_ops.mlir | 24 ++--- .../Dialect/GPU/IR/test/target_attrs.mlir | 6 +- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 4 +- .../Dialect/GPU/TargetUtils/KnownTargets.cpp | 18 ++-- .../test/convert_to_multi_mma.mlir | 12 +-- .../test/distribute_multi_mma.mlir | 4 +- .../test/drop_multi_mma_unit_dims.mlir | 8 +- .../test/lower_multi_mma.mlir | 6 +- .../test/unroll_multi_mma.mlir | 4 +- .../test/vectorize_iree_gpu_ops.mlir | 4 +- .../test/distribute_mma_to_lanes.mlir | 4 +- .../Transforms/test/pack_to_intrinsics.mlir | 8 +- .../test/ROCDL/config_tile_and_fuse.mlir | 4 +- .../ROCDL/config_user_vector_distribute.mlir | 16 ++-- .../test/ROCDL/config_vector_distribute.mlir | 12 +-- .../test/ROCDL/pipeline_tile_and_fuse.mlir | 4 +- .../ROCDL/pipeline_vector_distribute.mlir | 18 ++-- .../test/amdgpu_contraction_distribution.mlir | 8 +- .../test/amdgpu_set_anchor_layouts.mlir | 4 +- .../test/attention_mfma_transform_spec.mlir | 2 +- .../LLVMGPU/test/cast_type_to_fit_mma.mlir | 8 +- .../LLVMGPU/test/configure_vector_layout.mlir | 22 ++--- 28 files changed, 171 insertions(+), 171 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir index b8992c68d20e..e2299523900f 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir @@ -51,7 +51,7 @@ func.func @contract_to_mfma_32x32x8_mm(%a : vector<32x8xf16>, %b : vector<8x32xf indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<32x8xf16>, vector<8x32xf16> into vector<32x32xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<32x32xf32> @@ -128,7 +128,7 @@ func.func @contract_to_mfma_16x16x16_mm(%a : vector<16x16xf16>, %b : vector<16x1 indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> %O = iree_vector_ext.to_layout %output to #layout_b : vector<16x16xf32> @@ -216,7 +216,7 @@ func.func @contract_to_mfma_32x32x8_mm_mnbatch(%a : vector<64x8xf16>, %b : vecto indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<64x8xf16>, vector<8x32xf16> into vector<64x32xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<64x32xf32> @@ -305,7 +305,7 @@ func.func @contract_to_mfma_32x32x8_mm_kbatch(%a : vector<32x16xf16>, %b : vecto indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<32x16xf16>, vector<16x32xf16> into vector<32x32xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<32x32xf32> @@ -388,7 +388,7 @@ func.func @contract_to_mfma_32x32x8_mm_mnbatch_order(%a : vector<64x8xf16>, %b : indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<64x8xf16>, vector<8x96xf16> into vector<64x96xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<64x96xf32> @@ -479,7 +479,7 @@ func.func @contract_to_mfma_32x32x8_mmt(%a : vector<32x8xf16>, %b : vector<64x8x indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<32x8xf16>, vector<64x8xf16> into vector<32x64xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<32x64xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir index 4a681e3610c8..527c083955c0 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir @@ -242,7 +242,7 @@ func.func @weight_dequant_matmul() { #hal.descriptor_set.binding<2, storage_buffer> ]> ]> -func.func @conv() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { +func.func @conv() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 00ca348a2edf..ab0d9e185360 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -215,19 +215,19 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context, case MMAIntrinsic::MFMA_F32_16x16x4_F32: { return OpaqueMmaLayout{16, 16, 4, f32, f32, f32}; } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: { return OpaqueMmaLayout{16, 16, 16, f16, f16, f32}; } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: { return OpaqueMmaLayout{32, 32, 8, f16, f16, f32}; } - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: { + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: { return OpaqueMmaLayout{16, 16, 32, f8E4M3FNUZ, f8E4M3FNUZ, f32}; } - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { return OpaqueMmaLayout{16, 16, 32, i8, i8, i32}; } - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return OpaqueMmaLayout{32, 32, 16, i8, i8, i32}; } case MMAIntrinsic::WMMA_F16_16x16x16_F32: { @@ -277,7 +277,7 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, bNLayout, cMLayout, cNLayout}; } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [4, 4]> // #layout_a = #iree_vector_ext.layout<#outer, #inner> @@ -295,7 +295,7 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, bNLayout, cMLayout, cNLayout}; } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [32]> // #inner1 = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [2, 4]> // #inner2 = #iree_vector_ext.per_dim_layout<[VECTORY, LANEY, VECTORX], @@ -316,8 +316,8 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, bNLayout, cMLayout, cNLayout}; } - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [4, 8]> // #layout_a = #iree_vector_ext.layout<#outer, #inner> @@ -334,7 +334,7 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, bNLayout, cMLayout, cNLayout}; } - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [2, 8]> // #layout_a = #iree_vector_ext.layout<#outer, #inner> @@ -437,26 +437,26 @@ MMAAttr::getABCVectorTypes() const { auto cType = VectorType::get({4}, getCType()); return std::make_tuple(aType, bType, cType); } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: { auto aType = VectorType::get({4}, getAType()); auto bType = VectorType::get({4}, getBType()); auto cType = VectorType::get({4}, getCType()); return std::make_tuple(aType, bType, cType); } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: { auto aType = VectorType::get({4}, getAType()); auto bType = VectorType::get({4}, getBType()); auto cType = VectorType::get({16}, getCType()); return std::make_tuple(aType, bType, cType); } - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { auto aType = VectorType::get({8}, getAType()); auto bType = VectorType::get({8}, getBType()); auto cType = VectorType::get({4}, getCType()); return std::make_tuple(aType, bType, cType); } - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { auto aType = VectorType::get({8}, getAType()); auto bType = VectorType::get({8}, getBType()); auto cType = VectorType::get({16}, getCType()); @@ -485,11 +485,11 @@ MMAAttr::getContractionLayout(vector::ContractionOp contract) const { int64_t MMAAttr::getBlockSize() const { switch (getIntrinsic().getValue()) { case MMAIntrinsic::MFMA_F32_16x16x4_F32: - case MMAIntrinsic::MFMA_F16_16x16x16_F32: - case MMAIntrinsic::MFMA_F16_32x32x8_F32: - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: - case MMAIntrinsic::MFMA_I8_32x32x16_I32: + case MMAIntrinsic::MFMA_F32_16x16x16_F16: + case MMAIntrinsic::MFMA_F32_32x32x8_F16: + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: + case MMAIntrinsic::MFMA_I32_32x32x16_I8: case MMAIntrinsic::WMMA_F16_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F32: { return 1; @@ -502,11 +502,11 @@ int64_t MMAAttr::getBlockSize() const { int64_t MMAAttr::getSubgroupSize() const { switch (getIntrinsic().getValue()) { case MMAIntrinsic::MFMA_F32_16x16x4_F32: - case MMAIntrinsic::MFMA_F16_16x16x16_F32: - case MMAIntrinsic::MFMA_F16_32x32x8_F32: - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: + case MMAIntrinsic::MFMA_F32_32x32x8_F16: + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return 64; } case MMAIntrinsic::WMMA_F16_16x16x16_F32: @@ -524,20 +524,20 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16}, /*element=*/{1, 1}}; } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: { return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16}, /*element=*/{1, 4}}; } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: { return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32}, /*element=*/{1, 4}}; } - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16}, /*element=*/{1, 8}}; } - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32}, /*element=*/{1, 8}}; } @@ -556,20 +556,20 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, /*element=*/{1, 1}}; } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, /*element=*/{4, 1}}; } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: { return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1}, /*element=*/{4, 1}}; } - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, /*element=*/{8, 1}}; } - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1}, /*element=*/{8, 1}}; } @@ -585,14 +585,14 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const { switch (getIntrinsic().getValue()) { case MMAIntrinsic::MFMA_F32_16x16x4_F32: - case MMAIntrinsic::MFMA_F16_16x16x16_F32: - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, /*element=*/{4, 1}}; } - case MMAIntrinsic::MFMA_F16_32x32x8_F32: - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_F32_32x32x8_F16: + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1}, /*element=*/{4, 1}}; } @@ -632,11 +632,11 @@ FailureOr MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc, rhs, acc) .getResult(); } - case MMAIntrinsic::MFMA_F16_16x16x16_F32: - case MMAIntrinsic::MFMA_F16_32x32x8_F32: - case MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: - case MMAIntrinsic::MFMA_I8_32x32x16_I32: { + case MMAIntrinsic::MFMA_F32_16x16x16_F16: + case MMAIntrinsic::MFMA_F32_32x32x8_F16: + case MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: + case MMAIntrinsic::MFMA_I32_32x32x16_I8: { auto [m, n, k] = getMNKShape(); return builder .create(loc, resultType, m, n, k, getBlockSize(), lhs, @@ -716,8 +716,8 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides( SmallVector &offsets, SmallVector &sizes, SmallVector &strides) const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F16_16x16x16_F32: - case MMAIntrinsic::MFMA_I8_16x16x32_I32: + case MMAIntrinsic::MFMA_F32_16x16x16_F16: + case MMAIntrinsic::MFMA_I32_16x16x32_I8: break; default: return failure(); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td index 55a83b3fc131..a68a8a3fe1fb 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td @@ -100,11 +100,11 @@ class IREEGPU_I32MmaEnumAttr // Format: __xx_ def MFMA_F32_16x16x4_F32 : I32EnumAttrCase<"MFMA_F32_16x16x4_F32", 0>; -def MFMA_F16_16x16x16_F32 : I32EnumAttrCase<"MFMA_F16_16x16x16_F32", 1>; -def MFMA_F16_32x32x8_F32 : I32EnumAttrCase<"MFMA_F16_32x32x8_F32", 2>; -def MFMA_F8E4M3FNUZ_16x16x32_F32 : I32EnumAttrCase<"MFMA_F8E4M3FNUZ_16x16x32_F32", 3>; -def MFMA_I8_16x16x32_I32 : I32EnumAttrCase<"MFMA_I8_16x16x32_I32", 4>; -def MFMA_I8_32x32x16_I32 : I32EnumAttrCase<"MFMA_I8_32x32x16_I32", 5>; +def MFMA_F32_16x16x16_F16 : I32EnumAttrCase<"MFMA_F32_16x16x16_F16", 1>; +def MFMA_F32_32x32x8_F16 : I32EnumAttrCase<"MFMA_F32_32x32x8_F16", 2>; +def MFMA_F32_16x16x32_F8E4M3FNUZ : I32EnumAttrCase<"MFMA_F32_16x16x32_F8E4M3FNUZ", 3>; +def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 4>; +def MFMA_I32_32x32x16_I8 : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 5>; // TODO: Create separate WMMA ops for AMD and NVIDIA GPUs def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 6>; def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>; @@ -112,11 +112,11 @@ def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>; def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic", "Descriptor for different MMA intrinsics", [ MFMA_F32_16x16x4_F32, - MFMA_F16_16x16x16_F32, - MFMA_F16_32x32x8_F32, - MFMA_F8E4M3FNUZ_16x16x32_F32, - MFMA_I8_16x16x32_I32, - MFMA_I8_32x32x16_I32, + MFMA_F32_16x16x16_F16, + MFMA_F32_32x32x8_F16, + MFMA_F32_16x16x32_F8E4M3FNUZ, + MFMA_I32_16x16x32_I8, + MFMA_I32_32x32x16_I8, WMMA_F16_16x16x16_F32, WMMA_F16_16x16x16_F16 ]>; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td index 0549c090e715..cfc4ffecd3df 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td @@ -56,7 +56,7 @@ def IREEGPU_MultiMmaOp : Op + kind = #iree_gpu.mma_layout } %3 = iree_gpu.multi_mma %0, %1, %2 #contraction_trait : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> @@ -99,7 +99,7 @@ def IREEGPU_MultiMmaOp : Op + kind = #iree_gpu.mma_layout } %3 = iree_gpu.multi_mma %0, %1, %2 #contraction_trait : vector<4xf16>, vector<4xf16> into vector<4xf32> @@ -127,7 +127,7 @@ def IREEGPU_MultiMmaOp : Op, + kind = #iree_gpu.mma_layout, rhs_permutation = [1, 0] } %7 = iree_gpu.multi_mma %4, %5, %6 #contraction_trait diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir index 8ee674d1b371..07663501c79f 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir @@ -2,21 +2,21 @@ module { func.func @test_mfma_f16_16x16x16_f32() attributes { - mma_types = #iree_gpu.mma_layout} { + mma_types = #iree_gpu.mma_layout} { return } } // CHECK-LABEL: func @test_mfma_f16_16x16x16_f32 -// CHECK-SAME: mma_types = #iree_gpu.mma_layout +// CHECK-SAME: mma_types = #iree_gpu.mma_layout module { func.func @test_mfma_f16_32x32x8_f32() attributes { - mma_types = #iree_gpu.mma_layout} { + mma_types = #iree_gpu.mma_layout} { return } } // CHECK-LABEL: func @test_mfma_f16_32x32x8_f32 -// CHECK-SAME: mma_types = #iree_gpu.mma_layout +// CHECK-SAME: mma_types = #iree_gpu.mma_layout module { func.func @test_wmma_f16_16x16x16_f32() attributes { diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir index a18506849d88..690acf66f278 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir @@ -63,7 +63,7 @@ func.func @vector_multi_mma(%lhs: vector<2x3x4xf16>, %rhs: vector<3x5x4xf16>, %a %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> return %0 : vector<2x5x4xf32> } @@ -76,7 +76,7 @@ func.func @vector_multi_mma(%lhs: vector<2x3x4xf16>, %rhs: vector<3x5x4xf16>, %a // CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> // ----- @@ -90,7 +90,7 @@ func.func @tensor_multi_mma(%lhs: tensor, %rhs: tensor, %a %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : tensor, tensor into tensor return %0 : tensor } @@ -103,7 +103,7 @@ func.func @tensor_multi_mma(%lhs: tensor, %rhs: tensor, %a // CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor, tensor into tensor // ----- @@ -117,7 +117,7 @@ func.func @single_multi_mma(%lhs: vector<4xf16>, %rhs: vector<4xf16>, %acc: vect %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<4xf16>, vector<4xf16> into vector<4xf32> return %0 : vector<4xf32> } @@ -128,7 +128,7 @@ func.func @single_multi_mma(%lhs: vector<4xf16>, %rhs: vector<4xf16>, %acc: vect // CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]] // CHECK-SAME: iterator_types = [] -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : vector<4xf16>, vector<4xf16> into vector<4xf32> // ----- @@ -142,7 +142,7 @@ func.func @tensor_subgroup_multi_mma(%lhs: tensor, %rhs: tensor, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : tensor, tensor into tensor return %0 : tensor } @@ -155,7 +155,7 @@ func.func @tensor_subgroup_multi_mma(%lhs: tensor, %rhs: tensor, #iree_gpu.iterator_type, #iree_gpu.iterator_type], -// CHECK-SAME: kind = #iree_gpu.mma_layout} +// CHECK-SAME: kind = #iree_gpu.mma_layout} // CHECK-SAME: : tensor, tensor into tensor // ----- @@ -169,7 +169,7 @@ func.func @tensor_subgroup_matmul_transpose_b_multi_mma(%lhs: tensor, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout, + kind = #iree_gpu.mma_layout, rhs_permutation = array } : tensor, tensor into tensor return %0 : tensor @@ -183,7 +183,7 @@ func.func @tensor_subgroup_matmul_transpose_b_multi_mma(%lhs: tensor, #iree_gpu.iterator_type, #iree_gpu.iterator_type], -// CHECK-SAME: kind = #iree_gpu.mma_layout, +// CHECK-SAME: kind = #iree_gpu.mma_layout, // CHECK-SAME: rhs_permutation = array} // CHECK-SAME: : tensor, tensor into tensor @@ -201,7 +201,7 @@ func.func @tensor_subgroup_matmul_transpose_b_32x32x8_multi_mma( %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout, + kind = #iree_gpu.mma_layout, rhs_permutation = array } : tensor, tensor into tensor return %0 : tensor @@ -215,7 +215,7 @@ func.func @tensor_subgroup_matmul_transpose_b_32x32x8_multi_mma( // CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]], // CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], -// CHECK-SAME: kind = #iree_gpu.mma_layout, +// CHECK-SAME: kind = #iree_gpu.mma_layout, // CHECK-SAME: rhs_permutation = array} // CHECK-SAME: : tensor, tensor into tensor diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir index baa47b2be12e..47b44b0bdcf4 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir @@ -7,7 +7,7 @@ func.func @test_target_wgp() attributes { // CHECK-SAME: storage = b32|b16, // CHECK-SAME: subgroup = shuffle|arithmetic, // CHECK-SAME: dot = dp4xi8toi32, - // CHECK-SAME: mma = [, ], + // CHECK-SAME: mma = [, ], // CHECK-SAME: subgroup_size_choices = [32, 64], // CHECK-SAME: max_workgroup_sizes = [1024, 1024, 1024], // CHECK-SAME: max_thread_count_per_workgroup = 1024, @@ -15,7 +15,7 @@ func.func @test_target_wgp() attributes { wgp = #iree_gpu.target_wgp< compute = fp16|fp32|int8, storage = b16|b32, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, - mma = [, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, @@ -63,7 +63,7 @@ func.func @test_target() attributes { wgp = < compute = fp16|fp32|int8, storage = b16|b32, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, - mma = [, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 656eb39d050d..b10e605a2335 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -80,8 +80,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { IREE::GPU::MMAIntrinsic type = mma.getIntrinsic().getValue(); // TODO: Drop this once all intrinsics are supported. - if (type != IREE::GPU::MMAIntrinsic::MFMA_F16_16x16x16_F32 && - type != IREE::GPU::MMAIntrinsic::MFMA_I8_16x16x32_I32) { + if (type != IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16 && + type != IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8) { continue; } supportedMmas.push_back(mma); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index 2f3d254b6587..6c50a8a96b6e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -123,11 +123,11 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, const WgpDetails *getCDNA3WgpDetails() { static const MMAIntrinsic cdna3MMAOps[] = { MMAIntrinsic::MFMA_F32_16x16x4_F32, - MMAIntrinsic::MFMA_F16_16x16x16_F32, - MMAIntrinsic::MFMA_F16_32x32x8_F32, - MMAIntrinsic::MFMA_F8E4M3FNUZ_16x16x32_F32, - MMAIntrinsic::MFMA_I8_16x16x32_I32, - MMAIntrinsic::MFMA_I8_32x32x16_I32, + MMAIntrinsic::MFMA_F32_16x16x16_F16, + MMAIntrinsic::MFMA_F32_32x32x8_F16, + MMAIntrinsic::MFMA_F32_16x16x32_F8E4M3FNUZ, + MMAIntrinsic::MFMA_I32_16x16x32_I8, + MMAIntrinsic::MFMA_I32_32x32x16_I8, }; static const WgpDetails cdna3Wgp = { allComputeBits, allStorageBits, allSubgroupOps, @@ -139,8 +139,8 @@ const WgpDetails *getCDNA3WgpDetails() { const WgpDetails *getCDNA2WgpDetails() { static const MMAIntrinsic cdna2MMAOps[] = { - MMAIntrinsic::MFMA_F16_16x16x16_F32, - MMAIntrinsic::MFMA_F16_32x32x8_F32, + MMAIntrinsic::MFMA_F32_16x16x16_F16, + MMAIntrinsic::MFMA_F32_32x32x8_F16, }; static const WgpDetails cdna2Wgp = { allComputeBits, allStorageBits, allSubgroupOps, @@ -152,8 +152,8 @@ const WgpDetails *getCDNA2WgpDetails() { const WgpDetails *getCDNA1WgpDetails() { static const MMAIntrinsic cdna1MMAOps[] = { - MMAIntrinsic::MFMA_F16_16x16x16_F32, - MMAIntrinsic::MFMA_F16_32x32x8_F32, + MMAIntrinsic::MFMA_F32_16x16x16_F16, + MMAIntrinsic::MFMA_F32_32x32x8_F16, }; static const WgpDetails cdna1Wgp = { allComputeBits, allStorageBits, allSubgroupOps, diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/convert_to_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/convert_to_multi_mma.mlir index de2d1c041f04..168c4ad943f2 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/convert_to_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/convert_to_multi_mma.mlir @@ -19,7 +19,7 @@ func.func @convert_to_mfma_16x16x16(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x module attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %root : (!transform.any_op) -> !transform.any_op - %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op + %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op transform.yield } } @@ -35,7 +35,7 @@ module attributes { transform.with_named_sequence } { // CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]], // CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> // ----- @@ -59,7 +59,7 @@ func.func @convert_to_single_mfma_16x16x16(%lhs: tensor<16x16xf16>, %rhs: tensor module attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %root : (!transform.any_op) -> !transform.any_op - %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op + %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op transform.yield } } @@ -73,7 +73,7 @@ module attributes { transform.with_named_sequence } { // CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]], // CHECK-SAME: iterator_types = [], -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<16x16xf16>, tensor<16x16xf16> into tensor<16x16xf32> // ----- @@ -97,7 +97,7 @@ func.func @convert_to_mfma_16x16x16_transpose_b(%lhs: tensor<2x16x16xf16>, %rhs: module attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %root : (!transform.any_op) -> !transform.any_op - %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op + %1 = transform.iree.convert_to_multi_mma %0, kind(#iree_gpu.mma_layout) : (!transform.any_op) -> !transform.any_op transform.yield } } @@ -112,6 +112,6 @@ module attributes { transform.with_named_sequence } { // CHECK: iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP1]]], // CHECK-SAME: iterator_types = [#iree_gpu.iterator_type], -// CHECK-SAME: kind = #iree_gpu.mma_layout, +// CHECK-SAME: kind = #iree_gpu.mma_layout, // CHECK-SAME: rhs_permutation = array // CHECK-SAME: : tensor<2x16x16xf16>, tensor<2x16x16xf16> into tensor<16x16xf32> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir index ef8ca4b58e1d..8d00f6f764db 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir @@ -9,7 +9,7 @@ func.func @distribute_multi_mma_F16_16x16x16_F32(%lhs: tensor<2x2x16x16xf16>, %r %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : tensor<2x2x16x16xf16>, tensor<2x2x16x16xf16> into tensor<2x2x16x16xf32> return %0 : tensor<2x2x16x16xf32> } @@ -61,7 +61,7 @@ func.func @distribute_multi_mma_I8_16x16x32_I32(%lhs: tensor<2x2x16x32xi8>, %rhs %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout, + kind = #iree_gpu.mma_layout, rhs_permutation = array } : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> return %0 : tensor<2x2x16x16xi32> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/drop_multi_mma_unit_dims.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/drop_multi_mma_unit_dims.mlir index 9adbf3b0f25e..a050a47ad9f0 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/drop_multi_mma_unit_dims.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/drop_multi_mma_unit_dims.mlir @@ -9,7 +9,7 @@ func.func @drop_multi_mma_unit_dims(%lhs: vector<1x1x4xf16>, %rhs: vector<1x1x4x %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<1x1x4xf16>, vector<1x1x4xf16> into vector<1x1x4xf32> return %0 : vector<1x1x4xf32> } @@ -35,7 +35,7 @@ module attributes { transform.with_named_sequence } { // CHECK: %[[ACC_EXT:.+]] = vector.extract %[[ACC]][0, 0] : vector<4xf32> from vector<1x1x4xf32> // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_EXT]], %[[RHS_EXT]], %[[ACC_EXT]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]], iterator_types = [] -// CHECK-SAME: kind = #iree_gpu.mma_layout} : vector<4xf16>, vector<4xf16> into vector<4xf32> +// CHECK-SAME: kind = #iree_gpu.mma_layout} : vector<4xf16>, vector<4xf16> into vector<4xf32> // CHECK: vector.broadcast %[[MMA]] : vector<4xf32> to vector<1x1x4xf32> // ----- @@ -49,7 +49,7 @@ func.func @drop_multi_mma_unit_dims_no_kn(%lhs: vector<1x4xf16>, %rhs: vector<4x %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<1x4xf16>, vector<4xf16> into vector<1x4xf32> return %0 : vector<1x4xf32> } @@ -74,5 +74,5 @@ module attributes { transform.with_named_sequence } { // CHECK: %[[ACC_EXT:.+]] = vector.extract %[[ACC]][0] : vector<4xf32> from vector<1x4xf32> // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_EXT]], %[[RHS]], %[[ACC_EXT]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]], iterator_types = [] -// CHECK-SAME: kind = #iree_gpu.mma_layout} : vector<4xf16>, vector<4xf16> into vector<4xf32> +// CHECK-SAME: kind = #iree_gpu.mma_layout} : vector<4xf16>, vector<4xf16> into vector<4xf32> // CHECK: vector.broadcast %[[MMA]] : vector<4xf32> to vector<1x4xf32> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir index 60255d47bd70..c8700153b482 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir @@ -9,7 +9,7 @@ func.func @lower_multi_mma_mfma_16x16x16(%lhs: vector<4xf16>, %rhs: vector<4xf16 %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<4xf16>, vector<4xf16> into vector<4xf32> return %0 : vector<4xf32> } @@ -43,7 +43,7 @@ func.func @lower_multi_mma_mfma_32x32x8(%lhs: vector<4xf16>, %rhs: vector<4xf16> %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<4xf16>, vector<4xf16> into vector<16xf32> return %0 : vector<16xf32> } @@ -110,7 +110,7 @@ func.func @lower_multi_mma_mfma_shape_cast_16x16x16(%lhs: vector<1x4xf16>, %rhs: %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<1x4xf16>, vector<4x1xf16> into vector<4x1xf32> return %0 : vector<4x1xf32> } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/unroll_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/unroll_multi_mma.mlir index 0a962dfbd9b0..5b6bf42eac71 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/unroll_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/unroll_multi_mma.mlir @@ -9,7 +9,7 @@ func.func @unroll_multi_mma_order(%lhs: vector<2x2x4xf16>, %rhs: vector<2x2x4xf1 %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<2x2x4xf16>, vector<2x2x4xf16> into vector<2x2x4xf32> return %0 : vector<2x2x4xf32> } @@ -75,7 +75,7 @@ func.func @unroll_multi_mma_count(%lhs: vector<2x3x4xf16>, %rhs: vector<3x5x4xf1 %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> return %0 : vector<2x5x4xf32> } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir index e0deb5434fbc..baa44c99ce2b 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir @@ -9,7 +9,7 @@ func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %a %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32> return %0 : tensor<2x5x4xf32> } @@ -46,7 +46,7 @@ func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %ac %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32> return %0 : tensor<4xf32> } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir index dc94e64fc26b..b98ff4223ae6 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir @@ -12,7 +12,7 @@ module { iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%lhs_transpose, %arg1 : tensor<2x8x16x16xf16>, tensor<8x2x16x16xf16>) outs(%arg2 : tensor<2x2x16x16xf32>) - attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}>} { + attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}>} { ^bb0(%in: f16, %in_2: f16, %out: f32): %4 = arith.extf %in : f16 to f32 %5 = arith.extf %in_2 : f16 to f32 @@ -33,6 +33,6 @@ module { // CHECK: %[[LHS_T:.+]] = linalg.transpose ins({{.*}}: tensor<2x8x1x4xf16>) // CHECK: iree_gpu.multi_mma %[[LHS_T]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: kind = #iree_gpu.mma_layout +// CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x1xf32> // CHECK: mapping = [#iree_gpu.lane_id<0>] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir index 1aaae4eac18c..7da25abd2c3a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/pack_to_intrinsics.mlir @@ -1,6 +1,6 @@ // RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-pack-to-intrinsics, canonicalize, cse))' --split-input-file | FileCheck %s -#config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> +#config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> module { func.func @matmul_32x32x8(%a: tensor<64x64xf16>, %b: tensor<64x64xf16>, %c: tensor<64x64xf32>) -> tensor<64x64xf32> { %mm = linalg.matmul {lowering_config = #config} ins(%a, %b : tensor<64x64xf16>, tensor<64x64xf16>) outs(%c : tensor<64x64xf32>) -> tensor<64x64xf32> @@ -18,7 +18,7 @@ module { // CHECK: %[[PACKED_MM:.+]] = linalg.generic // CHECK-SAME: ins(%[[A_PACK]], %[[B_PACK]] : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16>) // CHECK-SAME: outs(%[[C_PACK]] : tensor<2x2x32x32xf32>) -// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> // ----- @@ -32,7 +32,7 @@ module { iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"] } ins(%a, %b : tensor, tensor) outs(%c : tensor) attrs = { - lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> + lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> } { ^bb0(%in: f16, %in_2: f16, %out: f32): %4 = arith.extf %in : f16 to f32 @@ -54,4 +54,4 @@ module { // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: ins({{.*}} : tensor, tensor) // CHECK-SAME: outs({{.*}} : tensor) -// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout}> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 2ac9f34f5209..2e36ccfc9b2a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -29,7 +29,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK: linalg.fill ins // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0 : index, 0 : index, 0 : index, 0 : index, 8 : index] // CHECK-SAME: subgroup = [0 : index, 0 : index, 4 : index, 1 : index, 0 : index] // CHECK-SAME: workgroup = [1 : index, 1 : index, 64 : index, 64 : index, 0 : index] @@ -52,7 +52,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // CHECK: linalg.fill ins // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: reduction = [0 : index, 0 : index, 4 : index] // CHECK-SAME: subgroup = [2 : index, 4 : index, 0 : index] // CHECK-SAME: workgroup = [64 : index, 128 : index, 0 : index] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir index 6c736077d060..b7ca495bb962 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir @@ -10,11 +10,11 @@ // Check that applying the `no_reduce_shared_memory_bank_conflicts` unit attribute disables shared memory padding. // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-OUT-SAME: no_reduce_shared_memory_bank_conflicts // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-IN-SAME: no_reduce_shared_memory_bank_conflicts #pipeline_layout = #hal.pipeline.layout, subgroup_m_count = 2, subgroup_n_count = 2>, + mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>, no_reduce_shared_memory_bank_conflicts // Disable the 'reduceSharedMemoryBankConflicts' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 @@ -86,11 +86,11 @@ hal.executable public @main_0_dispatch_0 { // Check that applying the `reorder_workgroups = transpose` unit attribute enables workgroup reordering. // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-OUT-SAME: reorder_workgroups = "transpose" // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-IN-SAME: reorder_workgroups = "transpose" #pipeline_layout = #hal.pipeline.layout, subgroup_m_count = 2, subgroup_n_count = 2>, + mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>, reorder_workgroups = "transpose" // enable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 @@ -162,7 +162,7 @@ hal.executable public @main_0_dispatch_0 { // Check that applying the `reorder_workgroups = none` unit attribute disables workgroup reordering. // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule, // OPT-OUT-SAME: reorder_workgroups = "none" #pipeline_layout = #hal.pipeline.layout, subgroup_m_count = 2, subgroup_n_count = 2>, + mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>, reorder_workgroups = "none" // Disable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir index 08ef0e89d2cc..d6bea0cbabe4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir @@ -10,7 +10,7 @@ // CHECK: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 #pipeline_layout = #hal.pipeline.layout +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1 #pipeline_layout = #hal.pipeline.layout +// CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 #pipeline_layout = #hal.pipeline.layout ]> ]> -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout}> +#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @matmul_transpose_b_mfma ordinal(0) layout(#pipeline_layout) { @@ -136,7 +136,7 @@ hal.executable public @main { #hal.descriptor_set.binding<2, storage_buffer> ]> ]> -#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout}> +#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout}> hal.executable private @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @conv_igemm_im2col ordinal(0) layout(#pipeline_layout) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir index 6cec811ef8ef..d79d76d67877 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir @@ -48,7 +48,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Basic pipeline test to make sure it generates the instructions we expect. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -97,7 +97,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -220,7 +220,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Make sure it generates the mfma instructions we expect for f8 inputs. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -271,7 +271,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Make sure it generates the mfma instructions we expect for integer inputs. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -322,7 +322,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Make sure it generates the mfma instructions we expect for integer inputs. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -429,7 +429,7 @@ hal.executable public @main_dispatch_expanded_matmul { // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // CHECK-SAME: prefetch_shared_memory @@ -533,7 +533,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // Basic pipeline test to make sure it generates the instructions we expect. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1> // CHECK-SAME: prefetch_shared_memory @@ -621,7 +621,7 @@ hal.executable public @contract_schedule_considering_read_layout { // Basic pipeline test to make sure it generates the instructions we expect. // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4> // CHECK-SAME: prefetch_shared_memory @@ -680,7 +680,7 @@ hal.executable private @attention_20x4096x64x4096x64 { // Basic test to make sure we can handle attention // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule, // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1> // Prefetching is disabled for attention for now // CHECK-NOT: prefetch_shared_memory diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir index fd1241393bb5..76e9f9bcb800 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir @@ -4,7 +4,7 @@ // layoutC means and how these layouts are assigned based on the instruction // type. -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -58,7 +58,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -107,7 +107,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> @@ -163,7 +163,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d2, d0)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir index 2269dc68d15a..54fd9b1fd433 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir @@ -3,7 +3,7 @@ // This tests that the compiler is setting the correct layout anchors for various vectorOps and shapes. // Currently only testing on contraction layoutV1, but can be expanded to others. -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d0)> @@ -34,7 +34,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d0)> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir index c9514cdc0daf..9261f2e552fc 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir @@ -1,4 +1,4 @@ -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout module attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%variant_op: !transform.any_op) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir index 21eb880a4721..d69b8e503492 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir @@ -2,7 +2,7 @@ func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf16>, %init: vector<96x64xf16>) -> vector<96x64xf16> attributes { mma_schedule = #iree_gpu.mma_schedule< - intrinsic = #iree_gpu.mma_layout, + intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>, workgroup_size = [64, 1, 1]} { %0 = vector.contract { @@ -26,7 +26,7 @@ func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf func.func @mfma_matmul_96x64x16_mmt(%lhs: vector<96x16xf16>, %rhs: vector<64x16xf16>, %init: vector<96x64xf16>) -> vector<96x64xf16> attributes { mma_schedule = #iree_gpu.mma_schedule< - intrinsic = #iree_gpu.mma_layout, + intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>, workgroup_size = [64, 1, 1]} { %0 = vector.contract { @@ -47,7 +47,7 @@ func.func @mfma_matmul_96x64x16_mmt(%lhs: vector<96x16xf16>, %rhs: vector<64x16x func.func @mfma_matmul_96x64x16_mm_cannot_downcast(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf16>, %init: vector<96x64xf64>) -> vector<96x64xf64> attributes { mma_schedule = #iree_gpu.mma_schedule< - intrinsic = #iree_gpu.mma_layout, + intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>, workgroup_size = [64, 1, 1]} { %0 = vector.contract { @@ -100,7 +100,7 @@ func.func @transform_dialect_mfma_matmul_96x64x16(%lhs: vector<96x16xf16>, %rhs: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs, %rhs, %init - {iree.amdgpu.mma = #iree_gpu.mma_layout} + {iree.amdgpu.mma = #iree_gpu.mma_layout} : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf16> return %0 : vector<96x64xf16> } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir index e67c011cdba3..98855831f160 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir @@ -3,7 +3,7 @@ #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 1>}> // Since CHECK-SAME doesnt work with CHECK-DAG, we cannot have prettier tests. @@ -30,7 +30,7 @@ func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -55,7 +55,7 @@ func.func @mfma_matmul_96x64x16_mmt(%lhs: vector<96x16xf16>, %rhs: vector<64x16x #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -80,7 +80,7 @@ func.func @mfma_matmul_96x64x16_mmtt(%lhs: vector<96x16xf16>, %rhs: vector<64x16 #translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, %rhs: ve #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -157,7 +157,7 @@ func.func @matmul_16x16x256_read(%lhs: memref<16x256xf16, strided<[256, 1], offs #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -212,7 +212,7 @@ func.func @matmul_16x16x256_read_permute(%lhs: memref<16x256xf16, strided<[256, #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 1>}> // We don't really care what layout we assign here, just that the only anchor // we set is on the contraction. @@ -308,7 +308,7 @@ func.func @wmma_matmul_48x32x32_mmt(%lhs: vector<48x32xf16>, %rhs: vector<32x32x #translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -333,7 +333,7 @@ func.func @matmul_192x64x16_mmt_multi_m(%lhs: vector<2x64x16xf16>, %rhs: vector< #translation = #iree_codegen.translation_info, subgroup_m_count = 4, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 4, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -358,7 +358,7 @@ func.func @matmul_192x64x16_mmt_multi_split_m(%lhs: vector<2x64x16xf16>, %rhs: v #translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, workgroup_size = [128, 2, 1]}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 2>, workgroup_size = [128, 2, 1]}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, subgroup_m_count = 2, subgroup_n_count = 2>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 2, subgroup_n_count = 2>}> // CHECK-LABEL: func.func @batch_matmul_unit_batch func.func @batch_matmul_unit_batch(%arg0: vector<1x64x64xf16>, %arg1: vector<1x64x128xf16>, %arg2: vector<1x64x128xf32>) -> vector<1x64x128xf32> attributes {translation_info = #translation} { // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]]