diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir index 44c72c6e0962..c801f9bb513e 100644 --- a/compiler/plugins/target/ROCM/test/target_device_features.mlir +++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir @@ -6,13 +6,13 @@ // GFX942: target = #iree_gpu.target, , , , ], +// GFX942-SAME: mma = [, , , ], // GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], // GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, // GFX942-SAME: chip = > // GFX940: target = #iree_gpu.target, , , , ], +// GFX940-SAME: mma = [, , , ], // GFX1100: target = #iree_gpu.target, ] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 5ab6fdf9aa6b..cbca1009b147 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -210,9 +210,6 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context, Type i32 = IntegerType::get(context, 32); switch (type) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - return OpaqueMmaLayout{16, 16, 4, f32, f32, f32}; - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: { return OpaqueMmaLayout{16, 16, 16, f16, f16, f32}; } @@ -254,24 +251,6 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, LayoutDimensionAttr::get(context, LayoutDimension::VECTORZ); (void)laneZ, (void)vectorZ; switch (type) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> - // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [4, 1]> - // #layout_a = #iree_vector_ext.layout<#outer, #inner> - // #layout_b = #iree_vector_ext.layout<#inner, #outer> - // #layout_c = #iree_vector_ext.layout<#inner, #outer> - - auto outer = PerDimLayoutAttr::get(context, {laneX}, {16}); - auto inner = PerDimLayoutAttr::get(context, {laneY, vectorX}, {4, 1}); - auto aMLayout = outer; - auto aKLayout = inner; - auto bKLayout = inner; - auto bNLayout = outer; - auto cMLayout = PerDimLayoutAttr::get(context, {laneY, vectorX}, {4, 4}); - auto cNLayout = outer; - return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, - bNLayout, cMLayout, cNLayout}; - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [4, 4]> @@ -425,12 +404,6 @@ MMAAttr::getABCVectorTypes() const { // amd_matrix_instruction_calculator tells us about the number of 32-bit // registers. So need to adjust accordingly. All vectors should be 1-D. switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - auto aType = VectorType::get({1}, getAType()); - auto bType = VectorType::get({1}, getBType()); - auto cType = VectorType::get({4}, getCType()); - return std::make_tuple(aType, bType, cType); - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: { auto aType = VectorType::get({4}, getAType()); auto bType = VectorType::get({4}, getBType()); @@ -477,7 +450,6 @@ MMAAttr::getContractionLayout(vector::ContractionOp contract) const { int64_t MMAAttr::getBlockSize() const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: case MMAIntrinsic::MFMA_F16_16x16x16_F32: case MMAIntrinsic::MFMA_F16_32x32x8_F32: case MMAIntrinsic::MFMA_I8_16x16x32_I32: @@ -493,7 +465,6 @@ int64_t MMAAttr::getBlockSize() const { int64_t MMAAttr::getSubgroupSize() const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: case MMAIntrinsic::MFMA_F16_16x16x16_F32: case MMAIntrinsic::MFMA_F16_32x32x8_F32: case MMAIntrinsic::MFMA_I8_16x16x32_I32: @@ -511,10 +482,6 @@ int64_t MMAAttr::getSubgroupSize() const { MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16}, - /*element=*/{1, 1}}; - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: { return {/*outer=*/{1, 1}, /*thread=*/{16, 4}, /*strides=*/{1, 16}, /*element=*/{1, 4}}; @@ -542,10 +509,6 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, - /*element=*/{1, 1}}; - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, /*element=*/{4, 1}}; @@ -573,7 +536,6 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const { switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: case MMAIntrinsic::MFMA_F16_16x16x16_F32: case MMAIntrinsic::MFMA_I8_16x16x32_I32: { return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*strides=*/{16, 1}, @@ -609,17 +571,6 @@ FailureOr MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc, return failure(); } switch (getIntrinsic().getValue()) { - case MMAIntrinsic::MFMA_F32_16x16x4_F32: { - // Update the lhs and rhs to extract the first element since vector<1xT> is - // not supoorted by amgpu.mfma op. - lhs = builder.create(loc, lhs, ArrayRef{int64_t{0}}); - rhs = builder.create(loc, rhs, ArrayRef{int64_t{0}}); - auto [m, n, k] = getMNKShape(); - return builder - .create(loc, resultType, m, n, k, getBlockSize(), lhs, - rhs, acc) - .getResult(); - } case MMAIntrinsic::MFMA_F16_16x16x16_F32: case MMAIntrinsic::MFMA_F16_32x32x8_F32: case MMAIntrinsic::MFMA_I8_16x16x32_I32: diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td index 6633ead21c75..0423c2f6e9fa 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td @@ -99,18 +99,16 @@ class IREEGPU_I32MmaEnumAttr } // Format: __xx_ -def MFMA_F32_16x16x4_F32 : I32EnumAttrCase<"MFMA_F32_16x16x4_F32", 0>; -def MFMA_F16_16x16x16_F32 : I32EnumAttrCase<"MFMA_F16_16x16x16_F32", 1>; -def MFMA_F16_32x32x8_F32 : I32EnumAttrCase<"MFMA_F16_32x32x8_F32", 2>; -def MFMA_I8_16x16x32_I32 : I32EnumAttrCase<"MFMA_I8_16x16x32_I32", 3>; -def MFMA_I8_32x32x16_I32 : I32EnumAttrCase<"MFMA_I8_32x32x16_I32", 4>; +def MFMA_F16_16x16x16_F32 : I32EnumAttrCase<"MFMA_F16_16x16x16_F32", 0>; +def MFMA_F16_32x32x8_F32 : I32EnumAttrCase<"MFMA_F16_32x32x8_F32", 1>; +def MFMA_I8_16x16x32_I32 : I32EnumAttrCase<"MFMA_I8_16x16x32_I32", 2>; +def MFMA_I8_32x32x16_I32 : I32EnumAttrCase<"MFMA_I8_32x32x16_I32", 3>; // TODO: Create separate WMMA ops for AMD and NVIDIA GPUs -def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 5>; -def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 6>; +def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 4>; +def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 5>; def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic", "Descriptor for different MMA intrinsics", [ - MFMA_F32_16x16x4_F32, MFMA_F16_16x16x16_F32, MFMA_F16_32x32x8_F32, MFMA_I8_16x16x32_I32, diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index ac88e2014b07..f4584fe66b4e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -122,8 +122,9 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, const WgpDetails *getCDNA3WgpDetails() { static const MMAIntrinsic cdna3MMAOps[] = { - MMAIntrinsic::MFMA_F32_16x16x4_F32, MMAIntrinsic::MFMA_F16_16x16x16_F32, - MMAIntrinsic::MFMA_F16_32x32x8_F32, MMAIntrinsic::MFMA_I8_16x16x32_I32, + MMAIntrinsic::MFMA_F16_16x16x16_F32, + MMAIntrinsic::MFMA_F16_32x32x8_F32, + MMAIntrinsic::MFMA_I8_16x16x32_I32, MMAIntrinsic::MFMA_I8_32x32x16_I32, }; static const WgpDetails cdna3Wgp = { diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt index 09269186ce26..1c6dc9f22551 100644 --- a/tests/e2e/matmul/CMakeLists.txt +++ b/tests/e2e/matmul/CMakeLists.txt @@ -2289,34 +2289,6 @@ iree_generated_e2e_runner_test( "requires-gpu-cdna3" ) -iree_generated_e2e_runner_test( - NAME - e2e_matmul_rocm_f32_large_cdna3_mfma - TEST_TYPE - matmul - GENERATOR - "generate_e2e_matmul_tests.py" - GENERATOR_ARGS - "--lhs_rhs_type=f32" - "--acc_type=f32" - "--shapes=gpu_large_aligned" - "--compilation_info=LLVMGPUVectorDistributeMFMA" - TEST_RUNNER - iree_tools_testing_e2e_iree-e2e-matmul-test - TARGET_BACKENDS - "rocm" - DRIVERS - "hip" - COMPILER_FLAGS - ${IREE_HIP_TEST_COMPILER_FLAGS} - LABELS - "noasan" - "nomsan" - "notsan" - "noubsan" - "requires-gpu-cdna3" -) - iree_generated_e2e_runner_test( NAME e2e_matmul_rocm_f16_large_cdna3_mfma_tb diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py index 4ef454aeab68..003f3de84e22 100644 --- a/tests/e2e/matmul/generate_e2e_matmul_tests.py +++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py @@ -260,11 +260,6 @@ def get_rocm_test_compilation_infos( schedules = [] if intrinsic == "MFMA": schedules = [ - MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 1, 1), - MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 1, 2), - MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 1, 2, 1), - MMASchedule("MFMA_F32_16x16x4_F32", 1, 1, 2, 1, 1), - MMASchedule("MFMA_F32_16x16x4_F32", 2, 2, 1, 1, 2), MMASchedule("MFMA_F16_16x16x16_F32", 1, 1, 1, 1, 1), MMASchedule("MFMA_F16_16x16x16_F32", 1, 1, 1, 1, 2), MMASchedule("MFMA_F16_16x16x16_F32", 1, 1, 1, 2, 1), @@ -304,16 +299,10 @@ def get_rocm_test_compilation_infos( for schedule in schedules: # Skip schedules with an intrinsic which element type does not # match the requested one. - # Search for the lhs_rhs type in the first part of intrinsic - # e.g., MFMA_F32_16x16x4_F32 -> MFMA_F32 - if lhs_rhs_type.value.upper() not in schedule.intrinsic[:8]: + if lhs_rhs_type.value.upper() not in schedule.intrinsic: continue - if schedule.intrinsic == "MFMA_F32_16x16x4_F32": - wg_tile_m = schedule.m_count * schedule.m_tile_count * 16 - wg_tile_n = schedule.n_count * schedule.n_tile_count * 16 - wg_tile_k = schedule.k_tile_count * 4 - elif schedule.intrinsic == "MFMA_F16_16x16x16_F32": + if schedule.intrinsic == "MFMA_F16_16x16x16_F32": wg_tile_m = schedule.m_count * schedule.m_tile_count * 16 wg_tile_n = schedule.n_count * schedule.n_tile_count * 16 wg_tile_k = schedule.k_tile_count * 16