Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions #77892

Merged

Conversation

mariusz-sikora-at-amd
Copy link
Contributor

Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).

Endoding is VOP3P. Tagged as deep/machine learning instructions.
i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1.
src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers:
f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" mc Machine (object) code llvm:ir labels Jan 12, 2024
@llvmbot
Copy link
Member

llvmbot commented Jan 12, 2024

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-mc
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-amdgpu

Author: Mariusz Sikora (mariusz-sikora-at-amd)

Changes

Endoding is VOP3P. Tagged as deep/machine learning instructions. i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1. src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers: f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).


Patch is 55.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77892.diff

19 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl (+5)
  • (added) clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl (+20)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+19)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4)
  • (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+46)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+11-6)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+47)
  • (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+7-6)
  • (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll (+255)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s (+120)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s (+24)
  • (added) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s (+24)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s (+24)
  • (added) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s (+27)
  • (added) llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s (+133)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt (+120)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt (+24)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt (+24)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..1c1b9b2c9e9e8c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts")
 
 //===----------------------------------------------------------------------===//
 // GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 6573325150d958..1be47f71276208 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err(
 
   iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);    // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
   iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true);     // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
+
+  fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);        // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}}
+  fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);        // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}}
+  fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);        // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}}
+  fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);        // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
new file mode 100644
index 00000000000000..31e10c0a5dc18c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @builtins_amdgcn_dl_insts
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %uiA, i32 %uiB, float %fC)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void builtins_amdgcn_dl_insts_err(global float *fOut,
+                                         uint uiA, uint uiB, float fC) {
+  fOut[0] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);
+  fOut[1] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);
+  fOut[2] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);
+  fOut[3] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 255f5106e543fb..1ec695a8164d8b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2696,6 +2696,25 @@ def int_amdgcn_udot8 :
      ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
   >;
 
+// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+class AMDGPU8bitFloatDot4Intrinsic :
+  ClangBuiltin<!subst("int", "__builtin", NAME)>,
+  DefaultAttrsIntrinsic<
+    [llvm_float_ty], // %r
+    [
+      llvm_i32_ty,   // %a
+      llvm_i32_ty,   // %b
+      llvm_float_ty, // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+def int_amdgcn_fdot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+
 //===----------------------------------------------------------------------===//
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d42496ef09ee8f..0fbe9a2dff2aba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4471,6 +4471,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_fdot2_f32_bf16:
     case Intrinsic::amdgcn_sudot4:
     case Intrinsic::amdgcn_sudot8:
+    case Intrinsic::amdgcn_fdot4_f32_fp8_bf8:
+    case Intrinsic::amdgcn_fdot4_f32_bf8_fp8:
+    case Intrinsic::amdgcn_fdot4_f32_fp8_fp8:
+    case Intrinsic::amdgcn_fdot4_f32_bf8_bf8:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ba79affe683d6f..bd68054589b112 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1688,6 +1688,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateMIMGD16(const MCInst &Inst);
   bool validateMIMGMSAA(const MCInst &Inst);
   bool validateOpSel(const MCInst &Inst);
+  bool validateNeg(const MCInst &Inst, int OpName);
   bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
@@ -4357,6 +4358,41 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
   return true;
 }
 
+bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
+  assert(OpName == AMDGPU::OpName::neg_lo || OpName == AMDGPU::OpName::neg_hi);
+
+  const unsigned Opc = Inst.getOpcode();
+  uint64_t TSFlags = MII.get(Opc).TSFlags;
+
+  // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
+  if (!(TSFlags & SIInstrFlags::IsDOT))
+    return true;
+
+  int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+  if (NegIdx == -1)
+    return true;
+
+  unsigned Neg = Inst.getOperand(NegIdx).getImm();
+
+  // Instructions that have neg_lo or neg_hi operand but neg modifier is allowed
+  // on some src operands but not allowed on other.
+  // It is convenient that such instructions don't have src_modifiers operand
+  // for src operands that don't allow neg because they also don't allow opsel.
+
+  int SrcMods[3] = {AMDGPU::OpName::src0_modifiers,
+                    AMDGPU::OpName::src1_modifiers,
+                    AMDGPU::OpName::src2_modifiers};
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (!AMDGPU::hasNamedOperand(Opc, SrcMods[i])) {
+      if (Neg & (1 << i))
+        return false;
+    }
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
                                   const OperandVector &Operands) {
   const unsigned Opc = Inst.getOpcode();
@@ -4834,6 +4870,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "invalid op_sel operand");
     return false;
   }
+  if (!validateNeg(Inst, AMDGPU::OpName::neg_lo)) {
+    Error(getImmLoc(AMDGPUOperand::ImmTyNegLo, Operands),
+          "invalid neg_lo operand");
+    return false;
+  }
+  if (!validateNeg(Inst, AMDGPU::OpName::neg_hi)) {
+    Error(getImmLoc(AMDGPUOperand::ImmTyNegHi, Operands),
+          "invalid neg_hi operand");
+    return false;
+  }
   if (!validateDPP(Inst, Operands)) {
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 6c7977e22599c6..e73e53aa270f91 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1260,14 +1260,19 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
   int NumOps = 0;
   int Ops[3];
 
-  for (int OpName : { AMDGPU::OpName::src0_modifiers,
-                      AMDGPU::OpName::src1_modifiers,
-                      AMDGPU::OpName::src2_modifiers }) {
-    int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
-    if (Idx == -1)
+  std::pair<int, int> MOps[] = {
+      {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src0},
+      {AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src1},
+      {AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::src2}};
+  int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+  for (auto [SrcMod, Src] : MOps) {
+    if (!AMDGPU::hasNamedOperand(Opc, Src))
       break;
 
-    Ops[NumOps++] = MI->getOperand(Idx).getImm();
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, SrcMod);
+    Ops[NumOps++] =
+        (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
   }
 
   const bool HasDstSel =
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 17921dcddc0e2a..faa00dff5ff00c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -443,6 +443,48 @@ def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
 >;
 } // End SubtargetPredicate = HasDot8Insts
 
+// Does not use opsel, no src_modifiers on src0 and src1.
+// src_modifiers on src2(f32) are f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
+def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
+                                        VOP3_PACKED, 1> {
+  let HasClamp = 0;
+  let HasOpSel = 0;
+  let HasOMod = 0;
+  let IsDOT = 1;
+  let HasSrc0Mods = 0;
+  let HasSrc1Mods = 0;
+  let HasSrc2Mods = 1;
+
+  let InsVOP3P = (ins VSrc_b32:$src0, VSrc_b32:$src1,
+                      PackedF16InputMods:$src2_modifiers, VSrc_f32:$src2,
+                      neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+
+  let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+                         PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+                         neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, FI:$fi);
+
+  let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+                          PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+                          neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
+                          row_mask:$row_mask, bank_mask:$bank_mask,
+                          bound_ctrl:$bound_ctrl, FI:$fi);
+}
+
+multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
+  defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>;
+
+  let SubtargetPredicate = isGFX12Plus in
+  def : GCNPat <(intrinsic_node i32:$src0, i32:$src1,
+                                (VOP3Mods f32:$src2, i32:$src2_modifiers)),
+                (!cast<Instruction>(NAME) i32:$src0, i32:$src1,
+                                          i32:$src2_modifiers, f32:$src2)>;
+}
+
+defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_fdot4_f32_fp8_bf8>;
+defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_fdot4_f32_bf8_fp8>;
+defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_fdot4_f32_fp8_fp8>;
+defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_fdot4_f32_bf8_bf8>;
+
 def : UDot2Pat<V_DOT2_U32_U16>;
 def : SDot2Pat<V_DOT2_I32_I16>;
 
@@ -1019,6 +1061,11 @@ defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_m
 defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
 defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
 
+defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>;
+defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>;
+defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>;
+defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>;
+
 //===----------------------------------------------------------------------===//
 // GFX11
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c4b9e706309374..a10a2cfae018c3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -169,6 +169,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
 class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
   VOP3_Pseudo<opName, P, pattern, 1> {
   let VOP3P = 1;
+  let IsDOT = P.IsDOT;
 }
 
 class VOP_Real<VOP_Pseudo ps> {
@@ -387,7 +388,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
   let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
 
-  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
@@ -396,8 +397,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   let Inst{40-32} = !if(P.HasSrc0, src0, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
-  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
-  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -772,12 +773,12 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
   let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
   let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
   let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
-  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
   let Inst{22-16} = op;
   let Inst{31-23} = 0x198; // encoding
-  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
-  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
new file mode 100644
index 00000000000000..97371f169c07ff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fabs.c = call float @llvm.fabs.f32(float %c)
+  %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fneg.c = fneg float %c
+  %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fneg.c = fneg float %c
+  %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
+  %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fabs.c = call float @llvm.fabs.f32(float %c)
+  %fneg.fabs.c = fneg float %fabs.c
+  %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fabs:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fabs.c = call float @llvm.fabs.f32(float %c)
+  %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fneg:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %fneg.c = fneg float %c
+  %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c)
+  ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg(i...
[truncated]

@mariusz-sikora-at-amd
Copy link
Contributor Author

Rebase to run tests

@mariusz-sikora-at-amd
Copy link
Contributor Author

Rebase to run tests

@mariusz-sikora-at-amd mariusz-sikora-at-amd merged commit 28b7e49 into llvm:main Jan 18, 2024
3 of 4 checks passed
jayfoad added a commit that referenced this pull request Jan 18, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants