Atomic loads/stores, spill/reload, tests for __fp16 and half vectors.

llvm · Nov 6, 2024 · 26660a6 · 26660a6
1 parent d58853c
commit 26660a6
Show file tree

Hide file tree

Showing 14 changed files with 1,297 additions and 19 deletions.
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
@@ -94,12 +94,12 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
 
     // True if the backend supports operations on the half LLVM IR type.
     // By setting this to false, conversions will happen for _Float16 around
-    // a statement by default with operations done in float. However, if
+    // a statement by default, with operations done in float. However, if
     // -ffloat16-excess-precision=none is given, no conversions will be made
     // and instead the backend will promote each half operation to float
     // individually.
     HasLegalHalfType = false;
-    // Allow half arguments and return values.
+    // Allow half arguments and return values (__fp16).
     HalfArgsAndReturns = true;
     // Support _Float16.
     HasFloat16 = true;

diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -185,7 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const {
 
   if (const BuiltinType *BT = Ty->getAs<BuiltinType>())
     switch (BT->getKind()) {
-//  case BuiltinType::Half:     // __fp16    Support __fp16??
+    case BuiltinType::Half:     // __fp16
     case BuiltinType::Float16:  // _Float16
     case BuiltinType::Float:
     case BuiltinType::Double:

diff --git a/.../test/CodeGen/SystemZ/fexcess-precision.c → clang/test/CodeGen/SystemZ/Float16.c b/.../test/CodeGen/SystemZ/fexcess-precision.c → clang/test/CodeGen/SystemZ/Float16.c
diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck %s
+
+__fp16 f(__fp16 a, __fp16 b, __fp16 c, __fp16 d) {
+    return a * b + c * d;
+}
+
+// CHECK-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    %a.addr = alloca half, align 2
+// CHECK-NEXT:    %b.addr = alloca half, align 2
+// CHECK-NEXT:    %c.addr = alloca half, align 2
+// CHECK-NEXT:    %d.addr = alloca half, align 2
+// CHECK-NEXT:    store half %a, ptr %a.addr, align 2
+// CHECK-NEXT:    store half %b, ptr %b.addr, align 2
+// CHECK-NEXT:    store half %c, ptr %c.addr, align 2
+// CHECK-NEXT:    store half %d, ptr %d.addr, align 2
+// CHECK-NEXT:    %0 = load half, ptr %a.addr, align 2
+// CHECK-NEXT:    %conv = fpext half %0 to float
+// CHECK-NEXT:    %1 = load half, ptr %b.addr, align 2
+// CHECK-NEXT:    %conv1 = fpext half %1 to float
+// CHECK-NEXT:    %mul = fmul float %conv, %conv1
+// CHECK-NEXT:    %2 = load half, ptr %c.addr, align 2
+// CHECK-NEXT:    %conv2 = fpext half %2 to float
+// CHECK-NEXT:    %3 = load half, ptr %d.addr, align 2
+// CHECK-NEXT:    %conv3 = fpext half %3 to float
+// CHECK-NEXT:    %mul4 = fmul float %conv2, %conv3
+// CHECK-NEXT:    %add = fadd float %mul, %mul4
+// CHECK-NEXT:    %4 = fptrunc float %add to half
+// CHECK-NEXT:    ret half %4
+// CHECK-NEXT:  }
+
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; }
 __int128 pass_int128(__int128 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
 
+__fp16 pass___fp16(__fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} half @pass___fp16(half %{{.*}})
+
 _Float16 pass__Float16(_Float16 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
 
@@ -75,6 +78,8 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; }
 _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
 
+// _Complex __fp16 is (currently?) not allowed.
+
 _Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
 
@@ -129,6 +134,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
 
 // Float-like aggregate types
 
+struct agg___fp16 { __fp16 a; };
+struct agg___fp16 pass_agg___fp16(struct agg___fp16 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, half %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
 struct agg__Float16 { _Float16 a; };
 struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; }
 // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}})
@@ -148,6 +158,11 @@ struct agg_longdouble { long double a; };
 struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
 
+struct agg___fp16_a8 { __fp16 a __attribute__((aligned (8))); };
+struct agg___fp16_a8 pass_agg___fp16_a8(struct agg___fp16_a8 arg) { return arg; }
+// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, double %{{.*}})
+// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg___fp16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16_a8) align 8 %{{.*}}, i64 %{{.*}})
+
 struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); };
 struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; }
 // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}})
@@ -180,6 +195,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; }
 
 // Union types likewise are *not* float-like aggregate types
 
+union union___fp16 { __fp16 a; };
+union union___fp16 pass_union___fp16(union union___fp16 arg) { return arg; }
+// CHECK-LABEL: define{{.*}} void @pass_union___fp16(ptr dead_on_unwind noalias writable sret(%union.union___fp16) align 2 %{{.*}}, i16 noext %{{.*}})
+
 union union__Float16 { _Float16 a; };
 union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}})
@@ -461,6 +480,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l,
 // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
 // CHECK: ret void
 
+struct agg___fp16 va_agg___fp16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg___fp16); }
+// CHECK-LABEL: define{{.*}} void @va_agg___fp16(ptr dead_on_unwind noalias writable sret(%struct.agg___fp16) align 2 %{{.*}}, ptr %{{.*}}
+// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1
+// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0
+// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]]
+// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4
+// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5
+// CHECK: br i1 [[FITS_IN_REGS]],
+// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8
+// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128
+// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22
+// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3
+// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]]
+// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]]
+// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1
+// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]]
+// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2
+// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6
+// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]]
+// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ]
+// CHECK: ret void
+
 struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); }
 // CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}}
 // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -523,7 +523,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setTruncStoreAction(VT, MVT::f16, Expand);
   }
   setOperationAction(ISD::LOAD, MVT::f16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Custom);
   setOperationAction(ISD::STORE, MVT::f16, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -4596,6 +4598,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  MVT RegVT = Op.getSimpleValueType();
+  if (RegVT.getSizeInBits() == 128)
+    return lowerATOMIC_LDST_I128(Op, DAG);
+  return lowerLoadF16(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  if (Node->getMemoryVT().getSizeInBits() == 128)
+    return lowerATOMIC_LDST_I128(Op, DAG);
+  return lowerStoreF16(Op, DAG);
+}
+
 SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
                                                      SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
@@ -6217,15 +6235,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
   MVT RegVT = Op.getSimpleValueType();
   if (RegVT != MVT::f16)
     return SDValue();
-  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
-  SDLoc DL(Ld);
-  assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending f16 load");
+
+  SDLoc DL(Op);
+  SDValue NewLd;
+  if (auto *AtomicLd = dyn_cast<AtomicSDNode>(Op.getNode())) {
+    assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load");
+    NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i32,
+                          AtomicLd->getChain(), AtomicLd->getBasePtr(),
+                          AtomicLd->getMemOperand());
+    cast<AtomicSDNode>(NewLd)->setExtensionType(ISD::EXTLOAD);
+  } else {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+    assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load");
+    NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
+                           Ld->getBasePtr(), Ld->getPointerInfo(),
+                           MVT::i16, Ld->getOriginalAlign(),
+                           Ld->getMemOperand()->getFlags());
+  }
   // Load as integer, shift and insert into upper 2 bytes of the FP register.
   // TODO: Use VLEH if available.
-  SDValue NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(),
-                                 Ld->getBasePtr(), Ld->getPointerInfo(),
-                                 MVT::i16, Ld->getOriginalAlign(),
-                                 Ld->getMemOperand()->getFlags());
   SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd,
                              DAG.getConstant(16, DL, MVT::i32));
   SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft);
@@ -6236,20 +6264,25 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op,
                                              SelectionDAG &DAG) const {
-  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
-  SDLoc DL(St);
-  SDValue StoredVal = St->getValue();
+  SDValue StoredVal = Op->getOperand(1);
   MVT StoreVT = StoredVal.getSimpleValueType();
   if (StoreVT != MVT::f16)
     return SDValue();
-  // Move into a GPR, shift and store the 2 bytes.
-  // TODO: Use VSTEH if available.
+
+  // Move into a GPR, shift and store the 2 bytes.  TODO: Use VSTEH if available.
+  SDLoc DL(Op);
   SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32);
   SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL,
                                            MVT::f32, SDValue(U32, 0), StoredVal);
   SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32);
   SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast,
                              DAG.getConstant(16, DL, MVT::i32));
+
+  if (auto *AtomicSt = dyn_cast<AtomicSDNode>(Op.getNode()))
+    return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(),
+                         Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand());
+
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
   return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(),
                            MVT::i16, St->getMemOperand());
 }
@@ -6373,8 +6406,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
+    return lowerATOMIC_STORE(Op, DAG);
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_LDST_I128(Op, DAG);
+    return lowerATOMIC_LOAD(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -698,6 +698,8 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;

diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -995,8 +995,31 @@ void SystemZInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI, Register VReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  // There are no fp16 load/store instructions, so need to save/restore via
+  // GPR (TODO: Use VSTEH in case of vector support).
+  if (RC == &SystemZ::FP16BitRegClass) {
+    assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+           "Expected non-SSA form with virtual registers.");
+    Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::COPY))
+      .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16)
+      .addReg(SrcReg, getKillRegState(isKill));
+    BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg)
+      .addReg(FP64Reg, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg)
+      .addReg(GR64Reg)
+      .addReg(0)
+      .addImm(48);
+    addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH))
+                        .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32),
+                      FrameIdx);
+    return;
+  }
+
   // Callers may expect a single instruction, so keep 128-bit moves
   // together for now and lower them after register allocation.
   unsigned LoadOpcode, StoreOpcode;
@@ -1012,8 +1035,31 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                             const TargetRegisterClass *RC,
                                             const TargetRegisterInfo *TRI,
                                             Register VReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  // There are no fp16 load/store instructions, so need to save/restore via
+  // GPR (TODO: Use VLEH in case of vector support).
+  if (RC == &SystemZ::FP16BitRegClass) {
+    assert(!MRI.isSSA() && MRI.getNumVirtRegs() &&
+           "Expected non-SSA form with virtual registers.");
+    Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass);
+    addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH))
+                        .addReg(GR64Reg, RegState::DefineNoRead,
+                                SystemZ::subreg_l32),
+                      FrameIdx);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg)
+      .addReg(GR64Reg)
+      .addReg(0)
+      .addImm(48);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg)
+      .addReg(GR64Reg, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg)
+      .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16);
+    return;
+  }
+
   // Callers may expect a single instruction, so keep 128-bit moves
   // together for now and lower them after register allocation.
   unsigned LoadOpcode, StoreOpcode;

diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test fp16 atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define half @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r0, 0(%r2)
+; CHECK-NEXT:    sllg %r0, %r0, 48
+; CHECK-NEXT:    ldgr %f0, %r0
+; CHECK-NEXT:    # kill: def $f0h killed $f0h killed $f0d
+; CHECK-NEXT:    br %r14
+  %val = load atomic half, ptr %src seq_cst, align 2
+  ret half %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test half atomic stores.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %src, half %val) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $f0h killed $f0h def $f0d
+; CHECK-NEXT:    lgdr %r0, %f0
+; CHECK-NEXT:    srlg %r0, %r0, 48
+; CHECK-NEXT:    sth %r0, 0(%r2)
+; CHECK-NEXT:    bcr 15, %r0
+; CHECK-NEXT:    br %r14
+  store atomic half %val, ptr %src seq_cst, align 2
+  ret void
+}