diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 3c9078bcdf8118..976d08f0fafbba 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -869,6 +869,7 @@ to ``float``; see below for more information on this emulation. * SPIR (natively) * X86 (if SSE2 is available; natively if AVX512-FP16 is also available) * RISC-V (natively if Zfh or Zhinx is available) + * SystemZ (emulated) * ``__bf16`` is supported on the following targets (currently never natively): diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index ef9a07033a6e4f..b4da2c9ce64754 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -91,11 +91,26 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo { "-v128:64-a:8:16-n32:64"); } MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 128; + + // True if the backend supports operations on the half LLVM IR type. + // By setting this to false, conversions will happen for _Float16 around + // a statement by default, with operations done in float. However, if + // -ffloat16-excess-precision=none is given, no conversions will be made + // and instead the backend will promote each half operation to float + // individually. + HasLegalHalfType = false; + // Support _Float16. + HasFloat16 = true; + HasStrictFP = true; } unsigned getMinGlobalAlign(uint64_t Size, bool HasNonWeakDef) const override; + bool useFP16ConversionIntrinsics() const override { + return false; + } + void getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const override; diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 23c96fa5cf98cb..021d764dbfd063 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -185,6 +185,7 @@ bool SystemZABIInfo::isFPArgumentType(QualType Ty) const { if (const BuiltinType *BT = Ty->getAs()) switch (BT->getKind()) { + case BuiltinType::Float16: // _Float16 case BuiltinType::Float: case BuiltinType::Double: return true; @@ -277,7 +278,8 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, } else { if (AI.getCoerceToType()) ArgTy = AI.getCoerceToType(); - InFPRs = (!IsSoftFloatABI && (ArgTy->isFloatTy() || ArgTy->isDoubleTy())); + InFPRs = (!IsSoftFloatABI && + (ArgTy->isHalfTy() || ArgTy->isFloatTy() || ArgTy->isDoubleTy())); IsVector = ArgTy->isVectorTy(); UnpaddedSize = TyInfo.Width; DirectAlign = TyInfo.Align; @@ -446,10 +448,11 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const { // The structure is passed as an unextended integer, a float, or a double. if (isFPArgumentType(SingleElementTy)) { - assert(Size == 32 || Size == 64); + assert(Size == 16 || Size == 32 || Size == 64); return ABIArgInfo::getDirect( - Size == 32 ? llvm::Type::getFloatTy(getVMContext()) - : llvm::Type::getDoubleTy(getVMContext())); + Size == 16 ? llvm::Type::getHalfTy(getVMContext()) + : Size == 32 ? llvm::Type::getFloatTy(getVMContext()) + : llvm::Type::getDoubleTy(getVMContext())); } else { llvm::IntegerType *PassTy = llvm::IntegerType::get(getVMContext(), Size); return Size <= 32 ? ABIArgInfo::getNoExtend(PassTy) diff --git a/clang/test/CodeGen/SystemZ/Float16.c b/clang/test/CodeGen/SystemZ/Float16.c new file mode 100644 index 00000000000000..4444dbdcc23ca0 --- /dev/null +++ b/clang/test/CodeGen/SystemZ/Float16.c @@ -0,0 +1,85 @@ +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=standard -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=STANDARD + +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=none -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=NONE + +// RUN: %clang_cc1 -triple s390x-linux-gnu \ +// RUN: -ffloat16-excess-precision=fast -emit-llvm -o - %s \ +// RUN: | FileCheck %s -check-prefix=FAST + +_Float16 f(_Float16 a, _Float16 b, _Float16 c, _Float16 d) { + return a * b + c * d; +} + +// STANDARD-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// STANDARD-NEXT: entry: +// STANDARD-NEXT: %a.addr = alloca half, align 2 +// STANDARD-NEXT: %b.addr = alloca half, align 2 +// STANDARD-NEXT: %c.addr = alloca half, align 2 +// STANDARD-NEXT: %d.addr = alloca half, align 2 +// STANDARD-NEXT: store half %a, ptr %a.addr, align 2 +// STANDARD-NEXT: store half %b, ptr %b.addr, align 2 +// STANDARD-NEXT: store half %c, ptr %c.addr, align 2 +// STANDARD-NEXT: store half %d, ptr %d.addr, align 2 +// STANDARD-NEXT: %0 = load half, ptr %a.addr, align 2 +// STANDARD-NEXT: %ext = fpext half %0 to float +// STANDARD-NEXT: %1 = load half, ptr %b.addr, align 2 +// STANDARD-NEXT: %ext1 = fpext half %1 to float +// STANDARD-NEXT: %mul = fmul float %ext, %ext1 +// STANDARD-NEXT: %2 = load half, ptr %c.addr, align 2 +// STANDARD-NEXT: %ext2 = fpext half %2 to float +// STANDARD-NEXT: %3 = load half, ptr %d.addr, align 2 +// STANDARD-NEXT: %ext3 = fpext half %3 to float +// STANDARD-NEXT: %mul4 = fmul float %ext2, %ext3 +// STANDARD-NEXT: %add = fadd float %mul, %mul4 +// STANDARD-NEXT: %unpromotion = fptrunc float %add to half +// STANDARD-NEXT: ret half %unpromotion +// STANDARD-NEXT: } + +// NONE-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// NONE-NEXT: entry: +// NONE-NEXT: %a.addr = alloca half, align 2 +// NONE-NEXT: %b.addr = alloca half, align 2 +// NONE-NEXT: %c.addr = alloca half, align 2 +// NONE-NEXT: %d.addr = alloca half, align 2 +// NONE-NEXT: store half %a, ptr %a.addr, align 2 +// NONE-NEXT: store half %b, ptr %b.addr, align 2 +// NONE-NEXT: store half %c, ptr %c.addr, align 2 +// NONE-NEXT: store half %d, ptr %d.addr, align 2 +// NONE-NEXT: %0 = load half, ptr %a.addr, align 2 +// NONE-NEXT: %1 = load half, ptr %b.addr, align 2 +// NONE-NEXT: %mul = fmul half %0, %1 +// NONE-NEXT: %2 = load half, ptr %c.addr, align 2 +// NONE-NEXT: %3 = load half, ptr %d.addr, align 2 +// NONE-NEXT: %mul1 = fmul half %2, %3 +// NONE-NEXT: %add = fadd half %mul, %mul1 +// NONE-NEXT: ret half %add +// NONE-NEXT: } + +// FAST-LABEL: define dso_local half @f(half noundef %a, half noundef %b, half noundef %c, half noundef %d) #0 { +// FAST-NEXT: entry: +// FAST-NEXT: %a.addr = alloca half, align 2 +// FAST-NEXT: %b.addr = alloca half, align 2 +// FAST-NEXT: %c.addr = alloca half, align 2 +// FAST-NEXT: %d.addr = alloca half, align 2 +// FAST-NEXT: store half %a, ptr %a.addr, align 2 +// FAST-NEXT: store half %b, ptr %b.addr, align 2 +// FAST-NEXT: store half %c, ptr %c.addr, align 2 +// FAST-NEXT: store half %d, ptr %d.addr, align 2 +// FAST-NEXT: %0 = load half, ptr %a.addr, align 2 +// FAST-NEXT: %ext = fpext half %0 to float +// FAST-NEXT: %1 = load half, ptr %b.addr, align 2 +// FAST-NEXT: %ext1 = fpext half %1 to float +// FAST-NEXT: %mul = fmul float %ext, %ext1 +// FAST-NEXT: %2 = load half, ptr %c.addr, align 2 +// FAST-NEXT: %ext2 = fpext half %2 to float +// FAST-NEXT: %3 = load half, ptr %d.addr, align 2 +// FAST-NEXT: %ext3 = fpext half %3 to float +// FAST-NEXT: %mul4 = fmul float %ext2, %ext3 +// FAST-NEXT: %add = fadd float %mul, %mul4 +// FAST-NEXT: %unpromotion = fptrunc float %add to half +// FAST-NEXT: ret half %unpromotion +// FAST-NEXT: } diff --git a/clang/test/CodeGen/SystemZ/fp16.c b/clang/test/CodeGen/SystemZ/fp16.c new file mode 100644 index 00000000000000..430958b69a177b --- /dev/null +++ b/clang/test/CodeGen/SystemZ/fp16.c @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -triple s390x-linux-gnu -emit-llvm -o - %s \ +// RUN: | FileCheck %s + +void f(__fp16 *a, __fp16 *b, __fp16 *c, __fp16 *d, __fp16 *e) { + *e = (*a) * (*b) + (*c) * (*d); +} + +// CHECK-LABEL: define dso_local void @f(ptr noundef %a, ptr noundef %b, ptr noundef %c, ptr noundef %d, ptr noundef %e) #0 { +// CHECK-NEXT: entry: +// CHECK-NEXT: %a.addr = alloca ptr, align 8 +// CHECK-NEXT: %b.addr = alloca ptr, align 8 +// CHECK-NEXT: %c.addr = alloca ptr, align 8 +// CHECK-NEXT: %d.addr = alloca ptr, align 8 +// CHECK-NEXT: %e.addr = alloca ptr, align 8 +// CHECK-NEXT: store ptr %a, ptr %a.addr, align 8 +// CHECK-NEXT: store ptr %b, ptr %b.addr, align 8 +// CHECK-NEXT: store ptr %c, ptr %c.addr, align 8 +// CHECK-NEXT: store ptr %d, ptr %d.addr, align 8 +// CHECK-NEXT: store ptr %e, ptr %e.addr, align 8 +// CHECK-NEXT: %0 = load ptr, ptr %a.addr, align 8 +// CHECK-NEXT: %1 = load half, ptr %0, align 2 +// CHECK-NEXT: %conv = fpext half %1 to float +// CHECK-NEXT: %2 = load ptr, ptr %b.addr, align 8 +// CHECK-NEXT: %3 = load half, ptr %2, align 2 +// CHECK-NEXT: %conv1 = fpext half %3 to float +// CHECK-NEXT: %mul = fmul float %conv, %conv1 +// CHECK-NEXT: %4 = load ptr, ptr %c.addr, align 8 +// CHECK-NEXT: %5 = load half, ptr %4, align 2 +// CHECK-NEXT: %conv2 = fpext half %5 to float +// CHECK-NEXT: %6 = load ptr, ptr %d.addr, align 8 +// CHECK-NEXT: %7 = load half, ptr %6, align 2 +// CHECK-NEXT: %conv3 = fpext half %7 to float +// CHECK-NEXT: %mul4 = fmul float %conv2, %conv3 +// CHECK-NEXT: %add = fadd float %mul, %mul4 +// CHECK-NEXT: %8 = fptrunc float %add to half +// CHECK-NEXT: %9 = load ptr, ptr %e.addr, align 8 +// CHECK-NEXT: store half %8, ptr %9, align 2 +// CHECK-NEXT: ret void +// CHECK-NEXT: } diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c index fd2b5d450cc643..2287126bdeabec 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi.c @@ -45,6 +45,9 @@ long long pass_longlong(long long arg) { return arg; } __int128 pass_int128(__int128 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0) +_Float16 pass__Float16(_Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}}) + float pass_float(float arg) { return arg; } // CHECK-LABEL: define{{.*}} float @pass_float(float %{{.*}}) @@ -72,6 +75,9 @@ _Complex long pass_complex_long(_Complex long arg) { return arg; } _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg) +_Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg) + _Complex float pass_complex_float(_Complex float arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg) @@ -123,6 +129,11 @@ struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; } // Float-like aggregate types +struct agg__Float16 { _Float16 a; }; +struct agg__Float16 pass_agg__Float16(struct agg__Float16 arg) { return arg; } +// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, half %{{.*}}) +// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, i16 noext %{{.*}}) + struct agg_float { float a; }; struct agg_float pass_agg_float(struct agg_float arg) { return arg; } // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, float %{{.*}}) @@ -137,6 +148,11 @@ struct agg_longdouble { long double a; }; struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}}) +struct agg__Float16_a8 { _Float16 a __attribute__((aligned (8))); }; +struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return arg; } +// HARD-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, double %{{.*}}) +// SOFT-FLOAT-LABEL: define{{.*}} void @pass_agg__Float16_a8(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a8) align 8 %{{.*}}, i64 %{{.*}}) + struct agg_float_a8 { float a __attribute__((aligned (8))); }; struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; } // HARD-FLOAT-LABEL: define{{.*}} void @pass_agg_float_a8(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a8) align 8 %{{.*}}, double %{{.*}}) @@ -164,6 +180,10 @@ struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; } // Union types likewise are *not* float-like aggregate types +union union__Float16 { _Float16 a; }; +union union__Float16 pass_union__Float16(union union__Float16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_union__Float16(ptr dead_on_unwind noalias writable sret(%union.union__Float16) align 2 %{{.*}}, i16 noext %{{.*}}) + union union_float { float a; }; union union_float pass_union_float(union union_float arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_union_float(ptr dead_on_unwind noalias writable sret(%union.union_float) align 4 %{{.*}}, i32 noext %{{.*}}) @@ -441,6 +461,30 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ] // CHECK: ret void +struct agg__Float16 va_agg__Float16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg__Float16); } +// CHECK-LABEL: define{{.*}} void @va_agg__Float16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16) align 2 %{{.*}}, ptr %{{.*}} +// HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1 +// SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 0 +// CHECK: [[REG_COUNT:%[^ ]+]] = load i64, ptr [[REG_COUNT_PTR]] +// HARD-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 4 +// SOFT-FLOAT: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 +// CHECK: br i1 [[FITS_IN_REGS]], +// CHECK: [[SCALED_REG_COUNT:%[^ ]+]] = mul i64 [[REG_COUNT]], 8 +// HARD-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 128 +// SOFT-FLOAT: [[REG_OFFSET:%[^ ]+]] = add i64 [[SCALED_REG_COUNT]], 22 +// CHECK: [[REG_SAVE_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 3 +// CHECK: [[REG_SAVE_AREA:%[^ ]+]] = load ptr, ptr [[REG_SAVE_AREA_PTR:[^ ]+]] +// CHECK: [[RAW_REG_ADDR:%[^ ]+]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i64 [[REG_OFFSET]] +// CHECK: [[REG_COUNT1:%[^ ]+]] = add i64 [[REG_COUNT]], 1 +// CHECK: store i64 [[REG_COUNT1]], ptr [[REG_COUNT_PTR]] +// CHECK: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 2 +// CHECK: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_PTR]] +// CHECK: [[RAW_MEM_ADDR:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 6 +// CHECK: [[OVERFLOW_ARG_AREA2:%[^ ]+]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i64 8 +// CHECK: store ptr [[OVERFLOW_ARG_AREA2]], ptr [[OVERFLOW_ARG_AREA_PTR]] +// CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi ptr [ [[RAW_REG_ADDR]], %{{.*}} ], [ [[RAW_MEM_ADDR]], %{{.*}} ] +// CHECK: ret void + struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); } // CHECK-LABEL: define{{.*}} void @va_agg_float(ptr dead_on_unwind noalias writable sret(%struct.agg_float) align 4 %{{.*}}, ptr %{{.*}} // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds nuw %struct.__va_list_tag, ptr %{{.*}}, i32 0, i32 1 diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt index 8fdcec6029a2a1..63f4c94605c907 100644 --- a/compiler-rt/test/builtins/CMakeLists.txt +++ b/compiler-rt/test/builtins/CMakeLists.txt @@ -56,7 +56,7 @@ foreach(arch ${BUILTIN_TEST_ARCH}) string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}") endif() else() - if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64" AND COMPILER_RT_HAS_${arch}_FLOAT16) + if (${arch} MATCHES "arm|armhf|aarch64|arm64|i?86|x86_64|AMD64|riscv32|riscv64|s390x" AND COMPILER_RT_HAS_${arch}_FLOAT16) list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_HAS_FLOAT16) string(REPLACE ";" " " BUILTINS_TEST_TARGET_CFLAGS "${BUILTINS_TEST_TARGET_CFLAGS}") endif() diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index e38fce764b6403..7004da809d9499 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -255,4 +255,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { } setLibcallName(RTLIB::MULO_I128, nullptr); } + + if (TT.isSystemZ()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + } } diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index e4aefc42d860f2..7f528918850261 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -60,9 +60,11 @@ enum RegisterKind { GRH32Reg, GR64Reg, GR128Reg, + FP16Reg, FP32Reg, FP64Reg, FP128Reg, + VR16Reg, VR32Reg, VR64Reg, VR128Reg, @@ -356,9 +358,11 @@ class SystemZOperand : public MCParsedAsmOperand { bool isADDR32() const { return isReg(GR32Reg); } bool isADDR64() const { return isReg(GR64Reg); } bool isADDR128() const { return false; } + bool isFP16() const { return isReg(FP16Reg); } bool isFP32() const { return isReg(FP32Reg); } bool isFP64() const { return isReg(FP64Reg); } bool isFP128() const { return isReg(FP128Reg); } + bool isVR16() const { return isReg(VR16Reg); } bool isVR32() const { return isReg(VR32Reg); } bool isVR64() const { return isReg(VR64Reg); } bool isVF128() const { return false; } @@ -534,6 +538,9 @@ class SystemZAsmParser : public MCTargetAsmParser { ParseStatus parseADDR128(OperandVector &Operands) { llvm_unreachable("Shouldn't be used as an operand"); } + ParseStatus parseFP16(OperandVector &Operands) { + return parseRegister(Operands, FP16Reg); + } ParseStatus parseFP32(OperandVector &Operands) { return parseRegister(Operands, FP32Reg); } @@ -543,6 +550,9 @@ class SystemZAsmParser : public MCTargetAsmParser { ParseStatus parseFP128(OperandVector &Operands) { return parseRegister(Operands, FP128Reg); } + ParseStatus parseVR16(OperandVector &Operands) { + return parseRegister(Operands, VR16Reg); + } ParseStatus parseVR32(OperandVector &Operands) { return parseRegister(Operands, VR32Reg); } @@ -829,11 +839,13 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, case GR128Reg: Group = RegGR; break; + case FP16Reg: case FP32Reg: case FP64Reg: case FP128Reg: Group = RegFP; break; + case VR16Reg: case VR32Reg: case VR64Reg: case VR128Reg: @@ -882,9 +894,11 @@ ParseStatus SystemZAsmParser::parseRegister(OperandVector &Operands, case GRH32Reg: Regs = SystemZMC::GRH32Regs; break; case GR64Reg: Regs = SystemZMC::GR64Regs; break; case GR128Reg: Regs = SystemZMC::GR128Regs; break; + case FP16Reg: Regs = SystemZMC::FP16Regs; break; case FP32Reg: Regs = SystemZMC::FP32Regs; break; case FP64Reg: Regs = SystemZMC::FP64Regs; break; case FP128Reg: Regs = SystemZMC::FP128Regs; break; + case VR16Reg: Regs = SystemZMC::VR16Regs; break; case VR32Reg: Regs = SystemZMC::VR32Regs; break; case VR64Reg: Regs = SystemZMC::VR64Regs; break; case VR128Reg: Regs = SystemZMC::VR128Regs; break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 333221c46ebb8b..291b6789c78f69 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -61,6 +61,13 @@ const unsigned SystemZMC::GR128Regs[16] = { SystemZ::R12Q, 0, SystemZ::R14Q, 0 }; +const unsigned SystemZMC::FP16Regs[16] = { + SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H, + SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H, + SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H, + SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H +}; + const unsigned SystemZMC::FP32Regs[16] = { SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, @@ -82,6 +89,17 @@ const unsigned SystemZMC::FP128Regs[16] = { SystemZ::F12Q, SystemZ::F13Q, 0, 0 }; +const unsigned SystemZMC::VR16Regs[32] = { + SystemZ::F0H, SystemZ::F1H, SystemZ::F2H, SystemZ::F3H, + SystemZ::F4H, SystemZ::F5H, SystemZ::F6H, SystemZ::F7H, + SystemZ::F8H, SystemZ::F9H, SystemZ::F10H, SystemZ::F11H, + SystemZ::F12H, SystemZ::F13H, SystemZ::F14H, SystemZ::F15H, + SystemZ::F16H, SystemZ::F17H, SystemZ::F18H, SystemZ::F19H, + SystemZ::F20H, SystemZ::F21H, SystemZ::F22H, SystemZ::F23H, + SystemZ::F24H, SystemZ::F25H, SystemZ::F26H, SystemZ::F27H, + SystemZ::F28H, SystemZ::F29H, SystemZ::F30H, SystemZ::F31H +}; + const unsigned SystemZMC::VR32Regs[32] = { SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, @@ -142,6 +160,7 @@ unsigned SystemZMC::getFirstReg(unsigned Reg) { Map[AR32Regs[I]] = I; } for (unsigned I = 0; I < 32; ++I) { + Map[VR16Regs[I]] = I; Map[VR32Regs[I]] = I; Map[VR64Regs[I]] = I; Map[VR128Regs[I]] = I; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 39c1836a137005..1db1b4b9da0022 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -43,9 +43,11 @@ extern const unsigned GR32Regs[16]; extern const unsigned GRH32Regs[16]; extern const unsigned GR64Regs[16]; extern const unsigned GR128Regs[16]; +extern const unsigned FP16Regs[16]; extern const unsigned FP32Regs[16]; extern const unsigned FP64Regs[16]; extern const unsigned FP128Regs[16]; +extern const unsigned VR16Regs[32]; extern const unsigned VR32Regs[32]; extern const unsigned VR64Regs[32]; extern const unsigned VR128Regs[32]; diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 59154431877a88..8d4dc97f516824 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -546,6 +546,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { lowerAlignmentHint(MI, LoweredMI, SystemZ::VSTMAlign); break; + case SystemZ::VL16: + LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPH); + break; + case SystemZ::VL32: LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF); break; @@ -554,6 +558,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG); break; + case SystemZ::VST16: + LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEH); + break; + case SystemZ::VST32: LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF); break; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 99bb697ce20142..0ad872bcb63a74 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -50,6 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[ // other floating-point argument registers available for code that // doesn't care about the ABI. All floating-point argument registers // are call-clobbered, so we can use all of them here. + CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>, CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, @@ -115,6 +116,7 @@ def CC_SystemZ_ELF : CallingConv<[ CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>, // The first 4 float and double arguments are passed in even registers F0-F6. + CCIfType<[f16], CCAssignToReg<[F0H, F2H, F4H, F6H]>>, CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, @@ -138,7 +140,7 @@ def CC_SystemZ_ELF : CallingConv<[ CCAssignToStack<16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. - CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> + CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>> ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 403d238aa5b528..e1ba46b08d0db9 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1194,9 +1194,10 @@ void SystemZDAGToDAGISel::loadVectorConstant( SDValue BitCast = CurDAG->getNode(ISD::BITCAST, DL, VT, Op); ReplaceNode(Node, BitCast.getNode()); SelectCode(BitCast.getNode()); - } else { // float or double - unsigned SubRegIdx = - (VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 : SystemZ::subreg_h64); + } else { // half, float or double + unsigned SubRegIdx = (VT.getSizeInBits() == 16 ? SystemZ::subreg_h16 + : VT.getSizeInBits() == 32 ? SystemZ::subreg_h32 + : SystemZ::subreg_h64); ReplaceNode( Node, CurDAG->getTargetExtractSubreg(SubRegIdx, DL, VT, Op).getNode()); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 8f505b7e198cfa..fb159236ec5c2b 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -103,9 +103,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass); if (!useSoftFloat()) { if (Subtarget.hasVector()) { + addRegisterClass(MVT::f16, &SystemZ::VR16BitRegClass); addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass); } else { + addRegisterClass(MVT::f16, &SystemZ::FP16BitRegClass); addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); } @@ -513,11 +515,24 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, } // Handle floating-point types. + // Promote all f16 operations to float, with some exceptions below. + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, MVT::f16, Promote); + setOperationAction(ISD::ConstantFP, MVT::f16, Expand); + for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setTruncStoreAction(VT, MVT::f16, Expand); + } + for (auto Op : {ISD::LOAD, ISD::ATOMIC_LOAD, ISD::STORE, ISD::ATOMIC_STORE}) + setOperationAction(Op, MVT::f16, Subtarget.hasVector() ? Legal : Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall); + for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; ++I) { MVT VT = MVT::SimpleValueType(I); - if (isTypeLegal(VT)) { + if (isTypeLegal(VT) && VT != MVT::f16) { // We can use FI for FRINT. setOperationAction(ISD::FRINT, VT, Legal); @@ -549,7 +564,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); if (Subtarget.hasFPExtension()) { setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); @@ -557,6 +571,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FROUND, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); } + + // Extension from f16 needs libcall. + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } } @@ -766,6 +784,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Default to having -disable-strictnode-mutation on IsStrictFPEnabled = true; + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + if (Subtarget.isTargetzOS()) { struct RTLibCallMapping { RTLIB::Libcall Code; @@ -1656,6 +1677,10 @@ SDValue SystemZTargetLowering::LowerFormalArguments( NumFixedGPRs += 1; RC = &SystemZ::GR64BitRegClass; break; + case MVT::f16: + NumFixedFPRs += 1; + RC = &SystemZ::FP16BitRegClass; + break; case MVT::f32: NumFixedFPRs += 1; RC = &SystemZ::FP32BitRegClass; @@ -1700,9 +1725,12 @@ SDValue SystemZTargetLowering::LowerFormalArguments( // from this parameter. Unpromoted ints and floats are // passed as right-justified 8-byte values. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32) + if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32 || + VA.getLocVT() == MVT::f16) { + unsigned SlotOffs = VA.getLocVT() == MVT::f16 ? 6 : 4; FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, - DAG.getIntPtrConstant(4, DL)); + DAG.getIntPtrConstant(SlotOffs, DL)); + } ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); } @@ -2015,6 +2043,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, VA.getLocMemOffset(); if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32) Offset += 4; + else if (VA.getLocVT() == MVT::f16) + Offset += 6; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, DAG.getIntPtrConstant(Offset, DL)); @@ -4562,6 +4592,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op, return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0)); } +SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, + SelectionDAG &DAG) const { + MVT RegVT = Op.getSimpleValueType(); + if (RegVT.getSizeInBits() == 128) + return lowerATOMIC_LDST_I128(Op, DAG); + return lowerLoadF16(Op, DAG); +} + +SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op, + SelectionDAG &DAG) const { + auto *Node = cast(Op.getNode()); + if (Node->getMemoryVT().getSizeInBits() == 128) + return lowerATOMIC_LDST_I128(Op, DAG); + return lowerStoreF16(Op, DAG); +} + SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast(Op.getNode()); @@ -6109,6 +6155,69 @@ static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) { return Op; } +SDValue SystemZTargetLowering::lowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + SDValue In = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0); + if (In.getSimpleValueType() != MVT::f16) + return Op; // Legal + return SDValue(); // Let legalizer emit the libcall. +} + +SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op, + SelectionDAG &DAG) const { + MVT RegVT = Op.getSimpleValueType(); + assert(RegVT == MVT::f16 && "Expected to lower an f16 load."); + + SDLoc DL(Op); + SDValue NewLd; + if (auto *AtomicLd = dyn_cast(Op.getNode())) { + assert(EVT(RegVT) == AtomicLd->getMemoryVT() && "Unhandled f16 load"); + NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, MVT::i16, MVT::i32, + AtomicLd->getChain(), AtomicLd->getBasePtr(), + AtomicLd->getMemOperand()); + cast(NewLd)->setExtensionType(ISD::EXTLOAD); + } else { + LoadSDNode *Ld = cast(Op.getNode()); + assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load"); + NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Ld->getChain(), + Ld->getBasePtr(), Ld->getPointerInfo(), + MVT::i16, Ld->getOriginalAlign(), + Ld->getMemOperand()->getFlags()); + } + // Load as integer, shift and then insert into upper 2 bytes of the FP + // register. + SDValue Shft = DAG.getNode(ISD::SHL, DL, MVT::i32, NewLd, + DAG.getConstant(16, DL, MVT::i32)); + SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Shft); + SDValue F16Val = DAG.getTargetExtractSubreg(SystemZ::subreg_h16, + DL, MVT::f16, BCast); + return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL); +} + +SDValue SystemZTargetLowering::lowerStoreF16(SDValue Op, + SelectionDAG &DAG) const { + SDValue StoredVal = Op->getOperand(1); + MVT StoreVT = StoredVal.getSimpleValueType(); + assert(StoreVT == MVT::f16 && "Expected to lower an f16 store."); + + // Move into a GPR, shift and store the 2 bytes. + SDLoc DL(Op); + SDNode *U32 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f32); + SDValue In32 = DAG.getTargetInsertSubreg(SystemZ::subreg_h16, DL, + MVT::f32, SDValue(U32, 0), StoredVal); + SDValue BCast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, In32); + SDValue Shft = DAG.getNode(ISD::SRL, DL, MVT::i32, BCast, + DAG.getConstant(16, DL, MVT::i32)); + + if (auto *AtomicSt = dyn_cast(Op.getNode())) + return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MVT::i16, AtomicSt->getChain(), + Shft, AtomicSt->getBasePtr(), AtomicSt->getMemOperand()); + + StoreSDNode *St = cast(Op.getNode()); + return DAG.getTruncStore(St->getChain(), DL, Shft, St->getBasePtr(), + MVT::i16, St->getMemOperand()); +} + SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -6228,8 +6337,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, case ISD::ATOMIC_SWAP: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW); case ISD::ATOMIC_STORE: + return lowerATOMIC_STORE(Op, DAG); case ISD::ATOMIC_LOAD: - return lowerATOMIC_LDST_I128(Op, DAG); + return lowerATOMIC_LOAD(Op, DAG); case ISD::ATOMIC_LOAD_ADD: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD); case ISD::ATOMIC_LOAD_SUB: @@ -6286,6 +6396,13 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerAddrSpaceCast(Op, DAG); case ISD::ROTL: return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR); + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: + return lowerFP_EXTEND(Op, DAG); + case ISD::LOAD: + return lowerLoadF16(Op, DAG); + case ISD::STORE: + return lowerStoreF16(Op, DAG); case ISD::IS_FPCLASS: return lowerIS_FPCLASS(Op, DAG); case ISD::GET_ROUNDING: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 3c06c1fdf2b1bc..3f54563039a9ae 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -698,6 +698,8 @@ class SystemZTargetLowering : public TargetLowering { SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG, unsigned Opcode) const; @@ -719,6 +721,10 @@ class SystemZTargetLowering : public TargetLowering { SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; + SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerLoadF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerStoreF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td index aad04a2b4159cb..5b4b73d586a796 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -36,6 +36,8 @@ defm CondStoreF64 : CondStores; def LZER : InherentRRE<"lzer", 0xB374, FP32, fpimm0>; def LZDR : InherentRRE<"lzdr", 0xB375, FP64, fpimm0>; def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>; @@ -47,8 +49,11 @@ def LDR : UnaryRR <"ldr", 0x28, null_frag, FP64, FP64>; def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>; // For z13 we prefer LDR over LER to avoid partial register dependencies. -let isCodeGenOnly = 1 in - def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>; +let isCodeGenOnly = 1 in { + def LER16 : UnaryRR <"ler", 0x38, null_frag, FP16, FP16>; + def LDR16 : UnaryRR<"ldr", 0x28, null_frag, FP16, FP16>; + def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>; +} // Moves between two floating-point registers that also set the condition // codes. Note that these instructions will turn SNaNs into QNaNs and should @@ -331,8 +336,10 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { } // Generic form, which does not set CC. def LCDFR : UnaryRRE<"lcdfr", 0xB373, fneg, FP64, FP64>; -let isCodeGenOnly = 1 in +let isCodeGenOnly = 1 in { + def LCDFR_16 : UnaryRRE<"lcdfr", 0xB373, fneg, FP16, FP16>; def LCDFR_32 : UnaryRRE<"lcdfr", 0xB373, fneg, FP32, FP32>; +} // Absolute value (Load Positive). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { @@ -592,6 +599,7 @@ let hasSideEffects = 1 in { // Peepholes //===----------------------------------------------------------------------===// +def : Pat<(f16 fpimmneg0), (LCDFR_16 (LZER_16))>; def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>; def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>; def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index d553c72589f599..470543824dc5d0 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -968,6 +968,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg)) Opcode = SystemZ::LGR; + else if (SystemZ::FP16BitRegClass.contains(DestReg, SrcReg)) + Opcode = STI.hasVector() ? SystemZ::LDR16 : SystemZ::LER16; else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg)) // For z13 we prefer LDR over LER to avoid partial register dependencies. Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER; @@ -994,8 +996,31 @@ void SystemZInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Without vector support, there are no fp16 load/store instructions, so + // need to save/restore via GPR. + if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) { + assert(!MRI.isSSA() && MRI.getNumVirtRegs() && + "Expected non-SSA form with virtual registers."); + Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); + Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass); + BuildMI(MBB, MBBI, DL, get(SystemZ::COPY)) + .addReg(FP64Reg, RegState::DefineNoRead, SystemZ::subreg_h16) + .addReg(SrcReg, getKillRegState(isKill)); + BuildMI(MBB, MBBI, DL, get(SystemZ::LGDR), GR64Reg) + .addReg(FP64Reg, RegState::Kill); + BuildMI(MBB, MBBI, DL, get(SystemZ::SRLG), GR64Reg) + .addReg(GR64Reg) + .addReg(0) + .addImm(48); + addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::STH)) + .addReg(GR64Reg, RegState::Kill, SystemZ::subreg_l32), + FrameIdx); + return; + } + // Callers may expect a single instruction, so keep 128-bit moves // together for now and lower them after register allocation. unsigned LoadOpcode, StoreOpcode; @@ -1011,8 +1036,31 @@ void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Without vector support, there are no fp16 load/store instructions, so + // need to save/restore via GPR. + if (RC == &SystemZ::FP16BitRegClass && !STI.hasVector()) { + assert(!MRI.isSSA() && MRI.getNumVirtRegs() && + "Expected non-SSA form with virtual registers."); + Register GR64Reg = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); + Register FP64Reg = MRI.createVirtualRegister(&SystemZ::FP64BitRegClass); + addFrameReference(BuildMI(MBB, MBBI, DL, get(SystemZ::LH)) + .addReg(GR64Reg, RegState::DefineNoRead, + SystemZ::subreg_l32), + FrameIdx); + BuildMI(MBB, MBBI, DL, get(SystemZ::SLLG), GR64Reg) + .addReg(GR64Reg) + .addReg(0) + .addImm(48); + BuildMI(MBB, MBBI, DL, get(SystemZ::LDGR), FP64Reg) + .addReg(GR64Reg, RegState::Kill); + BuildMI(MBB, MBBI, DL, get(SystemZ::COPY), DestReg) + .addReg(FP64Reg, RegState::Kill, SystemZ::subreg_h16); + return; + } + // Callers may expect a single instruction, so keep 128-bit moves // together for now and lower them after register allocation. unsigned LoadOpcode, StoreOpcode; @@ -1883,6 +1931,10 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, } else if (RC == &SystemZ::FP128BitRegClass) { LoadOpcode = SystemZ::LX; StoreOpcode = SystemZ::STX; + } else if (RC == &SystemZ::FP16BitRegClass || + RC == &SystemZ::VR16BitRegClass) { + LoadOpcode = SystemZ::VL16; + StoreOpcode = SystemZ::VST16; } else if (RC == &SystemZ::VR32BitRegClass) { LoadOpcode = SystemZ::VL32; StoreOpcode = SystemZ::VST32; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index c09f48891c1391..7b6e4deed18ef6 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -140,6 +140,7 @@ let Predicates = [FeatureVector] in { // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. let mayLoad = 1, canFoldAsLoad = 1 in { + def VL16 : UnaryAliasVRX; def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -236,6 +237,7 @@ let Predicates = [FeatureVector] in { // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. let mayStore = 1 in { + def VST16 : StoreAliasVRX; def VST32 : StoreAliasVRX; def VST64 : StoreAliasVRX; } diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index 8f9bb56f2eb3bb..1dfe264b501b1c 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -20,6 +20,7 @@ class SystemZRegWithSubregs subregs> } let Namespace = "SystemZ" in { +def subreg_h16 : SubRegIndex<16, 16>; def subreg_l32 : SubRegIndex<32, 0>; // Also acts as subreg_hl32. def subreg_h32 : SubRegIndex<32, 32>; // Also acts as subreg_hh32. def subreg_l64 : SubRegIndex<64, 0>; @@ -201,9 +202,16 @@ def F27Dwarf : DwarfMapping<81>; def F29Dwarf : DwarfMapping<82>; def F31Dwarf : DwarfMapping<83>; +// Upper 16 bits of one of the floating-point registers +class FPR16 num, string n> : SystemZReg { + let HWEncoding = num; +} + // Upper 32 bits of one of the floating-point registers -class FPR32 num, string n> : SystemZReg { +class FPR32 num, string n, FPR16 high> + : SystemZRegWithSubregs { let HWEncoding = num; + let SubRegIndices = [subreg_h16]; } // One of the floating-point registers. @@ -223,12 +231,14 @@ class FPR128 num, string n, FPR64 low, FPR64 high> // Floating-point registers. Registers 16-31 require the vector facility. foreach I = 0-15 in { - def F#I#S : FPR32; + def F#I#H : FPR16; + def F#I#S : FPR32("F"#I#"H")>; def F#I#D : FPR64("F"#I#"S")>, DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } foreach I = 16-31 in { - def F#I#S : FPR32; + def F#I#H : FPR16; + def F#I#S : FPR32("F"#I#"H")>; def F#I#D : FPR64("F"#I#"S")>, DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } @@ -240,6 +250,7 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in { // There's no store-multiple instruction for FPRs, so we're not fussy // about the order in which call-saved registers are allocated. +defm FP16 : SystemZRegClass<"FP16", [f16], 16, (sequence "F%uH", 0, 15)>; defm FP32 : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>; defm FP64 : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>; defm FP128 : SystemZRegClass<"FP128", [f128], 128, @@ -262,6 +273,13 @@ foreach I = 0-31 in { DwarfRegNum<[!cast("F"#I#"Dwarf").Id]>; } +// Class used to store 16-bit fp values in the first element of a vector +// register. +defm VR16 : SystemZRegClass<"VR16", [f16], 16, + (add (sequence "F%uH", 0, 7), + (sequence "F%uH", 16, 31), + (sequence "F%uH", 8, 15))>; + // Class used to store 32-bit values in the first element of a vector // register. f32 scalars are used for the WLEDB and WLDEB instructions. defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32, @@ -298,6 +316,7 @@ class TypedReg { RegisterOperand op = opin; } +def v16hb : TypedReg; def v32f : TypedReg; def v32sb : TypedReg; def v64g : TypedReg; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td index d0fec02777875a..6c1d1df83fafa3 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -773,12 +773,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -840,7 +840,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1191,7 +1191,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1205,7 +1205,7 @@ def : InstRW<[WLat4LSU, WLat4LSU, LSU5, GroupAlone], // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td index a6d89ce9443c5a..c47fcb7cb0a11b 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -793,12 +793,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -860,7 +860,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1209,7 +1209,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1224,7 +1224,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td index 455354e283ad8e..28d34d80adb812 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -811,12 +811,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -878,7 +878,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1231,7 +1231,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1246,7 +1246,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td index 92abf0ba4022cc..24713b8fc93b56 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -812,12 +812,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXb2, Cracked], (instregex "LZXR$")>; // Load -def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXb, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXb2, GroupAlone], (instregex "LXR$")>; @@ -879,7 +879,7 @@ def : InstRW<[WLat12, WLat12, FXb, VecDF2, Cracked], (instregex "CL(F|G)XBR$")>; // Load Complement / Negative / Positive def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXb, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, VecDF4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root @@ -1237,7 +1237,7 @@ def : InstRW<[WLat2, VecXsPm, NormalGr], (instregex "VLEI(B|F|G|H)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(Align)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(L|BB)$")>; -def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(32|64)$")>; +def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VL(16|32|64)$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLLEZ(B|F|G|H|LF)?$")>; def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLREP(B|F|G|H)?$")>; def : InstRW<[WLat2LSU, RegReadAdv, VecXsPm, LSU, NormalGr], @@ -1252,7 +1252,7 @@ def : InstRW<[LSULatency, LSU, NormalGr], (instregex "VLRL(R)?$")>; // Vector: Stores //===----------------------------------------------------------------------===// -def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|32|64)?$")>; +def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VST(Align|L|16|32|64)?$")>; def : InstRW<[WLat1, FXb, LSU, NormalGr], (instregex "VSTE(F|G)$")>; def : InstRW<[WLat1, FXb, LSU, VecXsPm, Cracked], (instregex "VSTE(B|H)$")>; def : InstRW<[WLat1, LSU2, FXb3, GroupAlone2], (instregex "VSTM(Align)?$")>; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td index 99d0d674bbbb2f..e93f329fb286f4 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -705,12 +705,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LZXR$")>; // Load -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXU2, GroupAlone2], (instregex "LXR$")>; @@ -771,7 +771,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")> // Load Complement / Negative / Positive def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td index 5b334da2bac342..95dfab6c476bf3 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -743,12 +743,12 @@ def : InstRW<[], (instregex "Insn.*")>; //===----------------------------------------------------------------------===// // Load zero -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LZ(DR|ER|ER_16)$")>; def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LZXR$")>; // Load -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LER(16)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "LD(R|R16|R32|GR)$")>; def : InstRW<[WLat3, FXU, NormalGr], (instregex "LGDR$")>; def : InstRW<[WLat2, FXU2, GroupAlone], (instregex "LXR$")>; @@ -809,7 +809,7 @@ def : InstRW<[WLat12, WLat12, FXU, FPU2, GroupAlone], (instregex "CL(F|G)XBR$")> // Load Complement / Negative / Positive def : InstRW<[WLat7, WLat7, FPU, NormalGr], (instregex "L(C|N|P)(E|D)BR$")>; -def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32)?$")>; +def : InstRW<[WLat1, FXU, NormalGr], (instregex "L(C|N|P)DFR(_32|_16)?$")>; def : InstRW<[WLat10, WLat10, FPU4, GroupAlone], (instregex "L(C|N|P)XBR$")>; // Square root diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-10.ll b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll new file mode 100644 index 00000000000000..e30f9791b51e02 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/atomic-load-10.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test fp16 atomic loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR + +define half @f1(ptr %src) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: lh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: br %r14 + %val = load atomic half, ptr %src seq_cst, align 2 + ret half %val +} diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-10.ll b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll new file mode 100644 index 00000000000000..3f228d58dcd8ce --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/atomic-store-10.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test half atomic stores. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs -mcpu=z16 | FileCheck %s -check-prefix=VECTOR + +define void @f1(ptr %src, half %val) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r2) +; CHECK-NEXT: bcr 15, %r0 +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vsteh %v0, 0(%r2), 0 +; VECTOR-NEXT: bcr 14, %r0 +; VECTOR-NEXT: br %r14 + store atomic half %val, ptr %src seq_cst, align 2 + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll new file mode 100644 index 00000000000000..6e813a4a5094d7 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-libcall.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test that library calls are emitted for LLVM IR intrinsics +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define half @f1(half %x, i16 %y) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lhr %r13, %r2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: llgfr %r2, %r13 +; CHECK-NEXT: brasl %r14, __powisf2@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r13, %r15, 264(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.powi.f16.i16(half %x, i16 %y) + ret half %tmp +} + +define half @f2(half %x, half %y) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, powf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.pow.f16(half %x, half %y) + ret half %tmp +} + +define half @f3(half %x) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, sinf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.sin.f16(half %x) + ret half %tmp +} + +define half @f4(half %x) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, cosf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.cos.f16(half %x) + ret half %tmp +} + +define half @f5(half %x) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, expf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.exp.f16(half %x) + ret half %tmp +} + +define half @f6(half %x) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, exp2f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.exp2.f16(half %x) + ret half %tmp +} + +define half @f7(half %x) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, logf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log.f16(half %x) + ret half %tmp +} + +define half @f8(half %x) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, log2f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log2.f16(half %x) + ret half %tmp +} + +define half @f9(half %x) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: brasl %r14, log10f@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.log10.f16(half %x) + ret half %tmp +} + +define half @f10(half %x, half %y) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, fminf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.minnum.f16(half %x, half %y) + ret half %tmp +} + +define half @f11(half %x, half %y) { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f2, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, fmaxf@PLT +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call half @llvm.maxnum.f16(half %x, half %y) + ret half %tmp +} + +; Verify that "nnan" minnum/maxnum calls are transformed to +; compare+select sequences instead of libcalls. +define half @f12(half %x, half %y) { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: jl .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call nnan half @llvm.minnum.f16(half %x, half %y) + ret half %tmp +} + +define half @f13(half %x, half %y) { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: jh .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 + %tmp = call nnan half @llvm.maxnum.f16(half %x, half %y) + ret half %tmp +} + +declare half @llvm.powi.f16.i16(half, i16) +declare half @llvm.pow.f16(half, half) + +declare half @llvm.sin.f16(half) +declare half @llvm.cos.f16(half) + +declare half @llvm.exp.f16(half) +declare half @llvm.exp2.f16(half) + +declare half @llvm.log.f16(half) +declare half @llvm.log2.f16(half) +declare half @llvm.log10.f16(half) + +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) diff --git a/llvm/test/CodeGen/SystemZ/fp-half-strict.ll b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll new file mode 100644 index 00000000000000..42663b109d7a9f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-strict.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR +; +; Tests for strict 16-bit floating point (half). + +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) + +; Test register addition. +define half @fun0(half %f1, half %f2) #0 { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %res = call half @llvm.experimental.constrained.fadd.f16( + half %f1, half %f2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %res +} + +; Test atomic memory accesses and extension/truncation inside a strictfp +; function. +define void @fun1(ptr %Src, ptr %Dst) #0 { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: bcr 14, %r0 +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: adbr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: bcr 14, %r0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load atomic half, ptr %Src seq_cst, align 2 + %E0 = fpext half %Op0 to double + %Add = call double @llvm.experimental.constrained.fadd.f64( + double %E0, double %E0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %Res = fptrunc double %Add to half + store atomic half %Res, ptr %Dst seq_cst, align 2 + ret void +} + +; Test a chain of half operations which should have each operation surrounded +; by conversions to/from fp32 for proper emulation. +define half @fun2(half %Op0, half %Op1, half %Op2) #0 { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -184 +; NOVEC-NEXT: .cfi_def_cfa_offset 344 +; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: ler %f8, %f4 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: meebr %f0, %f10 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: meebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: ldr %f8, %f4 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: meebr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: wfmsb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %A0 = call half @llvm.experimental.constrained.fmul.f16( + half %Op0, half %Op1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %Res = call half @llvm.experimental.constrained.fmul.f16( + half %A0, half %Op2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %Res +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll new file mode 100644 index 00000000000000..cc3f61f9986494 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll @@ -0,0 +1,797 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR + +; Add the <8 x half> argument with itself and return it. +define <8 x half> @fun0(<8 x half> %Op) { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -224 +; NOVEC-NEXT: .cfi_def_cfa_offset 384 +; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: .cfi_offset %f12, -200 +; NOVEC-NEXT: .cfi_offset %f13, -208 +; NOVEC-NEXT: .cfi_offset %f14, -216 +; NOVEC-NEXT: .cfi_offset %f15, -224 +; NOVEC-NEXT: lh %r0, 414(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f15, %r0 +; NOVEC-NEXT: lh %r0, 406(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f12, %r0 +; NOVEC-NEXT: lh %r0, 398(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f9, %r0 +; NOVEC-NEXT: lh %r0, 390(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ler %f10, %f6 +; NOVEC-NEXT: ler %f11, %f4 +; NOVEC-NEXT: ler %f13, %f2 +; NOVEC-NEXT: ler %f14, %f0 +; NOVEC-NEXT: lgr %r13, %r2 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f12 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f12, %f0 +; NOVEC-NEXT: ler %f0, %f15 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f15, %f0 +; NOVEC-NEXT: ler %f0, %f14 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f14, %f0 +; NOVEC-NEXT: ler %f0, %f13 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f13, %f0 +; NOVEC-NEXT: ler %f0, %f11 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f11, %f0 +; NOVEC-NEXT: ler %f0, %f10 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 6(%r13) +; NOVEC-NEXT: lgdr %r0, %f11 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 4(%r13) +; NOVEC-NEXT: lgdr %r0, %f13 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 2(%r13) +; NOVEC-NEXT: lgdr %r0, %f14 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lgdr %r0, %f15 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 14(%r13) +; NOVEC-NEXT: lgdr %r0, %f12 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 12(%r13) +; NOVEC-NEXT: lgdr %r0, %f9 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 10(%r13) +; NOVEC-NEXT: lgdr %r0, %f8 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 8(%r13) +; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r13, %r15, 328(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: .cfi_offset %f12, -200 +; VECTOR-NEXT: .cfi_offset %f13, -208 +; VECTOR-NEXT: .cfi_offset %f14, -216 +; VECTOR-NEXT: .cfi_offset %f15, -224 +; VECTOR-NEXT: vlreph %v11, 414(%r15) +; VECTOR-NEXT: vlreph %v12, 406(%r15) +; VECTOR-NEXT: vlreph %v13, 398(%r15) +; VECTOR-NEXT: vlreph %v14, 390(%r15) +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: ldr %f9, %f4 +; VECTOR-NEXT: ldr %f10, %f2 +; VECTOR-NEXT: lgr %r13, %r2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f15, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f14, %f0 +; VECTOR-NEXT: ldr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f13, %f0 +; VECTOR-NEXT: ldr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f12, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 14(%r13), 0 +; VECTOR-NEXT: vsteh %v12, 12(%r13), 0 +; VECTOR-NEXT: vsteh %v13, 10(%r13), 0 +; VECTOR-NEXT: vsteh %v14, 8(%r13), 0 +; VECTOR-NEXT: vsteh %v8, 6(%r13), 0 +; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v15, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd <8 x half> %Op, %Op + ret <8 x half> %Res +} + +; Same, but with partial vector values. +define <4 x half> @fun1(<4 x half> %Op) { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -192 +; NOVEC-NEXT: .cfi_def_cfa_offset 352 +; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: ler %f8, %f6 +; NOVEC-NEXT: ler %f9, %f4 +; NOVEC-NEXT: ler %f10, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f11, %f0 +; NOVEC-NEXT: ler %f0, %f10 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ler %f6, %f0 +; NOVEC-NEXT: ler %f0, %f11 +; NOVEC-NEXT: ler %f2, %f10 +; NOVEC-NEXT: ler %f4, %f9 +; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 304(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: ldr %f9, %f4 +; VECTOR-NEXT: ldr %f10, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f11, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f6, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: ldr %f2, %f10 +; VECTOR-NEXT: ldr %f4, %f9 +; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd <4 x half> %Op, %Op + ret <4 x half> %Res +} + +; Test a vector extension. +define <2 x half> @fun2(<2 x half> %Op) { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: ldr %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f9, %f9 +; NOVEC-NEXT: ldr %f8, %f0 +; NOVEC-NEXT: adbr %f8, %f0 +; NOVEC-NEXT: ldr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ldr %f0, %f8 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ler %f2, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vmrhg %v0, %v0, %v1 +; VECTOR-NEXT: vfadb %v0, %v0, %v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload +; VECTOR-NEXT: vrepg %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f2, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E = fpext <2 x half> %Op to <2 x double> + %Add = fadd <2 x double> %E, %E + %Res = fptrunc <2 x double> %Add to <2 x half> + ret <2 x half> %Res +} + +; Load and store an <8 x half> vector. +define void @fun3(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun3: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lh %r0, 2(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lh %r0, 4(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: lh %r0, 6(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lh %r0, 8(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lh %r0, 10(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lh %r0, 12(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: lh %r0, 14(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 14(%r3) +; NOVEC-NEXT: lgdr %r0, %f6 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 12(%r3) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 10(%r3) +; NOVEC-NEXT: lgdr %r0, %f4 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 8(%r3) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 6(%r3) +; NOVEC-NEXT: lgdr %r0, %f2 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 4(%r3) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 2(%r3) +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r3) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun3: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vlreph %v1, 2(%r2) +; VECTOR-NEXT: vlreph %v2, 4(%r2) +; VECTOR-NEXT: vlreph %v3, 6(%r2) +; VECTOR-NEXT: vlreph %v4, 8(%r2) +; VECTOR-NEXT: vlreph %v5, 10(%r2) +; VECTOR-NEXT: vlreph %v6, 12(%r2) +; VECTOR-NEXT: vlreph %v7, 14(%r2) +; VECTOR-NEXT: vsteh %v7, 14(%r3), 0 +; VECTOR-NEXT: vsteh %v6, 12(%r3), 0 +; VECTOR-NEXT: vsteh %v5, 10(%r3), 0 +; VECTOR-NEXT: vsteh %v4, 8(%r3), 0 +; VECTOR-NEXT: vsteh %v3, 6(%r3), 0 +; VECTOR-NEXT: vsteh %v2, 4(%r3), 0 +; VECTOR-NEXT: vsteh %v1, 2(%r3), 0 +; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 +; VECTOR-NEXT: br %r14 +entry: + %L = load <8 x half>, ptr %Src + store <8 x half> %L, ptr %Dst + ret void +} + +; Call a function with <8 x half> argument and return values. +declare <8 x half> @foo(<8 x half>) +define void @fun4(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun4: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -208 +; NOVEC-NEXT: .cfi_def_cfa_offset 368 +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lh %r0, 2(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d +; NOVEC-NEXT: lh %r0, 4(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d +; NOVEC-NEXT: lh %r0, 6(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d +; NOVEC-NEXT: lh %r0, 8(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lh %r0, 10(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lh %r0, 12(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lh %r0, 14(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 190(%r15) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 182(%r15) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 174(%r15) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: la %r2, 192(%r15) +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, foo@PLT +; NOVEC-NEXT: lh %r0, 192(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lh %r0, 194(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lh %r0, 196(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f2, %r0 +; NOVEC-NEXT: lh %r0, 198(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lh %r0, 200(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lh %r0, 202(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lh %r0, 204(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: lh %r0, 206(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 14(%r13) +; NOVEC-NEXT: lgdr %r0, %f6 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 12(%r13) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 10(%r13) +; NOVEC-NEXT: lgdr %r0, %f4 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 8(%r13) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 6(%r13) +; NOVEC-NEXT: lgdr %r0, %f2 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 4(%r13) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 2(%r13) +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 312(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun4: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -208 +; VECTOR-NEXT: .cfi_def_cfa_offset 368 +; VECTOR-NEXT: vlreph %v6, 6(%r2) +; VECTOR-NEXT: vlreph %v4, 4(%r2) +; VECTOR-NEXT: vlreph %v2, 2(%r2) +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vlreph %v1, 8(%r2) +; VECTOR-NEXT: vlreph %v3, 10(%r2) +; VECTOR-NEXT: vlreph %v5, 12(%r2) +; VECTOR-NEXT: vlreph %v7, 14(%r2) +; VECTOR-NEXT: la %r2, 192(%r15) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 +; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 +; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 +; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, foo@PLT +; VECTOR-NEXT: vlreph %v0, 192(%r15) +; VECTOR-NEXT: vlreph %v1, 194(%r15) +; VECTOR-NEXT: vlreph %v2, 196(%r15) +; VECTOR-NEXT: vlreph %v3, 198(%r15) +; VECTOR-NEXT: vlreph %v4, 200(%r15) +; VECTOR-NEXT: vlreph %v5, 202(%r15) +; VECTOR-NEXT: vlreph %v6, 204(%r15) +; VECTOR-NEXT: vlreph %v7, 206(%r15) +; VECTOR-NEXT: vsteh %v7, 14(%r13), 0 +; VECTOR-NEXT: vsteh %v6, 12(%r13), 0 +; VECTOR-NEXT: vsteh %v5, 10(%r13), 0 +; VECTOR-NEXT: vsteh %v4, 8(%r13), 0 +; VECTOR-NEXT: vsteh %v3, 6(%r13), 0 +; VECTOR-NEXT: vsteh %v2, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v1, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 312(%r15) +; VECTOR-NEXT: br %r14 +entry: + %arg = load <8 x half>, ptr %Src + %Res = call <8 x half> @foo(<8 x half> %arg) + store <8 x half> %Res, ptr %Dst + ret void +} + +; Receive and pass argument fully on stack. +declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5) +define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) { +; NOVEC-LABEL: fun5: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -256 +; NOVEC-NEXT: .cfi_def_cfa_offset 416 +; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: .cfi_offset %f11, -192 +; NOVEC-NEXT: lh %r0, 422(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f1, %r0 +; NOVEC-NEXT: lh %r0, 430(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f3, %r0 +; NOVEC-NEXT: lh %r0, 438(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f5, %r0 +; NOVEC-NEXT: lh %r0, 446(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f7, %r0 +; NOVEC-NEXT: lh %r0, 454(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f8, %r0 +; NOVEC-NEXT: lh %r0, 462(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f9, %r0 +; NOVEC-NEXT: lh %r0, 470(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f10, %r0 +; NOVEC-NEXT: lh %r0, 478(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f11, %r0 +; NOVEC-NEXT: lgdr %r0, %f11 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 222(%r15) +; NOVEC-NEXT: lgdr %r0, %f10 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 214(%r15) +; NOVEC-NEXT: lgdr %r0, %f9 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 206(%r15) +; NOVEC-NEXT: lgdr %r0, %f8 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 198(%r15) +; NOVEC-NEXT: lgdr %r0, %f7 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 190(%r15) +; NOVEC-NEXT: lgdr %r0, %f5 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 182(%r15) +; NOVEC-NEXT: lgdr %r0, %f3 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 174(%r15) +; NOVEC-NEXT: lgdr %r0, %f1 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, foo2@PLT +; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 368(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun5: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: vlreph %v1, 390(%r15) +; VECTOR-NEXT: vlreph %v3, 398(%r15) +; VECTOR-NEXT: vlreph %v5, 406(%r15) +; VECTOR-NEXT: vlreph %v7, 414(%r15) +; VECTOR-NEXT: vlreph %v16, 422(%r15) +; VECTOR-NEXT: vlreph %v17, 430(%r15) +; VECTOR-NEXT: vlreph %v18, 438(%r15) +; VECTOR-NEXT: vlreph %v19, 446(%r15) +; VECTOR-NEXT: vsteh %v19, 222(%r15), 0 +; VECTOR-NEXT: vsteh %v18, 214(%r15), 0 +; VECTOR-NEXT: vsteh %v17, 206(%r15), 0 +; VECTOR-NEXT: vsteh %v16, 198(%r15), 0 +; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 +; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 +; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 +; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, foo2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 336(%r15) +; VECTOR-NEXT: br %r14 + call void @foo2(<4 x half> %dummy, <8 x half> %Arg5) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll new file mode 100644 index 00000000000000..cd4aa12c2b4ef0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half.ll @@ -0,0 +1,627 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=NOVEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefix=VECTOR +; +; Tests for 16-bit floating point (half). + +; Incoming half arguments added together and returned. +define half @fun0(half %Op0, half %Op1) { +; NOVEC-LABEL: fun0: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Res = fadd half %Op0, %Op1 + ret half %Res +} + +define half @fun1(half %Op0, half %Op1) { +; NOVEC-LABEL: fun1: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: ldr %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: ldr %f8, %f2 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: wfadb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E0 = fpext half %Op0 to double + %E1 = fpext half %Op1 to double + %Add = fadd double %E0, %E1 + %Res = fptrunc double %Add to half + ret half %Res +} + +define half @fun2(half %Op0, half %Op1) { +; NOVEC-LABEL: fun2: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -232 +; NOVEC-NEXT: .cfi_def_cfa_offset 392 +; NOVEC-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 216(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f11, 208(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f11, -184 +; NOVEC-NEXT: la %r2, 160(%r15) +; NOVEC-NEXT: ler %f8, %f2 +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f9, 160(%r15) +; NOVEC-NEXT: ld %f11, 168(%r15) +; NOVEC-NEXT: la %r2, 176(%r15) +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f0, 176(%r15) +; NOVEC-NEXT: ld %f2, 184(%r15) +; NOVEC-NEXT: la %r2, 192(%r15) +; NOVEC-NEXT: axbr %f0, %f9 +; NOVEC-NEXT: std %f0, 192(%r15) +; NOVEC-NEXT: std %f2, 200(%r15) +; NOVEC-NEXT: brasl %r14, __trunctfhf2@PLT +; NOVEC-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 216(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f11, 208(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 344(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -232 +; VECTOR-NEXT: .cfi_def_cfa_offset 392 +; VECTOR-NEXT: std %f8, 224(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: la %r2, 176(%r15) +; VECTOR-NEXT: ldr %f8, %f2 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: vl %v0, 176(%r15), 3 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; VECTOR-NEXT: la %r2, 192(%r15) +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: vl %v0, 192(%r15), 3 +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; VECTOR-NEXT: wfaxb %v0, %v1, %v0 +; VECTOR-NEXT: la %r2, 208(%r15) +; VECTOR-NEXT: vst %v0, 208(%r15), 3 +; VECTOR-NEXT: brasl %r14, __trunctfhf2@PLT +; VECTOR-NEXT: ld %f8, 224(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 344(%r15) +; VECTOR-NEXT: br %r14 +entry: + %E0 = fpext half %Op0 to fp128 + %E1 = fpext half %Op1 to fp128 + %Add = fadd fp128 %E0, %E1 + %Res = fptrunc fp128 %Add to half + ret half %Res +} + +; Test loading and storing a half value. +define void @fun3(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun3: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r3) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun3: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 +; VECTOR-NEXT: br %r14 +entry: + %L = load half, ptr %Src, align 2 + store half %L, ptr %Dst, align 2 + ret void +} + +define void @fun4(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun4: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT +; NOVEC-NEXT: adbr %f0, %f0 +; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun4: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: adbr %f0, %f0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load half, ptr %Src, align 2 + %E0 = fpext half %Op0 to double + %Add = fadd double %E0, %E0 + %Res = fptrunc double %Add to half + store half %Res, ptr %Dst, align 2 + ret void +} + +define void @fun5(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun5: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -192 +; NOVEC-NEXT: .cfi_def_cfa_offset 352 +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: la %r2, 160(%r15) +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhftf2@PLT +; NOVEC-NEXT: ld %f0, 160(%r15) +; NOVEC-NEXT: ld %f2, 168(%r15) +; NOVEC-NEXT: la %r2, 176(%r15) +; NOVEC-NEXT: axbr %f0, %f0 +; NOVEC-NEXT: std %f0, 176(%r15) +; NOVEC-NEXT: std %f2, 184(%r15) +; NOVEC-NEXT: brasl %r14, __trunctfhf2@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun5: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: la %r2, 160(%r15) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhftf2@PLT +; VECTOR-NEXT: vl %v0, 160(%r15), 3 +; VECTOR-NEXT: wfaxb %v0, %v0, %v0 +; VECTOR-NEXT: la %r2, 176(%r15) +; VECTOR-NEXT: vst %v0, 176(%r15), 3 +; VECTOR-NEXT: brasl %r14, __trunctfhf2@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %Op0 = load half, ptr %Src, align 2 + %E0 = fpext half %Op0 to fp128 + %Add = fadd fp128 %E0, %E0 + %Res = fptrunc fp128 %Add to half + store half %Res, ptr %Dst, align 2 + ret void +} + +; Test a chain of half operations which should have each operation surrounded +; by conversions to/from fp32 for proper emulation. +define half @fun6(half %Op0, half %Op1, half %Op2) { +; NOVEC-LABEL: fun6: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -184 +; NOVEC-NEXT: .cfi_def_cfa_offset 344 +; NOVEC-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: .cfi_offset %f10, -184 +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f2 +; NOVEC-NEXT: ler %f8, %f4 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f10, %f0 +; NOVEC-NEXT: ler %f0, %f9 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f10 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 296(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun6: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f2 +; VECTOR-NEXT: ldr %f8, %f4 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: wfasb %f0, %f9, %f0 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 +entry: + %A0 = fadd half %Op0, %Op1 + %Res = fadd half %A0, %Op2 + ret half %Res +} + +; Store an incoming half argument and return a loaded one. +define half @fun7(half %Op0, ptr %Dst, ptr %Src) { +; NOVEC-LABEL: fun7: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r2) +; NOVEC-NEXT: lh %r0, 0(%r3) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun7: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vsteh %v0, 0(%r2), 0 +; VECTOR-NEXT: vlreph %v0, 0(%r3) +; VECTOR-NEXT: br %r14 +entry: + store half %Op0, ptr %Dst + %Res = load half, ptr %Src + ret half %Res +} + +; Call a function with half argument and return values. +declare half @foo(half) +define void @fun8(ptr %Src, ptr %Dst) { +; NOVEC-LABEL: fun8: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) +; NOVEC-NEXT: .cfi_offset %r13, -56 +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lh %r0, 0(%r2) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: lgr %r13, %r3 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, foo@PLT +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: sth %r0, 0(%r13) +; NOVEC-NEXT: lmg %r13, %r15, 264(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun8: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlreph %v0, 0(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, foo@PLT +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 264(%r15) +; VECTOR-NEXT: br %r14 +entry: + %arg = load half, ptr %Src + %Res = call half @foo(half %arg) + store half %Res, ptr %Dst + ret void +} + +; Receive stack argument. +define half @fun9(half %Arg0, half %Arg1, half %Arg2, half %Arg3, half %Arg4) { +; NOVEC-LABEL: fun9: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -176 +; NOVEC-NEXT: .cfi_def_cfa_offset 336 +; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; NOVEC-NEXT: .cfi_offset %f8, -168 +; NOVEC-NEXT: .cfi_offset %f9, -176 +; NOVEC-NEXT: lh %r0, 342(%r15) +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ler %f8, %f6 +; NOVEC-NEXT: ldgr %f0, %r0 +; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: ler %f9, %f0 +; NOVEC-NEXT: ler %f0, %f8 +; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT +; NOVEC-NEXT: aebr %f0, %f9 +; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT +; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun9: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -176 +; VECTOR-NEXT: .cfi_def_cfa_offset 336 +; VECTOR-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: vlreph %v0, 342(%r15) +; VECTOR-NEXT: ldr %f8, %f6 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; VECTOR-NEXT: lmg %r14, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %A0 = fadd half %Arg3, %Arg4 + ret half %A0 +} + +; Pass stack argument. +define void @fun10(half %Arg0) { +; NOVEC-LABEL: fun10: +; NOVEC: # %bb.0: +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -168 +; NOVEC-NEXT: .cfi_def_cfa_offset 328 +; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d +; NOVEC-NEXT: lgdr %r0, %f0 +; NOVEC-NEXT: risblg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: srl %r0, 16 +; NOVEC-NEXT: ler %f2, %f0 +; NOVEC-NEXT: ler %f4, %f0 +; NOVEC-NEXT: ler %f6, %f0 +; NOVEC-NEXT: sth %r0, 166(%r15) +; NOVEC-NEXT: brasl %r14, fun9@PLT +; NOVEC-NEXT: lmg %r14, %r15, 280(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun10: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -168 +; VECTOR-NEXT: .cfi_def_cfa_offset 328 +; VECTOR-NEXT: ldr %f2, %f0 +; VECTOR-NEXT: ldr %f4, %f0 +; VECTOR-NEXT: ldr %f6, %f0 +; VECTOR-NEXT: vsteh %v0, 166(%r15), 0 +; VECTOR-NEXT: brasl %r14, fun9@PLT +; VECTOR-NEXT: lmg %r14, %r15, 280(%r15) +; VECTOR-NEXT: br %r14 + call void @fun9(half %Arg0, half %Arg0, half %Arg0, half %Arg0, half %Arg0) + ret void +} + +; Test loading some immediates from the Constant Pool. +declare void @foo2(half, half, half, half) +define void @fun11() { +; NOVEC-LABEL: fun11: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) +; NOVEC-NEXT: .cfi_offset %r14, -48 +; NOVEC-NEXT: .cfi_offset %r15, -40 +; NOVEC-NEXT: aghi %r15, -160 +; NOVEC-NEXT: .cfi_def_cfa_offset 320 +; NOVEC-NEXT: lhrl %r0, .LCPI11_0 +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: ldgr %f4, %r0 +; NOVEC-NEXT: lhrl %r0, .LCPI11_1 +; NOVEC-NEXT: sll %r0, 16 +; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d +; NOVEC-NEXT: lzer %f2 +; NOVEC-NEXT: risbhg %r0, %r0, 0, 159, 32 +; NOVEC-NEXT: lcdfr %f0, %f2 +; NOVEC-NEXT: ldgr %f6, %r0 +; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d +; NOVEC-NEXT: brasl %r14, foo2@PLT +; NOVEC-NEXT: lmg %r14, %r15, 272(%r15) +; NOVEC-NEXT: br %r14 +; +; VECTOR-LABEL: fun11: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: lzer %f2 +; VECTOR-NEXT: vrepih %v4, 13824 +; VECTOR-NEXT: vrepih %v6, 15360 +; VECTOR-NEXT: lcdfr %f0, %f2 +; VECTOR-NEXT: brasl %r14, foo2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 +entry: + call void @foo2(half -0.0, half 0.0, half 0.375, half 1.0) + ret void +} + +; Test a tail call. +declare void @foo3(half) +define void @fun12(half %Arg0) { +; NOVEC-LABEL: fun12: +; NOVEC: # %bb.0: # %entry +; NOVEC-NEXT: jg foo3@PLT +; +; VECTOR-LABEL: fun12: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: jg foo3@PLT +entry: + tail call void @foo3(half %Arg0) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-round-03.ll index d35cafc406ad77..e7a9c0fa6e87aa 100644 --- a/llvm/test/CodeGen/SystemZ/fp-round-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-round-03.ll @@ -1,6 +1,19 @@ ; Test rounding functions for z14 and above. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -verify-machineinstrs \ +; RUN: | FileCheck %s + +; Test that an f16 intrinsic can be lowered with promotion to float. +declare half @llvm.rint.f16(half %f) +define half @f0(half %f) { +; CHECK-LABEL: f0: +; CHECK: brasl %r14, __extendhfsf2@PLT +; CHECK: fiebra %f0, 0, %f0, 0 +; CHECK: brasl %r14, __truncsfhf2@PLT +; CHECK: br %r14 + %res = call half @llvm.rint.f16(half %f) + ret half %res +} ; Test rint for f32. declare float @llvm.rint.f32(float %f) diff --git a/llvm/test/CodeGen/SystemZ/spill-half-01.mir b/llvm/test/CodeGen/SystemZ/spill-half-01.mir new file mode 100644 index 00000000000000..56f4ecbffd2c63 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/spill-half-01.mir @@ -0,0 +1,47 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s -check-prefix=CHECK +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s -check-prefix=VECTOR + +# Test spilling / reloading of an fp16bit virtual register. + +--- +name: fun0 +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: fp16bit } +liveins: + - { reg: '$f0h', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $f0h + + ; CHECK-LABEL: fun0: + ; CHECK-NOT: $f0 + ; CHECK: # kill: def $f0h killed $f0h killed $f0d def $f0d + ; CHECK-NEXT: lgdr %r0, %f0 + ; CHECK-NEXT: srlg %r0, %r0, 48 + ; CHECK-NEXT: sth %r0, 166(%r15) # 2-byte Folded Spill + ; CHECK-NEXT: #APP + ; CHECK-NEXT: #NO_APP + ; CHECK: lh %r0, 166(%r15) # 2-byte Folded Reload + ; CHECK-NEXT: sllg %r0, %r0, 48 + ; CHECK-NEXT: ldgr %f0, %r0 + ; CHECK: # kill: def $f0h killed $f0h killed $f0d + ; CHECK-NOT: $f0 + + ; VECTOR-LABEL: fun0: + ; VECTOR: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill + ; VECTOR-NEXT: #APP + ; VECTOR-NEXT: #NO_APP + ; VECTOR-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload + + %0:fp16bit = COPY $f0h + INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d + $f0h = COPY %0 + Return implicit $f0h +... diff --git a/llvm/test/CodeGen/SystemZ/spill-half-02.mir b/llvm/test/CodeGen/SystemZ/spill-half-02.mir new file mode 100644 index 00000000000000..4934d0b7281157 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/spill-half-02.mir @@ -0,0 +1,40 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ +# RUN: -start-before=greedy | FileCheck %s + +# Test spilling / reloading of an vr16bit virtual register. + +--- +name: fun0 +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: addr64bit } + - { id: 1, class: addr64bit } + - { id: 2, class: vr16bit } +liveins: + - { reg: '$r2d', virtual-reg: '%0' } + - { reg: '$r3d', virtual-reg: '%1' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $r2d, $r3d + + ; CHECK-LABEL: fun0: + ; CHECK: stg %r3, 168(%r15) # 8-byte Folded Spill + ; CHECK-NEXT: vlreph %v0, 0(%r2) + ; CHECK-NEXT: vsteh %v0, 166(%r15), 0 # 2-byte Folded Spill + ; CHECK-NEXT: #APP + ; CHECK-NEXT: #NO_APP + ; CHECK-NEXT: lg %r1, 168(%r15) # 8-byte Folded Reload + ; CHECK-NEXT: vlreph %v0, 166(%r15) # 2-byte Folded Reload + ; CHECK-NEXT: vsteh %v0, 0(%r1), 0 + + %1:addr64bit = COPY $r3d + %0:addr64bit = COPY $r2d + %2:vr16bit = VL16 %0, 0, $noreg + INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0d, 12, implicit-def dead early-clobber $r1d, 12, implicit-def dead early-clobber $r2d, 12, implicit-def dead early-clobber $r3d, 12, implicit-def dead early-clobber $r4d, 12, implicit-def dead early-clobber $r5d, 12, implicit-def dead early-clobber $r6d, 12, implicit-def dead early-clobber $r7d, 12, implicit-def dead early-clobber $r8d, 12, implicit-def dead early-clobber $r9d, 12, implicit-def dead early-clobber $r10d, 12, implicit-def dead early-clobber $r11d, 12, implicit-def dead early-clobber $r12d, 12, implicit-def dead early-clobber $r13d, 12, implicit-def dead early-clobber $r14d, 12, implicit-def dead early-clobber $f0d, 12, implicit-def dead early-clobber $f1d, 12, implicit-def dead early-clobber $f2d, 12, implicit-def dead early-clobber $f3d, 12, implicit-def dead early-clobber $f4d, 12, implicit-def dead early-clobber $f5d, 12, implicit-def dead early-clobber $f6d, 12, implicit-def dead early-clobber $f7d, 12, implicit-def dead early-clobber $f8d, 12, implicit-def dead early-clobber $f9d, 12, implicit-def dead early-clobber $f10d, 12, implicit-def dead early-clobber $f11d, 12, implicit-def dead early-clobber $f12d, 12, implicit-def dead early-clobber $f13d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f14d, 12, implicit-def dead early-clobber $f15d, 12, implicit-def dead early-clobber $f16d, 12, implicit-def dead early-clobber $f17d, 12, implicit-def dead early-clobber $f18d, 12, implicit-def dead early-clobber $f19d, 12, implicit-def dead early-clobber $f20d, 12, implicit-def dead early-clobber $f21d, 12, implicit-def dead early-clobber $f22d, 12, implicit-def dead early-clobber $f23d, 12, implicit-def dead early-clobber $f24d, 12, implicit-def dead early-clobber $f25d, 12, implicit-def dead early-clobber $f26d, 12, implicit-def dead early-clobber $f27d, 12, implicit-def dead early-clobber $f28d, 12, implicit-def dead early-clobber $f29d, 12, implicit-def dead early-clobber $f30d, 12, implicit-def dead early-clobber $f31d + VST16 %2, %1, 0, $noreg + Return +... diff --git a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir index 7fc7bd3e347bb5..95ba0b4bf34663 100644 --- a/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir +++ b/llvm/test/CodeGen/SystemZ/twoaddr-kill.mir @@ -18,19 +18,19 @@ body: | ; CHECK-NEXT: $r2l = COPY [[COPY]] ; CHECK-NEXT: $r3l = COPY killed [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:grh32bit = COPY killed [[COPY1]] - ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l + ; CHECK-NEXT: INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l ; CHECK-NEXT: [[COPY3:%[0-9]+]]:grh32bit = COPY killed [[COPY2]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:grh32bit = COPY [[COPY3]] - ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 393225 /* reguse:GRH32Bit */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber [[COPY4]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 524297 /* reguse:GRH32Bit */, [[COPY3]] ; CHECK-NEXT: $r2l = COPY killed [[COPY4]] ; CHECK-NEXT: Return implicit killed $r2l %0:gr32bit = COPY killed $r2l %2:grh32bit = COPY %0 $r2l = COPY %0 $r3l = COPY killed %0 - INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 393226 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l + INLINEASM &"stepa $1, $2, $3", 0 /* attdialect */, 524298 /* regdef:GRH32Bit */, def %1:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %2(tied-def 3), 9 /* reguse */, killed $r2l, 9 /* reguse */, killed $r3l %4:grh32bit = COPY killed %1 - INLINEASM &"stepb $1, $2", 0 /* attdialect */, 393227 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 393225 /* reguse:GRH32Bit */, %4 + INLINEASM &"stepb $1, $2", 0 /* attdialect */, 524299 /* regdef-ec:GRH32Bit */, def early-clobber %3:grh32bit, 2147483657 /* reguse tiedto:$0 */, killed %4(tied-def 3), 524297 /* reguse:GRH32Bit */, %4 $r2l = COPY killed %3 Return implicit killed $r2l ...