diff --git a/clang/lib/DPCT/Diagnostics/Diagnostics.inc b/clang/lib/DPCT/Diagnostics/Diagnostics.inc index bc7766dd9f58..ffede05fdcc8 100644 --- a/clang/lib/DPCT/Diagnostics/Diagnostics.inc +++ b/clang/lib/DPCT/Diagnostics/Diagnostics.inc @@ -298,6 +298,9 @@ DEF_WARNING(JOINT_MATRIX_SHAPE, 1135, HIGH_LEVEL, "Please check if joint_matrix DEF_COMMENT(JOINT_MATRIX_SHAPE, 1135, HIGH_LEVEL, "Please check if joint_matrix implementations support the combination of data type and matrix shape type in the target hardware.") DEF_WARNING(UNSUPPORTED_EXTMEM_WIN_HANDLE, 1136, HIGH_LEVEL, "SYCL Bindless Images extension only supports importing external resource memory using NT handle on Windows. If assert(%0.get_win32_handle()) fails, you may need to adjust the code to use (%0.get_win32_handle()).") DEF_COMMENT(UNSUPPORTED_EXTMEM_WIN_HANDLE, 1136, HIGH_LEVEL, "SYCL Bindless Images extension only supports importing external resource memory using NT handle on Windows. If assert({0}.get_win32_handle()) fails, you may need to adjust the code to use ({0}.get_win32_handle()).") +DEF_WARNING(ASYNC_COPY_DEVICE_WARN, 1137, LOW_LEVEL, "Synchronous copy operations are used on the SYCL side for migration of PTX \"cp.async\" and may cause performance issues. You may need to adjust.") +DEF_COMMENT(ASYNC_COPY_DEVICE_WARN, 1137, LOW_LEVEL, "Synchronous copy operations are used on the SYCL side for migration of PTX \"cp.async\" and may cause performance issues. You may need to adjust.") + // clang-format on #undef DEF_COMMENT diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp index c3d4bbc57b8a..3d2a2176d2c6 100644 --- a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp +++ b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp @@ -37,7 +37,7 @@ using namespace clang::dpct; namespace { inline bool SYCLGenError() { return true; } -inline bool SYCLGenSuccess() { return false; } +inline bool SYCLGenSuccess() {return false; } /// This is used to handle all the AST nodes (except specific instructions, Eg. /// mov/setp), and generate functionally equivalent SYCL code. @@ -589,9 +589,11 @@ bool SYCLGenBase::emitVariableDeclaration(const InlineAsmVarDecl *D) { bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) { // Address expression only support ld/st/red & atom instructions. - if (!CurrInst || !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom, - asmtok::op_prefetch, asmtok::op_red)) + if (!CurrInst || + !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom, + asmtok::op_prefetch, asmtok::op_red, asmtok::op_cp)) { return SYCLGenError(); + } std::string Type; if (tryEmitType(Type, CurrInst->getType(0))) return SYCLGenError(); @@ -618,6 +620,7 @@ bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) { std::string Reg; if (tryEmitStmt(Reg, Dst->getSymbol())) return SYCLGenSuccess(); + if (CurrInst->is(asmtok::op_prefetch, asmtok::op_red) || CanSuppressCast(Dst->getSymbol())) OS() << llvm::formatv("{0}", Reg); @@ -2769,6 +2772,46 @@ class SYCLGen : public SYCLGenBase { endstmt(); return SYCLGenSuccess(); } + + bool handle_cp(const InlineAsmInstruction *Inst) override { + if (Inst->getNumInputOperands() != 3 || Inst->getNumTypes() != 1) + return SYCLGenError(); + + llvm::SaveAndRestore Store(CurrInst); + CurrInst = Inst; + + std::string Op[3]; + for (int i = 0; i < 3; ++i) + if (tryEmitStmt(Op[i], Inst->getInputOperand(i))) + return SYCLGenError(); + + auto CommonIfStat = [&](std::string Val) { + indent(); + return "if (" + Op[1] + " > " + Val + ")\n"; + }; + + auto CommonBody = [&](std::string Val) { + incIndent(); + indent(); + decIndent(); + return "*(" + Op[2] + " + " + Val + ") = *(" + Op[0] + " + " + Val + ")"; + }; + + OS() << "*(" << Op[2] << ") = *(" << Op[0] << ");\n"; + + OS() << CommonIfStat("4"); + OS() << CommonBody("1") << ";\n"; + + OS() << CommonIfStat("8"); + OS() << CommonBody("2") << ";\n"; + + OS() << CommonIfStat("12"); + OS() << CommonBody("3"); + endstmt(); + + report(Diagnostics::ASYNC_COPY_DEVICE_WARN, true); + return SYCLGenSuccess(); + } }; /// Clean the special character in identifier. @@ -2985,7 +3028,6 @@ void AsmRule::doMigrateInternel(const GCCAsmStmt *GAS) { Parser.addInlineAsmOperands(GAS->getInputExpr(I), getReplaceString(GAS->getInputExpr(I)), GAS->getInputConstraint(I)); - do { auto Inst = Parser.ParseStatement(); if (Inst.isInvalid()) { diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h b/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h index bbd5510030d6..1922185e50df 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h @@ -318,7 +318,7 @@ class InlineAsmInstruction : public InlineAsmStmt { /// e.g. asmtok::op_mov, asmtok::op_setp, etc. InlineAsmIdentifierInfo *Opcode = nullptr; - std::optional StateSpace; + SmallVector StateSpaces; /// This represents arrtibutes like: comparsion operator, rounding modifiers, /// ... e.g. instruction setp.eq.s32 has a comparsion operator 'eq'. @@ -342,12 +342,14 @@ class InlineAsmInstruction : public InlineAsmStmt { public: InlineAsmInstruction(InlineAsmIdentifierInfo *Op, - std::optional SS, + SmallVector AsmStateSpaces, ArrayRef Attrs, ArrayRef Types, InlineAsmExpr *Out, InlineAsmExpr *Pred, ArrayRef InOps) - : InlineAsmStmt(InstructionClass), Opcode(Op), StateSpace(SS), - Types(Types), OutputOp(Out), PredOutputOp(Pred), InputOps(InOps) { + : InlineAsmStmt(InstructionClass), Opcode(Op), Types(Types), + OutputOp(Out), PredOutputOp(Pred), InputOps(InOps) { + StateSpaces.insert(StateSpaces.begin(), AsmStateSpaces.begin(), + AsmStateSpaces.end()); Attributes.insert(Attrs.begin(), Attrs.end()); } @@ -390,6 +392,10 @@ class InlineAsmInstruction : public InlineAsmStmt { return InstructionClass <= S->getStmtClass(); } AsmStateSpace getStateSpace() const { + + std::optional StateSpace = + StateSpaces.size() > 0 ? StateSpaces[StateSpaces.size() - 1] + : AsmStateSpace::none; return StateSpace.value_or(AsmStateSpace::none); } }; diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp index f683c0004210..0ca2c149bb65 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp @@ -333,7 +333,7 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() { SmallVector Attrs; SmallVector Types; SmallVector Ops; - std::optional StateSpace; + SmallVector StateSpaces; while (Tok.startOfDot()) { switch (Tok.getIdentifier()->getFlags()) { case InlineAsmIdentifierInfo::BuiltinType: @@ -343,11 +343,7 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() { Attrs.push_back(ConvertToInstAttr(Tok.getKind())); break; case InlineAsmIdentifierInfo::StateSpace: - // Duplicated state space in an single instruction statement. - if (StateSpace.has_value()) - return AsmStmtError(); - else - StateSpace = ConvertToStateSpace(Tok.getKind()); + StateSpaces.push_back(ConvertToStateSpace(Tok.getKind())); break; default: return AsmStmtError(); @@ -383,7 +379,13 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() { Types.push_back(Context.getBuiltinType(InlineAsmBuiltinType::byte)); } - return ::new (Context) InlineAsmInstruction(Opcode, StateSpace, Attrs, Types, + if (Opcode->getTokenID() == asmtok::op_cp) { + Ops.push_back(Out.get()); + Out = nullptr; + Types.push_back(Context.getBuiltinType(InlineAsmBuiltinType::u32)); + } + + return ::new (Context) InlineAsmInstruction(Opcode, StateSpaces, Attrs, Types, Out.get(), Pred.get(), Ops); } diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def b/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def index 3efa709cb287..64b4604cc5ae 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def @@ -398,6 +398,8 @@ MODIFIER(clamp, ".clamp") MODIFIER(wrap, ".wrap") MODIFIER(wide, ".wide") MODIFIER(sync, ".sync") +MODIFIER(async, ".async") +MODIFIER(cg, ".cg") MODIFIER(warp, ".warp") MODIFIER(up, ".up") MODIFIER(down, ".down") diff --git a/clang/lib/DPCT/SrcAPI/APINames_ASM.inc b/clang/lib/DPCT/SrcAPI/APINames_ASM.inc index d708bad5f8e5..0c27c5ecb508 100644 --- a/clang/lib/DPCT/SrcAPI/APINames_ASM.inc +++ b/clang/lib/DPCT/SrcAPI/APINames_ASM.inc @@ -56,7 +56,7 @@ ENTRY("clz", "clz", true, NO_FLAG, P1, "Successful") ENTRY("cnot", "cnot", true, NO_FLAG, P1, "Successful") ENTRY("copysign", "copysign", true, NO_FLAG, P1, "Successful") ENTRY("cos", "cos", true, NO_FLAG, P1, "Successful") -ENTRY("cp", "cp", false, NO_FLAG, P1, "Comment") +ENTRY("cp", "cp", true, NO_FLAG, P1, "Partial") ENTRY("createpolicy", "createpolicy", false, NO_FLAG, P1, "Comment") ENTRY("cvt", "cvt", true, NO_FLAG, P1, "Partial") ENTRY("cvta", "cvta", false, NO_FLAG, P1, "Comment") diff --git a/clang/test/dpct/asm/cp.cu b/clang/test/dpct/asm/cp.cu new file mode 100644 index 000000000000..3606a18172ad --- /dev/null +++ b/clang/test/dpct/asm/cp.cu @@ -0,0 +1,70 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2 +// RUN: dpct --format-range=none -out-root %T/cp %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only +// RUN: FileCheck %s --match-full-lines --input-file %T/cp/cp.dp.cpp +// RUN: %if build_lit %{icpx -c -fsycl %T/cp/cp.dp.cpp -o %T/cp/cp.dp.o %} + +// clang-format off +#include +#include +#include + +// CHECK: inline void cp_async4(void *smem_ptr, const void *glob_ptr) { +// CHECK-NEXT: const int BYTES = 16; +// CHECK-NEXT: auto smem = smem_ptr; +// CHECK-NEXT: /* +// CHECK-NEXT: DPCT1137:{{[0-9]+}}: Synchronous copy operations are used on the SYCL side for migration of PTX "cp.async" and may cause performance issues. You may need to adjust. +// CHECK-NEXT: */ +// CHECK-NEXT: { +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem)) = *(((uint32_t *)(uintptr_t)glob_ptr)); +// CHECK-NEXT: if (BYTES > 4) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 1) = *(((uint32_t *)(uintptr_t)glob_ptr) + 1); +// CHECK-NEXT: if (BYTES > 8) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 2) = *(((uint32_t *)(uintptr_t)glob_ptr) + 2); +// CHECK-NEXT: if (BYTES > 12) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 3) = *(((uint32_t *)(uintptr_t)glob_ptr) + 3); +// CHECK-NEXT: } +// CHECK-NEXT:} +__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES)); +} + + +// CHECK: inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, +// CHECK-NEXT: bool pred = true) { +// CHECK-NEXT: const int BYTES = 16; +// CHECK-NEXT: auto smem = smem_ptr; +// CHECK-NEXT: /* +// CHECK-NEXT: DPCT1137:{{[0-9]+}}: Synchronous copy operations are used on the SYCL side for migration of PTX "cp.async" and may cause performance issues. You may need to adjust. +// CHECK-NEXT: */ +// CHECK-NEXT: { +// CHECK-NEXT: bool p; +// CHECK-NEXT: p = (int)pred != 0; +// CHECK-NEXT: if (p) { +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem)) = *(((uint32_t *)(uintptr_t)glob_ptr)); +// CHECK-NEXT: if (BYTES > 4) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 1) = *(((uint32_t *)(uintptr_t)glob_ptr) + 1); +// CHECK-NEXT: if (BYTES > 8) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 2) = *(((uint32_t *)(uintptr_t)glob_ptr) + 2); +// CHECK-NEXT: if (BYTES > 12) +// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 3) = *(((uint32_t *)(uintptr_t)glob_ptr) + 3); +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT:} +__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, + bool pred = true) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); +} + +// clang-format on