Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCLomatic] Support migration of PTX instruction cp.async #2672

Open
wants to merge 4 commits into
base: SYCLomatic
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/lib/DPCT/Diagnostics/Diagnostics.inc
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,9 @@ DEF_WARNING(JOINT_MATRIX_SHAPE, 1135, HIGH_LEVEL, "Please check if joint_matrix
DEF_COMMENT(JOINT_MATRIX_SHAPE, 1135, HIGH_LEVEL, "Please check if joint_matrix implementations support the combination of data type and matrix shape type in the target hardware.")
DEF_WARNING(UNSUPPORTED_EXTMEM_WIN_HANDLE, 1136, HIGH_LEVEL, "SYCL Bindless Images extension only supports importing external resource memory using NT handle on Windows. If assert(%0.get_win32_handle()) fails, you may need to adjust the code to use (%0.get_win32_handle()).")
DEF_COMMENT(UNSUPPORTED_EXTMEM_WIN_HANDLE, 1136, HIGH_LEVEL, "SYCL Bindless Images extension only supports importing external resource memory using NT handle on Windows. If assert({0}.get_win32_handle()) fails, you may need to adjust the code to use ({0}.get_win32_handle()).")
DEF_WARNING(ASYNC_COPY_DEVICE_WARN, 1137, LOW_LEVEL, "Synchronous copy operations are used on the SYCL side for migration of PTX \"cp.async\" and may cause performance issues. You may need to adjust.")
DEF_COMMENT(ASYNC_COPY_DEVICE_WARN, 1137, LOW_LEVEL, "Synchronous copy operations are used on the SYCL side for migration of PTX \"cp.async\" and may cause performance issues. You may need to adjust.")

// clang-format on

#undef DEF_COMMENT
Expand Down
50 changes: 46 additions & 4 deletions clang/lib/DPCT/RulesAsm/AsmMigration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ using namespace clang::dpct;
namespace {

inline bool SYCLGenError() { return true; }
inline bool SYCLGenSuccess() { return false; }
inline bool SYCLGenSuccess() {return false; }

/// This is used to handle all the AST nodes (except specific instructions, Eg.
/// mov/setp), and generate functionally equivalent SYCL code.
Expand Down Expand Up @@ -589,9 +589,11 @@ bool SYCLGenBase::emitVariableDeclaration(const InlineAsmVarDecl *D) {

bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) {
// Address expression only support ld/st/red & atom instructions.
if (!CurrInst || !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom,
asmtok::op_prefetch, asmtok::op_red))
if (!CurrInst ||
!CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom,
asmtok::op_prefetch, asmtok::op_red, asmtok::op_cp)) {
return SYCLGenError();
}
std::string Type;
if (tryEmitType(Type, CurrInst->getType(0)))
return SYCLGenError();
Expand All @@ -618,6 +620,7 @@ bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) {
std::string Reg;
if (tryEmitStmt(Reg, Dst->getSymbol()))
return SYCLGenSuccess();

if (CurrInst->is(asmtok::op_prefetch, asmtok::op_red) ||
CanSuppressCast(Dst->getSymbol()))
OS() << llvm::formatv("{0}", Reg);
Expand Down Expand Up @@ -2769,6 +2772,46 @@ class SYCLGen : public SYCLGenBase {
endstmt();
return SYCLGenSuccess();
}

bool handle_cp(const InlineAsmInstruction *Inst) override {
if (Inst->getNumInputOperands() != 3 || Inst->getNumTypes() != 1)
return SYCLGenError();

llvm::SaveAndRestore<const InlineAsmInstruction *> Store(CurrInst);
CurrInst = Inst;

std::string Op[3];
for (int i = 0; i < 3; ++i)
if (tryEmitStmt(Op[i], Inst->getInputOperand(i)))
return SYCLGenError();

auto CommonIfStat = [&](std::string Val) {
indent();
return "if (" + Op[1] + " > " + Val + ")\n";
};

auto CommonBody = [&](std::string Val) {
incIndent();
indent();
decIndent();
return "*(" + Op[2] + " + " + Val + ") = *(" + Op[0] + " + " + Val + ")";
};

OS() << "*(" << Op[2] << ") = *(" << Op[0] << ");\n";

OS() << CommonIfStat("4");
OS() << CommonBody("1") << ";\n";

OS() << CommonIfStat("8");
OS() << CommonBody("2") << ";\n";

OS() << CommonIfStat("12");
OS() << CommonBody("3");
endstmt();

report(Diagnostics::ASYNC_COPY_DEVICE_WARN, true);
return SYCLGenSuccess();
}
};

/// Clean the special character in identifier.
Expand Down Expand Up @@ -2985,7 +3028,6 @@ void AsmRule::doMigrateInternel(const GCCAsmStmt *GAS) {
Parser.addInlineAsmOperands(GAS->getInputExpr(I),
getReplaceString(GAS->getInputExpr(I)),
GAS->getInputConstraint(I));

do {
auto Inst = Parser.ParseStatement();
if (Inst.isInvalid()) {
Expand Down
14 changes: 10 additions & 4 deletions clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ class InlineAsmInstruction : public InlineAsmStmt {
/// e.g. asmtok::op_mov, asmtok::op_setp, etc.
InlineAsmIdentifierInfo *Opcode = nullptr;

std::optional<AsmStateSpace> StateSpace;
SmallVector<AsmStateSpace, 4> StateSpaces;

/// This represents arrtibutes like: comparsion operator, rounding modifiers,
/// ... e.g. instruction setp.eq.s32 has a comparsion operator 'eq'.
Expand All @@ -342,12 +342,14 @@ class InlineAsmInstruction : public InlineAsmStmt {

public:
InlineAsmInstruction(InlineAsmIdentifierInfo *Op,
std::optional<AsmStateSpace> SS,
SmallVector<AsmStateSpace, 4> AsmStateSpaces,
ArrayRef<InstAttr> Attrs,
ArrayRef<InlineAsmType *> Types, InlineAsmExpr *Out,
InlineAsmExpr *Pred, ArrayRef<InlineAsmExpr *> InOps)
: InlineAsmStmt(InstructionClass), Opcode(Op), StateSpace(SS),
Types(Types), OutputOp(Out), PredOutputOp(Pred), InputOps(InOps) {
: InlineAsmStmt(InstructionClass), Opcode(Op), Types(Types),
OutputOp(Out), PredOutputOp(Pred), InputOps(InOps) {
StateSpaces.insert(StateSpaces.begin(), AsmStateSpaces.begin(),
AsmStateSpaces.end());
Attributes.insert(Attrs.begin(), Attrs.end());
}

Expand Down Expand Up @@ -390,6 +392,10 @@ class InlineAsmInstruction : public InlineAsmStmt {
return InstructionClass <= S->getStmtClass();
}
AsmStateSpace getStateSpace() const {

std::optional<AsmStateSpace> StateSpace =
StateSpaces.size() > 0 ? StateSpaces[StateSpaces.size() - 1]
: AsmStateSpace::none;
return StateSpace.value_or(AsmStateSpace::none);
}
};
Expand Down
16 changes: 9 additions & 7 deletions clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() {
SmallVector<InstAttr, 4> Attrs;
SmallVector<InlineAsmType *, 4> Types;
SmallVector<InlineAsmExpr *, 4> Ops;
std::optional<AsmStateSpace> StateSpace;
SmallVector<AsmStateSpace, 4> StateSpaces;
while (Tok.startOfDot()) {
switch (Tok.getIdentifier()->getFlags()) {
case InlineAsmIdentifierInfo::BuiltinType:
Expand All @@ -343,11 +343,7 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() {
Attrs.push_back(ConvertToInstAttr(Tok.getKind()));
break;
case InlineAsmIdentifierInfo::StateSpace:
// Duplicated state space in an single instruction statement.
if (StateSpace.has_value())
return AsmStmtError();
else
StateSpace = ConvertToStateSpace(Tok.getKind());
StateSpaces.push_back(ConvertToStateSpace(Tok.getKind()));
break;
default:
return AsmStmtError();
Expand Down Expand Up @@ -383,7 +379,13 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() {
Types.push_back(Context.getBuiltinType(InlineAsmBuiltinType::byte));
}

return ::new (Context) InlineAsmInstruction(Opcode, StateSpace, Attrs, Types,
if (Opcode->getTokenID() == asmtok::op_cp) {
Ops.push_back(Out.get());
Out = nullptr;
Types.push_back(Context.getBuiltinType(InlineAsmBuiltinType::u32));
}

return ::new (Context) InlineAsmInstruction(Opcode, StateSpaces, Attrs, Types,
Out.get(), Pred.get(), Ops);
}

Expand Down
2 changes: 2 additions & 0 deletions clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ MODIFIER(clamp, ".clamp")
MODIFIER(wrap, ".wrap")
MODIFIER(wide, ".wide")
MODIFIER(sync, ".sync")
MODIFIER(async, ".async")
MODIFIER(cg, ".cg")
MODIFIER(warp, ".warp")
MODIFIER(up, ".up")
MODIFIER(down, ".down")
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/DPCT/SrcAPI/APINames_ASM.inc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ ENTRY("clz", "clz", true, NO_FLAG, P1, "Successful")
ENTRY("cnot", "cnot", true, NO_FLAG, P1, "Successful")
ENTRY("copysign", "copysign", true, NO_FLAG, P1, "Successful")
ENTRY("cos", "cos", true, NO_FLAG, P1, "Successful")
ENTRY("cp", "cp", false, NO_FLAG, P1, "Comment")
ENTRY("cp", "cp", true, NO_FLAG, P1, "Partial")
ENTRY("createpolicy", "createpolicy", false, NO_FLAG, P1, "Comment")
ENTRY("cvt", "cvt", true, NO_FLAG, P1, "Partial")
ENTRY("cvta", "cvta", false, NO_FLAG, P1, "Comment")
Expand Down
70 changes: 70 additions & 0 deletions clang/test/dpct/asm/cp.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
// RUN: dpct --format-range=none -out-root %T/cp %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
// RUN: FileCheck %s --match-full-lines --input-file %T/cp/cp.dp.cpp
// RUN: %if build_lit %{icpx -c -fsycl %T/cp/cp.dp.cpp -o %T/cp/cp.dp.o %}

// clang-format off
#include <cstdint>
#include <cstdint>
#include <cuda_runtime.h>

// CHECK: inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
// CHECK-NEXT: const int BYTES = 16;
// CHECK-NEXT: auto smem = smem_ptr;
// CHECK-NEXT: /*
// CHECK-NEXT: DPCT1137:{{[0-9]+}}: Synchronous copy operations are used on the SYCL side for migration of PTX "cp.async" and may cause performance issues. You may need to adjust.
// CHECK-NEXT: */
// CHECK-NEXT: {
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem)) = *(((uint32_t *)(uintptr_t)glob_ptr));
// CHECK-NEXT: if (BYTES > 4)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 1) = *(((uint32_t *)(uintptr_t)glob_ptr) + 1);
// CHECK-NEXT: if (BYTES > 8)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 2) = *(((uint32_t *)(uintptr_t)glob_ptr) + 2);
// CHECK-NEXT: if (BYTES > 12)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 3) = *(((uint32_t *)(uintptr_t)glob_ptr) + 3);
// CHECK-NEXT: }
// CHECK-NEXT:}
__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
}


// CHECK: inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
// CHECK-NEXT: bool pred = true) {
// CHECK-NEXT: const int BYTES = 16;
// CHECK-NEXT: auto smem = smem_ptr;
// CHECK-NEXT: /*
// CHECK-NEXT: DPCT1137:{{[0-9]+}}: Synchronous copy operations are used on the SYCL side for migration of PTX "cp.async" and may cause performance issues. You may need to adjust.
// CHECK-NEXT: */
// CHECK-NEXT: {
// CHECK-NEXT: bool p;
// CHECK-NEXT: p = (int)pred != 0;
// CHECK-NEXT: if (p) {
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem)) = *(((uint32_t *)(uintptr_t)glob_ptr));
// CHECK-NEXT: if (BYTES > 4)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 1) = *(((uint32_t *)(uintptr_t)glob_ptr) + 1);
// CHECK-NEXT: if (BYTES > 8)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 2) = *(((uint32_t *)(uintptr_t)glob_ptr) + 2);
// CHECK-NEXT: if (BYTES > 12)
// CHECK-NEXT: *(((uint32_t *)(uintptr_t)smem) + 3) = *(((uint32_t *)(uintptr_t)glob_ptr) + 3);
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT:}
__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}

// clang-format on
Loading