Skip to content

Commit

Permalink
Merge pull request llvm#79 from AMD-Lightning-Internal/upstream_merge…
Browse files Browse the repository at this point in the history
…_202501132308

merge main into amd-staging
  • Loading branch information
ronlieb authored Jan 14, 2025
2 parents 5b362a0 + d5512f7 commit fa99c8c
Show file tree
Hide file tree
Showing 201 changed files with 15,432 additions and 11,516 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,9 @@ void NarrowingConversionsCheck::handleFloatingCast(const ASTContext &Context,
return;
}
const BuiltinType *FromType = getBuiltinType(Rhs);
if (ToType->getKind() < FromType->getKind())
if (!llvm::APFloatBase::isRepresentableBy(
Context.getFloatTypeSemantics(FromType->desugar()),
Context.getFloatTypeSemantics(ToType->desugar())))
diagNarrowType(SourceLoc, Lhs, Rhs);
}
}
Expand Down
5 changes: 5 additions & 0 deletions clang-tools-extra/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ Changes in existing checks
<clang-tidy/checks/bugprone/forwarding-reference-overload>` check by fixing
a crash when determining if an ``enable_if[_t]`` was found.

- Improve :doc:`bugprone-narrowing-conversions
<clang-tidy/checks/bugprone/narrowing-conversions>` to avoid incorrect check
results when floating point type is not ``float``, ``double`` and
``long double``.

- Improved :doc:`bugprone-optional-value-conversion
<clang-tidy/checks/bugprone/optional-value-conversion>` to support detecting
conversion directly by ``std::make_unique`` and ``std::make_shared``.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ void narrow_double_to_float_not_ok(double d) {
f = narrow_double_to_float_return();
}

float narrow_float16_to_float_return(_Float16 f) {
return f;
}

_Float16 narrow_float_to_float16_return(float f) {
return f;
// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to '_Float16' [bugprone-narrowing-conversions]
}

void narrow_fp_constants() {
float f;
f = 0.5; // [dcl.init.list] 7.2 : in-range fp constant to narrower float is not a narrowing.
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
}();

GenerateArg(Consumer, Opt, It->Path);
};
}

// Note: some paths that came from "[-iprefix=xx] -iwithprefixbefore=yy" may
// have already been generated as "-I[xx]yy". If that's the case, their
Expand Down
5 changes: 2 additions & 3 deletions clang/lib/Serialization/ASTReaderDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,9 +451,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
void VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D);
void VisitOMPRequiresDecl(OMPRequiresDecl *D);
void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
};

} // namespace clang
};
} // namespace clang

namespace {

Expand Down
28 changes: 16 additions & 12 deletions compiler-rt/lib/scudo/standalone/primary64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1141,18 +1141,18 @@ template <typename Config> class SizeClassAllocator64 {
BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
}
const uptr TotalChunks = Region->MemMapInfo.AllocatedUser / BlockSize;
Str->append(
"%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
"inuse: %6zu total: %6zu releases: %6zu last "
"releases attempted: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
"(0x%zx)\n",
Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
Region->ReleaseInfo.NumReleasesAttempted,
Region->ReleaseInfo.LastReleasedBytes >> 10,
RegionPushedBytesDelta >> 10, Region->RegionBeg,
getRegionBaseByClassId(ClassId));
Str->append("%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
"inuse: %6zu total: %6zu releases attempted: %6zu last "
"released: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
"(0x%zx)\n",
Region->Exhausted ? "E" : " ", ClassId,
getSizeByClassId(ClassId), Region->MemMapInfo.MappedUser >> 10,
Region->FreeListInfo.PoppedBlocks,
Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
Region->ReleaseInfo.NumReleasesAttempted,
Region->ReleaseInfo.LastReleasedBytes >> 10,
RegionPushedBytesDelta >> 10, Region->RegionBeg,
getRegionBaseByClassId(ClassId));
}

void getRegionFragmentationInfo(RegionInfo *Region, uptr ClassId,
Expand Down Expand Up @@ -1297,6 +1297,10 @@ template <typename Config> class SizeClassAllocator64 {
return 0;
}

// The following steps contribute to the majority time spent in page
// releasing thus we increment the counter here.
++Region->ReleaseInfo.NumReleasesAttempted;

// Note that we have extracted the `GroupsToRelease` from region freelist.
// It's safe to let pushBlocks()/popBlocks() access the remaining region
// freelist. In the steps 3 and 4, we will temporarily release the FLLock
Expand Down
65 changes: 44 additions & 21 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,23 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
const fir::LLVMTypeConverter *typeConverter;
};

static mlir::Value genGetDeviceAddress(mlir::PatternRewriter &rewriter,
mlir::ModuleOp mod, mlir::Location loc,
mlir::Value inputArg) {
fir::FirOpBuilder builder(rewriter, mod);
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc, builder);
auto fTy = callee.getFunctionType();
mlir::Value conv = createConvertOp(rewriter, loc, fTy.getInput(0), inputArg);
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, conv, sourceFile, sourceLine)};
auto call = rewriter.create<fir::CallOp>(loc, callee, args);
return createConvertOp(rewriter, loc, inputArg.getType(), call->getResult(0));
}

struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
using OpRewritePattern::OpRewritePattern;

Expand All @@ -382,26 +399,10 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
if (cuf::isRegisteredDeviceGlobal(global)) {
rewriter.setInsertionPointAfter(addrOfOp);
auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(
loc, builder);
auto fTy = callee.getFunctionType();
mlir::Type toTy = fTy.getInput(0);
mlir::Value inputArg =
createConvertOp(rewriter, loc, toTy, addrOfOp.getResult());
mlir::Value sourceFile =
fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, inputArg, sourceFile, sourceLine)};
auto call = rewriter.create<fir::CallOp>(loc, callee, args);
mlir::Value cast = createConvertOp(
rewriter, loc, op.getMemref().getType(), call->getResult(0));
mlir::Value devAddr = genGetDeviceAddress(rewriter, mod, op.getLoc(),
addrOfOp.getResult());
rewriter.startOpModification(op);
op.getMemrefMutable().assign(cast);
op.getMemrefMutable().assign(devAddr);
rewriter.finalizeOpModification(op);
return success();
}
Expand Down Expand Up @@ -771,10 +772,32 @@ struct CUFLaunchOpConversion
loc, clusterDimsAttr.getZ().getInt());
}
}
llvm::SmallVector<mlir::Value> args;
auto mod = op->getParentOfType<mlir::ModuleOp>();
for (mlir::Value arg : op.getArgs()) {
// If the argument is a global descriptor, make sure we pass the device
// copy of this descriptor and not the host one.
if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(arg.getType()))) {
if (auto declareOp =
mlir::dyn_cast_or_null<fir::DeclareOp>(arg.getDefiningOp())) {
if (auto addrOfOp = mlir::dyn_cast_or_null<fir::AddrOfOp>(
declareOp.getMemref().getDefiningOp())) {
if (auto global = symTab.lookup<fir::GlobalOp>(
addrOfOp.getSymbol().getRootReference().getValue())) {
if (cuf::isRegisteredDeviceGlobal(global)) {
arg = genGetDeviceAddress(rewriter, mod, op.getLoc(),
declareOp.getResult());
}
}
}
}
}
args.push_back(arg);
}

auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
op.getArgs());
mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
if (clusterDimX && clusterDimY && clusterDimZ) {
gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
Expand Down
4 changes: 3 additions & 1 deletion flang/lib/Semantics/resolve-names.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4015,7 +4015,9 @@ bool SubprogramVisitor::Pre(const parser::PrefixSpec::Attributes &attrs) {
*attrs == common::CUDASubprogramAttrs::Device) {
const Scope &scope{currScope()};
const Scope *mod{FindModuleContaining(scope)};
if (mod && mod->GetName().value() == "cudadevice") {
if (mod &&
(mod->GetName().value() == "cudadevice" ||
mod->GetName().value() == "__cuda_device")) {
return false;
}
// Implicitly USE the cudadevice module by copying its symbols in the
Expand Down
32 changes: 32 additions & 0 deletions flang/module/__cuda_device.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
!===-- module/__cuda_device.f90 --------------------------------------------===!
!
! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
! See https://llvm.org/LICENSE.txt for license information.
! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
!
!===------------------------------------------------------------------------===!

! This module contains CUDA Fortran interfaces used in cudadevice.f90.

module __cuda_device
implicit none

! Set PRIVATE by default to explicitly only export what is meant
! to be exported by this MODULE.

interface
attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
real, intent(in), value :: x, y
real :: __fadd_rd
end function
end interface
public :: __fadd_rd

interface
attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
real, intent(in), value :: x, y
real :: __fadd_ru
end function
end interface
public :: __fadd_ru
end module
17 changes: 1 addition & 16 deletions flang/module/cudadevice.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
! CUDA Fortran procedures available in device subprogram

module cudadevice
use __cuda_device, only: __fadd_rd, __fadd_ru
implicit none

! Set PRIVATE by default to explicitly only export what is meant
Expand Down Expand Up @@ -71,20 +72,4 @@ attributes(device) subroutine threadfence_system()
end interface
public :: threadfence_system

interface
attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
real, intent(in) :: x, y
real :: __fadd_rd
end function
end interface
public :: __fadd_rd

interface
attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
real, intent(in) :: x, y
real :: __fadd_ru
end function
end interface
public :: __fadd_ru

end module
42 changes: 42 additions & 0 deletions flang/test/Fir/CUDA/cuda-launch.fir
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,45 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: func.func @_QMmod1Phost_sub()
// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})

// -----

module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
gpu.module @cuda_device_mod {
gpu.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) kernel {
gpu.return
}
}
fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda<device>} : !fir.box<!fir.ptr<!fir.array<?xf32>>> {
%c0 = arith.constant 0 : index
%0 = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
%1 = fir.shape %c0 : (index) -> !fir.shape<1>
%2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xf32>>>
}
func.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dp"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
return
}
func.func @_QQmain() {
%c1_i32 = arith.constant 1 : i32
%c4 = arith.constant 4 : index
%0 = cuf.alloc !fir.array<4xf32> {bindc_name = "a_dev", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa_dev"} -> !fir.ref<!fir.array<4xf32>>
%1 = fir.shape %c4 : (index) -> !fir.shape<1>
%2 = fir.declare %0(%1) {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFEa_dev"} : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<4xf32>>
%3 = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
%4 = fir.declare %3 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
%5 = fir.embox %2(%1) : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
fir.store %5 to %4 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
cuf.sync_descriptor @_QMdevptrEdev_ptr
cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%4) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
cuf.free %2 : !fir.ref<!fir.array<4xf32>> {data_attr = #cuf.cuda<device>}
return
}
}

// CHECK-LABEL: func.func @_QQmain()
// CHECK: %[[ADDROF:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
// CHECK: %[[DECL:.*]] = fir.declare %[[ADDROF]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
// CHECK: %[[CONV_DECL:.*]] = fir.convert %[[DECL]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.llvm_ptr<i8>
// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_DECL]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
6 changes: 5 additions & 1 deletion flang/tools/f18/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(MODULES_WITHOUT_IMPLEMENTATION
"__ppc_intrinsics"
"mma"
"__cuda_builtins"
"__cuda_device"
"cudadevice"
"ieee_arithmetic"
"ieee_exceptions"
Expand Down Expand Up @@ -67,9 +68,12 @@ if (NOT CMAKE_CROSSCOMPILING)
elseif(${filename} STREQUAL "__ppc_intrinsics" OR
${filename} STREQUAL "mma")
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
elseif(${filename} STREQUAL "cudadevice")
elseif(${filename} STREQUAL "__cuda_device")
set(opts -fc1 -xcuda)
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
elseif(${filename} STREQUAL "cudadevice")
set(opts -fc1 -xcuda)
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_device.mod)
else()
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
if(${filename} STREQUAL "iso_fortran_env")
Expand Down
Loading

0 comments on commit fa99c8c

Please sign in to comment.