Merge pull request llvm#79 from AMD-Lightning-Internal/upstream_merge…

…_202501132308 merge main into amd-staging
abidh · Jan 14, 2025 · fa99c8c · fa99c8c
2 parents 5b362a0 + d5512f7
commit fa99c8c
Show file tree

Hide file tree

Showing 201 changed files with 15,432 additions and 11,516 deletions.
diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -513,7 +513,9 @@ void NarrowingConversionsCheck::handleFloatingCast(const ASTContext &Context,
       return;
     }
     const BuiltinType *FromType = getBuiltinType(Rhs);
-    if (ToType->getKind() < FromType->getKind())
+    if (!llvm::APFloatBase::isRepresentableBy(
+            Context.getFloatTypeSemantics(FromType->desugar()),
+            Context.getFloatTypeSemantics(ToType->desugar())))
       diagNarrowType(SourceLoc, Lhs, Rhs);
   }
 }

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -210,6 +210,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/forwarding-reference-overload>` check by fixing
   a crash when determining if an ``enable_if[_t]`` was found.
 
+- Improve :doc:`bugprone-narrowing-conversions
+  <clang-tidy/checks/bugprone/narrowing-conversions>` to avoid incorrect check
+  results when floating point type is not ``float``, ``double`` and
+  ``long double``.
+
 - Improved :doc:`bugprone-optional-value-conversion
   <clang-tidy/checks/bugprone/optional-value-conversion>` to support detecting
   conversion directly by ``std::make_unique`` and ``std::make_shared``.

diff --git a/...test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp b/...test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
@@ -36,6 +36,15 @@ void narrow_double_to_float_not_ok(double d) {
   f = narrow_double_to_float_return();
 }
 
+float narrow_float16_to_float_return(_Float16 f) {
+  return f;
+}
+
+_Float16 narrow_float_to_float16_return(float f) {
+  return f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to '_Float16' [bugprone-narrowing-conversions]
+}
+
 void narrow_fp_constants() {
   float f;
   f = 0.5; // [dcl.init.list] 7.2 : in-range fp constant to narrower float is not a narrowing.

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
     }();
 
     GenerateArg(Consumer, Opt, It->Path);
-  };
+  }
 
   // Note: some paths that came from "[-iprefix=xx] -iwithprefixbefore=yy" may
   // have already been generated as "-I[xx]yy". If that's the case, their

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -451,9 +451,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
   void VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D);
   void VisitOMPRequiresDecl(OMPRequiresDecl *D);
   void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
-  };
-
-  } // namespace clang
+};
+} // namespace clang
 
 namespace {
 

diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1141,18 +1141,18 @@ template <typename Config> class SizeClassAllocator64 {
           BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
     }
     const uptr TotalChunks = Region->MemMapInfo.AllocatedUser / BlockSize;
-    Str->append(
-        "%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-        "inuse: %6zu total: %6zu releases: %6zu last "
-        "releases attempted: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
-        "(0x%zx)\n",
-        Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
-        Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
-        Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
-        Region->ReleaseInfo.NumReleasesAttempted,
-        Region->ReleaseInfo.LastReleasedBytes >> 10,
-        RegionPushedBytesDelta >> 10, Region->RegionBeg,
-        getRegionBaseByClassId(ClassId));
+    Str->append("%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
+                "inuse: %6zu total: %6zu releases attempted: %6zu last "
+                "released: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
+                "(0x%zx)\n",
+                Region->Exhausted ? "E" : " ", ClassId,
+                getSizeByClassId(ClassId), Region->MemMapInfo.MappedUser >> 10,
+                Region->FreeListInfo.PoppedBlocks,
+                Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
+                Region->ReleaseInfo.NumReleasesAttempted,
+                Region->ReleaseInfo.LastReleasedBytes >> 10,
+                RegionPushedBytesDelta >> 10, Region->RegionBeg,
+                getRegionBaseByClassId(ClassId));
   }
 
   void getRegionFragmentationInfo(RegionInfo *Region, uptr ClassId,
@@ -1297,6 +1297,10 @@ template <typename Config> class SizeClassAllocator64 {
         return 0;
     }
 
+    // The following steps contribute to the majority time spent in page
+    // releasing thus we increment the counter here.
+    ++Region->ReleaseInfo.NumReleasesAttempted;
+
     // Note that we have extracted the `GroupsToRelease` from region freelist.
     // It's safe to let pushBlocks()/popBlocks() access the remaining region
     // freelist. In the steps 3 and 4, we will temporarily release the FLLock

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -366,6 +366,23 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   const fir::LLVMTypeConverter *typeConverter;
 };
 
+static mlir::Value genGetDeviceAddress(mlir::PatternRewriter &rewriter,
+                                       mlir::ModuleOp mod, mlir::Location loc,
+                                       mlir::Value inputArg) {
+  fir::FirOpBuilder builder(rewriter, mod);
+  mlir::func::FuncOp callee =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc, builder);
+  auto fTy = callee.getFunctionType();
+  mlir::Value conv = createConvertOp(rewriter, loc, fTy.getInput(0), inputArg);
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, conv, sourceFile, sourceLine)};
+  auto call = rewriter.create<fir::CallOp>(loc, callee, args);
+  return createConvertOp(rewriter, loc, inputArg.getType(), call->getResult(0));
+}
+
 struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -382,26 +399,10 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
         if (cuf::isRegisteredDeviceGlobal(global)) {
           rewriter.setInsertionPointAfter(addrOfOp);
           auto mod = op->getParentOfType<mlir::ModuleOp>();
-          fir::FirOpBuilder builder(rewriter, mod);
-          mlir::Location loc = op.getLoc();
-          mlir::func::FuncOp callee =
-              fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(
-                  loc, builder);
-          auto fTy = callee.getFunctionType();
-          mlir::Type toTy = fTy.getInput(0);
-          mlir::Value inputArg =
-              createConvertOp(rewriter, loc, toTy, addrOfOp.getResult());
-          mlir::Value sourceFile =
-              fir::factory::locationToFilename(builder, loc);
-          mlir::Value sourceLine =
-              fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
-          llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-              builder, loc, fTy, inputArg, sourceFile, sourceLine)};
-          auto call = rewriter.create<fir::CallOp>(loc, callee, args);
-          mlir::Value cast = createConvertOp(
-              rewriter, loc, op.getMemref().getType(), call->getResult(0));
+          mlir::Value devAddr = genGetDeviceAddress(rewriter, mod, op.getLoc(),
+                                                    addrOfOp.getResult());
           rewriter.startOpModification(op);
-          op.getMemrefMutable().assign(cast);
+          op.getMemrefMutable().assign(devAddr);
           rewriter.finalizeOpModification(op);
           return success();
         }
@@ -771,10 +772,32 @@ struct CUFLaunchOpConversion
             loc, clusterDimsAttr.getZ().getInt());
       }
     }
+    llvm::SmallVector<mlir::Value> args;
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    for (mlir::Value arg : op.getArgs()) {
+      // If the argument is a global descriptor, make sure we pass the device
+      // copy of this descriptor and not the host one.
+      if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(arg.getType()))) {
+        if (auto declareOp =
+                mlir::dyn_cast_or_null<fir::DeclareOp>(arg.getDefiningOp())) {
+          if (auto addrOfOp = mlir::dyn_cast_or_null<fir::AddrOfOp>(
+                  declareOp.getMemref().getDefiningOp())) {
+            if (auto global = symTab.lookup<fir::GlobalOp>(
+                    addrOfOp.getSymbol().getRootReference().getValue())) {
+              if (cuf::isRegisteredDeviceGlobal(global)) {
+                arg = genGetDeviceAddress(rewriter, mod, op.getLoc(),
+                                          declareOp.getResult());
+              }
+            }
+          }
+        }
+      }
+      args.push_back(arg);
+    }
+
     auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
         loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
-        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
-        op.getArgs());
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
       gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
@@ -4015,7 +4015,9 @@ bool SubprogramVisitor::Pre(const parser::PrefixSpec::Attributes &attrs) {
           *attrs == common::CUDASubprogramAttrs::Device) {
         const Scope &scope{currScope()};
         const Scope *mod{FindModuleContaining(scope)};
-        if (mod && mod->GetName().value() == "cudadevice") {
+        if (mod &&
+            (mod->GetName().value() == "cudadevice" ||
+                mod->GetName().value() == "__cuda_device")) {
           return false;
         }
         // Implicitly USE the cudadevice module by copying its symbols in the

diff --git a/flang/module/__cuda_device.f90 b/flang/module/__cuda_device.f90
@@ -0,0 +1,32 @@
+!===-- module/__cuda_device.f90 --------------------------------------------===!
+!
+! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+! See https://llvm.org/LICENSE.txt for license information.
+! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!
+!===------------------------------------------------------------------------===!
+
+! This module contains CUDA Fortran interfaces used in cudadevice.f90.
+
+module __cuda_device
+implicit none
+
+  ! Set PRIVATE by default to explicitly only export what is meant
+  ! to be exported by this MODULE.
+
+  interface
+    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
+      real, intent(in), value :: x, y
+      real :: __fadd_rd
+    end function
+  end interface
+  public :: __fadd_rd
+
+  interface
+    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
+      real, intent(in), value :: x, y
+      real :: __fadd_ru
+    end function
+  end interface
+  public :: __fadd_ru
+end module
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
@@ -9,6 +9,7 @@
 ! CUDA Fortran procedures available in device subprogram
 
 module cudadevice
+  use __cuda_device, only: __fadd_rd, __fadd_ru
 implicit none
 
   ! Set PRIVATE by default to explicitly only export what is meant
@@ -71,20 +72,4 @@ attributes(device) subroutine threadfence_system()
   end interface
   public :: threadfence_system
 
-  interface
-    attributes(device) function __fadd_rd(x, y) bind(c, name='__nv_fadd_rd')
-      real, intent(in) :: x, y
-      real :: __fadd_rd
-    end function
-  end interface
-  public :: __fadd_rd
-
-  interface
-    attributes(device) function __fadd_ru(x, y) bind(c, name='__nv_fadd_ru')
-      real, intent(in) :: x, y
-      real :: __fadd_ru
-    end function
-  end interface
-  public :: __fadd_ru
-
 end module
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -62,3 +62,45 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK-LABEL: func.func @_QMmod1Phost_sub()
 // CHECK: gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
 
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) kernel {
+      gpu.return
+    }
+  }
+  fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda<device>} : !fir.box<!fir.ptr<!fir.array<?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xf32>>>
+  }
+  func.func @_QMdevptrPtest(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dp"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+    return
+  }
+  func.func @_QQmain() {
+    %c1_i32 = arith.constant 1 : i32
+    %c4 = arith.constant 4 : index
+    %0 = cuf.alloc !fir.array<4xf32> {bindc_name = "a_dev", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa_dev"} -> !fir.ref<!fir.array<4xf32>>
+    %1 = fir.shape %c4 : (index) -> !fir.shape<1>
+    %2 = fir.declare %0(%1) {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFEa_dev"} : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<4xf32>>
+    %3 = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    %4 = fir.declare %3 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    %5 = fir.embox %2(%1) : (!fir.ref<!fir.array<4xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+    fir.store %5 to %4 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+    cuf.sync_descriptor @_QMdevptrEdev_ptr
+    cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%4) : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+    cuf.free %2 : !fir.ref<!fir.array<4xf32>> {data_attr = #cuf.cuda<device>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: %[[ADDROF:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: %[[DECL:.*]] = fir.declare %[[ADDROF]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: %[[CONV_DECL:.*]] = fir.convert %[[DECL]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.llvm_ptr<i8>
+// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_DECL]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
@@ -21,6 +21,7 @@ set(MODULES_WITHOUT_IMPLEMENTATION
   "__ppc_intrinsics"
   "mma"
   "__cuda_builtins"
+  "__cuda_device"
   "cudadevice"
   "ieee_arithmetic"
   "ieee_exceptions"
@@ -67,9 +68,12 @@ if (NOT CMAKE_CROSSCOMPILING)
     elseif(${filename} STREQUAL "__ppc_intrinsics" OR
            ${filename} STREQUAL "mma")
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
-    elseif(${filename} STREQUAL "cudadevice")
+    elseif(${filename} STREQUAL "__cuda_device")
       set(opts -fc1 -xcuda)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
+    elseif(${filename} STREQUAL "cudadevice")
+      set(opts -fc1 -xcuda)
+      set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_device.mod)
     else()
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
       if(${filename} STREQUAL "iso_fortran_env")
-Original file line number
+Diff line change
@@ Expand Up @@
         }();
         GenerateArg(Consumer, Opt, It->Path);
-      };
+      }
       // Note: some paths that came from "[-iprefix=xx] -iwithprefixbefore=yy" may
       // have already been generated as "-I[xx]yy". If that's the case, their
@@ Expand Down @@