Add support for gpu*Reduce functions on AMD GPUs (#23950)

Resolves Cray/chapel-private#5609 Continuation from where I left off in #23689. In that PR, I struggled with segfaults with AMD GPUs, so I had to back out of AMD support. Turns out AMD GPUs tend to segfault at execution time if you don't use the right `--offload-arch`, and that the default on the system that I tested this was not right. This PR adds that while compiling the reduction support code in the runtime to remove the blockage. ### Details - The runtime now has `chpl_gpu_can_reduce`/`chpl_gpu_impl_can_reduce` interface that returns true/false depending on whether we can use this cub-based reduction support - Today, it returns false for cpu-as-device mode or ROCm 4.x which doesn't have hipcub - For cases where there's no cub-based reduction, we fallback to regular CPU-based reductions. On ROCm 4.x this means we copy the array to the host and reduce on host. Clearly, this is less than ideal and just a portability stopgap. I hope to drop ROCm 4 support as soon as we can - Adds a new `rocm-utils` header to be able to use `ROCM_VERSION_MAJOR` portably, and to be able to use `ROCM_CALL` in multiple files - Moves `test/gpu/native/noAmd/reduction` directory to `test/gpu/native` and removes `noAmd.skipif` [Reviewed by @stonea] ### Test - [x] nvidia - [x] amd with ROCm 4.2 - [x] amd with ROCm 4.4 - [x] amd with ROCm 5.2 `gpu/native/reduction` only - [x] amd with ROCm 5.4 `gpu/native/reduction` only - [x] cpu `gpu/native/reduction` only
chapel-lang · Nov 30, 2023 · e547028 · e547028
2 parents 2b26934 + 7cbd647
commit e547028
Show file tree

Hide file tree

Showing 31 changed files with 181 additions and 67 deletions.
diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
@@ -421,10 +421,14 @@ module GPU
                     " elements cannot be reduced with gpu*Reduce functions");
     }
 
-    if CHPL_GPU == "amd" {
-      compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+    proc retType(param op: string, const ref A: [] ?t) type {
+      if isValReduce(op) then return A.eltType;
+      if isValIdxReduce(op) then return (A.eltType, int);
+      compilerError("Unknown reduction operation: ", op);
     }
-    else if CHPL_GPU == "cpu" {
+
+
+    proc doCpuReduceHelp(param op: string, const ref A: [] ?t) {
       select op {
         when "sum" do return + reduce A;
         when "min" do return min reduce A;
@@ -434,10 +438,20 @@ module GPU
         otherwise do compilerError("Unknown reduction operation: ", op);
       }
     }
-    else {
-      compilerAssert(CHPL_GPU=="nvidia");
-    }
 
+    proc doCpuReduce(param op: string, const ref A: [] ?t) {
+      if CHPL_GPU=="cpu" {
+        return doCpuReduceHelp(op, A);
+      }
+      else {
+        var res: retType(op, A);
+        on here.parent {
+          var HostArr = A;
+          res = doCpuReduceHelp(op, HostArr);
+        }
+        return res;
+      }
+    }
 
     proc getExternFuncName(param op: string, type t) param: string {
       return "chpl_gpu_"+op+"_reduce_"+cTypeName;
@@ -504,6 +518,10 @@ module GPU
     }
 
     use CTypes;
+    extern proc chpl_gpu_can_reduce(): bool;
+    if !chpl_gpu_can_reduce() {
+      return doCpuReduce(op, A);
+    }
 
     // find the extern function we'll use
     param externFunc = getExternFuncName(op, t);

diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
@@ -76,6 +76,8 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
+bool chpl_gpu_impl_can_reduce(void);
+
 #define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
@@ -166,6 +166,8 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
+bool chpl_gpu_can_reduce(void);
+
 #define DECL_ONE_REDUCE(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val, int* idx);

diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
@@ -973,6 +973,10 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
+bool chpl_gpu_can_reduce(void) {
+  return chpl_gpu_impl_can_reduce();
+}
+
 #define DEF_ONE_REDUCE(kind, data_type)\
 void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
                                           data_type* val, int* idx) { \

diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
@@ -21,7 +21,7 @@ SRCS = $(GPU_SRCS)
 
 GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
 
-RUNTIME_CXXFLAGS += -x hip
+RUNTIME_CXXFLAGS += -x hip --offload-arch=$(CHPL_MAKE_GPU_ARCH)
 
 $(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
                                          $(RUNTIME_OBJ_DIR_STAMP)

diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -19,29 +19,20 @@
 
 #ifdef HAS_GPU_LOCALE
 
-/* TODO uncomment these when the implementations are in
+#include "../common/rocm-utils.h"
+
 #include <hip/hip_common.h>
+
+#if ROCM_VERSION_MAJOR >= 5
 #include <hipcub/hipcub.hpp>
-#include <rocm_version.h>
-*/
+#endif
+
 
 #include "chpl-gpu.h"
 #include "chpl-gpu-impl.h"
 #include "gpu/chpl-gpu-reduce-util.h"
 
-// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light
-// wrapper around rocprim anyways). I filed
-// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know
-// if/when I'll hear back something. For now, I am merging the code that's
-// supposed to work but doesn't instead of removing them from my branch.
-#if 1
-#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
-void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val, int* idx,\
-                                                    void* stream) {\
-  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
-}
-#elif ROCM_VERSION_MAJOR >= 5
+#if ROCM_VERSION_MAJOR >= 5
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
@@ -50,20 +41,21 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
-                                  0, true));\
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, \
+                                            (data_type*)result, n));\
   ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \
-  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
-                                  0, true));\
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, \
+                                            (data_type*)result, n));\
   ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\
                               (hipStream_t)stream)); \
+  ROCM_CALL(hipFree(result)); \
 }
 #else
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val,\
+                                                    data_type* val, int* idx,\
                                                     void* stream) {\
-  chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\
+  chpl_internal_error("Reduction via runtime calls is not supported with AMD GPUs using ROCm version <5\n");\
 }
 #endif // 1
 
@@ -73,22 +65,34 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
 
 #undef DEF_ONE_REDUCE_RET_VAL
 
-#if 1
-#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+#if ROCM_VERSION_MAJOR >= 5
+#define DEF_ONE_REDUCE_RET_VAL_IDX(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
-  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
+  using kvp = hipcub::KeyValuePair<int,data_type>; \
+  kvp* result; \
+  ROCM_CALL(hipMalloc(&result, sizeof(kvp))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                                  (hipStream_t)stream);\
+  ROCM_CALL(hipMalloc(&temp, temp_bytes)); \
+  hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                                  (hipStream_t)stream);\
+  kvp result_host; \
+  ROCM_CALL(hipMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
+                               (hipStream_t)stream)); \
+  *val = result_host.value; \
+  *idx = result_host.key; \
+  ROCM_CALL(hipFree(result)); \
 }
 #else
-#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+#define DEF_ONE_REDUCE_RET_VAL_IDX(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
-  // TODO I don't know any other specific issues with these versions. Should be
-  // able to whip up the implementation quickly once we figure out what's going
-  // wrong here.
-  chpl_internal_error("Unimplemented");
+  chpl_internal_error("Reduction via runtime calls is not supported with AMD GPUs using ROCm version <5\n");\
 }
 #endif // 1
 
@@ -97,7 +101,5 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
 
 #undef DEF_ONE_REDUCE_RET_VAL_IDX
 
-#undef DEF_REDUCE
-
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c
@@ -29,31 +29,18 @@
 #include "chplcgfns.h"
 #include "chpl-env-gen.h"
 #include "chpl-linefile-support.h"
+#include "../common/rocm-utils.h"
+
 
 #include <assert.h>
 
+#ifndef __HIP_PLATFORM_AMD__
 #define __HIP_PLATFORM_AMD__
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_common.h>
 
-static void chpl_gpu_rocm_check(int err, const char* file, int line) {
-  if(err == hipErrorContextAlreadyInUse) { return; }
-  if(err != hipSuccess) {
-    const int msg_len = 256;
-    char msg[msg_len];
-
-    snprintf(msg, msg_len,
-             "%s:%d: Error calling HIP function: %s (Code: %d)",
-             file, line, hipGetErrorString((hipError_t)err), err);
-
-    chpl_internal_error(msg);
-  }
-}
-
-#define ROCM_CALL(call) do {\
-  chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
-} while(0);
 
 static inline
 void* chpl_gpu_load_module(const char* fatbin_data) {
@@ -498,4 +485,9 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
     ROCM_CALL(hipStreamSynchronize(stream));
   }
 }
+
+bool chpl_gpu_impl_can_reduce(void) {
+  return ROCM_VERSION_MAJOR>=5;
+}
+
 #endif // HAS_GPU_LOCALE
diff --git a/runtime/src/gpu/common/rocm-utils.h b/runtime/src/gpu/common/rocm-utils.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HIP_PLATFORM_AMD__
+#define __HIP_PLATFORM_AMD__
+#endif
+#include <hip/hip_common.h>
+#include <hip/hip_runtime.h>
+
+#if __has_include(<rocm-core/rocm_version.h>)  // 5.x wants this
+#include <rocm-core/rocm_version.h>
+#elif __has_include(<rocm/rocm_version.h>)  // 4.x wants this
+#include <rocm/rocm_version.h>
+#elif __has_include(<rocm_version.h>)  // Deprecated. 5.x used to want this
+#include <rocm_version.h>
+#elif !defined(ROCM_VERSION_MAJOR)
+#define ROCM_VERSION_MAJOR 4 // this is the safe bet
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void chpl_internal_error(const char*);
+
+
+static void chpl_gpu_rocm_check(int err, const char* file, int line) {
+  if(err == hipErrorContextAlreadyInUse) { return; }
+  if(err != hipSuccess) {
+    const int msg_len = 256;
+    char msg[msg_len];
+
+    snprintf(msg, msg_len,
+             "%s:%d: Error calling HIP function: %s (Code: %d)",
+             file, line, hipGetErrorString((hipError_t)err), err);
+
+    chpl_internal_error(msg);
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#define ROCM_CALL(call) do {\
+  chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
+} while(0);
diff --git a/runtime/src/gpu/cpu/gpu-cpu.c b/runtime/src/gpu/cpu/gpu-cpu.c
@@ -157,6 +157,10 @@ bool chpl_gpu_impl_stream_ready(void* stream) {
 void chpl_gpu_impl_stream_synchronize(void* stream) {
 }
 
+bool chpl_gpu_impl_can_reduce(void) {
+  return false;
+}
+
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\

diff --git a/runtime/src/gpu/nvidia/gpu-nvidia.c b/runtime/src/gpu/nvidia/gpu-nvidia.c
@@ -480,4 +480,8 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
   }
 }
 
+bool chpl_gpu_impl_can_reduce(void) {
+  return true;
+}
+
 #endif // HAS_GPU_LOCALE
diff --git a/test/gpu/native/noAmd.skipif b/test/gpu/native/noAmd.skipif
diff --git a/test/gpu/native/noAmd/reduction/largeArrays.execopts b/test/gpu/native/noAmd/reduction/largeArrays.execopts
diff --git a/test/gpu/native/noAmd/reduction/basic.chpl → test/gpu/native/reduction/basic.chpl b/test/gpu/native/noAmd/reduction/basic.chpl → test/gpu/native/reduction/basic.chpl
diff --git a/test/gpu/native/noAmd/reduction/basic.good → test/gpu/native/reduction/basic.good b/test/gpu/native/noAmd/reduction/basic.good → test/gpu/native/reduction/basic.good
diff --git a/...u/native/noAmd/reduction/largeArrays.chpl → test/gpu/native/reduction/largeArrays.chpl b/...u/native/noAmd/reduction/largeArrays.chpl → test/gpu/native/reduction/largeArrays.chpl
@@ -1,7 +1,15 @@
 use GPU;
 
 config const printResult = false;
-config const n = 100;
+config var n = 100;
+
+// testing large data:
+// 1. is only meaningful if the reduction is actually done on a device
+// 2. times out testing if we use CPU-based reduction, especially if it is a
+//    fallback.
+// So, override n to be something smaller
+extern proc chpl_gpu_can_reduce(): bool;
+if !chpl_gpu_can_reduce() then n = 100;
 
 var result: uint(8);
 on here.gpus[0] {

diff --git a/test/gpu/native/reduction/largeArrays.execopts b/test/gpu/native/reduction/largeArrays.execopts
@@ -0,0 +1,11 @@
+--n=2_000_000_000
+--n=2_000_000_001
+--n=2_147_483_647
+--n=4_000_000_000
+--n=4_000_000_001
+
+# Engin: I want to test the following too, but hit unrelated issues on AMD
+# https://github.com/chapel-lang/chapel/issues/23934
+# -n=4_294_967_293
+# -n=4_294_967_294
+# -n=4_294_967_295
diff --git a/...u/native/noAmd/reduction/largeArrays.good → test/gpu/native/reduction/largeArrays.good b/...u/native/noAmd/reduction/largeArrays.good → test/gpu/native/reduction/largeArrays.good
diff --git a/...native/noAmd/reduction/largeArrays.skipif → test/gpu/native/reduction/largeArrays.skipif b/...native/noAmd/reduction/largeArrays.skipif → test/gpu/native/reduction/largeArrays.skipif
diff --git a/...ve/noAmd/reduction/largeArraysMinMax.chpl → ...u/native/reduction/largeArraysMinMax.chpl b/...ve/noAmd/reduction/largeArraysMinMax.chpl → ...u/native/reduction/largeArraysMinMax.chpl
@@ -6,8 +6,20 @@ config const kind = "min";
 const isMin = kind=="min";
 assert(isMin || kind=="max");
 
-config const n = 2*max(int(32));
-config const setIdx = n-1;
+config var n = 4_000_000_000;
+config var setIdx = n-1;
+
+// testing large data:
+// 1. is only meaningful if the reduction is actually done on a device
+// 2. times out testing if we use CPU-based reduction, especially if it is a
+//    fallback.
+// So, override n/setIdx to be something smaller
+extern proc chpl_gpu_can_reduce(): bool;
+if !chpl_gpu_can_reduce() {
+  n = 100;
+  setIdx = n-1;
+}
+
 assert(n>setIdx);
 
 config const printResult = false;

diff --git a/...oAmd/reduction/largeArraysMinMax.compopts → ...tive/reduction/largeArraysMinMax.compopts b/...oAmd/reduction/largeArraysMinMax.compopts → ...tive/reduction/largeArraysMinMax.compopts
diff --git a/...oAmd/reduction/largeArraysMinMax.execopts → ...tive/reduction/largeArraysMinMax.execopts b/...oAmd/reduction/largeArraysMinMax.execopts → ...tive/reduction/largeArraysMinMax.execopts
diff --git a/...ve/noAmd/reduction/largeArraysMinMax.good → ...u/native/reduction/largeArraysMinMax.good b/...ve/noAmd/reduction/largeArraysMinMax.good → ...u/native/reduction/largeArraysMinMax.good
diff --git a/.../noAmd/reduction/largeArraysMinMax.skipif → ...native/reduction/largeArraysMinMax.skipif b/.../noAmd/reduction/largeArraysMinMax.skipif → ...native/reduction/largeArraysMinMax.skipif
diff --git a/.../native/noAmd/reduction/nonZeroBased.chpl → test/gpu/native/reduction/nonZeroBased.chpl b/.../native/noAmd/reduction/nonZeroBased.chpl → test/gpu/native/reduction/nonZeroBased.chpl
diff --git a/.../native/noAmd/reduction/nonZeroBased.good → test/gpu/native/reduction/nonZeroBased.good b/.../native/noAmd/reduction/nonZeroBased.good → test/gpu/native/reduction/nonZeroBased.good
diff --git a/...ive/noAmd/reduction/reduceThroughput.chpl → ...pu/native/reduction/reduceThroughput.chpl b/...ive/noAmd/reduction/reduceThroughput.chpl → ...pu/native/reduction/reduceThroughput.chpl
diff --git a/...noAmd/reduction/reduceThroughput.execopts → ...ative/reduction/reduceThroughput.execopts b/...noAmd/reduction/reduceThroughput.execopts → ...ative/reduction/reduceThroughput.execopts
diff --git a/...ive/noAmd/reduction/reduceThroughput.good → ...pu/native/reduction/reduceThroughput.good b/...ive/noAmd/reduction/reduceThroughput.good → ...pu/native/reduction/reduceThroughput.good
diff --git a/...u/native/noAmd/reduction/stringError.chpl → test/gpu/native/reduction/stringError.chpl b/...u/native/noAmd/reduction/stringError.chpl → test/gpu/native/reduction/stringError.chpl
diff --git a/...u/native/noAmd/reduction/stringError.good → test/gpu/native/reduction/stringError.good b/...u/native/noAmd/reduction/stringError.good → test/gpu/native/reduction/stringError.good
diff --git a/...ative/noAmd/reduction/stringError.prediff → .../gpu/native/reduction/stringError.prediff b/...ative/noAmd/reduction/stringError.prediff → .../gpu/native/reduction/stringError.prediff