chapel-lang · e-kayrakli · Nov 6, 2023 · Sep 19, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
@@ -389,4 +389,155 @@ module GPU
   /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
      are equal store 'val' in 'x'. The operation returns the old value of x. */
   inline proc gpuAtomicCAS(  ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }
+
+  // ============================
+  // Reductions
+  // ============================
+
+  @chpldoc.nodoc
+  config param gpuDebugReduce = false;
+
+  private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
+    if op != "sum" && op != "min" && op != "max" &&
+       op != "minloc" && op != "maxloc" {
+
+      compilerError("Unexpected reduction kind in doGpuReduce: ", op);
+    }
+
+
+    if CHPL_GPU == "amd" {
+      compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+    }
+    else if CHPL_GPU == "cpu" {
+      select op {
+        when "sum" do return + reduce A;
+        when "min" do return min reduce A;
+        when "max" do return max reduce A;
+        when "minloc" do return minloc reduce zip (A.domain, A);
+        when "maxloc" do return maxloc reduce zip (A.domain, A);
+      }
+    }
+    else {
+      compilerAssert(CHPL_GPU=="nvidia");
+    }
+
+
+    proc chplTypeToCTypeName(type t) param {
+      select t {
+        when int(8)   do return "int8_t";
+        when int(16)  do return "int16_t";
+        when int(32)  do return "int32_t";
+        when int(64)  do return "int64_t";
+        when uint(8)  do return "uint8_t";
+        when uint(16) do return "uint16_t";
+        when uint(32) do return "uint32_t";
+        when uint(64) do return "uint64_t";
+        when real(32) do return "float";
+        when real(64) do return "double";
+      }
+      return "unknown";
+    }
+
+    proc getExternFuncName(param op: string, type t) param: string {
+      return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
+    }
+
+    inline proc subReduceVal(param op, ref accum: ?t, val: t) {
+      select op {
+        when "sum" do accum += val;
+        when "min" do accum = min(accum, val);
+        when "max" do accum = max(accum, val);
+      }
+    }
+
+    inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t,
+                                val: t) {
+      select op {
+        when "minloc" do
+          if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
+        when "maxloc" do
+          if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
+      }
+    }
+
+    iter offsetsThatCanFitIn32Bits(size: int) {
+      // Engin: I've tried to get max(int(32)) to work as this bug is about CUB
+      // using `int` as the size in the interface. However, getting close to
+      // max(int(32)) also triggers the bug. So, I am choosing this as a
+      // round/safe value for the time being.
+      param chunkSize = 2_000_000_000;
+
+      use Math only divCeil;
+      const numChunks = divCeil(size, chunkSize);
+      const standardChunkSize = divCeil(size, numChunks);
+
+      if gpuDebugReduce then
+        writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);
+
+      foreach chunk in 0..<numChunks {
+        const start = chunk*standardChunkSize;
+        const curChunkSize = if start+standardChunkSize <= size
+                               then standardChunkSize
+                               else size-start;
+        if gpuDebugReduce then
+          writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
+                 curChunkSize);
+
+        yield (start, curChunkSize);
+      }
+    }
+
+    use CTypes;
+
+    param externFunc = getExternFuncName(op, t);
+
+    if op == "sum" || op == "min" || op == "max" {
+      var val: t;
+      if op == "min" then
+        val = max(t);
+      else if op == "max" then
+        val = min(t);
+
+      extern externFunc proc reduce_fn(data, size, ref val);
+
+      const basePtr = c_ptrToConst(A);
+      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+        var curVal: t;
+        reduce_fn(basePtr+offset, size, curVal);
+        if gpuDebugReduce then
+          writef(" (curVal=%i)\n", curVal);
+        subReduceVal(op, val, curVal);
+      }
+
+      return val;
+    }
+    else {
+      var ret: (int, t);
+      if op == "minloc" then
+        ret[1] = max(t);
+      else if op == "maxloc" then
+        ret[1] = min(t);
+
+      extern externFunc proc reduce_fn(data, size, ref val, ref idx);
+
+      const basePtr = c_ptrToConst(A);
+      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+        var curIdx: int(32);
+        var curVal: t;
+        reduce_fn(basePtr+offset, size, curVal, curIdx);
+        subReduceValIdx(op, offset, ret, (curIdx, curVal));
+        if gpuDebugReduce then
+          writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
+      }
+
+      return ret;
+    }
+  }
+
+  inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
+  inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
+  inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
+  inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
+  inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
+
 }
diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
@@ -76,6 +76,27 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
+#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream);
+
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max)
+
+#undef DECL_ONE_REDUCE_RET_VAL
+
+#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream);
+
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc)
+
+#undef DECL_ONE_REDUCE_RET_VAL_IDX
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
@@ -24,6 +24,7 @@
 #include <stdbool.h>
 #include "chpl-tasks.h"
 #include "chpl-mem-desc.h"
+#include "gpu/chpl-gpu-reduce-util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -149,6 +150,25 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
+#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val);
+
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);
+
+#undef DECL_ONE_REDUCE_RET_VAL
+
+#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val, int* idx);
+
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);
+
+#undef DECL_ONE_REDUCE_RET_VAL_IDX
+
 #endif // HAS_GPU_LOCALE
 
 #ifdef __cplusplus

diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
+  MACRO(impl_kind, chpl_kind, int8_t)  \
+  MACRO(impl_kind, chpl_kind, int16_t)  \
+  MACRO(impl_kind, chpl_kind, int32_t)  \
+  MACRO(impl_kind, chpl_kind, int64_t)  \
+  MACRO(impl_kind, chpl_kind, uint8_t)  \
+  MACRO(impl_kind, chpl_kind, uint16_t)  \
+  MACRO(impl_kind, chpl_kind, uint32_t)  \
+  MACRO(impl_kind, chpl_kind, uint64_t)  \
+  MACRO(impl_kind, chpl_kind, float)   \
+  MACRO(impl_kind, chpl_kind, double);
+
+#define GPU_REDUCE(MACRO, chpl_kind) \
+  MACRO(chpl_kind, int8_t)  \
+  MACRO(chpl_kind, int16_t)  \
+  MACRO(chpl_kind, int32_t)  \
+  MACRO(chpl_kind, int64_t)  \
+  MACRO(chpl_kind, uint8_t)  \
+  MACRO(chpl_kind, uint16_t)  \
+  MACRO(chpl_kind, uint32_t)  \
+  MACRO(chpl_kind, uint64_t)  \
+  MACRO(chpl_kind, float)   \
+  MACRO(chpl_kind, double);
+
+#endif // HAS_GPU_LOCALE
+
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
@@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
 #include "chpl-env.h"
 #include "chpl-comm-compiler-macros.h"
 
+#include "gpu/chpl-gpu-reduce-util.h"
+
 void chpl_gpu_init(void) {
   chpl_gpu_impl_init(&chpl_gpu_num_devices);
 
@@ -697,4 +699,54 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
+#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
+}
+
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum)
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min)
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val, int* idx) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
+}
+
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc);
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc);
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
+
 #endif
diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include
@@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
@@ -15,8 +15,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-SRCS = gpu-amd.c
+GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c
 
-GPU_SRCS = gpu-amd.c
+SRCS = $(GPU_SRCS)
 
-GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
+GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
+
+RUNTIME_CXXFLAGS += -x hip
+
+$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<