Skip to content

Commit

Permalink
Add support for gpu*Reduce functions on AMD GPUs (#23950)
Browse files Browse the repository at this point in the history
Resolves Cray/chapel-private#5609

Continuation from where I left off in
#23689. In that PR, I
struggled with segfaults with AMD GPUs, so I had to back out of AMD
support. Turns out AMD GPUs tend to segfault at execution time if you
don't use the right `--offload-arch`, and that the default on the system
that I tested this was not right. This PR adds that while compiling the
reduction support code in the runtime to remove the blockage.

### Details
- The runtime now has `chpl_gpu_can_reduce`/`chpl_gpu_impl_can_reduce`
interface that returns true/false depending on whether we can use this
cub-based reduction support
- Today, it returns false for cpu-as-device mode or ROCm 4.x which
doesn't have hipcub
- For cases where there's no cub-based reduction, we fallback to regular
CPU-based reductions. On ROCm 4.x this means we copy the array to the
host and reduce on host. Clearly, this is less than ideal and just a
portability stopgap. I hope to drop ROCm 4 support as soon as we can
- Adds a new `rocm-utils` header to be able to use `ROCM_VERSION_MAJOR`
portably, and to be able to use `ROCM_CALL` in multiple files
- Moves `test/gpu/native/noAmd/reduction` directory to `test/gpu/native`
and removes `noAmd.skipif`

[Reviewed by @stonea]

### Test
- [x] nvidia
- [x] amd with ROCm 4.2
- [x] amd with ROCm 4.4
- [x] amd with ROCm 5.2 `gpu/native/reduction` only  
- [x] amd with ROCm 5.4 `gpu/native/reduction` only
- [x] cpu `gpu/native/reduction` only
  • Loading branch information
e-kayrakli authored Nov 30, 2023
2 parents 2b26934 + 7cbd647 commit e547028
Show file tree
Hide file tree
Showing 31 changed files with 181 additions and 67 deletions.
30 changes: 24 additions & 6 deletions modules/standard/GPU.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -421,10 +421,14 @@ module GPU
" elements cannot be reduced with gpu*Reduce functions");
}

if CHPL_GPU == "amd" {
compilerError("gpu*Reduce functions are not supported on AMD GPUs");
proc retType(param op: string, const ref A: [] ?t) type {
if isValReduce(op) then return A.eltType;
if isValIdxReduce(op) then return (A.eltType, int);
compilerError("Unknown reduction operation: ", op);
}
else if CHPL_GPU == "cpu" {


proc doCpuReduceHelp(param op: string, const ref A: [] ?t) {
select op {
when "sum" do return + reduce A;
when "min" do return min reduce A;
Expand All @@ -434,10 +438,20 @@ module GPU
otherwise do compilerError("Unknown reduction operation: ", op);
}
}
else {
compilerAssert(CHPL_GPU=="nvidia");
}

proc doCpuReduce(param op: string, const ref A: [] ?t) {
if CHPL_GPU=="cpu" {
return doCpuReduceHelp(op, A);
}
else {
var res: retType(op, A);
on here.parent {
var HostArr = A;
res = doCpuReduceHelp(op, HostArr);
}
return res;
}
}

proc getExternFuncName(param op: string, type t) param: string {
return "chpl_gpu_"+op+"_reduce_"+cTypeName;
Expand Down Expand Up @@ -504,6 +518,10 @@ module GPU
}

use CTypes;
extern proc chpl_gpu_can_reduce(): bool;
if !chpl_gpu_can_reduce() {
return doCpuReduce(op, A);
}

// find the extern function we'll use
param externFunc = getExternFuncName(op, t);
Expand Down
2 changes: 2 additions & 0 deletions runtime/include/chpl-gpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ void chpl_gpu_impl_stream_destroy(void* stream);
bool chpl_gpu_impl_stream_ready(void* stream);
void chpl_gpu_impl_stream_synchronize(void* stream);

bool chpl_gpu_impl_can_reduce(void);

#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
Expand Down
2 changes: 2 additions & 0 deletions runtime/include/chpl-gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
bool chpl_gpu_can_access_peer(int dev1, int dev2);
void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);

bool chpl_gpu_can_reduce(void);

#define DECL_ONE_REDUCE(chpl_kind, data_type) \
void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx);
Expand Down
4 changes: 4 additions & 0 deletions runtime/src/chpl-gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,10 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
}

bool chpl_gpu_can_reduce(void) {
return chpl_gpu_impl_can_reduce();
}

#define DEF_ONE_REDUCE(kind, data_type)\
void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
data_type* val, int* idx) { \
Expand Down
2 changes: 1 addition & 1 deletion runtime/src/gpu/amd/Makefile.share
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ SRCS = $(GPU_SRCS)

GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))

RUNTIME_CXXFLAGS += -x hip
RUNTIME_CXXFLAGS += -x hip --offload-arch=$(CHPL_MAKE_GPU_ARCH)

$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
$(RUNTIME_OBJ_DIR_STAMP)
Expand Down
66 changes: 34 additions & 32 deletions runtime/src/gpu/amd/gpu-amd-reduce.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,20 @@

#ifdef HAS_GPU_LOCALE

/* TODO uncomment these when the implementations are in
#include "../common/rocm-utils.h"

#include <hip/hip_common.h>

#if ROCM_VERSION_MAJOR >= 5
#include <hipcub/hipcub.hpp>
#include <rocm_version.h>
*/
#endif


#include "chpl-gpu.h"
#include "chpl-gpu-impl.h"
#include "gpu/chpl-gpu-reduce-util.h"

// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light
// wrapper around rocprim anyways). I filed
// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know
// if/when I'll hear back something. For now, I am merging the code that's
// supposed to work but doesn't instead of removing them from my branch.
#if 1
#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
void* stream) {\
chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
}
#elif ROCM_VERSION_MAJOR >= 5
#if ROCM_VERSION_MAJOR >= 5
#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
Expand All @@ -50,20 +41,21 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\
void* temp = NULL; \
size_t temp_bytes = 0; \
ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
0, true));\
ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, \
(data_type*)result, n));\
ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \
ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
0, true));\
ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, \
(data_type*)result, n));\
ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\
(hipStream_t)stream)); \
ROCM_CALL(hipFree(result)); \
}
#else
#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val,\
data_type* val, int* idx,\
void* stream) {\
chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\
chpl_internal_error("Reduction via runtime calls is not supported with AMD GPUs using ROCm version <5\n");\
}
#endif // 1

Expand All @@ -73,22 +65,34 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)

#undef DEF_ONE_REDUCE_RET_VAL

#if 1
#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
#if ROCM_VERSION_MAJOR >= 5
#define DEF_ONE_REDUCE_RET_VAL_IDX(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
void* stream) {\
chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
using kvp = hipcub::KeyValuePair<int,data_type>; \
kvp* result; \
ROCM_CALL(hipMalloc(&result, sizeof(kvp))); \
void* temp = NULL; \
size_t temp_bytes = 0; \
hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (kvp*)result, n,\
(hipStream_t)stream);\
ROCM_CALL(hipMalloc(&temp, temp_bytes)); \
hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (kvp*)result, n,\
(hipStream_t)stream);\
kvp result_host; \
ROCM_CALL(hipMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
(hipStream_t)stream)); \
*val = result_host.value; \
*idx = result_host.key; \
ROCM_CALL(hipFree(result)); \
}
#else
#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
#define DEF_ONE_REDUCE_RET_VAL_IDX(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
void* stream) {\
// TODO I don't know any other specific issues with these versions. Should be
// able to whip up the implementation quickly once we figure out what's going
// wrong here.
chpl_internal_error("Unimplemented");
chpl_internal_error("Reduction via runtime calls is not supported with AMD GPUs using ROCm version <5\n");\
}
#endif // 1

Expand All @@ -97,7 +101,5 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)

#undef DEF_ONE_REDUCE_RET_VAL_IDX

#undef DEF_REDUCE

#endif // HAS_GPU_LOCALE

26 changes: 9 additions & 17 deletions runtime/src/gpu/amd/gpu-amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,18 @@
#include "chplcgfns.h"
#include "chpl-env-gen.h"
#include "chpl-linefile-support.h"
#include "../common/rocm-utils.h"


#include <assert.h>

#ifndef __HIP_PLATFORM_AMD__
#define __HIP_PLATFORM_AMD__
#endif
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_common.h>

static void chpl_gpu_rocm_check(int err, const char* file, int line) {
if(err == hipErrorContextAlreadyInUse) { return; }
if(err != hipSuccess) {
const int msg_len = 256;
char msg[msg_len];

snprintf(msg, msg_len,
"%s:%d: Error calling HIP function: %s (Code: %d)",
file, line, hipGetErrorString((hipError_t)err), err);

chpl_internal_error(msg);
}
}

#define ROCM_CALL(call) do {\
chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
} while(0);

static inline
void* chpl_gpu_load_module(const char* fatbin_data) {
Expand Down Expand Up @@ -498,4 +485,9 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
ROCM_CALL(hipStreamSynchronize(stream));
}
}

bool chpl_gpu_impl_can_reduce(void) {
return ROCM_VERSION_MAJOR>=5;
}

#endif // HAS_GPU_LOCALE
63 changes: 63 additions & 0 deletions runtime/src/gpu/common/rocm-utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright 2020-2023 Hewlett Packard Enterprise Development LP
* Copyright 2004-2019 Cray Inc.
* Other additional copyright holders may be indicated within. *
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef __HIP_PLATFORM_AMD__
#define __HIP_PLATFORM_AMD__
#endif
#include <hip/hip_common.h>
#include <hip/hip_runtime.h>

#if __has_include(<rocm-core/rocm_version.h>) // 5.x wants this
#include <rocm-core/rocm_version.h>
#elif __has_include(<rocm/rocm_version.h>) // 4.x wants this
#include <rocm/rocm_version.h>
#elif __has_include(<rocm_version.h>) // Deprecated. 5.x used to want this
#include <rocm_version.h>
#elif !defined(ROCM_VERSION_MAJOR)
#define ROCM_VERSION_MAJOR 4 // this is the safe bet
#endif

#ifdef __cplusplus
extern "C" {
#endif

extern void chpl_internal_error(const char*);


static void chpl_gpu_rocm_check(int err, const char* file, int line) {
if(err == hipErrorContextAlreadyInUse) { return; }
if(err != hipSuccess) {
const int msg_len = 256;
char msg[msg_len];

snprintf(msg, msg_len,
"%s:%d: Error calling HIP function: %s (Code: %d)",
file, line, hipGetErrorString((hipError_t)err), err);

chpl_internal_error(msg);
}
}

#ifdef __cplusplus
}
#endif

#define ROCM_CALL(call) do {\
chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
} while(0);
4 changes: 4 additions & 0 deletions runtime/src/gpu/cpu/gpu-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ bool chpl_gpu_impl_stream_ready(void* stream) {
void chpl_gpu_impl_stream_synchronize(void* stream) {
}

bool chpl_gpu_impl_can_reduce(void) {
return false;
}

#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
Expand Down
4 changes: 4 additions & 0 deletions runtime/src/gpu/nvidia/gpu-nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,4 +480,8 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
}
}

bool chpl_gpu_impl_can_reduce(void) {
return true;
}

#endif // HAS_GPU_LOCALE
2 changes: 0 additions & 2 deletions test/gpu/native/noAmd.skipif

This file was deleted.

6 changes: 0 additions & 6 deletions test/gpu/native/noAmd/reduction/largeArrays.execopts

This file was deleted.

File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
use GPU;

config const printResult = false;
config const n = 100;
config var n = 100;

// testing large data:
// 1. is only meaningful if the reduction is actually done on a device
// 2. times out testing if we use CPU-based reduction, especially if it is a
// fallback.
// So, override n to be something smaller
extern proc chpl_gpu_can_reduce(): bool;
if !chpl_gpu_can_reduce() then n = 100;

var result: uint(8);
on here.gpus[0] {
Expand Down
11 changes: 11 additions & 0 deletions test/gpu/native/reduction/largeArrays.execopts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--n=2_000_000_000
--n=2_000_000_001
--n=2_147_483_647
--n=4_000_000_000
--n=4_000_000_001

# Engin: I want to test the following too, but hit unrelated issues on AMD
# https://github.com/chapel-lang/chapel/issues/23934
# -n=4_294_967_293
# -n=4_294_967_294
# -n=4_294_967_295
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,20 @@ config const kind = "min";
const isMin = kind=="min";
assert(isMin || kind=="max");

config const n = 2*max(int(32));
config const setIdx = n-1;
config var n = 4_000_000_000;
config var setIdx = n-1;

// testing large data:
// 1. is only meaningful if the reduction is actually done on a device
// 2. times out testing if we use CPU-based reduction, especially if it is a
// fallback.
// So, override n/setIdx to be something smaller
extern proc chpl_gpu_can_reduce(): bool;
if !chpl_gpu_can_reduce() {
n = 100;
setIdx = n-1;
}

assert(n>setIdx);

config const printResult = false;
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit e547028

Please sign in to comment.