Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial support for whole-array reduction on NVIDIA GPUs #23689

Merged
merged 42 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
096fdba
Snapshot
e-kayrakli Sep 19, 2023
9020bf5
Get things to link in a hacky way
e-kayrakli Sep 22, 2023
df97abb
Get the initial example working
e-kayrakli Sep 22, 2023
01bcf84
Cleanup
e-kayrakli Sep 22, 2023
38dc53f
Drop additional makefile logic
e-kayrakli Oct 7, 2023
3417301
Finish implementing basic reductions
e-kayrakli Oct 7, 2023
17118b3
Add test
e-kayrakli Oct 7, 2023
b5e689b
Add minloc and maxloc reduces
e-kayrakli Oct 9, 2023
f036931
Refactor basic reduce runtime interface, too
e-kayrakli Oct 9, 2023
41e30a9
Move the actual reduction functions to the GPU module
e-kayrakli Oct 9, 2023
8d0e3e0
Simplify runtime macros
e-kayrakli Oct 9, 2023
c56df6b
Start separating impl implementations
e-kayrakli Oct 13, 2023
7aa3642
Seperate runtime interface from the implementation interface
e-kayrakli Oct 16, 2023
cf1e526
Runtime cleanup
e-kayrakli Oct 16, 2023
1489028
Add a common header
e-kayrakli Oct 16, 2023
60f921c
Blind implementation of reductions on AMD
e-kayrakli Oct 17, 2023
ab9b475
Add a compilerError in the module code
e-kayrakli Oct 17, 2023
5ab3e5e
Test/limit usage on AMD
e-kayrakli Oct 18, 2023
0819e31
Fix an issue, add perf test
e-kayrakli Oct 18, 2023
3a78ab2
Initial attempt to do multi-chunk reduction
e-kayrakli Oct 25, 2023
19ad754
Add a new test and fix an issue exposed by it
e-kayrakli Oct 25, 2023
06be1a8
Add new test
e-kayrakli Oct 25, 2023
d012a9b
Expand test to minloc,maxloc. Fix a bug
e-kayrakli Oct 26, 2023
e85a00c
Make the new functions work with cpu-as-device. Add skipifs
e-kayrakli Oct 26, 2023
186ad54
Add the missing good file
e-kayrakli Oct 26, 2023
4b6953f
Remove a trailing whitespace
e-kayrakli Oct 26, 2023
aca38c4
Revert some of the AMD changes
e-kayrakli Oct 26, 2023
257a6ae
Add the missing execopts
e-kayrakli Oct 26, 2023
9c49498
Remove an include
e-kayrakli Oct 26, 2023
1627385
Relocate tests into a new noAmd directory
e-kayrakli Nov 3, 2023
ab8a348
Add a user facing error message for unknown types and a test to lock …
e-kayrakli Nov 3, 2023
7725662
Add more fall-through otherwises
e-kayrakli Nov 3, 2023
6da77ee
Start adding documentation
e-kayrakli Nov 3, 2023
889ef73
Add one more fallthrough, unify error messages
e-kayrakli Nov 3, 2023
377f97d
Free runtime memory that we were leaking before
e-kayrakli Nov 3, 2023
1675b9e
A big refactor to reduce code duplication significantly
e-kayrakli Nov 3, 2023
dd10162
Fix a bug for non-zero-based arrays, add test
e-kayrakli Nov 3, 2023
7f007b7
Remove trailing whitespaces
e-kayrakli Nov 3, 2023
727f443
More clarifications in doc
e-kayrakli Nov 3, 2023
79ea359
Add a missing space
e-kayrakli Nov 3, 2023
cc39ef7
Add missing commas in AMD runtime
e-kayrakli Nov 6, 2023
00ee967
Move skipif to the parent directory
e-kayrakli Nov 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions modules/standard/GPU.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,155 @@ module GPU
/* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
are equal store 'val' in 'x'. The operation returns the old value of x. */
inline proc gpuAtomicCAS( ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }

// ============================
// Reductions
// ============================

@chpldoc.nodoc
config param gpuDebugReduce = false;

private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
if op != "sum" && op != "min" && op != "max" &&
op != "minloc" && op != "maxloc" {

compilerError("Unexpected reduction kind in doGpuReduce: ", op);
}


if CHPL_GPU == "amd" {
compilerError("gpu*Reduce functions are not supported on AMD GPUs");
}
else if CHPL_GPU == "cpu" {
select op {
when "sum" do return + reduce A;
when "min" do return min reduce A;
when "max" do return max reduce A;
when "minloc" do return minloc reduce zip (A.domain, A);
when "maxloc" do return maxloc reduce zip (A.domain, A);
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
}
}
else {
compilerAssert(CHPL_GPU=="nvidia");
}


proc chplTypeToCTypeName(type t) param {
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
select t {
when int(8) do return "int8_t";
when int(16) do return "int16_t";
when int(32) do return "int32_t";
when int(64) do return "int64_t";
when uint(8) do return "uint8_t";
when uint(16) do return "uint16_t";
when uint(32) do return "uint32_t";
when uint(64) do return "uint64_t";
when real(32) do return "float";
when real(64) do return "double";
}
return "unknown";
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
}

proc getExternFuncName(param op: string, type t) param: string {
return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
}

inline proc subReduceVal(param op, ref accum: ?t, val: t) {
select op {
when "sum" do accum += val;
when "min" do accum = min(accum, val);
when "max" do accum = max(accum, val);
}
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
}

inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t,
val: t) {
select op {
when "minloc" do
if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
when "maxloc" do
if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
}
}

iter offsetsThatCanFitIn32Bits(size: int) {
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
// Engin: I've tried to get max(int(32)) to work as this bug is about CUB
// using `int` as the size in the interface. However, getting close to
// max(int(32)) also triggers the bug. So, I am choosing this as a
// round/safe value for the time being.
param chunkSize = 2_000_000_000;

use Math only divCeil;
const numChunks = divCeil(size, chunkSize);
const standardChunkSize = divCeil(size, numChunks);

if gpuDebugReduce then
writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);

foreach chunk in 0..<numChunks {
const start = chunk*standardChunkSize;
const curChunkSize = if start+standardChunkSize <= size
then standardChunkSize
else size-start;
if gpuDebugReduce then
writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
curChunkSize);

yield (start, curChunkSize);
}
}

use CTypes;

param externFunc = getExternFuncName(op, t);

if op == "sum" || op == "min" || op == "max" {
var val: t;
if op == "min" then
val = max(t);
else if op == "max" then
val = min(t);

extern externFunc proc reduce_fn(data, size, ref val);

const basePtr = c_ptrToConst(A);
for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
var curVal: t;
reduce_fn(basePtr+offset, size, curVal);
if gpuDebugReduce then
writef(" (curVal=%i)\n", curVal);
subReduceVal(op, val, curVal);
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
}

return val;
}
else {
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
var ret: (int, t);
if op == "minloc" then
ret[1] = max(t);
else if op == "maxloc" then
ret[1] = min(t);

extern externFunc proc reduce_fn(data, size, ref val, ref idx);

const basePtr = c_ptrToConst(A);
for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
var curIdx: int(32);
var curVal: t;
reduce_fn(basePtr+offset, size, curVal, curIdx);
subReduceValIdx(op, offset, ret, (curIdx, curVal));
if gpuDebugReduce then
writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
}

return ret;
}
}

inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);

}
21 changes: 21 additions & 0 deletions runtime/include/chpl-gpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,27 @@ void chpl_gpu_impl_stream_destroy(void* stream);
bool chpl_gpu_impl_stream_ready(void* stream);
void chpl_gpu_impl_stream_synchronize(void* stream);

#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val,\
void* stream);

GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max)

#undef DECL_ONE_REDUCE_RET_VAL

#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
void* stream);

GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc)

#undef DECL_ONE_REDUCE_RET_VAL_IDX

#ifdef __cplusplus
}
#endif
Expand Down
20 changes: 20 additions & 0 deletions runtime/include/chpl-gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <stdbool.h>
#include "chpl-tasks.h"
#include "chpl-mem-desc.h"
#include "gpu/chpl-gpu-reduce-util.h"

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -149,6 +150,25 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
bool chpl_gpu_can_access_peer(int dev1, int dev2);
void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);

#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val);

GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);

#undef DECL_ONE_REDUCE_RET_VAL

#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx);

GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);

#undef DECL_ONE_REDUCE_RET_VAL_IDX

#endif // HAS_GPU_LOCALE

#ifdef __cplusplus
Expand Down
47 changes: 47 additions & 0 deletions runtime/include/gpu/chpl-gpu-reduce-util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright 2020-2023 Hewlett Packard Enterprise Development LP
* Copyright 2004-2019 Cray Inc.
* Other additional copyright holders may be indicated within. *
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifdef HAS_GPU_LOCALE

#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
MACRO(impl_kind, chpl_kind, int8_t) \
MACRO(impl_kind, chpl_kind, int16_t) \
MACRO(impl_kind, chpl_kind, int32_t) \
MACRO(impl_kind, chpl_kind, int64_t) \
MACRO(impl_kind, chpl_kind, uint8_t) \
MACRO(impl_kind, chpl_kind, uint16_t) \
MACRO(impl_kind, chpl_kind, uint32_t) \
MACRO(impl_kind, chpl_kind, uint64_t) \
MACRO(impl_kind, chpl_kind, float) \
MACRO(impl_kind, chpl_kind, double);

#define GPU_REDUCE(MACRO, chpl_kind) \
MACRO(chpl_kind, int8_t) \
MACRO(chpl_kind, int16_t) \
MACRO(chpl_kind, int32_t) \
MACRO(chpl_kind, int64_t) \
MACRO(chpl_kind, uint8_t) \
MACRO(chpl_kind, uint16_t) \
MACRO(chpl_kind, uint32_t) \
MACRO(chpl_kind, uint64_t) \
MACRO(chpl_kind, float) \
MACRO(chpl_kind, double);

#endif // HAS_GPU_LOCALE

52 changes: 52 additions & 0 deletions runtime/src/chpl-gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
#include "chpl-env.h"
#include "chpl-comm-compiler-macros.h"

#include "gpu/chpl-gpu-reduce-util.h"

void chpl_gpu_init(void) {
chpl_gpu_impl_init(&chpl_gpu_num_devices);

Expand Down Expand Up @@ -697,4 +699,54 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
}

#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
e-kayrakli marked this conversation as resolved.
Show resolved Hide resolved
data_type* val) { \
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
\
int dev = chpl_task_getRequestedSubloc(); \
chpl_gpu_impl_use_device(dev); \
void* stream = get_stream(dev); \
\
chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \
\
if (chpl_gpu_sync_with_host) { \
CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
wait_stream(stream); \
} \
\
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
}

GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum)
GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min)
GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max)

#undef DEF_ONE_REDUCE_RET_VAL

#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\
void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
data_type* val, int* idx) { \
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
\
int dev = chpl_task_getRequestedSubloc(); \
chpl_gpu_impl_use_device(dev); \
void* stream = get_stream(dev); \
\
chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
\
if (chpl_gpu_sync_with_host) { \
CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
wait_stream(stream); \
} \
\
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
}

GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc);
GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc);

#undef DEF_ONE_REDUCE_RET_VAL_IDX


#endif
1 change: 1 addition & 0 deletions runtime/src/gpu/amd/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)

ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc

include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
12 changes: 9 additions & 3 deletions runtime/src/gpu/amd/Makefile.share
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SRCS = gpu-amd.c
GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c

GPU_SRCS = gpu-amd.c
SRCS = $(GPU_SRCS)

GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))

RUNTIME_CXXFLAGS += -x hip

$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
$(RUNTIME_OBJ_DIR_STAMP)
$(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
Loading