Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add example calling cuRAND #40

Merged
merged 6 commits into from
Jun 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ ENV/

# mypy
.mypy_cache/

# Vim swap files
.*.swp
109 changes: 109 additions & 0 deletions examples/cuda/curand/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Demonstration of calling cuRAND functions from Numba kernels. Shim functions
# in a .cu file are used to access the cuRAND functions from Numba. This is
# based on the cuRAND device API example in:
# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example
#
# The result produced by this example agrees with the documentation example.
# E.g. on a particular configuration:
#
# Output from this example: 0.4999931156635
# Output from cuRAND documentation example: 0.4999931156635
#
# Note that this example requires an installation of the CUDA toolkit 11.0 or
# later, because NVRTC will use the include files from the installed CUDA
# toolkit.

import sys

try:
from cuda import cuda as cuda_driver # noqa: F401
from numba import config
config.CUDA_USE_NVIDIA_BINDING = True
except ImportError:
print("This example requires the NVIDIA CUDA Python Bindings. "
"Please see https://nvidia.github.io/cuda-python/install.html for "
"installation instructions.")
sys.exit(1)

from numba import cuda
from numba_curand import (curand_init, curand, curand_state_arg_handler,
CurandStates)
import numpy as np

# Various parameters

threads = 64
blocks = 64
nthreads = blocks * threads

sample_count = 10000
repetitions = 50


# State initialization kernel

@cuda.jit(link=['shim.cu'], extensions=[curand_state_arg_handler])
def setup(states):
i = cuda.grid(1)
curand_init(1234, i, 0, states, i)


# Random sampling kernel - computes the fraction of numbers with low bits set
# from a random distribution.

@cuda.jit(link=['shim.cu'], extensions=[curand_state_arg_handler])
def count_low_bits_native(states, sample_count, results):
i = cuda.grid(1)
count = 0

# Copy state to local memory
# XXX: TBC

# Generate pseudo-random numbers
for sample in range(sample_count):
x = curand(states, i)

# Check if low bit set
if(x & 1):
count += 1

# Copy state back to global memory
# XXX: TBC

# Store results
results[i] += count


# Create state on the device. The CUDA Array Interface provides a convenient
# way to get the pointer needed for the shim functions.

# Initialise cuRAND state

states = CurandStates(nthreads)
setup[blocks, threads](states)

# Run random sampling kernel

results = cuda.to_device(np.zeros(nthreads, dtype=np.int32))

for i in range(repetitions):
count_low_bits_native[blocks, threads](
states, sample_count, results)


# Collect the results and summarize them. This could have been done on
# device, but the corresponding CUDA C++ sample does it on the host, and
# we're following that example.

host_results = results.copy_to_host()

total = 0
for i in range(nthreads):
total += host_results[i]

# Use float32 to show an exact match between this and the cuRAND
# documentation example
fraction = (np.float32(total) /
np.float32(nthreads * sample_count * repetitions))

print(f"Fraction with low bit set was {fraction:17.13f}")
118 changes: 118 additions & 0 deletions examples/cuda/curand/numba_curand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Numba extension for cuRAND functions. Presently only implements:
#
# - curand_init()
# - curand()
#
# This is enough for a proof-of-concept embedded calls to cuRAND functions in
# Numba kernels.

from numba import cuda, types
from numba.core.extending import models, register_model, typeof_impl

import numpy as np


# cuRAND state type as a NumPy dtype - this mirrors the state defined in
# curand_kernel.h. Can be used to inspect the state through the device array
# held by CurandStates.

state_fields = [
('d', np.int32),
('v', np.int32, 5),
('boxmuller_flag', np.int32),
('boxmuller_flag_double', np.int32),
('boxmuller_extra', np.float32),
('boxmuller_extra_double', np.float64),
]

curandState = np.dtype(state_fields, align=True)


# Hold an array of cuRAND states - somewhat analagous to a curandState* in
# C/C++.

class CurandStates:
def __init__(self, n):
self._array = cuda.device_array(n, dtype=curandState)

@property
def data(self):
return self._array.__cuda_array_interface__['data'][0]


# Numba typing for cuRAND state.

class CurandState(types.Type):
def __init__(self):
super().__init__(name='CurandState')


curand_state = CurandState()


class CurandStatePointer(types.Type):
def __init__(self):
self.dtype = curand_state
super().__init__(name='CurandState*')


curand_state_pointer = CurandStatePointer()


@typeof_impl.register(CurandStates)
def typeof_curand_states(val, c):
return curand_state_pointer


# The CurandState model mirrors the C/C++ structure, and the state pointer
# represented similarly to other pointers.

@register_model(CurandState)
class curand_state_model(models.StructModel):
def __init__(self, dmm, fe_type):
members = [
('d', types.int32),
('v', types.UniTuple(types.int32, 5)),
('boxmuller_flag', types.int32),
('boxmuller_flag_double', types.int32),
('boxmuller_extra', types.float32),
('boxmuller_extra_double', types.float64),
]
super().__init__(dmm, fe_type, members)


register_model(CurandStatePointer)(models.PointerModel)


# Numba forward declarations of cuRAND functions. These call shim functions
# prepended with _numba, that simply forward arguments to the named cuRAND
# function.

curand_init_sig = types.void(
types.uint64,
types.uint64,
types.uint64,
curand_state_pointer,
types.uint64
)

curand_init = cuda.declare_device('_numba_curand_init', curand_init_sig)
curand = cuda.declare_device('_numba_curand',
types.uint32(curand_state_pointer, types.uint64))


# Argument handling. When a CurandStatePointer is passed into a kernel, we
# really only need to pass the pointer to the data, not the whole underlying
# array structure. Our handler here transforms these arguments into a uint64
# holding the pointer.

class CurandStateArgHandler:
def prepare_args(self, ty, val, **kwargs):
if isinstance(val, CurandStates):
assert ty == curand_state_pointer
return types.uint64, val.data
else:
return ty, val


curand_state_arg_handler = CurandStateArgHandler()
37 changes: 37 additions & 0 deletions examples/cuda/curand/shim.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Shim functions for calling cuRAND from Numba functions.
//
// Numba's ABI expects that:
//
// - The return value is used to indicate whether a Python exception occurred
// during function execution. This does not happen in C/C++ kernels, so we
// always return 0.
// - The result returned to Numba is passed as a pointer in the first parameter.
// For void functions (such as curand_init()), a parameter is passed, but is
// unused.

#include <curand_kernel.h>

extern "C"
__device__ int _numba_curand_init(
int* numba_return_value,
unsigned long long seed,
unsigned long long sequence,
unsigned long long offset,
curandState *state,
unsigned long long index)
{
curand_init(seed, sequence, offset, &state[index]);

return 0;
}

extern "C"
__device__ unsigned int _numba_curand(
int* numba_return_value,
curandState *states,
unsigned long long index)
{
*numba_return_value = curand(&states[index]);

return 0;
}