Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][CUDA] Minor fixes required to run BabelStream benchmarks on CUDA #1543

Closed
wants to merge 13 commits into from
Closed
18 changes: 16 additions & 2 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ pi_result cuda_piPlatformGetInfo(pi_platform platform,
switch (param_name) {
case PI_PLATFORM_INFO_NAME:
return getInfo(param_value_size, param_value, param_value_size_ret,
"NVIDIA CUDA");
"NVIDIA CUDA BACKEND");
smaslov-intel marked this conversation as resolved.
Show resolved Hide resolved
case PI_PLATFORM_INFO_VENDOR:
return getInfo(param_value_size, param_value, param_value_size_ret,
"NVIDIA Corporation");
Expand Down Expand Up @@ -3359,6 +3359,13 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
ret_err = cuda_piEnqueueMemBufferRead(
command_queue, buffer, blocking_map, offset, size, hostPtr,
num_events_in_wait_list, event_wait_list, retEvent);
} else {
if (retEvent) {
auto new_event =
_pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue);
new_event->record();
*retEvent = new_event;
}
}

return ret_err;
Expand All @@ -3372,7 +3379,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list,
pi_event *retEvent) {
pi_result ret_err = PI_INVALID_OPERATION;
pi_result ret_err = PI_SUCCESS;

assert(mapped_ptr != nullptr);
assert(memobj != nullptr);
Expand All @@ -3385,6 +3392,13 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
command_queue, memobj, true, memobj->get_map_offset(mapped_ptr),
memobj->get_size(), mapped_ptr, num_events_in_wait_list, event_wait_list,
retEvent);
} else {
if (retEvent) {
auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
command_queue);
new_event->record();
*retEvent = new_event;
}
}

memobj->unmap(mapped_ptr);
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,14 @@ pi_result OCL(piDevicesGet)(pi_platform platform, pi_device_type device_type,
*num_devices = 0;
result = PI_SUCCESS;
}

// Absorb the CL_INVALID_DEVICE_TYPE error when the device type is
// not supported in some platforms and just return 0 in num_devices
if (result == CL_INVALID_DEVICE_TYPE) {
assert(num_devices != 0);
*num_devices = 0;
result = PI_SUCCESS;
}
return cast<pi_result>(result);
}

Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ context_impl::context_impl(const vector_class<cl::sycl::device> Devices,
DeviceIds.push_back(getSyclObjImpl(D)->getHandleRef());
}

if (MPlatform->is_cuda()) {
if (MPlatform->getPlugin().getBackend() == backend::cuda) {
#if USE_PI_CUDA
const pi_context_properties props[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
UseCUDAPrimaryContext, 0};
Expand Down
38 changes: 38 additions & 0 deletions sycl/source/detail/platform_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,40 @@ static void filterAllowList(vector_class<RT::PiDevice> &PiDevices,
PiDevices.resize(InsertIDx);
}

// @return True if the device is invalid for the current backend preferences
static bool isDeviceInvalidForBe(const device &Device) {

if (Device.is_host())
return false;

// Retrieve Platform version to identify CUDA OpenCL platform
// String: OpenCL 1.2 CUDA <version>
const platform platform = Device.get_info<info::device::platform>();
const std::string platformVersion =
platform.get_info<info::platform::version>();
const bool HasOpenCL = (platformVersion.find("OpenCL") != std::string::npos);
const bool HasCUDA = (platformVersion.find("CUDA") != std::string::npos);

backend *PrefBackend = detail::SYCLConfig<detail::SYCL_BE>::get();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
backend *PrefBackend = detail::SYCLConfig<detail::SYCL_BE>::get();

Let's move this like after rejecting NVIDIA OpenCL impementation.

auto DeviceBackend = detail::getSyclObjImpl(Device)->getPlugin().getBackend();

// Reject the NVIDIA OpenCL implementation
if (DeviceBackend == backend::opencl && HasCUDA && HasOpenCL)
return true;

Copy link
Contributor

@bader bader May 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
backend *PrefBackend = detail::SYCLConfig<detail::SYCL_BE>::get();

// If no preference, assume OpenCL and reject CUDA
if (DeviceBackend == backend::cuda && !PrefBackend) {
return true;
} else if (!PrefBackend)
return false;

// If using PI_OPENCL, reject the CUDA backend
if (DeviceBackend == backend::cuda && *PrefBackend == backend::opencl)
return true;

return false;
}
Comment on lines +220 to +231
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// If no preference, assume OpenCL and reject CUDA
if (DeviceBackend == backend::cuda && !PrefBackend) {
return true;
} else if (!PrefBackend)
return false;
// If using PI_OPENCL, reject the CUDA backend
if (DeviceBackend == backend::cuda && *PrefBackend == backend::opencl)
return true;
return false;
}
// If no preference, assume OpenCL and reject CUDA
if (DeviceBackend == backend::cuda && (!PrefBackend || *PrefBackend == backend::opencl))
return true;
return false;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor fixes to CUDA backend to get BabelStream (https://github.com/UoB-HPC/BabelStream), each one on individual commits:

  • Fixed: default selection for CUDA devices in presence of the NVIDIA OpenCL platform
  • Fixed: Missing return event on Unmap
  • Throw error in queue constructor when backend is not correct (currently segfaults)

From the PR description, this piece of code seems to be "not required to run BabelStream benchmarks on CUDA" and removing NVIDIA OpenCL implementation is enough. Right?

If so, @Ruyk, can we move this part to a separate PR to make @hiaselhans happy?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, i'll split out the changes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not required to run BabelStream benchmarks on CUDA" and removing NVIDIA OpenCL > implementation is enough. Right?

Actually, babel stream doesnt use a device selector but a device list, so the NVIDIA OpenCL still appears. There are other workarounds I can do so not a problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, @Ruyk, can we move this part to a separate PR to make @hiaselhans happy?

thanks @bader 😄
well, maybe it's me being a dickhead with this.

// If SYCL_BE is set then skip platforms which doesn't have specified

just saying we're re-implementing things already implemented somewhere else...


vector_class<device>
platform_impl::get_devices(info::device_type DeviceType) const {
vector_class<device> Res;
Expand All @@ -211,6 +245,7 @@ platform_impl::get_devices(info::device_type DeviceType) const {

pi_uint32 NumDevices;
const detail::plugin &Plugin = getPlugin();

Plugin.call<PiApiKind::piDevicesGet>(
MPlatform, pi::cast<RT::PiDeviceType>(DeviceType), 0,
pi::cast<RT::PiDevice *>(nullptr), &NumDevices);
Expand All @@ -235,6 +270,9 @@ platform_impl::get_devices(info::device_type DeviceType) const {
PiDevice, std::make_shared<platform_impl>(*this)));
});

Res.erase(std::remove_if(Res.begin(), Res.end(), isDeviceInvalidForBe),
Res.end());

return Res;
}

Expand Down
8 changes: 0 additions & 8 deletions sycl/source/detail/platform_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,6 @@ class platform_impl {
/// \return true if this SYCL platform is a host platform.
bool is_host() const { return MHostPlatform; };

bool is_cuda() const {
const string_class CUDA_PLATFORM_STRING = "NVIDIA CUDA";
const string_class PlatformName =
get_platform_info<string_class, info::platform::name>::get(MPlatform,
getPlugin());
return PlatformName == CUDA_PLATFORM_STRING;
}

/// \return an instance of OpenCL cl_platform_id.
cl_platform_id get() const {
if (is_host())
Expand Down
11 changes: 11 additions & 0 deletions sycl/source/detail/plugin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ class plugin {
RT::PiPlugin MPlugin;
const backend MBackend;
}; // class plugin

/// Two plugins are the same if their string is the same.
/// There is no need to check the actual string, just the pointer, since
/// there is only one instance of the PiPlugin struct per backend.
///
/// \ingroup sycl_pi
///
inline bool operator==(const plugin &lhs, const plugin &rhs) {
return (lhs.getBackend() == rhs.getBackend());
}

} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
24 changes: 1 addition & 23 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,29 +85,7 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context,

RT::PiProgram Program;

bool IsCUDA = false;

// TODO: Implement `piProgramCreateWithBinary` to not require extra logic for
// the CUDA backend.
#if USE_PI_CUDA
// All devices in a context are from the same platform.
RT::PiDevice Device = getFirstDevice(Context);
RT::PiPlatform Platform = nullptr;
Plugin.call<PiApiKind::piDeviceGetInfo>(Device, PI_DEVICE_INFO_PLATFORM, sizeof(Platform),
&Platform, nullptr);
size_t PlatformNameSize = 0u;
Plugin.call<PiApiKind::piPlatformGetInfo>(Platform, PI_PLATFORM_INFO_NAME, 0u, nullptr,
&PlatformNameSize);
std::vector<char> PlatformName(PlatformNameSize, '\0');
Plugin.call<PiApiKind::piPlatformGetInfo>(Platform, PI_PLATFORM_INFO_NAME,
PlatformName.size(), PlatformName.data(), nullptr);
if (PlatformNameSize > 0u &&
std::strncmp(PlatformName.data(), "NVIDIA CUDA", PlatformNameSize) == 0) {
IsCUDA = true;
}
#endif // USE_PI_CUDA

if (IsCUDA) {
if (Context->getPlugin().getBackend() == backend::cuda) {
// TODO: Reemplace CreateWithSource with CreateWithBinary in CUDA backend
const char *SignedData = reinterpret_cast<const char *>(Data);
Plugin.call<PiApiKind::piclProgramCreateWithSource>(Context->getHandleRef(), 1 /*one binary*/, &SignedData,
Expand Down
10 changes: 7 additions & 3 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,16 @@ class queue_impl {
: MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
MPropList(PropList), MHostQueue(MDevice->is_host()),
MOpenCLInterop(!MHostQueue) {
if (!MHostQueue) {
MCommandQueue = createQueue(Order);
}

if (!Context->hasDevice(Device))
throw cl::sycl::invalid_parameter_error(
"Queue cannot be constructed with the given context and device "
"as the context does not contain the given device.",
PI_INVALID_DEVICE);

if (!MHostQueue) {
MCommandQueue = createQueue(Order);
}
}

/// Constructs a SYCL queue from plugin interoperability handle.
Expand Down Expand Up @@ -240,6 +242,8 @@ class queue_impl {
RT::PiContext Context = MContext->getHandleRef();
RT::PiDevice Device = MDevice->getHandleRef();
const detail::plugin &Plugin = getPlugin();

assert(Plugin == MDevice->getPlugin());
RT::PiResult Error = Plugin.call_nocheck<PiApiKind::piQueueCreate>(
Context, Device, CreationFlags, &Queue);

Expand Down
9 changes: 6 additions & 3 deletions sycl/source/device_selector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <CL/sycl/device_selector.hpp>
#include <CL/sycl/exception.hpp>
#include <CL/sycl/stl.hpp>
#include <detail/config.hpp>
#include <detail/device_impl.hpp>
#include <detail/force_device.hpp>
// 4.6.1 Device selection class
Expand Down Expand Up @@ -52,7 +53,8 @@ device device_selector::select_device() const {
// preference to the device of the preferred BE.
//
if (score < dev_score ||
(score == dev_score && isDeviceOfPreferredSyclBe(dev))) {
(score == dev_score && isDeviceOfPreferredSyclBe(dev) &&
dev_score != -1)) {
res = &dev;
score = dev_score;
}
Expand All @@ -78,9 +80,7 @@ device device_selector::select_device() const {
}

int default_selector::operator()(const device &dev) const {

int Score = -1;

// Give preference to device of SYCL BE.
if (isDeviceOfPreferredSyclBe(dev))
Score = 50;
Expand Down Expand Up @@ -114,8 +114,10 @@ int gpu_selector::operator()(const device &dev) const {

int cpu_selector::operator()(const device &dev) const {
int Score = -1;

if (dev.is_cpu()) {
Score = 1000;

// Give preference to device of SYCL BE.
if (isDeviceOfPreferredSyclBe(dev))
Score += 50;
Expand All @@ -125,6 +127,7 @@ int cpu_selector::operator()(const device &dev) const {

int accelerator_selector::operator()(const device &dev) const {
int Score = -1;

if (dev.is_accelerator()) {
Score = 1000;
// Give preference to device of SYCL BE.
Expand Down
2 changes: 1 addition & 1 deletion sycl/test/basic_tests/get_nonhost_devices.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %t.out

// Check that the host device is not included in devices returned by
Expand Down
3 changes: 0 additions & 3 deletions sycl/test/scheduler/DataMovement.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// XFAIL: cuda
// TODO: Fix accidential error return when unmapping read-only memory objects.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out -g
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
Expand Down
13 changes: 13 additions & 0 deletions sycl/tools/get_device_count_by_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <cstring>

#ifdef USE_PI_CUDA
#include <cuda.h>
Expand Down Expand Up @@ -82,6 +83,18 @@ static bool queryOpenCL(cl_device_type deviceType, cl_uint &deviceCount,
}

for (cl_uint i = 0; i < platformCount; i++) {

const size_t MAX_PLATFORM_VENDOR = 100u;
char info[MAX_PLATFORM_VENDOR];
// get platform attribute value
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, MAX_PLATFORM_VENDOR,
info, NULL);
auto IsNVIDIAOpenCL = strstr(info, "NVIDIA") != NULL;
if (IsNVIDIAOpenCL) {
// Ignore NVIDIA OpenCL platform for testing
continue;
}

cl_uint deviceCountPart = 0;
iRet =
clGetDeviceIDs(platforms[i], deviceType, 0, nullptr, &deviceCountPart);
Expand Down