Skip to content

Commit

Permalink
[SYCL] Host task implementation (#1471)
Browse files Browse the repository at this point in the history
This patch is part 1 in a series of patches for host-interop-task proposal by
Codeplay. See the proposal at [1].

This patch implements:
 - host-task execution mechanism;
 - enqueue of host-task without interop_handle argument;
 - spin-lock to await for host and synchronous device events completion.

This patch reimplements glue/connection of events within different contexts to
eliminate use of event callback in favor of host-task.

Host-task execution mechanism involves:
 - thread-pool for queue to execute host-task's user lambda in;
 - explicit call to event_impl::setComplete() for host events and device-side
   synchronous events;
 - helper class DispatchHostTask which wraps call to host-task's user lambda.

Thread pool's size is set via `SYCL_QUEUE_THREAD_POOL_SIZE` environment variable
and defaults to 1.

Even though host-task is enqueued to device queue it'll be executed on the
default host queue.
Host-task is represented via distinct ExecCGCommand paired with EmptyCommand.
Any other command, which depends on host-task will really depend on it's
EmptyCommand. The EmptyCommand is in blocked state initially.

Class DispatchHostTask awaits for host-task's dependency events, then calls to
host-task's user lambda and unblocks any dependent commands via unblocking it's
EmptyCommand and enqueueing of leaves for requirements (i.e. host accessors
required for execution of this host-task).

[1] https://github.com/codeplaysoftware/standards-proposals/blob/master/host_task/host_task.md
  • Loading branch information
s-kanaev authored May 20, 2020
1 parent 075361e commit ae3fd5c
Show file tree
Hide file tree
Showing 23 changed files with 1,070 additions and 135 deletions.
1 change: 1 addition & 0 deletions sycl/doc/EnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ subject to change. Do not rely on these variables in production code.
| SYCL_THROW_ON_BLOCK | Any(\*) | Throw an exception on attempt to wait for a blocked command. |
| SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. |
| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
`(*) Note: Any means this environment variable is effective when set to any non-null value.`

### SYCL_PRINT_EXECUTION_GRAPH Options
Expand Down
8 changes: 4 additions & 4 deletions sycl/include/CL/sycl/accessor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,7 @@ class accessor :
template <int Dims = Dimensions, typename AllocatorT,
typename = typename detail::enable_if_t<
(Dims == 0) &&
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf))>
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>
>
accessor(buffer<DataT,1,AllocatorT> &BufferRef,
handler &CommandGroupHandler)
Expand Down Expand Up @@ -817,9 +817,9 @@ class accessor :
#endif

template <int Dims = Dimensions, typename AllocatorT,
typename = detail::enable_if_t<(Dims > 0) && (Dims == Dimensions) &&
(!IsPlaceH &&
(IsGlobalBuf || IsConstantBuf))>>
typename = detail::enable_if_t<
(Dims > 0) && (Dims == Dimensions) &&
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>>
accessor(buffer<DataT, Dims, AllocatorT> &BufferRef,
handler &CommandGroupHandler)
#ifdef __SYCL_DEVICE_ONLY__
Expand Down
31 changes: 30 additions & 1 deletion sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,16 @@ class InteropTask {
void call(cl::sycl::interop_handler &h) { MFunc(h); }
};

class HostTask {
std::function<void()> MHostTask;

public:
HostTask() : MHostTask([]() {}) {}
HostTask(std::function<void()> &&Func) : MHostTask(Func) {}

void call() { MHostTask(); }
};

// Class which stores specific lambda object.
template <class KernelType, class KernelArgType, int Dims>
class HostKernel : public HostKernelBase {
Expand Down Expand Up @@ -391,7 +401,8 @@ class CG {
COPY_USM,
FILL_USM,
PREFETCH_USM,
INTEROP_TASK_CODEPLAY
CODEPLAY_INTEROP_TASK,
CODEPLAY_HOST_TASK
};

CG(CGTYPE Type, vector_class<vector_class<char>> ArgsStorage,
Expand Down Expand Up @@ -631,6 +642,24 @@ class CGInteropTask : public CG {
MInteropTask(std::move(InteropTask)) {}
};

class CGHostTask : public CG {
public:
std::unique_ptr<HostTask> MHostTask;
vector_class<ArgDesc> MArgs;

CGHostTask(std::unique_ptr<HostTask> HostTask, vector_class<ArgDesc> Args,
std::vector<std::vector<char>> ArgsStorage,
std::vector<detail::AccessorImplPtr> AccStorage,
std::vector<std::shared_ptr<const void>> SharedPtrStorage,
std::vector<Requirement *> Requirements,
std::vector<detail::EventImplPtr> Events, CGTYPE Type,
detail::code_location loc = {})
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
std::move(SharedPtrStorage), std::move(Requirements),
std::move(Events), std::move(loc)),
MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
};

} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
39 changes: 38 additions & 1 deletion sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,27 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
using name = Type;
};

template <typename, typename T> struct check_fn_signature {
static_assert(std::integral_constant<T, false>::value,
"Second template parameter is required to be of function type");
};

template <typename F, typename RetT, typename... Args>
struct check_fn_signature<F, RetT(Args...)> {
private:
template <typename T>
static constexpr auto check(T *) -> typename std::is_same<
decltype(std::declval<T>().operator()(std::declval<Args>()...)),
RetT>::type;

template <typename> static constexpr std::false_type check(...);

using type = decltype(check<F>(0));

public:
static constexpr bool value = type::value;
};

__SYCL_EXPORT device getDeviceFromHandler(handler &);

} // namespace detail
Expand Down Expand Up @@ -789,6 +810,20 @@ class __SYCL_EXPORT handler {
MCGType = detail::CG::RUN_ON_HOST_INTEL;
}

template <typename FuncT>
typename std::enable_if<detail::check_fn_signature<
typename std::remove_reference<FuncT>::type, void()>::value>::type
codeplay_host_task(FuncT Func) {
throwIfActionIsCreated();

MNDRDesc.set(range<1>(1));
MArgs = std::move(MAssociatedAccesors);

MHostTask.reset(new detail::HostTask(std::move(Func)));

MCGType = detail::CG::CODEPLAY_HOST_TASK;
}

/// Defines and invokes a SYCL kernel function for the specified range and
/// offset.
///
Expand Down Expand Up @@ -1140,7 +1175,7 @@ class __SYCL_EXPORT handler {
template <typename FuncT> void interop_task(FuncT Func) {

MInteropTask.reset(new detail::InteropTask(std::move(Func)));
MCGType = detail::CG::INTEROP_TASK_CODEPLAY;
MCGType = detail::CG::CODEPLAY_INTEROP_TASK;
}

/// Defines and invokes a SYCL kernel function for the specified range.
Expand Down Expand Up @@ -1598,6 +1633,8 @@ class __SYCL_EXPORT handler {
vector_class<char> MPattern;
/// Storage for a lambda or function object.
unique_ptr_class<detail::HostKernelBase> MHostKernel;
/// Storage for lambda/function when using HostTask
unique_ptr_class<detail::HostTask> MHostTask;
detail::OSModuleHandle MOSModuleHandle;
// Storage for a lambda or function when using InteropTasks
std::unique_ptr<detail::InteropTask> MInteropTask;
Expand Down
49 changes: 40 additions & 9 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,31 @@ event_impl::~event_impl() {
}

void event_impl::waitInternal() const {
if (!MHostEvent) {
if (!MHostEvent && MEvent) {
getPlugin().call<PiApiKind::piEventsWait>(1, &MEvent);
return;
}
// Waiting of host events is NOP so far as all operations on host device
// are blocking.

while (MState != HES_Complete)
;
}

void event_impl::setComplete() {
if (MHostEvent || !MEvent) {
#ifndef NDEBUG
int Expected = HES_NotComplete;
int Desired = HES_Complete;

bool Succeeded = MState.compare_exchange_strong(Expected, Desired);

assert(Succeeded && "Unexpected state of event");
#else
MState.store(static_cast<int>(HES_Complete));
#endif
return;
}

assert(false && "setComplete is not supported for non-host event");
}

const RT::PiEvent &event_impl::getHandleRef() const { return MEvent; }
Expand All @@ -68,11 +88,15 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
MHostEvent = Context->is_host();
MOpenCLInterop = !MHostEvent;
MContext = Context;

MState = HES_NotComplete;
}

event_impl::event_impl() : MState(HES_Complete) {}

event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
: MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
MOpenCLInterop(true), MHostEvent(false) {
MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) {

if (MContext->is_host()) {
throw cl::sycl::invalid_parameter_error(
Expand All @@ -96,12 +120,19 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
}

event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) {
if (Queue->is_host() &&
Queue->has_property<property::queue::enable_profiling>()) {
MHostProfilingInfo.reset(new HostProfilingInfo());
if (!MHostProfilingInfo)
throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
if (Queue->is_host()) {
MState.store(HES_NotComplete);

if (Queue->has_property<property::queue::enable_profiling>()) {
MHostProfilingInfo.reset(new HostProfilingInfo());
if (!MHostProfilingInfo)
throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
}

return;
}

MState.store(HES_Complete);
}

void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID,
Expand Down
10 changes: 9 additions & 1 deletion sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <CL/sycl/info/info_desc.hpp>
#include <CL/sycl/stl.hpp>

#include <atomic>
#include <cassert>

__SYCL_INLINE_NAMESPACE(cl) {
Expand All @@ -32,7 +33,7 @@ class event_impl {
/// Constructs a ready SYCL event.
///
/// If the constructed SYCL event is waited on it will complete immediately.
event_impl() = default;
event_impl();
/// Constructs an event instance from a plug-in event handle.
///
/// The SyclContext must match the plug-in context associated with the
Expand Down Expand Up @@ -166,6 +167,13 @@ class event_impl {
bool MHostEvent = true;
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
void *MCommand = nullptr;

enum HostEventState : int { HES_NotComplete = 0, HES_Complete };

// State of host event. Employed only for host events and event with no
// backend's representation (e.g. alloca). Used values are listed in
// HostEventState enum.
std::atomic<int> MState;
};

} // namespace detail
Expand Down
24 changes: 24 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,30 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
#endif
}

void queue_impl::initHostTaskAndEventCallbackThreadPool() {
if (MHostTaskThreadPool)
return;

int Size = 1;

if (const char *val = std::getenv("SYCL_QUEUE_THREAD_POOL_SIZE"))
try {
Size = std::stoi(val);
} catch (...) {
throw invalid_parameter_error(
"Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
PI_INVALID_VALUE);
}

if (Size < 1)
throw invalid_parameter_error(
"Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
PI_INVALID_VALUE);

MHostTaskThreadPool.reset(new ThreadPool(Size));
MHostTaskThreadPool->start();
}

pi_native_handle queue_impl::getNative() const {
auto Plugin = getPlugin();
pi_native_handle Handle;
Expand Down
14 changes: 14 additions & 0 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <detail/event_impl.hpp>
#include <detail/plugin.hpp>
#include <detail/scheduler/scheduler.hpp>
#include <detail/thread_pool.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
Expand Down Expand Up @@ -348,6 +349,13 @@ class queue_impl {
MExceptions.PushBack(ExceptionPtr);
}

ThreadPool &getThreadPool() {
if (!MHostTaskThreadPool)
initHostTaskAndEventCallbackThreadPool();

return *MHostTaskThreadPool;
}

/// Gets the native handle of the SYCL queue.
///
/// \return a native handle.
Expand Down Expand Up @@ -380,6 +388,8 @@ class queue_impl {
void instrumentationEpilog(void *TelementryEvent, string_class &Name,
int32_t StreamID, uint64_t IId);

void initHostTaskAndEventCallbackThreadPool();

/// Stores a USM operation event that should be associated with the queue
///
/// \param Event is the event to be stored
Expand Down Expand Up @@ -414,6 +424,10 @@ class queue_impl {
const bool MOpenCLInterop = false;
// Assume OOO support by default.
bool MSupportOOO = true;

// Thread pool for host task and event callbacks execution.
// The thread pool is instantiated upon the very first call to getThreadPool()
std::unique_ptr<ThreadPool> MHostTaskThreadPool;
};

} // namespace detail
Expand Down
Loading

0 comments on commit ae3fd5c

Please sign in to comment.