Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add plumbing through ShadyingSystem into Context to support Batched Execution #1301

Merged
merged 2 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/include/OSL/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -486,4 +486,6 @@ DataT* assume_aligned(DataT* ptr)

#endif

static constexpr int MaxSupportedSimdLaneCount = 16;

OSL_NAMESPACE_EXIT
2 changes: 0 additions & 2 deletions src/include/OSL/wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,6 @@ assign_all(Block<DataT, WidthT>&, const DataT&);
// NOTE: additional constructors & helpers functions exist in the implementation
// that were not specified in the descriptions above for brevity.

static constexpr int MaxSupportedSimdLaneCount = 16;

/// Type for an opaque pointer to whatever the renderer uses to represent a
/// coordinate transformation.
typedef const void* TransformationPtr;
Expand Down
5 changes: 5 additions & 0 deletions src/include/osl_pvt.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ class Symbol {
, m_layer(-1)
, m_scope(0)
, m_dataoffset(-1)
, m_wide_dataoffset(-1)
, m_initializers(0)
, m_node(declaration_node)
, m_alias(NULL)
Expand Down Expand Up @@ -608,6 +609,9 @@ class Symbol {
void dataoffset(int d) { m_dataoffset = d; }
int dataoffset() const { return m_dataoffset; }

void wide_dataoffset(int d) { m_wide_dataoffset = d; }
int wide_dataoffset() const { return m_wide_dataoffset; }

void initializers(int d) { m_initializers = d; }
int initializers() const { return m_initializers; }

Expand Down Expand Up @@ -828,6 +832,7 @@ class Symbol {
short m_layer; ///< Layer (within the group) this belongs to
int m_scope; ///< Scope where this symbol was declared
int m_dataoffset; ///< Offset of the data (-1 for unknown)
int m_wide_dataoffset; ///< Offset of the wide data (-1 for unknown)
int m_initializers; ///< Number of default initializers
ASTNode* m_node; ///< Ptr to the declaration of this symbol
Symbol* m_alias; ///< Another symbol that this is an alias for
Expand Down
269 changes: 244 additions & 25 deletions src/liboslexec/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,39 @@
#include <vector>
#include <string>
#include <cstdio>
#include <cstdint>

#include <OpenImageIO/sysutil.h>
#include <OpenImageIO/timer.h>
#include <OpenImageIO/thread.h>

#include <OSL/batched_shaderglobals.h>
#include <OSL/mask.h>
#include <OSL/wide.h>

#include "oslexec_pvt.h"

OSL_NAMESPACE_ENTER

static mutex buffered_errors_mutex;


namespace pvt {

template <size_t ByteAlignmentT, typename T>
inline bool is_aligned(const T *pointer)
{
std::uintptr_t ptrAsUint = reinterpret_cast<std::uintptr_t>(pointer);
return (ptrAsUint%ByteAlignmentT==0);
}

} // namespace pvt

ShadingContext::ShadingContext (ShadingSystemImpl &shadingsys,
PerThreadInfo *threadinfo)
: m_shadingsys(shadingsys), m_renderer(m_shadingsys.renderer()),
m_group(NULL), m_max_warnings(shadingsys.max_warnings_per_thread()), m_dictionary(NULL)
m_group(NULL), m_max_warnings(shadingsys.max_warnings_per_thread()),
m_dictionary(NULL), batch_size_executed(0)
{
m_shadingsys.m_stat_contexts += 1;
m_threadinfo = threadinfo ? threadinfo : shadingsys.get_perthread_info ();
Expand All @@ -44,6 +60,7 @@ ShadingContext::execute_init (ShaderGroup &sgroup, ShaderGlobals &ssg, bool run)
{
if (m_group)
execute_cleanup ();
batch_size_executed = 0;
m_group = &sgroup;
m_ticks = 0;

Expand Down Expand Up @@ -186,6 +203,156 @@ ShadingContext::execute (ShaderGroup &sgroup, ShaderGlobals &ssg, bool run)
return result;
}

template<int WidthT>
bool
ShadingContext::Batched<WidthT>::execute_init
(ShaderGroup &sgroup, int batch_size, BatchedShaderGlobals<WidthT> &bsg, bool run)
{
if (context().m_group)
context().execute_cleanup ();

context().batch_size_executed = batch_size;
context().m_group = &sgroup;
context().m_ticks = 0;

// Optimize if we haven't already
if (sgroup.nlayers()) {
sgroup.start_running ();
if (! sgroup.batch_jitted()) {
shadingsys().template batched<WidthT>().jit_group(sgroup, &context());
if (shadingsys().m_greedyjit && shadingsys().m_groups_to_compile_count) {
// If we are greedily JITing, optimize/JIT everything now
shadingsys().template batched<WidthT>().jit_all_groups();
}
}
// To handle layers that were not used but still possibly had
// render outputs, we always generate a run function even for
// do nothing groups, so that a GroupData on the heap gets built
// and the run function can broadcast default values there.
//
// Observation is that nothing ever overwrites that default value
// so we could just run it once, or deal with broadcasting the
// default value ourselves

} else {
// empty shader - nothing to do!
return false;
}

int profile = shadingsys().m_profile;
OIIO::Timer timer (profile ? OIIO::Timer::StartNow : OIIO::Timer::DontStartNow);

// Allocate enough space on the heap
size_t heap_size_needed = sgroup.llvm_groupdata_wide_size();
context().reserve_heap(heap_size_needed);
// Zero out the heap memory we will be using
if (shadingsys().m_clearmemory)
memset (context().m_heap.get(), 0, heap_size_needed);

// Set up closure storage
context().m_closure_pool.clear();

// Clear the message blackboard
context().m_messages.clear ();
// TODO: implement batched_messages
//context().batched_messages(WidthOf<WidthT>()).clear ();

// Clear miscellaneous scratch space
context().m_scratch_pool.clear ();

// Zero out stats for this execution
context().clear_runtime_stats ();

if (run) {
bsg.uniform.context = &context();
bsg.uniform.renderer = context().renderer();
bsg.uniform.Ci = NULL;
RunLLVMGroupFuncWide run_func = sgroup.llvm_compiled_wide_init();
OSL_DASSERT (run_func);
OSL_DASSERT (sgroup.llvm_groupdata_wide_size() <= context().m_heapsize);

if(batch_size > 0) {
Mask<WidthT> run_mask(false);
run_mask.set_count_on(batch_size);

run_func (&bsg, context().m_heap.get(), run_mask.value());
}
}

if (profile)
context().m_ticks += timer.ticks();
return true;
}

template<int WidthT>
bool
ShadingContext::Batched<WidthT>::execute_layer (int batch_size, BatchedShaderGlobals<WidthT> &bsg, int layernumber)
{
if (!group() || group()->nlayers() == 0 || group()->does_nothing() || (context().batch_size_executed != batch_size))
return false;
OSL_DASSERT (bsg.uniform.context == &context() && bsg.uniform.renderer == context().renderer());

int profile = shadingsys().m_profile;
OIIO::Timer timer (profile ? OIIO::Timer::StartNow : OIIO::Timer::DontStartNow);

RunLLVMGroupFuncWide run_func = group()->llvm_compiled_wide_layer (layernumber);
if (! run_func)
return false;

OSL_ASSERT(pvt::is_aligned<64>(&bsg));
OSL_ASSERT(pvt::is_aligned<64>(context().m_heap.get()));

if (batch_size > 0) {
Mask<WidthT> run_mask(false);
run_mask.set_count_on(batch_size);

run_func (&bsg, context().m_heap.get(), run_mask.value());
}

if (profile)
context().m_ticks += timer.ticks();

return true;
}


template<int WidthT>
bool
ShadingContext::Batched<WidthT>::execute(ShaderGroup &sgroup, int batch_size, BatchedShaderGlobals<WidthT> &bsg, bool run)
{
OSL_ASSERT(is_aligned<64>(&bsg));
int n = sgroup.m_exec_repeat;

Block<Vec3,WidthT> Psave, Nsave; // for repeats
bool repeat = (n > 1);
if (repeat) {
// If we're going to repeat more than once, we need to save any
// globals that might get modified.
Psave = bsg.varying.P;
Nsave = bsg.varying.N;
if (! run)
n = 1;
}

bool result = true;
while (1) {
if (! execute_init (sgroup, batch_size, bsg, run))
return false;
if (run && n)
execute_layer (batch_size, bsg, group()->nlayers()-1);
result = context().execute_cleanup ();
if (--n < 1)
break; // done
if (repeat) {
// Going around for another pass... restore things as best as we
// can.
bsg.varying.P = Psave;
bsg.varying.N = Nsave;
bsg.uniform.Ci = NULL;
}
}
return result;
}


void
Expand All @@ -199,11 +366,48 @@ ShadingContext::record_error (ErrorHandler::ErrCode code,
}


// separate declaration from definition of template function
// to ensure noinline is respected
template<typename ErrorsT, typename TestFunctorT>
static OSL_NOINLINE void process_errors_helper (ShadingSystemImpl &shading_sys, const ErrorsT &errors, int startAtError, int endBeforeError, const TestFunctorT & test_func);

// Given array of ErrorItems emit errors within the range startAtError to
// endBeforeError if and only if the test_func passed each ErrorItem's mask
// returns true. This allows the same batch of errors to be processed for
// each data lane separately effectively serializing emission of errors,
// warnings, info, and messages
template<typename ErrorsT, typename TestFunctorT>
void process_errors_helper (ShadingSystemImpl &shading_sys, const ErrorsT &errors, int startAtError, int endBeforeError, const TestFunctorT & test_func)
{
for (int i = startAtError; i < endBeforeError; ++i) {
const auto & error_item = errors[i];
if (test_func(error_item.mask)) {
switch (errors[i].err_code) {
case ErrorHandler::EH_MESSAGE :
case ErrorHandler::EH_DEBUG :
shading_sys.message (error_item.msgString);
break;
case ErrorHandler::EH_INFO :
shading_sys.info (error_item.msgString);
break;
case ErrorHandler::EH_WARNING :
shading_sys.warning (error_item.msgString);
break;
case ErrorHandler::EH_ERROR :
case ErrorHandler::EH_SEVERE :
shading_sys.error (error_item.msgString);
break;
default:
break;
}
}
}
}

void
ShadingContext::process_errors () const
{
size_t nerrors = m_buffered_errors.size();
int nerrors(m_buffered_errors.size());
if (! nerrors)
return;

Expand All @@ -212,25 +416,27 @@ ShadingContext::process_errors () const
// interleaved with other threads.
lock_guard lock (buffered_errors_mutex);

for (size_t i = 0; i < nerrors; ++i) {
switch (m_buffered_errors[i].first) {
case ErrorHandler::EH_MESSAGE :
case ErrorHandler::EH_DEBUG :
shadingsys().message (m_buffered_errors[i].second);
break;
case ErrorHandler::EH_INFO :
shadingsys().info (m_buffered_errors[i].second);
break;
case ErrorHandler::EH_WARNING :
shadingsys().warning (m_buffered_errors[i].second);
break;
case ErrorHandler::EH_ERROR :
case ErrorHandler::EH_SEVERE :
shadingsys().error (m_buffered_errors[i].second);
break;
default:
break;
if (execution_is_batched()) {
OSL_DASSERT(batch_size_executed <= MaxSupportedSimdLaneCount);
// Process each data lane separately and in the correct order
for(int lane_mask=0; lane_mask < batch_size_executed; ++lane_mask) {
OSL_INTEL_PRAGMA(noinline)
process_errors_helper(shadingsys(), m_buffered_errors, 0, nerrors,
// Test Function returns true to process the ErrorItem
[=](Mask<MaxSupportedSimdLaneCount> mask)->bool
{
return mask.is_on(lane_mask);
});
}
} else {
// Non-batch errors: ignore the mask, just print them out once
OSL_INTEL_PRAGMA(noinline)
process_errors_helper(shadingsys(), m_buffered_errors, 0, nerrors,
// Test Function returns true to process the ErrorItem
[=](Mask<MaxSupportedSimdLaneCount> /*mask*/)->bool
{
return true;
});
}
m_buffered_errors.clear();
}
Expand All @@ -249,12 +455,22 @@ const void *
ShadingContext::symbol_data (const Symbol &sym) const
{
const ShaderGroup &sgroup (*group());
if (! sgroup.optimized())
return NULL; // can't retrieve symbol if we didn't optimize it
if (execution_is_batched()) {
if (! sgroup.batch_jitted())
return NULL; // can't retrieve symbol if we didn't optimize & batched jit

if (sym.wide_dataoffset() >= 0 && (int)m_heapsize > sym.wide_dataoffset()) {
// lives on the heap
return m_heap.get() + sym.wide_dataoffset();
}
} else {
if (! sgroup.jitted())
return NULL; // can't retrieve symbol if we didn't optimize & jit

if (sym.dataoffset() >= 0 && (int)m_heapsize > sym.dataoffset()) {
// lives on the heap
return m_heap.get() + sym.dataoffset();
if (sym.dataoffset() >= 0 && (int)m_heapsize > sym.dataoffset()) {
// lives on the heap
return m_heap.get() + sym.dataoffset();
}
}

// doesn't live on the heap
Expand Down Expand Up @@ -325,5 +541,8 @@ osl_incr_layers_executed (ShaderGlobals *sg)
ctx->incr_layers_executed ();
}

template class ShadingContext::Batched<16>;
template class ShadingContext::Batched<8>;


OSL_NAMESPACE_EXIT
Loading