AcademySoftwareFoundation · lgritz · Nov 23, 2020 · Dec 31, 2015 · Dec 31, 2015
diff --git a/src/include/OSL/platform.h b/src/include/OSL/platform.h
@@ -486,4 +486,6 @@ DataT* assume_aligned(DataT* ptr)
 
 #endif
 
+static constexpr int MaxSupportedSimdLaneCount = 16;
+
 OSL_NAMESPACE_EXIT
diff --git a/src/include/OSL/wide.h b/src/include/OSL/wide.h
@@ -119,8 +119,6 @@ assign_all(Block<DataT, WidthT>&, const DataT&);
 // NOTE: additional constructors & helpers functions exist in the implementation
 // that were not specified in the descriptions above for brevity.
 
-static constexpr int MaxSupportedSimdLaneCount = 16;
-
 /// Type for an opaque pointer to whatever the renderer uses to represent a
 /// coordinate transformation.
 typedef const void* TransformationPtr;

diff --git a/src/include/osl_pvt.h b/src/include/osl_pvt.h
@@ -487,6 +487,7 @@ class Symbol {
         , m_layer(-1)
         , m_scope(0)
         , m_dataoffset(-1)
+        , m_wide_dataoffset(-1)
         , m_initializers(0)
         , m_node(declaration_node)
         , m_alias(NULL)
@@ -608,6 +609,9 @@ class Symbol {
     void dataoffset(int d) { m_dataoffset = d; }
     int dataoffset() const { return m_dataoffset; }
 
+    void wide_dataoffset(int d) { m_wide_dataoffset = d; }
+    int wide_dataoffset() const { return m_wide_dataoffset; }
+
     void initializers(int d) { m_initializers = d; }
     int initializers() const { return m_initializers; }
 
@@ -828,6 +832,7 @@ class Symbol {
     short m_layer;                ///< Layer (within the group) this belongs to
     int m_scope;                  ///< Scope where this symbol was declared
     int m_dataoffset;             ///< Offset of the data (-1 for unknown)
+    int m_wide_dataoffset;        ///< Offset of the wide data (-1 for unknown)
     int m_initializers;           ///< Number of default initializers
     ASTNode* m_node;              ///< Ptr to the declaration of this symbol
     Symbol* m_alias;              ///< Another symbol that this is an alias for

diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp
@@ -5,23 +5,39 @@
 #include <vector>
 #include <string>
 #include <cstdio>
+#include <cstdint>
 
 #include <OpenImageIO/sysutil.h>
 #include <OpenImageIO/timer.h>
 #include <OpenImageIO/thread.h>
 
+#include <OSL/batched_shaderglobals.h>
+#include <OSL/mask.h>
+#include <OSL/wide.h>
+
 #include "oslexec_pvt.h"
 
 OSL_NAMESPACE_ENTER
 
 static mutex buffered_errors_mutex;
 
 
+namespace pvt {
+
+template <size_t ByteAlignmentT, typename T>
+inline bool is_aligned(const T *pointer)
+{
+    std::uintptr_t ptrAsUint = reinterpret_cast<std::uintptr_t>(pointer);
+    return (ptrAsUint%ByteAlignmentT==0);
+}
+
+} // namespace pvt
 
 ShadingContext::ShadingContext (ShadingSystemImpl &shadingsys,
                                 PerThreadInfo *threadinfo)
     : m_shadingsys(shadingsys), m_renderer(m_shadingsys.renderer()),
-      m_group(NULL), m_max_warnings(shadingsys.max_warnings_per_thread()), m_dictionary(NULL)
+      m_group(NULL), m_max_warnings(shadingsys.max_warnings_per_thread()),
+      m_dictionary(NULL), batch_size_executed(0)
 {
     m_shadingsys.m_stat_contexts += 1;
     m_threadinfo = threadinfo ? threadinfo : shadingsys.get_perthread_info ();
@@ -44,6 +60,7 @@ ShadingContext::execute_init (ShaderGroup &sgroup, ShaderGlobals &ssg, bool run)
 {
     if (m_group)
         execute_cleanup ();
+    batch_size_executed = 0;
     m_group = &sgroup;
     m_ticks = 0;
 
@@ -186,6 +203,156 @@ ShadingContext::execute (ShaderGroup &sgroup, ShaderGlobals &ssg, bool run)
     return result;
 }
 
+template<int WidthT>
+bool
+ShadingContext::Batched<WidthT>::execute_init
+(ShaderGroup &sgroup, int batch_size, BatchedShaderGlobals<WidthT> &bsg, bool run)
+{
+    if (context().m_group)
+        context().execute_cleanup ();
+
+    context().batch_size_executed = batch_size;
+    context().m_group = &sgroup;
+    context().m_ticks = 0;
+
+    // Optimize if we haven't already
+    if (sgroup.nlayers()) {
+        sgroup.start_running ();
+        if (! sgroup.batch_jitted()) {
+            shadingsys().template batched<WidthT>().jit_group(sgroup, &context());
+            if (shadingsys().m_greedyjit && shadingsys().m_groups_to_compile_count) {
+                // If we are greedily JITing, optimize/JIT everything now
+                shadingsys().template batched<WidthT>().jit_all_groups();
+            }
+        }
+        // To handle layers that were not used but still possibly had
+        // render outputs, we always generate a run function even for
+        // do nothing groups, so that a GroupData on the heap gets built
+        // and the run function can broadcast default values there.
+        //
+        // Observation is that nothing ever overwrites that default value
+        // so we could just run it once, or deal with broadcasting the
+        // default value ourselves
+
+    } else {
+       // empty shader - nothing to do!
+       return false;
+    }
+
+    int profile = shadingsys().m_profile;
+    OIIO::Timer timer (profile ? OIIO::Timer::StartNow : OIIO::Timer::DontStartNow);
+
+    // Allocate enough space on the heap
+    size_t heap_size_needed = sgroup.llvm_groupdata_wide_size();
+    context().reserve_heap(heap_size_needed);
+    // Zero out the heap memory we will be using
+    if (shadingsys().m_clearmemory)
+        memset (context().m_heap.get(), 0, heap_size_needed);
+
+    // Set up closure storage
+    context().m_closure_pool.clear();
+
+    // Clear the message blackboard
+    context().m_messages.clear ();
+    // TODO: implement batched_messages
+    //context().batched_messages(WidthOf<WidthT>()).clear ();
+
+    // Clear miscellaneous scratch space
+    context().m_scratch_pool.clear ();
+
+    // Zero out stats for this execution
+    context().clear_runtime_stats ();
+
+    if (run) {
+        bsg.uniform.context = &context();
+        bsg.uniform.renderer = context().renderer();
+        bsg.uniform.Ci = NULL;
+        RunLLVMGroupFuncWide run_func = sgroup.llvm_compiled_wide_init();
+        OSL_DASSERT (run_func);
+        OSL_DASSERT (sgroup.llvm_groupdata_wide_size() <= context().m_heapsize);
+
+        if(batch_size > 0) {
+            Mask<WidthT> run_mask(false);
+            run_mask.set_count_on(batch_size);
+
+            run_func (&bsg, context().m_heap.get(), run_mask.value());
+        }
+    }
+
+    if (profile)
+        context().m_ticks += timer.ticks();
+    return true;
+}
+
+template<int WidthT>
+bool
+ShadingContext::Batched<WidthT>::execute_layer (int batch_size, BatchedShaderGlobals<WidthT> &bsg, int layernumber)
+{
+    if (!group() || group()->nlayers() == 0 || group()->does_nothing() || (context().batch_size_executed != batch_size))
+        return false;
+    OSL_DASSERT (bsg.uniform.context == &context() && bsg.uniform.renderer == context().renderer());
+
+    int profile = shadingsys().m_profile;
+    OIIO::Timer timer (profile ? OIIO::Timer::StartNow : OIIO::Timer::DontStartNow);
+
+    RunLLVMGroupFuncWide run_func = group()->llvm_compiled_wide_layer (layernumber);
+    if (! run_func)
+        return false;
+
+    OSL_ASSERT(pvt::is_aligned<64>(&bsg));
+    OSL_ASSERT(pvt::is_aligned<64>(context().m_heap.get()));
+
+    if (batch_size > 0) {
+        Mask<WidthT> run_mask(false);
+        run_mask.set_count_on(batch_size);
+
+        run_func (&bsg, context().m_heap.get(), run_mask.value());
+    }
+
+    if (profile)
+        context().m_ticks += timer.ticks();
+
+    return true;
+}
+
+
+template<int WidthT>
+bool
+ShadingContext::Batched<WidthT>::execute(ShaderGroup &sgroup, int batch_size, BatchedShaderGlobals<WidthT> &bsg, bool run)
+{
+    OSL_ASSERT(is_aligned<64>(&bsg));
+    int n = sgroup.m_exec_repeat;
+
+    Block<Vec3,WidthT> Psave, Nsave;   // for repeats
+    bool repeat = (n > 1);
+    if (repeat) {
+        // If we're going to repeat more than once, we need to save any
+        // globals that might get modified.
+        Psave = bsg.varying.P;
+        Nsave = bsg.varying.N;
+        if (! run)
+            n = 1;
+    }
+
+    bool result = true;
+    while (1) {
+        if (! execute_init (sgroup, batch_size, bsg, run))
+            return false;
+        if (run && n)
+            execute_layer (batch_size, bsg, group()->nlayers()-1);
+        result = context().execute_cleanup ();
+        if (--n < 1)
+            break;   // done
+        if (repeat) {
+            // Going around for another pass... restore things as best as we
+            // can.
+            bsg.varying.P = Psave;
+            bsg.varying.N = Nsave;
+            bsg.uniform.Ci = NULL;
+        }
+    }
+    return result;
+}
 
 
 void
@@ -199,11 +366,48 @@ ShadingContext::record_error (ErrorHandler::ErrCode code,
 }
 
 
+// separate declaration from definition of template function
+// to ensure noinline is respected
+template<typename ErrorsT, typename TestFunctorT>
+static OSL_NOINLINE void process_errors_helper (ShadingSystemImpl &shading_sys, const ErrorsT &errors, int startAtError, int endBeforeError, const TestFunctorT & test_func);
+
+// Given array of ErrorItems emit errors within the range startAtError to
+// endBeforeError if and only if the test_func passed each ErrorItem's mask
+// returns true.  This allows the same batch of errors to be processed for
+// each data lane separately effectively serializing emission of errors,
+// warnings, info, and messages
+template<typename ErrorsT, typename TestFunctorT>
+void process_errors_helper (ShadingSystemImpl &shading_sys, const ErrorsT &errors, int startAtError, int endBeforeError, const TestFunctorT & test_func)
+{
+    for (int i = startAtError;  i < endBeforeError;  ++i) {
+        const auto & error_item = errors[i];
+        if (test_func(error_item.mask)) {
+            switch (errors[i].err_code) {
+            case ErrorHandler::EH_MESSAGE :
+            case ErrorHandler::EH_DEBUG :
+                shading_sys.message (error_item.msgString);
+                break;
+            case ErrorHandler::EH_INFO :
+                shading_sys.info (error_item.msgString);
+                break;
+            case ErrorHandler::EH_WARNING :
+                shading_sys.warning (error_item.msgString);
+                break;
+            case ErrorHandler::EH_ERROR :
+            case ErrorHandler::EH_SEVERE :
+                shading_sys.error (error_item.msgString);
+                break;
+            default:
+                break;
+            }
+        }
+    }
+}
 
 void
 ShadingContext::process_errors () const
 {
-    size_t nerrors = m_buffered_errors.size();
+    int nerrors(m_buffered_errors.size());
     if (! nerrors)
         return;
 
@@ -212,25 +416,27 @@ ShadingContext::process_errors () const
     // interleaved with other threads.
     lock_guard lock (buffered_errors_mutex);
 
-    for (size_t i = 0;  i < nerrors;  ++i) {
-        switch (m_buffered_errors[i].first) {
-        case ErrorHandler::EH_MESSAGE :
-        case ErrorHandler::EH_DEBUG :
-           shadingsys().message (m_buffered_errors[i].second);
-            break;
-        case ErrorHandler::EH_INFO :
-            shadingsys().info (m_buffered_errors[i].second);
-            break;
-        case ErrorHandler::EH_WARNING :
-            shadingsys().warning (m_buffered_errors[i].second);
-            break;
-        case ErrorHandler::EH_ERROR :
-        case ErrorHandler::EH_SEVERE :
-            shadingsys().error (m_buffered_errors[i].second);
-            break;
-        default:
-            break;
+    if (execution_is_batched()) {
+        OSL_DASSERT(batch_size_executed <= MaxSupportedSimdLaneCount);
+        // Process each data lane separately and in the correct order
+        for(int lane_mask=0; lane_mask < batch_size_executed; ++lane_mask) {
+            OSL_INTEL_PRAGMA(noinline)
+            process_errors_helper(shadingsys(), m_buffered_errors, 0, nerrors,
+                // Test Function returns true to process the ErrorItem
+                [=](Mask<MaxSupportedSimdLaneCount> mask)->bool
+                {
+                    return mask.is_on(lane_mask);
+                });
         }
+    } else {
+        // Non-batch errors: ignore the mask, just print them out once
+        OSL_INTEL_PRAGMA(noinline)
+        process_errors_helper(shadingsys(), m_buffered_errors, 0, nerrors,
+            // Test Function returns true to process the ErrorItem
+            [=](Mask<MaxSupportedSimdLaneCount> /*mask*/)->bool
+            {
+                return true;
+            });
     }
     m_buffered_errors.clear();
 }
@@ -249,12 +455,22 @@ const void *
 ShadingContext::symbol_data (const Symbol &sym) const
 {
     const ShaderGroup &sgroup (*group());
-    if (! sgroup.optimized())
-        return NULL;   // can't retrieve symbol if we didn't optimize it
+    if (execution_is_batched()) {
+        if (! sgroup.batch_jitted())
+            return NULL;   // can't retrieve symbol if we didn't optimize & batched jit
+
+        if (sym.wide_dataoffset() >= 0 && (int)m_heapsize > sym.wide_dataoffset()) {
+            // lives on the heap
+            return m_heap.get() + sym.wide_dataoffset();
+        }
+    } else {
+        if (! sgroup.jitted())
+            return NULL;   // can't retrieve symbol if we didn't optimize & jit
 
-    if (sym.dataoffset() >= 0 && (int)m_heapsize > sym.dataoffset()) {
-        // lives on the heap
-        return m_heap.get() + sym.dataoffset();
+        if (sym.dataoffset() >= 0 && (int)m_heapsize > sym.dataoffset()) {
+            // lives on the heap
+            return m_heap.get() + sym.dataoffset();
+        }
     }
 
     // doesn't live on the heap
@@ -325,5 +541,8 @@ osl_incr_layers_executed (ShaderGlobals *sg)
     ctx->incr_layers_executed ();
 }
 
+template class ShadingContext::Batched<16>;
+template class ShadingContext::Batched<8>;
+
 
 OSL_NAMESPACE_EXIT
Original file line number	Diff line number	Diff line change
Expand Up		@@ -486,4 +486,6 @@ DataT* assume_aligned(DataT* ptr)

		#endif

		static constexpr int MaxSupportedSimdLaneCount = 16;

		OSL_NAMESPACE_EXIT