From c18cf91d918d4101fc7a0f438ab2348b6966bb07 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Sun, 14 Aug 2022 00:49:42 -0400
Subject: [PATCH 1/4] llvm/execution: Fix typo "variale" -> "variable"

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index d9226522658..933b561fc4d 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -723,11 +723,11 @@ def cuda_evaluate(self, variable, num_evaluations):
         return ct_results
 
     def thread_evaluate(self, variable, num_evaluations):
-        ct_param, ct_state, ct_data, converted_variale, out_ty = \
+        ct_param, ct_state, ct_data, converted_variable, out_ty = \
             self._prepare_evaluate(variable, num_evaluations)
 
         ct_results = out_ty()
-        ct_variable = converted_variale.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
+        ct_variable = converted_variable.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
         jobs = min(os.cpu_count(), num_evaluations)
         evals_per_job = (num_evaluations + jobs - 1) // jobs
 

From 2474d63745e11ceec2aba4a614c7fddf931af1fe Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Sun, 14 Aug 2022 00:51:33 -0400
Subject: [PATCH 2/4] llvm/helpers: Reuse type of index variable

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/psyneulink/core/llvm/helpers.py b/psyneulink/core/llvm/helpers.py
index bdb887e7ddc..7c3b0414d00 100644
--- a/psyneulink/core/llvm/helpers.py
+++ b/psyneulink/core/llvm/helpers.py
@@ -443,7 +443,7 @@ def printf_float_array(builder, array, prefix="", suffix="\n", override_debug=Fa
     printf(builder, prefix, override_debug=override_debug)
 
     with array_ptr_loop(builder, array, "print_array_loop") as (b1, i):
-        printf(b1, "%lf ", b1.load(b1.gep(array, [ir.IntType(32)(0), i])), override_debug=override_debug)
+        printf(b1, "%lf ", b1.load(b1.gep(array, [i.type(0), i])), override_debug=override_debug)
 
     printf(builder, suffix, override_debug=override_debug)
 
@@ -451,7 +451,7 @@ def printf_float_array(builder, array, prefix="", suffix="\n", override_debug=Fa
 def printf_float_matrix(builder, matrix, prefix="", suffix="\n", override_debug=False):
     printf(builder, prefix, override_debug=override_debug)
     with array_ptr_loop(builder, matrix, "print_row_loop") as (b1, i):
-        row = b1.gep(matrix, [ir.IntType(32)(0), i])
+        row = b1.gep(matrix, [i.type(0), i])
         printf_float_array(b1, row, suffix="\n", override_debug=override_debug)
     printf(builder, suffix, override_debug=override_debug)
 

From b8dffcc8050df6cfab840e0a0731d5118f4c6f72 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Sun, 14 Aug 2022 21:39:59 -0400
Subject: [PATCH 3/4] llvm/OptimizationControlMechanism: Move construction of
 simulation input out of "evaluate" function

The "evaluate" function input now matches that of the simulated
composition.
This reduces work for parallel execution of "evaluate"
and enables better use of fast "shared" memory when executed on a GPU.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../nonstateful/optimizationfunctions.py      | 50 ++++++++++++++++---
 .../control/optimizationcontrolmechanism.py   | 39 ++-------------
 2 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index df8d182577c..5b32f1df01a 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -769,9 +769,13 @@ def _is_static(it:SampleIterator):
             return False
 
         assert all(_is_static(sample_iterator) for sample_iterator in self.search_space)
+
         assert ocm is ocm.agent_rep.controller
-        # Compiled evaluate expects the same variable as mech function
-        variable = [input_port.parameters.value.get(context) for input_port in ocm.input_ports]
+
+        # Compiled evaluate expects the same variable as composition
+        # FIXME: simplify this
+        variable = [[oip.parameters.value.get(context) for oip in ocm.input_ports if oip.shadow_inputs is not None and ocm.agent_rep.input_CIM_ports[oip.shadow_inputs][0] is input_port][0] for input_port in ocm.agent_rep.input_CIM.input_ports]
+
         num_evals = np.prod([d.num for d in self.search_space])
 
         # Map allocations to values
@@ -1744,14 +1748,46 @@ def _gen_llvm_select_min_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags:fr
         return builder.function
 
     def _gen_llvm_function_body(self, ctx, builder, params, state_features, arg_in, arg_out, *, tags:frozenset):
-        ocm = self._get_optimized_controller()
-        if ocm is not None:
-            assert ocm.function is self
-            obj_func = ctx.import_llvm_function(ocm, tags=tags.union({"evaluate"}))
+        controller = self._get_optimized_controller()
+        if controller is not None:
+            assert controller.function is self
+            obj_func = ctx.import_llvm_function(controller, tags=tags.union({"evaluate"}))
             comp_args = builder.function.args[-3:]
             obj_param_ptr = comp_args[0]
             obj_state_ptr = comp_args[1]
-            extra_args = [arg_in, comp_args[2]]
+
+            # Construct input
+            comp_input = builder.alloca(obj_func.args[4].type.pointee, name="sim_input")
+
+            input_initialized = [False] * len(comp_input.type.pointee)
+            for src_idx, ip in enumerate(controller.input_ports):
+                if ip.shadow_inputs is None:
+                    continue
+
+                # shadow inputs point to an input port of of a node.
+                # If that node takes direct input, it will have an associated
+                # (input_port, output_port) in the input_CIM.
+                # Take the former as an index to composition input variable.
+                cim_in_port = controller.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
+                dst_idx = controller.agent_rep.input_CIM.input_ports.index(cim_in_port)
+
+                # Check that all inputs are unique
+                assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
+                input_initialized[dst_idx] = True
+
+                src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
+                # Destination is a struct of 2d arrays
+                dst = builder.gep(comp_input, [ctx.int32_ty(0),
+                                               ctx.int32_ty(dst_idx),
+                                               ctx.int32_ty(0)])
+                builder.store(builder.load(src), dst)
+
+            # Assert that we have populated all inputs
+            assert all(input_initialized), \
+              "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
+
+            # Extra args: input and data
+            extra_args = [comp_input, comp_args[2]]
         else:
             obj_func = ctx.import_llvm_function(self.objective_function)
             obj_state_ptr = pnlvm.helpers.get_state_ptr(builder, self, state_features,
diff --git a/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py b/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py
index 67b665ce8cf..8abba09428e 100644
--- a/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py
+++ b/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py
@@ -3196,10 +3196,6 @@ def evaluate_agent_rep(self, control_allocation, context=None, return_results=Fa
                                            context=context
                                            )
 
-    def _get_evaluate_input_struct_type(self, ctx):
-        # We construct input from optimization function input
-        return ctx.get_input_struct_type(self.function)
-
     def _get_evaluate_output_struct_type(self, ctx):
         # Returns a scalar that is the predicted net_outcome
         return ctx.float_ty
@@ -3326,7 +3322,7 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                 ctx.get_state_struct_type(self.agent_rep).as_pointer(),
                 self._get_evaluate_alloc_struct_type(ctx).as_pointer(),
                 self._get_evaluate_output_struct_type(ctx).as_pointer(),
-                self._get_evaluate_input_struct_type(ctx).as_pointer(),
+                ctx.get_input_struct_type(self.agent_rep).as_pointer(),
                 ctx.get_data_struct_type(self.agent_rep).as_pointer()]
 
         builder = ctx.create_llvm_function(args, self, str(self) + "_evaluate")
@@ -3334,7 +3330,7 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
         for p in llvm_func.args:
             p.attributes.add('nonnull')
 
-        comp_params, base_comp_state, allocation_sample, arg_out, arg_in, base_comp_data = llvm_func.args
+        comp_params, base_comp_state, allocation_sample, arg_out, comp_input, base_comp_data = llvm_func.args
 
         if "const_params" in debug_env:
             comp_params = builder.alloca(comp_params.type.pointee, name="const_params_loc")
@@ -3390,37 +3386,8 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                                                       ctx.int32_ty(0)])
             builder.store(builder.load(sample_ptr), sample_dst)
 
-        # Construct input
-        comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
-
-        input_initialized = [False] * len(comp_input.type.pointee)
-        for src_idx, ip in enumerate(self.input_ports):
-            if ip.shadow_inputs is None:
-                continue
-
-            # shadow inputs point to an input port of of a node.
-            # If that node takes direct input, it will have an associated
-            # (input_port, output_port) in the input_CIM.
-            # Take the former as an index to composition input variable.
-            cim_in_port = self.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
-            dst_idx = self.agent_rep.input_CIM.input_ports.index(cim_in_port)
-
-            # Check that all inputs are unique
-            assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
-            input_initialized[dst_idx] = True
-
-            src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
-            # Destination is a struct of 2d arrays
-            dst = builder.gep(comp_input, [ctx.int32_ty(0),
-                                           ctx.int32_ty(dst_idx),
-                                           ctx.int32_ty(0)])
-            builder.store(builder.load(src), dst)
-
-        # Assert that we have populated all inputs
-        assert all(input_initialized), \
-          "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
-
         if "const_input" in debug_env:
+            comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
             if not debug_env["const_input"]:
                 input_init = [[os.defaults.variable.tolist()] for os in self.agent_rep.input_CIM.input_ports]
                 print("Setting default input: ", input_init)

From aa63fc706f9a31eb188a54d9e1a90a47a0e4bd4c Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Sun, 14 Aug 2022 23:40:09 -0400
Subject: [PATCH 4/4] llvm, gird_evaluate: Use 'state_feature_values' to
 construct input

This enables using the same input preprocessing codepaths as
compiled composition run.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../nonstateful/optimizationfunctions.py      |  8 +++---
 psyneulink/core/llvm/execution.py             | 27 +++++++++----------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index 5b32f1df01a..6f4ce9f8588 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -773,8 +773,8 @@ def _is_static(it:SampleIterator):
         assert ocm is ocm.agent_rep.controller
 
         # Compiled evaluate expects the same variable as composition
-        # FIXME: simplify this
-        variable = [[oip.parameters.value.get(context) for oip in ocm.input_ports if oip.shadow_inputs is not None and ocm.agent_rep.input_CIM_ports[oip.shadow_inputs][0] is input_port][0] for input_port in ocm.agent_rep.input_CIM.input_ports]
+        state_features = ocm.parameters.state_feature_values._get(context)
+        inputs, num_inputs_sets = ocm.agent_rep._parse_run_inputs(state_features, context)
 
         num_evals = np.prod([d.num for d in self.search_space])
 
@@ -782,9 +782,9 @@ def _is_static(it:SampleIterator):
         comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id])
         execution_mode = ocm.parameters.comp_execution_mode._get(context)
         if execution_mode == "PTX":
-            outcomes = comp_exec.cuda_evaluate(variable, num_evals)
+            outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals)
         elif execution_mode == "LLVM":
-            outcomes = comp_exec.thread_evaluate(variable, num_evals)
+            outcomes = comp_exec.thread_evaluate(inputs, num_inputs_sets, num_evals)
         else:
             assert False, f"Unknown execution mode for {ocm.name}: {execution_mode}."
 
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 933b561fc4d..1075c6b4d24 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -676,7 +676,7 @@ def cuda_run(self, inputs, runs, num_input_sets):
             assert runs_np[0] <= runs, "Composition ran more times than allowed!"
             return _convert_ctype_to_python(ct_out)[0:runs_np[0]]
 
-    def _prepare_evaluate(self, variable, num_evaluations):
+    def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations):
         ocm = self._composition.controller
         assert len(self._execution_contexts) == 1
 
@@ -695,25 +695,23 @@ def _prepare_evaluate(self, variable, num_evaluations):
         ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)
 
         # Construct input variable
-        var_dty = _element_dtype(bin_func.byref_arg_types[5])
-        converted_variable = np.concatenate(variable, dtype=var_dty)
+        ct_inputs = self._get_run_input_struct(inputs, num_input_sets)
 
         # Output ctype
         out_ty = bin_func.byref_arg_types[4] * num_evaluations
 
         # return variable as numpy array. pycuda can use it directly
-        return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty
+        return ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty
 
-    def cuda_evaluate(self, variable, num_evaluations):
-        ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
-        self._uploaded_bytes['input'] += converted_variable.nbytes
+    def cuda_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         # Output is allocated on device, but we need the ctype (out_ty).
         cuda_args = (self.upload_ctype(ct_comp_param, 'params'),
                      self.upload_ctype(ct_comp_state, 'state'),
                      jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),
-                     jit_engine.pycuda.driver.In(converted_variable),
+                     self.upload_ctype(ct_inputs, 'input'),
                      self.upload_ctype(ct_comp_data, 'data'),
                     )
 
@@ -722,12 +720,11 @@ def cuda_evaluate(self, variable, num_evaluations):
 
         return ct_results
 
-    def thread_evaluate(self, variable, num_evaluations):
-        ct_param, ct_state, ct_data, converted_variable, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
+    def thread_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_param, ct_state, ct_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         ct_results = out_ty()
-        ct_variable = converted_variable.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
         jobs = min(os.cpu_count(), num_evaluations)
         evals_per_job = (num_evaluations + jobs - 1) // jobs
 
@@ -738,7 +735,9 @@ def thread_evaluate(self, variable, num_evaluations):
             results = [ex.submit(self.__bin_func, ct_param, ct_state,
                                  int(i * evals_per_job),
                                  min((i + 1) * evals_per_job, num_evaluations),
-                                 ct_results, ct_variable, ct_data)
+                                 ct_results,
+                                 ctypes.cast(ctypes.byref(ct_inputs), self.__bin_func.c_func.argtypes[5]),
+                                 ct_data)
                        for i in range(jobs)]
 
         parallel_stop = time.time()