llvm: Switch compiled 'evaluate' to use composition simulation input (#…

…2466) Prepares for future modification to pass multiple inputs for multiple simulation trials. Avoids duplicate construction of "predicated input" in each evaluation. Allows better use of shared memory in GPU execution.
PrincetonUniversity · Aug 16, 2022 · 47f99cc · 47f99cc
2 parents 4fdb0cb + aa63fc7
commit 47f99cc
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 61 deletions.
diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -769,18 +769,22 @@ def _is_static(it:SampleIterator):
             return False
 
         assert all(_is_static(sample_iterator) for sample_iterator in self.search_space)
+
         assert ocm is ocm.agent_rep.controller
-        # Compiled evaluate expects the same variable as mech function
-        variable = [input_port.parameters.value.get(context) for input_port in ocm.input_ports]
+
+        # Compiled evaluate expects the same variable as composition
+        state_features = ocm.parameters.state_feature_values._get(context)
+        inputs, num_inputs_sets = ocm.agent_rep._parse_run_inputs(state_features, context)
+
         num_evals = np.prod([d.num for d in self.search_space])
 
         # Map allocations to values
         comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id])
         execution_mode = ocm.parameters.comp_execution_mode._get(context)
         if execution_mode == "PTX":
-            outcomes = comp_exec.cuda_evaluate(variable, num_evals)
+            outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals)
         elif execution_mode == "LLVM":
-            outcomes = comp_exec.thread_evaluate(variable, num_evals)
+            outcomes = comp_exec.thread_evaluate(inputs, num_inputs_sets, num_evals)
         else:
             assert False, f"Unknown execution mode for {ocm.name}: {execution_mode}."
 
@@ -1744,14 +1748,46 @@ def _gen_llvm_select_min_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags:fr
         return builder.function
 
     def _gen_llvm_function_body(self, ctx, builder, params, state_features, arg_in, arg_out, *, tags:frozenset):
-        ocm = self._get_optimized_controller()
-        if ocm is not None:
-            assert ocm.function is self
-            obj_func = ctx.import_llvm_function(ocm, tags=tags.union({"evaluate"}))
+        controller = self._get_optimized_controller()
+        if controller is not None:
+            assert controller.function is self
+            obj_func = ctx.import_llvm_function(controller, tags=tags.union({"evaluate"}))
             comp_args = builder.function.args[-3:]
             obj_param_ptr = comp_args[0]
             obj_state_ptr = comp_args[1]
-            extra_args = [arg_in, comp_args[2]]
+
+            # Construct input
+            comp_input = builder.alloca(obj_func.args[4].type.pointee, name="sim_input")
+
+            input_initialized = [False] * len(comp_input.type.pointee)
+            for src_idx, ip in enumerate(controller.input_ports):
+                if ip.shadow_inputs is None:
+                    continue
+
+                # shadow inputs point to an input port of of a node.
+                # If that node takes direct input, it will have an associated
+                # (input_port, output_port) in the input_CIM.
+                # Take the former as an index to composition input variable.
+                cim_in_port = controller.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
+                dst_idx = controller.agent_rep.input_CIM.input_ports.index(cim_in_port)
+
+                # Check that all inputs are unique
+                assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
+                input_initialized[dst_idx] = True
+
+                src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
+                # Destination is a struct of 2d arrays
+                dst = builder.gep(comp_input, [ctx.int32_ty(0),
+                                               ctx.int32_ty(dst_idx),
+                                               ctx.int32_ty(0)])
+                builder.store(builder.load(src), dst)
+
+            # Assert that we have populated all inputs
+            assert all(input_initialized), \
+              "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
+
+            # Extra args: input and data
+            extra_args = [comp_input, comp_args[2]]
         else:
             obj_func = ctx.import_llvm_function(self.objective_function)
             obj_state_ptr = pnlvm.helpers.get_state_ptr(builder, self, state_features,

diff --git a/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py b/psyneulink/core/components/mechanisms/modulatory/control/optimizationcontrolmechanism.py
@@ -3196,10 +3196,6 @@ def evaluate_agent_rep(self, control_allocation, context=None, return_results=Fa
                                            context=context
                                            )
 
-    def _get_evaluate_input_struct_type(self, ctx):
-        # We construct input from optimization function input
-        return ctx.get_input_struct_type(self.function)
-
     def _get_evaluate_output_struct_type(self, ctx):
         # Returns a scalar that is the predicted net_outcome
         return ctx.float_ty
@@ -3326,15 +3322,15 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                 ctx.get_state_struct_type(self.agent_rep).as_pointer(),
                 self._get_evaluate_alloc_struct_type(ctx).as_pointer(),
                 self._get_evaluate_output_struct_type(ctx).as_pointer(),
-                self._get_evaluate_input_struct_type(ctx).as_pointer(),
+                ctx.get_input_struct_type(self.agent_rep).as_pointer(),
                 ctx.get_data_struct_type(self.agent_rep).as_pointer()]
 
         builder = ctx.create_llvm_function(args, self, str(self) + "_evaluate")
         llvm_func = builder.function
         for p in llvm_func.args:
             p.attributes.add('nonnull')
 
-        comp_params, base_comp_state, allocation_sample, arg_out, arg_in, base_comp_data = llvm_func.args
+        comp_params, base_comp_state, allocation_sample, arg_out, comp_input, base_comp_data = llvm_func.args
 
         if "const_params" in debug_env:
             comp_params = builder.alloca(comp_params.type.pointee, name="const_params_loc")
@@ -3390,37 +3386,8 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
                                                       ctx.int32_ty(0)])
             builder.store(builder.load(sample_ptr), sample_dst)
 
-        # Construct input
-        comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
-
-        input_initialized = [False] * len(comp_input.type.pointee)
-        for src_idx, ip in enumerate(self.input_ports):
-            if ip.shadow_inputs is None:
-                continue
-
-            # shadow inputs point to an input port of of a node.
-            # If that node takes direct input, it will have an associated
-            # (input_port, output_port) in the input_CIM.
-            # Take the former as an index to composition input variable.
-            cim_in_port = self.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
-            dst_idx = self.agent_rep.input_CIM.input_ports.index(cim_in_port)
-
-            # Check that all inputs are unique
-            assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
-            input_initialized[dst_idx] = True
-
-            src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
-            # Destination is a struct of 2d arrays
-            dst = builder.gep(comp_input, [ctx.int32_ty(0),
-                                           ctx.int32_ty(dst_idx),
-                                           ctx.int32_ty(0)])
-            builder.store(builder.load(src), dst)
-
-        # Assert that we have populated all inputs
-        assert all(input_initialized), \
-          "Not all inputs to the simulated composition are initialized: {}".format(input_initialized)
-
         if "const_input" in debug_env:
+            comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
             if not debug_env["const_input"]:
                 input_init = [[os.defaults.variable.tolist()] for os in self.agent_rep.input_CIM.input_ports]
                 print("Setting default input: ", input_init)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
@@ -676,7 +676,7 @@ def cuda_run(self, inputs, runs, num_input_sets):
             assert runs_np[0] <= runs, "Composition ran more times than allowed!"
             return _convert_ctype_to_python(ct_out)[0:runs_np[0]]
 
-    def _prepare_evaluate(self, variable, num_evaluations):
+    def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations):
         ocm = self._composition.controller
         assert len(self._execution_contexts) == 1
 
@@ -695,25 +695,23 @@ def _prepare_evaluate(self, variable, num_evaluations):
         ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)
 
         # Construct input variable
-        var_dty = _element_dtype(bin_func.byref_arg_types[5])
-        converted_variable = np.concatenate(variable, dtype=var_dty)
+        ct_inputs = self._get_run_input_struct(inputs, num_input_sets)
 
         # Output ctype
         out_ty = bin_func.byref_arg_types[4] * num_evaluations
 
         # return variable as numpy array. pycuda can use it directly
-        return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty
+        return ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty
 
-    def cuda_evaluate(self, variable, num_evaluations):
-        ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
-        self._uploaded_bytes['input'] += converted_variable.nbytes
+    def cuda_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         # Output is allocated on device, but we need the ctype (out_ty).
         cuda_args = (self.upload_ctype(ct_comp_param, 'params'),
                      self.upload_ctype(ct_comp_state, 'state'),
                      jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),
-                     jit_engine.pycuda.driver.In(converted_variable),
+                     self.upload_ctype(ct_inputs, 'input'),
                      self.upload_ctype(ct_comp_data, 'data'),
                     )
 
@@ -722,12 +720,11 @@ def cuda_evaluate(self, variable, num_evaluations):
 
         return ct_results
 
-    def thread_evaluate(self, variable, num_evaluations):
-        ct_param, ct_state, ct_data, converted_variale, out_ty = \
-            self._prepare_evaluate(variable, num_evaluations)
+    def thread_evaluate(self, inputs, num_input_sets, num_evaluations):
+        ct_param, ct_state, ct_data, ct_inputs, out_ty = \
+            self._prepare_evaluate(inputs, num_input_sets, num_evaluations)
 
         ct_results = out_ty()
-        ct_variable = converted_variale.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
         jobs = min(os.cpu_count(), num_evaluations)
         evals_per_job = (num_evaluations + jobs - 1) // jobs
 
@@ -738,7 +735,9 @@ def thread_evaluate(self, variable, num_evaluations):
             results = [ex.submit(self.__bin_func, ct_param, ct_state,
                                  int(i * evals_per_job),
                                  min((i + 1) * evals_per_job, num_evaluations),
-                                 ct_results, ct_variable, ct_data)
+                                 ct_results,
+                                 ctypes.cast(ctypes.byref(ct_inputs), self.__bin_func.c_func.argtypes[5]),
+                                 ct_data)
                        for i in range(jobs)]
 
         parallel_stop = time.time()

diff --git a/psyneulink/core/llvm/helpers.py b/psyneulink/core/llvm/helpers.py
@@ -443,15 +443,15 @@ def printf_float_array(builder, array, prefix="", suffix="\n", override_debug=Fa
     printf(builder, prefix, override_debug=override_debug)
 
     with array_ptr_loop(builder, array, "print_array_loop") as (b1, i):
-        printf(b1, "%lf ", b1.load(b1.gep(array, [ir.IntType(32)(0), i])), override_debug=override_debug)
+        printf(b1, "%lf ", b1.load(b1.gep(array, [i.type(0), i])), override_debug=override_debug)
 
     printf(builder, suffix, override_debug=override_debug)
 
 
 def printf_float_matrix(builder, matrix, prefix="", suffix="\n", override_debug=False):
     printf(builder, prefix, override_debug=override_debug)
     with array_ptr_loop(builder, matrix, "print_row_loop") as (b1, i):
-        row = b1.gep(matrix, [ir.IntType(32)(0), i])
+        row = b1.gep(matrix, [i.type(0), i])
         printf_float_array(b1, row, suffix="\n", override_debug=override_debug)
     printf(builder, suffix, override_debug=override_debug)