From fc9dc9bfa279f4e9da5d1397bcedd010fec63bbc Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 13:35:33 -0500 Subject: [PATCH 1/6] llvm/execution: Do not require declaring cached compiled structures Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index a86dd4bb368..ec6dbf72c79 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -69,7 +69,7 @@ def __init__(self): self._debug_env = debug_env def _get_compilation_param(self, name, init_method, arg): - struct = getattr(self, name) + struct = getattr(self, name, None) if struct is None: struct_ty = self._bin_func.byref_arg_types[arg] init_f = getattr(self._obj, init_method) @@ -188,9 +188,6 @@ def __init__(self, component, execution_ids=[None], *, tags=frozenset()): ] self._component = component - self._param = None - self._state = None - par_struct_ty, ctx_struct_ty, vi_ty, vo_ty = self._bin_func.byref_arg_types if len(execution_ids) > 1: @@ -271,9 +268,6 @@ def __init__(self, composition, execution_ids=[None], *, additional_tags=frozens self.__tags = frozenset(additional_tags) self.__conds = None - self._state = None - self._param = None - self._data = None if len(execution_ids) > 1: self._ct_len = ctypes.c_int(len(execution_ids)) From 42faac9401ea56f9f208d19bc92538bc0f3666ce Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 13:54:14 -0500 Subject: [PATCH 2/6] llvm/execution: Reuse helper functions to generate structures for 'evaluate' Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index ec6dbf72c79..e3d48f4f496 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -650,7 +650,6 @@ def cuda_run(self, inputs, runs, num_input_sets): def _prepare_evaluate(self, variable, num_evaluations): ocm = self._composition.controller assert len(self._execution_contexts) == 1 - context = self._execution_contexts[0] bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset({"evaluate", "alloc_range"})) self.__bin_func = bin_func @@ -661,9 +660,10 @@ def _prepare_evaluate(self, variable, num_evaluations): # all but #4 are shared # Directly initialized structures - ct_comp_param = bin_func.byref_arg_types[0](*ocm.agent_rep._get_param_initializer(context)) - ct_comp_state = bin_func.byref_arg_types[1](*ocm.agent_rep._get_state_initializer(context)) - ct_comp_data = bin_func.byref_arg_types[6](*ocm.agent_rep._get_data_initializer(context)) + assert ocm.agent_rep is self._composition + ct_comp_param = self._get_compilation_param('_eval_param', '_get_param_initializer', 0) + ct_comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1) + ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6) # Construct input variable var_dty = _element_dtype(bin_func.byref_arg_types[5]) @@ -672,6 +672,7 @@ def _prepare_evaluate(self, variable, num_evaluations): # Output ctype out_ty = bin_func.byref_arg_types[4] * num_evaluations + # return variable as numpy array. pycuda can use it directly return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty def cuda_evaluate(self, variable, num_evaluations): From 37b917a2069669d533d43394a23d055d3953993d Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 14:06:47 -0500 Subject: [PATCH 3/6] llvm/execution: Report timing information for creating binary structures Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index e3d48f4f496..c0759dc9f7a 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -18,6 +18,7 @@ from inspect import isgenerator import os import sys +import time from psyneulink.core import llvm as pnlvm @@ -75,11 +76,22 @@ def _get_compilation_param(self, name, init_method, arg): init_f = getattr(self._obj, init_method) if len(self._execution_contexts) > 1: struct_ty = struct_ty * len(self._execution_contexts) + init_start = time.time() initializer = (init_f(ex) for ex in self._execution_contexts) else: + init_start = time.time() initializer = init_f(self._execution_contexts[0]) + init_end = time.time() struct = struct_ty(*initializer) + struct_end = time.time() + + + if "time_stat" in self._debug_env: + print("Time to get initializer for struct:", name, + "for", self._obj.name, ":", init_end - init_start) + print("Time to instantiate struct:", name, + "for", self._obj.name, ":", struct_end - init_end) setattr(self, name, struct) if "stat" in self._debug_env: print("Instantiated struct:", name, "( size:" , From bb97db1f27e7f0cc14348dd731ddaf8698a01dbb Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 14:34:35 -0500 Subject: [PATCH 4/6] llvm/debug: Remove unused debug setting 'no_ref_pass' All uses were removed in ba56af82585e2d61f5b5bd13d9a19b7ee3b60124 ("Refactor/autodiff (#1488)") Signed-off-by: Jan Vesely --- psyneulink/core/llvm/debug.py | 1 - 1 file changed, 1 deletion(-) diff --git a/psyneulink/core/llvm/debug.py b/psyneulink/core/llvm/debug.py index e297a7863ab..3c07ae1432a 100644 --- a/psyneulink/core/llvm/debug.py +++ b/psyneulink/core/llvm/debug.py @@ -33,7 +33,6 @@ instead of loading them from the param argument * "const_state" -- hardcode base context values into generate code, instead of laoding them from the context argument - * "no_ref_pass" -- Don't pass arguments to llvm functions by reference Compiled code dump: * "llvm" -- dumps LLVM IR into a file (named after the dumped module). From a2a48294e20e6390a3bfbc990aaa148a552b909c Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 16:01:04 -0500 Subject: [PATCH 5/6] llvm/execution: Drop ctype types for input/output in FuncExecution We only need numpy dtype for input Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index c0759dc9f7a..c853594c415 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -170,7 +170,7 @@ def _cuda_conditions(self): @property def _cuda_out(self): if self._buffer_cuda_out is None: - size = ctypes.sizeof(self._vo_ty) + size = ctypes.sizeof(self._ct_vo) self._buffer_cuda_out = jit_engine.pycuda.driver.mem_alloc(size) return self._buffer_cuda_out @@ -186,7 +186,7 @@ def cuda_execute(self, variable): threads=len(self._execution_contexts)) # Copy the result from the device - ct_res = self.download_ctype(self._cuda_out, self._vo_ty, 'result') + ct_res = self.download_ctype(self._cuda_out, type(self._ct_vo), 'result') return _convert_ctype_to_python(ct_res) @@ -200,7 +200,7 @@ def __init__(self, component, execution_ids=[None], *, tags=frozenset()): ] self._component = component - par_struct_ty, ctx_struct_ty, vi_ty, vo_ty = self._bin_func.byref_arg_types + _, _, vi_ty, vo_ty = self._bin_func.byref_arg_types if len(execution_ids) > 1: self._bin_multirun = self._bin_func.get_multi_run() @@ -208,9 +208,7 @@ def __init__(self, component, execution_ids=[None], *, tags=frozenset()): vo_ty = vo_ty * len(execution_ids) vi_ty = vi_ty * len(execution_ids) - self._vo_ty = vo_ty self._ct_vo = vo_ty() - self._vi_ty = vi_ty self._vi_dty = _element_dtype(vi_ty) if "stat" in self._debug_env: print("Input struct size:", _pretty_size(ctypes.sizeof(vi_ty)), From 0185842dc13d849af2c674ad76bf9115379709ea Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 21 Nov 2021 16:02:40 -0500 Subject: [PATCH 6/6] llvm/execution: Improve comments and variable naming Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index c853594c415..ac722dd9d77 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -230,7 +230,7 @@ def _state_struct(self): def execute(self, variable): # Make sure function inputs are 2d. - # Mechanism inptus are already 3d so the first part is nop. + # Mechanism inputs are already 3d so the first part is nop. new_variable = np.asfarray(np.atleast_2d(variable), dtype=self._vi_dty) @@ -411,11 +411,14 @@ def _extract_node_struct(self, node, data): # followed by a list of projection parameters; get the first one # output structure consists of a list of node outputs, # followed by a list of nested data structures; get the first one - field = data._fields_[0][0] - res_struct = getattr(data, field) + field_name = data._fields_[0][0] + res_struct = getattr(data, field_name) + + # Get the index into the array of all nodes index = self._composition._get_node_index(node) - field = res_struct._fields_[index][0] - res_struct = getattr(res_struct, field) + field_name = res_struct._fields_[index][0] + res_struct = getattr(res_struct, field_name) + return _convert_ctype_to_python(res_struct) def extract_node_struct(self, node, struct): @@ -663,11 +666,11 @@ def _prepare_evaluate(self, variable, num_evaluations): bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset({"evaluate", "alloc_range"})) self.__bin_func = bin_func - assert len(bin_func.byref_arg_types) == 7 # There are 7 arguments to evaluate_alloc_range: # comp_param, comp_state, from, to, results, input, comp_data # all but #4 are shared + assert len(bin_func.byref_arg_types) == 7 # Directly initialized structures assert ocm.agent_rep is self._composition @@ -690,8 +693,7 @@ def cuda_evaluate(self, variable, num_evaluations): self._prepare_evaluate(variable, num_evaluations) self._uploaded_bytes['input'] += converted_variable.nbytes - # Ouput is allocated on device, but we need the ctype. - + # Output is allocated on device, but we need the ctype (out_ty). cuda_args = (self.upload_ctype(ct_comp_param, 'params'), self.upload_ctype(ct_comp_state, 'state'), jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),