Skip to content

Commit

Permalink
llvm: Cleanup execution of compiled functions (#2214)
Browse files Browse the repository at this point in the history
Reuse helper functions to generate structures for 'evaluate'.
Remove unused settings and type information.
Clarify comments and improve variable names.
  • Loading branch information
jvesely authored Nov 25, 2021
2 parents 5d32243 + 0185842 commit 9239e36
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 25 deletions.
1 change: 0 additions & 1 deletion psyneulink/core/llvm/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
instead of loading them from the param argument
* "const_state" -- hardcode base context values into generate code,
instead of laoding them from the context argument
* "no_ref_pass" -- Don't pass arguments to llvm functions by reference
Compiled code dump:
* "llvm" -- dumps LLVM IR into a file (named after the dumped module).
Expand Down
55 changes: 31 additions & 24 deletions psyneulink/core/llvm/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from inspect import isgenerator
import os
import sys
import time


from psyneulink.core import llvm as pnlvm
Expand Down Expand Up @@ -69,17 +70,28 @@ def __init__(self):
self._debug_env = debug_env

def _get_compilation_param(self, name, init_method, arg):
struct = getattr(self, name)
struct = getattr(self, name, None)
if struct is None:
struct_ty = self._bin_func.byref_arg_types[arg]
init_f = getattr(self._obj, init_method)
if len(self._execution_contexts) > 1:
struct_ty = struct_ty * len(self._execution_contexts)
init_start = time.time()
initializer = (init_f(ex) for ex in self._execution_contexts)
else:
init_start = time.time()
initializer = init_f(self._execution_contexts[0])

init_end = time.time()
struct = struct_ty(*initializer)
struct_end = time.time()


if "time_stat" in self._debug_env:
print("Time to get initializer for struct:", name,
"for", self._obj.name, ":", init_end - init_start)
print("Time to instantiate struct:", name,
"for", self._obj.name, ":", struct_end - init_end)
setattr(self, name, struct)
if "stat" in self._debug_env:
print("Instantiated struct:", name, "( size:" ,
Expand Down Expand Up @@ -158,7 +170,7 @@ def _cuda_conditions(self):
@property
def _cuda_out(self):
if self._buffer_cuda_out is None:
size = ctypes.sizeof(self._vo_ty)
size = ctypes.sizeof(self._ct_vo)
self._buffer_cuda_out = jit_engine.pycuda.driver.mem_alloc(size)
return self._buffer_cuda_out

Expand All @@ -174,7 +186,7 @@ def cuda_execute(self, variable):
threads=len(self._execution_contexts))

# Copy the result from the device
ct_res = self.download_ctype(self._cuda_out, self._vo_ty, 'result')
ct_res = self.download_ctype(self._cuda_out, type(self._ct_vo), 'result')
return _convert_ctype_to_python(ct_res)


Expand All @@ -188,20 +200,15 @@ def __init__(self, component, execution_ids=[None], *, tags=frozenset()):
]
self._component = component

self._param = None
self._state = None

par_struct_ty, ctx_struct_ty, vi_ty, vo_ty = self._bin_func.byref_arg_types
_, _, vi_ty, vo_ty = self._bin_func.byref_arg_types

if len(execution_ids) > 1:
self._bin_multirun = self._bin_func.get_multi_run()
self._ct_len = ctypes.c_int(len(execution_ids))
vo_ty = vo_ty * len(execution_ids)
vi_ty = vi_ty * len(execution_ids)

self._vo_ty = vo_ty
self._ct_vo = vo_ty()
self._vi_ty = vi_ty
self._vi_dty = _element_dtype(vi_ty)
if "stat" in self._debug_env:
print("Input struct size:", _pretty_size(ctypes.sizeof(vi_ty)),
Expand All @@ -223,7 +230,7 @@ def _state_struct(self):

def execute(self, variable):
# Make sure function inputs are 2d.
# Mechanism inptus are already 3d so the first part is nop.
# Mechanism inputs are already 3d so the first part is nop.
new_variable = np.asfarray(np.atleast_2d(variable),
dtype=self._vi_dty)

Expand Down Expand Up @@ -271,9 +278,6 @@ def __init__(self, composition, execution_ids=[None], *, additional_tags=frozens
self.__tags = frozenset(additional_tags)

self.__conds = None
self._state = None
self._param = None
self._data = None

if len(execution_ids) > 1:
self._ct_len = ctypes.c_int(len(execution_ids))
Expand Down Expand Up @@ -407,11 +411,14 @@ def _extract_node_struct(self, node, data):
# followed by a list of projection parameters; get the first one
# output structure consists of a list of node outputs,
# followed by a list of nested data structures; get the first one
field = data._fields_[0][0]
res_struct = getattr(data, field)
field_name = data._fields_[0][0]
res_struct = getattr(data, field_name)

# Get the index into the array of all nodes
index = self._composition._get_node_index(node)
field = res_struct._fields_[index][0]
res_struct = getattr(res_struct, field)
field_name = res_struct._fields_[index][0]
res_struct = getattr(res_struct, field_name)

return _convert_ctype_to_python(res_struct)

def extract_node_struct(self, node, struct):
Expand Down Expand Up @@ -656,20 +663,20 @@ def cuda_run(self, inputs, runs, num_input_sets):
def _prepare_evaluate(self, variable, num_evaluations):
ocm = self._composition.controller
assert len(self._execution_contexts) == 1
context = self._execution_contexts[0]

bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset({"evaluate", "alloc_range"}))
self.__bin_func = bin_func
assert len(bin_func.byref_arg_types) == 7

# There are 7 arguments to evaluate_alloc_range:
# comp_param, comp_state, from, to, results, input, comp_data
# all but #4 are shared
assert len(bin_func.byref_arg_types) == 7

# Directly initialized structures
ct_comp_param = bin_func.byref_arg_types[0](*ocm.agent_rep._get_param_initializer(context))
ct_comp_state = bin_func.byref_arg_types[1](*ocm.agent_rep._get_state_initializer(context))
ct_comp_data = bin_func.byref_arg_types[6](*ocm.agent_rep._get_data_initializer(context))
assert ocm.agent_rep is self._composition
ct_comp_param = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)
ct_comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)
ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)

# Construct input variable
var_dty = _element_dtype(bin_func.byref_arg_types[5])
Expand All @@ -678,15 +685,15 @@ def _prepare_evaluate(self, variable, num_evaluations):
# Output ctype
out_ty = bin_func.byref_arg_types[4] * num_evaluations

# return variable as numpy array. pycuda can use it directly
return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty

def cuda_evaluate(self, variable, num_evaluations):
ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty = \
self._prepare_evaluate(variable, num_evaluations)
self._uploaded_bytes['input'] += converted_variable.nbytes

# Ouput is allocated on device, but we need the ctype.

# Output is allocated on device, but we need the ctype (out_ty).
cuda_args = (self.upload_ctype(ct_comp_param, 'params'),
self.upload_ctype(ct_comp_state, 'state'),
jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),
Expand Down

0 comments on commit 9239e36

Please sign in to comment.