Skip to content

Commit

Permalink
llvm: Switch compiled 'evaluate' to use composition simulation input (#…
Browse files Browse the repository at this point in the history
…2466)

Prepares for future modification to pass multiple inputs for multiple simulation trials.
Avoids duplicate construction of "predicated input" in each evaluation.
Allows better use of shared memory in GPU execution.
  • Loading branch information
jvesely authored Aug 16, 2022
2 parents 4fdb0cb + aa63fc7 commit 47f99cc
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -769,18 +769,22 @@ def _is_static(it:SampleIterator):
return False

assert all(_is_static(sample_iterator) for sample_iterator in self.search_space)

assert ocm is ocm.agent_rep.controller
# Compiled evaluate expects the same variable as mech function
variable = [input_port.parameters.value.get(context) for input_port in ocm.input_ports]

# Compiled evaluate expects the same variable as composition
state_features = ocm.parameters.state_feature_values._get(context)
inputs, num_inputs_sets = ocm.agent_rep._parse_run_inputs(state_features, context)

num_evals = np.prod([d.num for d in self.search_space])

# Map allocations to values
comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id])
execution_mode = ocm.parameters.comp_execution_mode._get(context)
if execution_mode == "PTX":
outcomes = comp_exec.cuda_evaluate(variable, num_evals)
outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals)
elif execution_mode == "LLVM":
outcomes = comp_exec.thread_evaluate(variable, num_evals)
outcomes = comp_exec.thread_evaluate(inputs, num_inputs_sets, num_evals)
else:
assert False, f"Unknown execution mode for {ocm.name}: {execution_mode}."

Expand Down Expand Up @@ -1744,14 +1748,46 @@ def _gen_llvm_select_min_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags:fr
return builder.function

def _gen_llvm_function_body(self, ctx, builder, params, state_features, arg_in, arg_out, *, tags:frozenset):
ocm = self._get_optimized_controller()
if ocm is not None:
assert ocm.function is self
obj_func = ctx.import_llvm_function(ocm, tags=tags.union({"evaluate"}))
controller = self._get_optimized_controller()
if controller is not None:
assert controller.function is self
obj_func = ctx.import_llvm_function(controller, tags=tags.union({"evaluate"}))
comp_args = builder.function.args[-3:]
obj_param_ptr = comp_args[0]
obj_state_ptr = comp_args[1]
extra_args = [arg_in, comp_args[2]]

# Construct input
comp_input = builder.alloca(obj_func.args[4].type.pointee, name="sim_input")

input_initialized = [False] * len(comp_input.type.pointee)
for src_idx, ip in enumerate(controller.input_ports):
if ip.shadow_inputs is None:
continue

# shadow inputs point to an input port of of a node.
# If that node takes direct input, it will have an associated
# (input_port, output_port) in the input_CIM.
# Take the former as an index to composition input variable.
cim_in_port = controller.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
dst_idx = controller.agent_rep.input_CIM.input_ports.index(cim_in_port)

# Check that all inputs are unique
assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
input_initialized[dst_idx] = True

src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
# Destination is a struct of 2d arrays
dst = builder.gep(comp_input, [ctx.int32_ty(0),
ctx.int32_ty(dst_idx),
ctx.int32_ty(0)])
builder.store(builder.load(src), dst)

# Assert that we have populated all inputs
assert all(input_initialized), \
"Not all inputs to the simulated composition are initialized: {}".format(input_initialized)

# Extra args: input and data
extra_args = [comp_input, comp_args[2]]
else:
obj_func = ctx.import_llvm_function(self.objective_function)
obj_state_ptr = pnlvm.helpers.get_state_ptr(builder, self, state_features,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3196,10 +3196,6 @@ def evaluate_agent_rep(self, control_allocation, context=None, return_results=Fa
context=context
)

def _get_evaluate_input_struct_type(self, ctx):
# We construct input from optimization function input
return ctx.get_input_struct_type(self.function)

def _get_evaluate_output_struct_type(self, ctx):
# Returns a scalar that is the predicted net_outcome
return ctx.float_ty
Expand Down Expand Up @@ -3326,15 +3322,15 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
ctx.get_state_struct_type(self.agent_rep).as_pointer(),
self._get_evaluate_alloc_struct_type(ctx).as_pointer(),
self._get_evaluate_output_struct_type(ctx).as_pointer(),
self._get_evaluate_input_struct_type(ctx).as_pointer(),
ctx.get_input_struct_type(self.agent_rep).as_pointer(),
ctx.get_data_struct_type(self.agent_rep).as_pointer()]

builder = ctx.create_llvm_function(args, self, str(self) + "_evaluate")
llvm_func = builder.function
for p in llvm_func.args:
p.attributes.add('nonnull')

comp_params, base_comp_state, allocation_sample, arg_out, arg_in, base_comp_data = llvm_func.args
comp_params, base_comp_state, allocation_sample, arg_out, comp_input, base_comp_data = llvm_func.args

if "const_params" in debug_env:
comp_params = builder.alloca(comp_params.type.pointee, name="const_params_loc")
Expand Down Expand Up @@ -3390,37 +3386,8 @@ def _gen_llvm_evaluate_function(self, *, ctx:pnlvm.LLVMBuilderContext, tags=froz
ctx.int32_ty(0)])
builder.store(builder.load(sample_ptr), sample_dst)

# Construct input
comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")

input_initialized = [False] * len(comp_input.type.pointee)
for src_idx, ip in enumerate(self.input_ports):
if ip.shadow_inputs is None:
continue

# shadow inputs point to an input port of of a node.
# If that node takes direct input, it will have an associated
# (input_port, output_port) in the input_CIM.
# Take the former as an index to composition input variable.
cim_in_port = self.agent_rep.input_CIM_ports[ip.shadow_inputs][0]
dst_idx = self.agent_rep.input_CIM.input_ports.index(cim_in_port)

# Check that all inputs are unique
assert not input_initialized[dst_idx], "Double initialization of input {}".format(dst_idx)
input_initialized[dst_idx] = True

src = builder.gep(arg_in, [ctx.int32_ty(0), ctx.int32_ty(src_idx)])
# Destination is a struct of 2d arrays
dst = builder.gep(comp_input, [ctx.int32_ty(0),
ctx.int32_ty(dst_idx),
ctx.int32_ty(0)])
builder.store(builder.load(src), dst)

# Assert that we have populated all inputs
assert all(input_initialized), \
"Not all inputs to the simulated composition are initialized: {}".format(input_initialized)

if "const_input" in debug_env:
comp_input = builder.alloca(sim_f.args[3].type.pointee, name="sim_input")
if not debug_env["const_input"]:
input_init = [[os.defaults.variable.tolist()] for os in self.agent_rep.input_CIM.input_ports]
print("Setting default input: ", input_init)
Expand Down
27 changes: 13 additions & 14 deletions psyneulink/core/llvm/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def cuda_run(self, inputs, runs, num_input_sets):
assert runs_np[0] <= runs, "Composition ran more times than allowed!"
return _convert_ctype_to_python(ct_out)[0:runs_np[0]]

def _prepare_evaluate(self, variable, num_evaluations):
def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations):
ocm = self._composition.controller
assert len(self._execution_contexts) == 1

Expand All @@ -695,25 +695,23 @@ def _prepare_evaluate(self, variable, num_evaluations):
ct_comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)

# Construct input variable
var_dty = _element_dtype(bin_func.byref_arg_types[5])
converted_variable = np.concatenate(variable, dtype=var_dty)
ct_inputs = self._get_run_input_struct(inputs, num_input_sets)

# Output ctype
out_ty = bin_func.byref_arg_types[4] * num_evaluations

# return variable as numpy array. pycuda can use it directly
return ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty
return ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty

def cuda_evaluate(self, variable, num_evaluations):
ct_comp_param, ct_comp_state, ct_comp_data, converted_variable, out_ty = \
self._prepare_evaluate(variable, num_evaluations)
self._uploaded_bytes['input'] += converted_variable.nbytes
def cuda_evaluate(self, inputs, num_input_sets, num_evaluations):
ct_comp_param, ct_comp_state, ct_comp_data, ct_inputs, out_ty = \
self._prepare_evaluate(inputs, num_input_sets, num_evaluations)

# Output is allocated on device, but we need the ctype (out_ty).
cuda_args = (self.upload_ctype(ct_comp_param, 'params'),
self.upload_ctype(ct_comp_state, 'state'),
jit_engine.pycuda.driver.mem_alloc(ctypes.sizeof(out_ty)),
jit_engine.pycuda.driver.In(converted_variable),
self.upload_ctype(ct_inputs, 'input'),
self.upload_ctype(ct_comp_data, 'data'),
)

Expand All @@ -722,12 +720,11 @@ def cuda_evaluate(self, variable, num_evaluations):

return ct_results

def thread_evaluate(self, variable, num_evaluations):
ct_param, ct_state, ct_data, converted_variale, out_ty = \
self._prepare_evaluate(variable, num_evaluations)
def thread_evaluate(self, inputs, num_input_sets, num_evaluations):
ct_param, ct_state, ct_data, ct_inputs, out_ty = \
self._prepare_evaluate(inputs, num_input_sets, num_evaluations)

ct_results = out_ty()
ct_variable = converted_variale.ctypes.data_as(self.__bin_func.c_func.argtypes[5])
jobs = min(os.cpu_count(), num_evaluations)
evals_per_job = (num_evaluations + jobs - 1) // jobs

Expand All @@ -738,7 +735,9 @@ def thread_evaluate(self, variable, num_evaluations):
results = [ex.submit(self.__bin_func, ct_param, ct_state,
int(i * evals_per_job),
min((i + 1) * evals_per_job, num_evaluations),
ct_results, ct_variable, ct_data)
ct_results,
ctypes.cast(ctypes.byref(ct_inputs), self.__bin_func.c_func.argtypes[5]),
ct_data)
for i in range(jobs)]

parallel_stop = time.time()
Expand Down
4 changes: 2 additions & 2 deletions psyneulink/core/llvm/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,15 +443,15 @@ def printf_float_array(builder, array, prefix="", suffix="\n", override_debug=Fa
printf(builder, prefix, override_debug=override_debug)

with array_ptr_loop(builder, array, "print_array_loop") as (b1, i):
printf(b1, "%lf ", b1.load(b1.gep(array, [ir.IntType(32)(0), i])), override_debug=override_debug)
printf(b1, "%lf ", b1.load(b1.gep(array, [i.type(0), i])), override_debug=override_debug)

printf(builder, suffix, override_debug=override_debug)


def printf_float_matrix(builder, matrix, prefix="", suffix="\n", override_debug=False):
printf(builder, prefix, override_debug=override_debug)
with array_ptr_loop(builder, matrix, "print_row_loop") as (b1, i):
row = b1.gep(matrix, [ir.IntType(32)(0), i])
row = b1.gep(matrix, [i.type(0), i])
printf_float_array(b1, row, suffix="\n", override_debug=override_debug)
printf(builder, suffix, override_debug=override_debug)

Expand Down

0 comments on commit 47f99cc

Please sign in to comment.