Skip to content

Commit

Permalink
llvm: Use more aggressive optimizations for GPU target (#3109)
Browse files Browse the repository at this point in the history
Drop PTXExec execution mode.
  • Loading branch information
jvesely authored Nov 11, 2024
2 parents 5d16e23 + 86be704 commit f0a2e3b
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 35 deletions.
1 change: 0 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ def get_comp_execution_modes():
pytest.param(pnlvm.ExecutionMode.LLVM, marks=pytest.mark.llvm),
pytest.param(pnlvm.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
pytest.param(pnlvm.ExecutionMode.LLVMRun, marks=pytest.mark.llvm),
pytest.param(pnlvm.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
pytest.param(pnlvm.ExecutionMode.PTXRun, marks=[pytest.mark.llvm, pytest.mark.cuda])
]

Expand Down
21 changes: 6 additions & 15 deletions psyneulink/core/compositions/composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2053,9 +2053,6 @@ def input_function(env, result):
* `ExecutionMode.PTXrun` -- compile multiple `TRIAL <TimeScale.TRIAL>`\\s for execution on GPU
(see `below <Composition_Compilation_PTX>` for additional details).

* `ExecutionMode.PTXExec` -- compile individual `TRIAL <TimeScale.TRIAL>`\\s for execution on GPU
(see `below <Composition_Compilation_PTX>` for additional details).

.. _Composition_Compilation_PyTorch:

*PyTorch support.* When using an `AutodiffComposition`, `ExecutionMode.PyTorch` can be used to execute its
Expand All @@ -2067,15 +2064,11 @@ def input_function(env, result):
*GPU support.* In addition to compilation for CPUs, support is being developed for `CUDA
<https://developer.nvidia.com/about-cuda>`_ capable `Invidia GPUs
<https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units>`_. This can be invoked by
specifying either `ExecutionMode.PTXRun` or `ExecutionMode.PTXExec` oin the **execution_mode** argument
of a `Composition execution method <Composition_Execution_Methods>`, which are equivalent to the LLVM
counterparts but run in a single thread of a CUDA capable GPU. This requires that a working `pycuda package
<https://documen.tician.de/pycuda/>`_ is `installed <https://wiki.tiker.net/PyCuda/Installation>`_, and that
CUDA execution is explicitly enabled by setting the ``PNL_LLVM_DEBUG`` environment variable to ``cuda``. At present
compilation using these modes runs on a single GPU thread, and therefore does not produce any performance benefits
over running in compiled mode on a CPU; (see `this <https://github.com/PrincetonUniversity/PsyNeuLink/projects/1>`_
for progress extending support of parallization in compiled modes).

specifying `ExecutionMode.PTXRun` in the **execution_mode** argument of a `Composition execution
method <Composition_Execution_Methods>`, which are equivalent to the LLVM counterparts but run in a single
thread of a CUDA capable GPU. This requires that a working `pycuda package <https://documen.tician.de/pycuda/>`_ is
`installed <https://wiki.tiker.net/PyCuda/Installation>`_, and that CUDA execution is not explicitly disabled by
setting the ``PNL_LLVM_DEBUG`` environment variable to ``nocuda``.

.. _Composition_Execution_Results_and_Reporting:

Expand Down Expand Up @@ -11841,7 +11834,7 @@ def execute(
called after each `PASS` is executed
passed the current *context* (but it is not necessary for your callable to take).

execution_mode : enum.Enum[Auto|LLVM|LLVMexec|Python|PTXExec] : default Python
execution_mode : enum.Enum[Auto|LLVM|LLVMexec|Python] : default Python
specifies whether to run using the Python interpreter or a `compiled mode <Composition_Compilation>`.
see **execution_mode** argument of `run <Composition.run>` method for additional details.

Expand Down Expand Up @@ -11965,8 +11958,6 @@ def execute(
_comp_ex = pnlvm.CompExecution.get(self, context)
if execution_mode & pnlvm.ExecutionMode.LLVM:
_comp_ex.execute(llvm_inputs)
elif execution_mode & pnlvm.ExecutionMode.PTX:
_comp_ex.cuda_execute(llvm_inputs)
else:
assert False, "Unknown execution mode: {}".format(execution_mode)

Expand Down
4 changes: 0 additions & 4 deletions psyneulink/core/llvm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,6 @@ class ExecutionMode(enum.Flag):
PTX
compile and run Composition `Nodes <Composition_Nodes>` and `Projections <Projection>` using CUDA for GPU.
PTXExec
compile and run each `TRIAL <TimeScale.TRIAL>` using CUDA for GPU.
PTXRun
compile and run multiple `TRIAL <TimeScale.TRIAL>`\\s using CUDA for GPU.
"""
Expand All @@ -89,7 +86,6 @@ class ExecutionMode(enum.Flag):
LLVMRun = LLVM | _Run
LLVMExec = LLVM | _Exec
PTXRun = PTX | _Run
PTXExec = PTX | _Exec
COMPILED = ~ (Python | PyTorch)


Expand Down
9 changes: 0 additions & 9 deletions psyneulink/core/llvm/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,15 +501,6 @@ def execute(self, inputs):
self._data_struct,
self._conditions)

def cuda_execute(self, inputs):
# NOTE: Make sure that input struct generation is inlined.
# We need the binary function to be setup for it to work correctly.
self._bin_exec_func.cuda_call(self._cuda_state_struct,
self._cuda_param_struct,
jit_engine.pycuda.driver.In(self._get_input_struct(inputs)),
self._cuda_data_struct,
self._cuda_conditions)

# Methods used to accelerate "Run"
def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
# Callers that override input arg, should ensure that _bin_func is not None
Expand Down
8 changes: 5 additions & 3 deletions psyneulink/core/llvm/jit_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,17 @@ def _ptx_jit_constructor():

# PassManagerBuilder is used only for inlining simple functions
__pass_manager_builder = binding.PassManagerBuilder()
__pass_manager_builder.opt_level = 0
__pass_manager_builder.opt_level = 2
__pass_manager_builder.size_level = 1
# The threshold of '7' is empirically selected.
__pass_manager_builder.inlining_threshold = 7

# The threshold of '64' is empirically selected on GF 3050
__pass_manager_builder.inlining_threshold = 64

# Use default device
# TODO: Add support for multiple devices
__compute_capability = pycuda_default.device.compute_capability()
__ptx_sm = "sm_{}{}".format(__compute_capability[0], __compute_capability[1])

# Create compilation target, use 64bit triple
__ptx_target = binding.Target.from_triple("nvptx64-nvidia-cuda")
__ptx_target_machine = __ptx_target.create_target_machine(cpu=__ptx_sm, opt=opt_level)
Expand Down
3 changes: 0 additions & 3 deletions tests/composition/test_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -3737,7 +3737,6 @@ def test_run_2_mechanisms_double_trial_specs(self, comp_mode):
@pytest.mark.parametrize("mode", [pnl.ExecutionMode.Python,
pytest.param(pnl.ExecutionMode.LLVM, marks=pytest.mark.llvm),
pytest.param(pnl.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
pytest.param(pnl.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
])
def test_execute_composition(self, mode):
comp = Composition()
Expand Down Expand Up @@ -3841,7 +3840,6 @@ def test_LPP_wrong_component(self):
@pytest.mark.parametrize("mode", [pnl.ExecutionMode.Python,
pytest.param(pnl.ExecutionMode.LLVM, marks=pytest.mark.llvm),
pytest.param(pnl.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
pytest.param(pnl.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
])
def test_execute_no_inputs(self, mode):
m_inner = ProcessingMechanism(input_shapes=2)
Expand Down Expand Up @@ -6606,7 +6604,6 @@ class TestProperties:
pytest.param(pnl.ExecutionMode.LLVM, marks=[_fallback_xfail, pytest.mark.llvm]),
pytest.param(pnl.ExecutionMode.LLVMExec, marks=[_fallback_xfail, pytest.mark.llvm]),
pytest.param(pnl.ExecutionMode.LLVMRun, marks=[_fallback_xfail, pytest.mark.llvm]),
pytest.param(pnl.ExecutionMode.PTXExec, marks=[_fallback_xfail, pytest.mark.llvm, pytest.mark.cuda]),
pytest.param(pnl.ExecutionMode.PTXRun, marks=[_fallback_xfail, pytest.mark.llvm, pytest.mark.cuda]),
])
def test_llvm_fallback(self, mode):
Expand Down

0 comments on commit f0a2e3b

Please sign in to comment.