llvm: Use more aggressive optimizations for GPU target (#3109)

Drop PTXExec execution mode.
PrincetonUniversity · Nov 11, 2024 · f0a2e3b · f0a2e3b
2 parents 5d16e23 + 86be704
commit f0a2e3b
Show file tree

Hide file tree

Showing 6 changed files with 11 additions and 35 deletions.
diff --git a/conftest.py b/conftest.py
@@ -190,7 +190,6 @@ def get_comp_execution_modes():
             pytest.param(pnlvm.ExecutionMode.LLVM, marks=pytest.mark.llvm),
             pytest.param(pnlvm.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
             pytest.param(pnlvm.ExecutionMode.LLVMRun, marks=pytest.mark.llvm),
-            pytest.param(pnlvm.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
             pytest.param(pnlvm.ExecutionMode.PTXRun, marks=[pytest.mark.llvm,  pytest.mark.cuda])
            ]
 

diff --git a/psyneulink/core/compositions/composition.py b/psyneulink/core/compositions/composition.py
@@ -2053,9 +2053,6 @@ def input_function(env, result):
     * `ExecutionMode.PTXrun` -- compile multiple `TRIAL <TimeScale.TRIAL>`\\s  for execution on GPU
       (see `below <Composition_Compilation_PTX>` for additional details).
 
-    * `ExecutionMode.PTXExec` -- compile individual `TRIAL <TimeScale.TRIAL>`\\s  for execution on GPU
-      (see `below <Composition_Compilation_PTX>` for additional details).
-
 .. _Composition_Compilation_PyTorch:
 
 *PyTorch support.*  When using an `AutodiffComposition`, `ExecutionMode.PyTorch` can be used to execute its
@@ -2067,15 +2064,11 @@ def input_function(env, result):
 *GPU support.*  In addition to compilation for CPUs, support is being developed for `CUDA
 <https://developer.nvidia.com/about-cuda>`_ capable `Invidia GPUs
 <https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units>`_.  This can be invoked by
-specifying either `ExecutionMode.PTXRun` or `ExecutionMode.PTXExec` oin the **execution_mode** argument
-of a `Composition execution method <Composition_Execution_Methods>`, which are equivalent to the LLVM
-counterparts but run in a single thread of a CUDA capable GPU. This requires that a working `pycuda package
-<https://documen.tician.de/pycuda/>`_ is `installed <https://wiki.tiker.net/PyCuda/Installation>`_, and that
-CUDA execution is explicitly enabled by setting the ``PNL_LLVM_DEBUG`` environment variable to ``cuda``.  At present
-compilation using these modes runs on a single GPU thread, and therefore does not produce any performance benefits
-over running in compiled mode on a CPU;  (see `this <https://github.com/PrincetonUniversity/PsyNeuLink/projects/1>`_
-for progress extending support of parallization in compiled modes).
-
+specifying `ExecutionMode.PTXRun` in the **execution_mode** argument of a `Composition execution
+method <Composition_Execution_Methods>`, which are equivalent to the LLVM counterparts but run in a single
+thread of a CUDA capable GPU. This requires that a working `pycuda package <https://documen.tician.de/pycuda/>`_ is
+`installed <https://wiki.tiker.net/PyCuda/Installation>`_, and that CUDA execution is not explicitly disabled by
+setting the ``PNL_LLVM_DEBUG`` environment variable to ``nocuda``.
 
 .. _Composition_Execution_Results_and_Reporting:
 
@@ -11841,7 +11834,7 @@ def execute(
                 called after each `PASS` is executed
                 passed the current *context* (but it is not necessary for your callable to take).
 
-            execution_mode : enum.Enum[Auto|LLVM|LLVMexec|Python|PTXExec] : default Python
+            execution_mode : enum.Enum[Auto|LLVM|LLVMexec|Python] : default Python
                 specifies whether to run using the Python interpreter or a `compiled mode <Composition_Compilation>`.
                 see **execution_mode** argument of `run <Composition.run>` method for additional details.
 
@@ -11965,8 +11958,6 @@ def execute(
                         _comp_ex = pnlvm.CompExecution.get(self, context)
                         if execution_mode & pnlvm.ExecutionMode.LLVM:
                             _comp_ex.execute(llvm_inputs)
-                        elif execution_mode & pnlvm.ExecutionMode.PTX:
-                            _comp_ex.cuda_execute(llvm_inputs)
                         else:
                             assert False, "Unknown execution mode: {}".format(execution_mode)
 

diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
@@ -70,9 +70,6 @@ class ExecutionMode(enum.Flag):
     PTX
       compile and run Composition `Nodes <Composition_Nodes>` and `Projections <Projection>` using CUDA for GPU.
 
-    PTXExec
-      compile and run each `TRIAL <TimeScale.TRIAL>` using CUDA for GPU.
-
     PTXRun
       compile and run multiple `TRIAL <TimeScale.TRIAL>`\\s using CUDA for GPU.
    """
@@ -89,7 +86,6 @@ class ExecutionMode(enum.Flag):
     LLVMRun = LLVM | _Run
     LLVMExec = LLVM | _Exec
     PTXRun = PTX | _Run
-    PTXExec = PTX | _Exec
     COMPILED = ~ (Python | PyTorch)
 
 

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
@@ -501,15 +501,6 @@ def execute(self, inputs):
                             self._data_struct,
                             self._conditions)
 
-    def cuda_execute(self, inputs):
-        # NOTE: Make sure that input struct generation is inlined.
-        # We need the binary function to be setup for it to work correctly.
-        self._bin_exec_func.cuda_call(self._cuda_state_struct,
-                                      self._cuda_param_struct,
-                                      jit_engine.pycuda.driver.In(self._get_input_struct(inputs)),
-                                      self._cuda_data_struct,
-                                      self._cuda_conditions)
-
     # Methods used to accelerate "Run"
     def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
         # Callers that override input arg, should ensure that _bin_func is not None

diff --git a/psyneulink/core/llvm/jit_engine.py b/psyneulink/core/llvm/jit_engine.py
@@ -107,15 +107,17 @@ def _ptx_jit_constructor():
 
     # PassManagerBuilder is used only for inlining simple functions
     __pass_manager_builder = binding.PassManagerBuilder()
-    __pass_manager_builder.opt_level = 0
+    __pass_manager_builder.opt_level = 2
     __pass_manager_builder.size_level = 1
-    # The threshold of '7' is empirically selected.
-    __pass_manager_builder.inlining_threshold = 7
+
+    # The threshold of '64' is empirically selected on GF 3050
+    __pass_manager_builder.inlining_threshold = 64
 
     # Use default device
     # TODO: Add support for multiple devices
     __compute_capability = pycuda_default.device.compute_capability()
     __ptx_sm = "sm_{}{}".format(__compute_capability[0], __compute_capability[1])
+
     # Create compilation target, use 64bit triple
     __ptx_target = binding.Target.from_triple("nvptx64-nvidia-cuda")
     __ptx_target_machine = __ptx_target.create_target_machine(cpu=__ptx_sm, opt=opt_level)

diff --git a/tests/composition/test_composition.py b/tests/composition/test_composition.py
@@ -3737,7 +3737,6 @@ def test_run_2_mechanisms_double_trial_specs(self, comp_mode):
     @pytest.mark.parametrize("mode", [pnl.ExecutionMode.Python,
                                       pytest.param(pnl.ExecutionMode.LLVM, marks=pytest.mark.llvm),
                                       pytest.param(pnl.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
-                                      pytest.param(pnl.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
                                      ])
     def test_execute_composition(self, mode):
         comp = Composition()
@@ -3841,7 +3840,6 @@ def test_LPP_wrong_component(self):
     @pytest.mark.parametrize("mode", [pnl.ExecutionMode.Python,
                                       pytest.param(pnl.ExecutionMode.LLVM, marks=pytest.mark.llvm),
                                       pytest.param(pnl.ExecutionMode.LLVMExec, marks=pytest.mark.llvm),
-                                      pytest.param(pnl.ExecutionMode.PTXExec, marks=[pytest.mark.llvm, pytest.mark.cuda]),
                                      ])
     def test_execute_no_inputs(self, mode):
         m_inner = ProcessingMechanism(input_shapes=2)
@@ -6606,7 +6604,6 @@ class TestProperties:
                                       pytest.param(pnl.ExecutionMode.LLVM, marks=[_fallback_xfail, pytest.mark.llvm]),
                                       pytest.param(pnl.ExecutionMode.LLVMExec, marks=[_fallback_xfail, pytest.mark.llvm]),
                                       pytest.param(pnl.ExecutionMode.LLVMRun, marks=[_fallback_xfail, pytest.mark.llvm]),
-                                      pytest.param(pnl.ExecutionMode.PTXExec, marks=[_fallback_xfail, pytest.mark.llvm, pytest.mark.cuda]),
                                       pytest.param(pnl.ExecutionMode.PTXRun, marks=[_fallback_xfail, pytest.mark.llvm, pytest.mark.cuda]),
                                      ])
     def test_llvm_fallback(self, mode):