Merge pull request #1782 from PrincetonUniversity/devel

Devel
PrincetonUniversity · Oct 12, 2020 · 472a2d0 · 472a2d0
2 parents e415857 + 78a0712
commit 472a2d0
Show file tree

Hide file tree

Showing 15 changed files with 172 additions and 173 deletions.
diff --git a/.github/workflows/pnl-ci.yml b/.github/workflows/pnl-ci.yml
@@ -23,23 +23,23 @@ jobs:
         fetch-depth: 10
 
     - name: Linux wheels cache
-      uses: actions/[email protected].1
+      uses: actions/[email protected].2
       if: startsWith(runner.os, 'Linux')
       with:
         path: ~/.cache/pip/wheels
         key: ${{ runner.os }}-python-${{ matrix.python-version }}-${{ matrix.python-architecture }}-pip-wheels-${{ github.sha }}
         restore-keys: ${{ runner.os }}-python-${{ matrix.python-version }}-${{ matrix.python-architecture }}-pip-wheels
 
     - name: MacOS wheels cache
-      uses: actions/[email protected].1
+      uses: actions/[email protected].2
       if: startsWith(runner.os, 'macOS')
       with:
         path: ~/Library/Caches/pip/wheels
         key: ${{ runner.os }}-python-${{ matrix.python-version }}-${{ matrix.python-architecture }}-pip-wheels-${{ github.sha }}
         restore-keys: ${{ runner.os }}-python-${{ matrix.python-version }}-${{ matrix.python-architecture }}-pip-wheels
 
     - name: Windows wheels cache
-      uses: actions/[email protected].1
+      uses: actions/[email protected].2
       if: startsWith(runner.os, 'Windows')
       with:
         path: ~\AppData\Local\pip\Cache\wheels

diff --git a/.travis.yml b/.travis.yml
@@ -67,9 +67,9 @@ before_install:
     if [ "$TRAVIS_CPU_ARCH" != "amd64" ]; then
       # There are a lot fewer wheels distributed for non-x86 architectures.
       # We end up building a lot of them locally, install dev packages
-      export EXTRA_PKGS="build-essential gfortran llvm-9-dev libfreetype6-dev libjpeg-dev liblapack-dev zlib1g-dev"
+      export EXTRA_PKGS="build-essential gfortran llvm-10-dev libfreetype6-dev libjpeg-dev liblapack-dev zlib1g-dev"
       # Export LLVM_CONFIG for llvmlite
-      export LLVM_CONFIG=llvm-config-9
+      export LLVM_CONFIG=llvm-config-10
       # Disable coverage
       export RUN_COV=""
     fi

diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -1,6 +1,6 @@
 jupyter<=1.0.0
 psyneulink-sphinx-theme<=1.2.1.7
-pytest<6.1.1
+pytest<6.1.2
 pytest-benchmark<=3.2.3
 pytest-cov<=2.10.1
 pytest-helpers-namespace<=2019.1.8

diff --git a/psyneulink/core/components/mechanisms/mechanism.py b/psyneulink/core/components/mechanisms/mechanism.py
@@ -3003,7 +3003,7 @@ def _gen_llvm_function_internal(self, ctx, builder, params, state, arg_in,
         for scale in [TimeScale.TIME_STEP, TimeScale.PASS, TimeScale.TRIAL, TimeScale.RUN]:
             num_exec_time_ptr = builder.gep(num_executions_ptr, [ctx.int32_ty(0), ctx.int32_ty(scale.value)])
             new_val = builder.load(num_exec_time_ptr)
-            new_val = builder.add(new_val, ctx.int32_ty(1))
+            new_val = builder.add(new_val, new_val.type(1))
             builder.store(new_val, num_exec_time_ptr)
 
         builder = self._gen_llvm_output_ports(ctx, builder, value, params, state, arg_in, arg_out)

diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
@@ -88,12 +88,12 @@ def _cuda_kernel(self):
             self.__cuda_kernel = _ptx_engine.get_kernel(self.name)
         return self.__cuda_kernel
 
-    def cuda_call(self, *args, threads=1, block_size=32):
+    def cuda_call(self, *args, threads=1, block_size=128):
         grid = ((threads + block_size - 1) // block_size, 1)
         self._cuda_kernel(*args, np.int32(threads),
                           block=(block_size, 1, 1), grid=grid)
 
-    def cuda_wrap_call(self, *args, threads=1, block_size=32):
+    def cuda_wrap_call(self, *args, threads=1, block_size=128):
         wrap_args = (jit_engine.pycuda.driver.InOut(a) if isinstance(a, np.ndarray) else a for a in args)
         self.cuda_call(*wrap_args, threads=threads, block_size=block_size)
 

diff --git a/psyneulink/core/llvm/builder_context.py b/psyneulink/core/llvm/builder_context.py
@@ -431,12 +431,16 @@ def _convert_llvm_ir_to_ctype(t: ir.Type):
     if type_t is ir.VoidType:
         return None
     elif type_t is ir.IntType:
-        if t.width == 32:
-            return ctypes.c_int
+        if t.width == 8:
+            return ctypes.c_int8
+        elif t.width == 16:
+            return ctypes.c_int16
+        elif t.width == 32:
+            return ctypes.c_int32
         elif t.width == 64:
-            return ctypes.c_longlong
+            return ctypes.c_int64
         else:
-            assert False, "Integer type too big!"
+            assert False, "Unknown integer type: {}".format(type_t)
     elif type_t is ir.DoubleType:
         return ctypes.c_double
     elif type_t is ir.FloatType:

diff --git a/psyneulink/core/llvm/codegen.py b/psyneulink/core/llvm/codegen.py
@@ -916,7 +916,7 @@ def gen_composition_run(ctx, composition, *, tags:frozenset):
             node_state = builder.gep(state, [ctx.int32_ty(0), ctx.int32_ty(0), ctx.int32_ty(idx)])
             num_executions_ptr = helpers.get_state_ptr(builder, node, node_state, "num_executions")
             num_exec_time_ptr = builder.gep(num_executions_ptr, [ctx.int32_ty(0), ctx.int32_ty(TimeScale.RUN.value)])
-            builder.store(ctx.int32_ty(0), num_exec_time_ptr)
+            builder.store(num_exec_time_ptr.type.pointee(0), num_exec_time_ptr)
 
         # Call execution
         exec_tags = tags.difference({"run"})

diff --git a/psyneulink/core/llvm/jit_engine.py b/psyneulink/core/llvm/jit_engine.py
@@ -24,6 +24,8 @@
             if pycuda.driver.get_version()[0] > 5:
                 from pycuda import autoinit as pycuda_default
                 import pycuda.compiler
+                assert pycuda_default.context is not None
+                pycuda_default.context.set_cache_config(pycuda.driver.func_cache.PREFER_L1)
                 ptx_enabled = True
             else:
                 raise UserWarning("CUDA driver too old (need 6+): " + str(pycuda.driver.get_version()))
@@ -316,5 +318,5 @@ def get_kernel(self, name):
             wrapper_mod = _gen_cuda_kernel_wrapper_module(function)
             self.compile_modules([wrapper_mod], set())
             kernel = self._engine._find_kernel(name + "_cuda_kernel")
-
+        kernel.set_cache_config(pycuda.driver.func_cache.PREFER_L1)
         return kernel
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 autograd<=1.3
 dill<=0.32
 elfi<=0.7.6
-graphviz<=0.14.1
+graphviz<0.14.3
 grpcio<=1.31.0
 grpcio-tools<=1.31.0
 llvmlite<=0.34

diff --git a/tests/llvm/test_custom_func.py b/tests/llvm/test_custom_func.py
@@ -64,3 +64,45 @@ def test_fixed_dimensions__pnl_builtin_vxm(mode):
         binf2.cuda_wrap_call(vector, matrix, new_res)
 
     assert np.array_equal(orig_res, new_res)
+
+
+@pytest.mark.llvm
+@pytest.mark.parametrize('mode', ['CPU',
+                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('val', [np.int8(0x7e),
+                                 np.int16(0x7eec),
+                                 np.int32(0x7eedbeee),
+                                 np.int64(0x7eedcafedeadbeee)
+                                ], ids=lambda x: str(x.dtype))
+def test_integer_broadcast(mode, val):
+    custom_name = None
+    with pnlvm.LLVMBuilderContext() as ctx:
+        custom_name = ctx.get_unique_name("broadcast")
+        int_ty = ctx.convert_python_struct_to_llvm_ir(val)
+        int_array_ty = ir.ArrayType(int_ty, 8)
+        func_ty = ir.FunctionType(ir.VoidType(), (int_ty.as_pointer(),
+                                                  int_array_ty.as_pointer()))
+        function = ir.Function(ctx.module, func_ty, name=custom_name)
+
+        i, o = function.args
+        block = function.append_basic_block(name="entry")
+        builder = ir.IRBuilder(block)
+        ival = builder.load(i)
+        ival = builder.add(ival, ival.type(1))
+        with pnlvm.helpers.array_ptr_loop(builder, o, "broadcast") as (b, i):
+            out_ptr = builder.gep(o, [ctx.int32_ty(0), i])
+            builder.store(ival, out_ptr)
+        builder.ret_void()
+
+    binf = pnlvm.LLVMBinaryFunction.get(custom_name)
+    res = np.zeros(8, dtype=val.dtype)
+
+    if mode == 'CPU':
+        ct_res = np.ctypeslib.as_ctypes(res)
+        ct_in = np.ctypeslib.as_ctypes(val)
+
+        binf(ctypes.byref(ct_in), ctypes.byref(ct_res))
+    else:
+        binf.cuda_wrap_call(np.asarray(val), res)
+
+    assert all(res == np.broadcast_to(val + 1, 8))
diff --git a/tests/mechanisms/test_ddm_mechanism.py b/tests/mechanisms/test_ddm_mechanism.py
@@ -793,3 +793,20 @@ def test_sequence_of_DDM_mechs_in_Composition_Pathway():
         # if you do not specify, assert_allcose will use a relative tolerance of 1e-07,
         # which WILL FAIL unless you gather higher precision values to use as reference
         np.testing.assert_allclose(val, expected, atol=1e-08, err_msg='Failed on expected_output[{0}]'.format(i))
+
+
+@pytest.mark.mechanism
+@pytest.mark.ddm_mechanism
+@pytest.mark.parametrize('mode', ['Python',
+                                  pytest.param('LLVM', marks=pytest.mark.llvm),
+                                  pytest.param('LLVMExec', marks=pytest.mark.llvm),
+                                  pytest.param('LLVMRun', marks=pytest.mark.llvm),
+                                  pytest.param('PTXExec', marks=[pytest.mark.llvm, pytest.mark.cuda]),
+                                  pytest.param('PTXRun', marks=[pytest.mark.llvm, pytest.mark.cuda])])
+def test_DDMMechanism_LCA_equivalent(mode):
+    ddm = DDM(default_variable=[0], function=DriftDiffusionIntegrator(rate=1, time_step_size=0.1))
+    comp2 = Composition()
+    comp2.add_node(ddm)
+    result2 = comp2.run(inputs={ddm:[1]}, bin_execute=mode)
+    assert np.allclose(np.asfarray(result2[0]), [0.1])
+    assert np.allclose(np.asfarray(result2[1]), [0.1])
diff --git a/tests/mechanisms/test_lca.py b/tests/mechanisms/test_lca.py
@@ -285,6 +285,23 @@ def test_equivalance_of_threshold_and_termination_specifications_max_vs_next(sel
     #     result = comp.run(inputs={lca:[1,0]})
     #     assert np.allclose(result, [[0.71463572, 0.28536428]])
 
+    @pytest.mark.mechanism
+    @pytest.mark.lca_mechanism
+    @pytest.mark.parametrize('mode', ['Python',
+                                      pytest.param('LLVM', marks=pytest.mark.llvm),
+                                      pytest.param('LLVMExec', marks=pytest.mark.llvm),
+                                      pytest.param('LLVMRun', marks=pytest.mark.llvm),
+                                      pytest.param('PTXExec', marks=[pytest.mark.llvm, pytest.mark.cuda]),
+                                      pytest.param('PTXRun', marks=[pytest.mark.llvm, pytest.mark.cuda])])
+    def test_LCAMechanism_DDM_equivalent(self, mode):
+        lca = LCAMechanism(size=2, leak=0., threshold=1, auto=0, hetero=0,
+                           initial_value=[0, 0], execute_until_finished=False)
+        comp1 = Composition()
+        comp1.add_node(lca)
+        result1 = comp1.run(inputs={lca:[1, -1]}, bin_execute=mode)
+        assert np.allclose(result1, [[0.52497918747894, 0.47502081252106]],)
+
+
 class TestLCAReset:
 
     def test_reset_run(self):