diff --git a/src/core/actor/DipolarBarnesHut.cpp b/src/core/actor/DipolarBarnesHut.cpp
index 61528b4ca4f..79d4b2f3234 100644
--- a/src/core/actor/DipolarBarnesHut.cpp
+++ b/src/core/actor/DipolarBarnesHut.cpp
@@ -27,28 +27,25 @@
 
 #ifdef DIPOLAR_BARNES_HUT
 
+std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
+
 void activate_dipolar_barnes_hut(float epssq, float itolsq) {
-  delete dipolarBarnesHut;
-  dipolarBarnesHut = nullptr;
   // also necessary on 1 CPU or GPU, does more than just broadcasting
+  dipole.method = DIPOLAR_BH_GPU;
   mpi_bcast_coulomb_params();
-  dipolarBarnesHut =
-      new DipolarBarnesHut(espressoSystemInterface, epssq, itolsq);
-  forceActors.push_back(dipolarBarnesHut);
-  energyActors.push_back(dipolarBarnesHut);
 
-  dipole.method = DIPOLAR_BH_GPU;
+  dipolarBarnesHut = std::make_unique<DipolarBarnesHut>(espressoSystemInterface,
+                                                        epssq, itolsq);
+  forceActors.push_back(dipolarBarnesHut.get());
+  energyActors.push_back(dipolarBarnesHut.get());
 }
 
 void deactivate_dipolar_barnes_hut() {
   if (dipolarBarnesHut) {
-    forceActors.remove(dipolarBarnesHut);
-    energyActors.remove(dipolarBarnesHut);
+    forceActors.remove(dipolarBarnesHut.get());
+    energyActors.remove(dipolarBarnesHut.get());
+    dipolarBarnesHut.reset();
   }
-  delete dipolarBarnesHut;
-  dipolarBarnesHut = nullptr;
 }
 
-DipolarBarnesHut *dipolarBarnesHut = nullptr;
-
 #endif
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index da71640916b..a9ca551fa0e 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -101,7 +101,7 @@ class DipolarBarnesHut : public Actor {
 void activate_dipolar_barnes_hut(float epssq, float itolsq);
 void deactivate_dipolar_barnes_hut();
 
-extern DipolarBarnesHut *dipolarBarnesHut;
+extern std::unique_ptr<DipolarBarnesHut> dipolarBarnesHut;
 
 #endif
 
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index 077e89997f0..03105f82423 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -85,6 +85,7 @@ python_test(FILE coulomb_tuning.py MAX_NUM_PROC 4 LABELS gpu)
 python_test(FILE correlation.py MAX_NUM_PROC 4)
 python_test(FILE dawaanr-and-dds-gpu.py MAX_NUM_PROC 1 LABELS gpu)
 python_test(FILE dawaanr-and-bh-gpu.py MAX_NUM_PROC 1 LABELS gpu)
+python_test(FILE dds-and-bh-gpu.py MAX_NUM_PROC 4 LABELS gpu)
 python_test(FILE electrostaticInteractions.py MAX_NUM_PROC 2)
 python_test(FILE engine_langevin.py MAX_NUM_PROC 4)
 python_test(FILE engine_lb.py MAX_NUM_PROC 2)
diff --git a/testsuite/python/dawaanr-and-bh-gpu.py b/testsuite/python/dawaanr-and-bh-gpu.py
index 1ce8236a195..2a1710303e1 100644
--- a/testsuite/python/dawaanr-and-bh-gpu.py
+++ b/testsuite/python/dawaanr-and-bh-gpu.py
@@ -152,9 +152,6 @@ def run_test_case(self):
             self.system.part.clear()
 
     def test(self):
-        if str(espressomd.cuda_init.CudaInitHandle().device_list[0]) == "Device 687f":
-            print("Test skipped on AMD card")
-            return
         if (self.system.cell_system.get_state()["n_nodes"] > 1):
             print("NOTE: Ignoring testcase for n_nodes > 1")
         else:
diff --git a/testsuite/python/dds-and-bh-gpu.py b/testsuite/python/dds-and-bh-gpu.py
new file mode 100644
index 00000000000..467b382400a
--- /dev/null
+++ b/testsuite/python/dds-and-bh-gpu.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2010-2018 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import print_function
+
+import math
+import unittest as ut
+import numpy as np
+from numpy import linalg as la
+from numpy.random import random, seed
+
+import espressomd
+import espressomd.magnetostatics
+import espressomd.analyze
+import espressomd.cuda_init
+import tests_common
+
+
+def stopAll(system):
+    system.part[:].v = np.zeros(3)
+    system.part[:].omega_body = np.zeros(3)
+
+
+@ut.skipIf(not espressomd.gpu_available() or not espressomd.has_features(["DIPOLAR_BARNES_HUT"]), "Features or gpu not available, skipping test!")
+class BH_DDS_gpu_multCPU_test(ut.TestCase):
+    system = espressomd.System(box_l=[1, 1, 1])
+    # just some seeding based on 14
+    system.seed = [s * 14 for s in range(
+        system.cell_system.get_state()["n_nodes"])]
+    # just some seeding different from the previous one
+    np.random.seed(71)
+
+    def vectorsTheSame(self, a, b):
+        tol = 5E-2
+        vec_len = la.norm(a - b)
+        rel = 2 * vec_len / (la.norm(a) + la.norm(b))
+        return rel <= tol
+
+    def run_test_case(self):
+        pf_bh_gpu = 2.34
+        pf_dds_gpu = 3.524
+        ratio_dawaanr_bh_gpu = pf_dds_gpu / pf_bh_gpu
+        l = 15
+        self.system.box_l = [l, l, l]
+        self.system.periodicity = [0, 0, 0]
+        self.system.time_step = 1E-4
+        self.system.cell_system.skin = 0.1
+
+        part_dip = np.zeros((3))
+
+        for n in [128, 541]:
+            dipole_modulus = 1.3
+            # scale the box for a large number of particles:
+            if n > 1000:
+                l *= (n / 541) ** (1 / 3.0)
+            for i in range(n):
+                part_pos = np.array(random(3)) * l
+                costheta = 2 * random() - 1
+                sintheta = np.sin(np.arccos(costheta))
+                phi = 2 * np.pi * random()
+                part_dip[0] = sintheta * np.cos(phi) * dipole_modulus
+                part_dip[1] = sintheta * np.sin(phi) * dipole_modulus
+                part_dip[2] = costheta * dipole_modulus
+                self.system.part.add(id=i, type=0, pos=part_pos, dip=part_dip, v=np.array(
+                    [0, 0, 0]), omega_body=np.array([0, 0, 0]))
+
+            self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
+                epsilon=10.0, sigma=0.5,
+                cutoff=0.55, shift="auto")
+            self.system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
+            stopAll(self.system)
+            self.system.integrator.set_vv()
+
+            self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
+                epsilon=0.0, sigma=0.0,
+                cutoff=-1, shift=0.0)
+
+            self.system.cell_system.skin = 0.0
+            self.system.time_step = 0.01
+            self.system.thermostat.turn_off()
+
+            # gamma should be zero in order to avoid the noise term in force
+            # and torque
+            self.system.thermostat.set_langevin(kT=1.297, gamma=0.0)
+
+            dds_gpu = espressomd.magnetostatics.DipolarDirectSumGpu(
+                prefactor=pf_dds_gpu)
+            self.system.actors.add(dds_gpu)
+            self.system.integrator.run(steps=0, recalc_forces=True)
+
+            dawaanr_f = []
+            dawaanr_t = []
+
+            for i in range(n):
+                dawaanr_f.append(self.system.part[i].f)
+                dawaanr_t.append(self.system.part[i].torque_lab)
+            dawaanr_e = espressomd.analyze.Analysis(
+                self.system).energy()["total"]
+
+            del dds_gpu
+            self.system.actors.clear()
+
+            self.system.integrator.run(steps=0, recalc_forces=True)
+            bh_gpu = espressomd.magnetostatics.DipolarBarnesHutGpu(
+                prefactor=pf_bh_gpu, epssq=200.0, itolsq=8.0)
+            self.system.actors.add(bh_gpu)
+            self.system.integrator.run(steps=0, recalc_forces=True)
+
+            bhgpu_f = []
+            bhgpu_t = []
+
+            for i in range(n):
+                bhgpu_f.append(self.system.part[i].f)
+                bhgpu_t.append(self.system.part[i].torque_lab)
+            bhgpu_e = espressomd.analyze.Analysis(
+                self.system).energy()["total"]
+
+            # compare
+            for i in range(n):
+                self.assertTrue(
+                    self.vectorsTheSame(
+                        np.array(
+                            dawaanr_t[i]), ratio_dawaanr_bh_gpu * np.array(
+                                bhgpu_t[i])),
+                                msg='Torques on particle do not match. i={0} dawaanr_t={1} ratio_dawaanr_bh_gpu*bhgpu_t={2}'.format(i, np.array(dawaanr_t[i]), ratio_dawaanr_bh_gpu * np.array(bhgpu_t[i])))
+                self.assertTrue(
+                    self.vectorsTheSame(
+                        np.array(
+                            dawaanr_f[i]), ratio_dawaanr_bh_gpu * np.array(
+                                bhgpu_f[i])),
+                                msg='Forces on particle do not match: i={0} dawaanr_f={1} ratio_dawaanr_bh_gpu*bhgpu_f={2}'.format(i, np.array(dawaanr_f[i]), ratio_dawaanr_bh_gpu * np.array(bhgpu_f[i])))
+            self.assertTrue(
+                abs(dawaanr_e - bhgpu_e * ratio_dawaanr_bh_gpu) <= abs(
+                    1E-3 * dawaanr_e),
+                            msg='Energies for dawaanr {0} and bh_gpu {1} do not match.'.format(dawaanr_e, ratio_dawaanr_bh_gpu * bhgpu_e))
+
+            self.system.integrator.run(steps=0, recalc_forces=True)
+
+            del bh_gpu
+            self.system.actors.clear()
+            self.system.part.clear()
+
+    def test(self):
+        self.run_test_case()
+
+if __name__ == '__main__':
+    ut.main()