diff --git a/cpp/src/tsvd/tsvd_mg.cu b/cpp/src/tsvd/tsvd_mg.cu
index 2e912c3492..09b3999f81 100644
--- a/cpp/src/tsvd/tsvd_mg.cu
+++ b/cpp/src/tsvd/tsvd_mg.cu
@@ -310,6 +310,8 @@ void inverse_transform_impl(raft::handle_t& handle,
  */
 template <typename T>
 void fit_transform_impl(raft::handle_t& handle,
+                        cudaStream_t* streams,
+                        size_t n_streams,
                         std::vector<Matrix::Data<T>*>& input_data,
                         Matrix::PartDescriptor& input_desc,
                         std::vector<Matrix::Data<T>*>& trans_data,
@@ -321,16 +323,6 @@ void fit_transform_impl(raft::handle_t& handle,
                         paramsTSVDMG& prms,
                         bool verbose)
 {
-  int rank = handle.get_comms().get_rank();
-
-  // TODO: These streams should come from raft::handle_t
-  auto n_streams = input_desc.blocksOwnedBy(rank).size();
-  ;
-  cudaStream_t streams[n_streams];
-  for (std::size_t i = 0; i < n_streams; i++) {
-    RAFT_CUDA_TRY(cudaStreamCreate(&streams[i]));
-  }
-
   fit_impl(
     handle, input_data, input_desc, components, singular_vals, prms, streams, n_streams, verbose);
 
@@ -371,13 +363,6 @@ void fit_transform_impl(raft::handle_t& handle,
 
   raft::linalg::scalarMultiply(
     explained_var_ratio, explained_var, scalar, prms.n_components, streams[0]);
-
-  for (std::size_t i = 0; i < n_streams; i++) {
-    handle.sync_stream(streams[i]);
-  }
-  for (std::size_t i = 0; i < n_streams; i++) {
-    RAFT_CUDA_TRY(cudaStreamDestroy(streams[i]));
-  }
 }
 
 void fit(raft::handle_t& handle,
@@ -416,7 +401,16 @@ void fit_transform(raft::handle_t& handle,
                    paramsTSVDMG& prms,
                    bool verbose)
 {
+  // TODO: These streams should come from raft::handle_t
+  int rank         = handle.get_comms().get_rank();
+  size_t n_streams = input_desc.blocksOwnedBy(rank).size();
+  cudaStream_t streams[n_streams];
+  for (std::size_t i = 0; i < n_streams; i++) {
+    RAFT_CUDA_TRY(cudaStreamCreate(&streams[i]));
+  }
   fit_transform_impl(handle,
+                     streams,
+                     n_streams,
                      input_data,
                      input_desc,
                      trans_data,
@@ -427,6 +421,12 @@ void fit_transform(raft::handle_t& handle,
                      singular_vals,
                      prms,
                      verbose);
+  for (std::size_t i = 0; i < n_streams; i++) {
+    handle.sync_stream(streams[i]);
+  }
+  for (std::size_t i = 0; i < n_streams; i++) {
+    RAFT_CUDA_TRY(cudaStreamDestroy(streams[i]));
+  }
 }
 
 void fit_transform(raft::handle_t& handle,
@@ -441,7 +441,16 @@ void fit_transform(raft::handle_t& handle,
                    paramsTSVDMG& prms,
                    bool verbose)
 {
+  // TODO: These streams should come from raft::handle_t
+  int rank         = handle.get_comms().get_rank();
+  size_t n_streams = input_desc.blocksOwnedBy(rank).size();
+  cudaStream_t streams[n_streams];
+  for (std::size_t i = 0; i < n_streams; i++) {
+    RAFT_CUDA_TRY(cudaStreamCreate(&streams[i]));
+  }
   fit_transform_impl(handle,
+                     streams,
+                     n_streams,
                      input_data,
                      input_desc,
                      trans_data,
@@ -452,6 +461,12 @@ void fit_transform(raft::handle_t& handle,
                      singular_vals,
                      prms,
                      verbose);
+  for (std::size_t i = 0; i < n_streams; i++) {
+    handle.sync_stream(streams[i]);
+  }
+  for (std::size_t i = 0; i < n_streams; i++) {
+    RAFT_CUDA_TRY(cudaStreamDestroy(streams[i]));
+  }
 }
 
 void transform(raft::handle_t& handle,
diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 0168de32d9..665dd7d300 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -245,7 +245,7 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.neighbors.NearestNeighbors,
             cuml.neighbors.NearestNeighbors,
-            shared_args=dict(n_neighbors=1024),
+            shared_args=dict(n_neighbors=64),
             cpu_args=dict(algorithm="brute", n_jobs=-1),
             cuml_args={},
             name="NearestNeighbors",
@@ -619,7 +619,7 @@ def all_algorithms():
         AlgorithmPair(
             None,
             cuml.dask.neighbors.NearestNeighbors,
-            shared_args=dict(n_neighbors=1024),
+            shared_args=dict(n_neighbors=64),
             cpu_args=dict(algorithm="brute", n_jobs=-1),
             cuml_args={},
             name="MNMG.NearestNeighbors",
diff --git a/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py b/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py
index 6929b75807..b1adb9b12a 100644
--- a/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py
+++ b/python/cuml/benchmark/automated/dask/bench_mnmg_regression.py
@@ -25,7 +25,7 @@
 
 
 @pytest.fixture(**fixture_generation_helper({
-                    'n_samples': [1000, 10000],
+                    'n_samples': [10000],
                     'n_features': [5, 500]
                 }))
 def regression(request):
diff --git a/python/cuml/benchmark/automated/utils/utils.py b/python/cuml/benchmark/automated/utils/utils.py
index fdbe72b6d4..340f5f60bb 100644
--- a/python/cuml/benchmark/automated/utils/utils.py
+++ b/python/cuml/benchmark/automated/utils/utils.py
@@ -31,6 +31,7 @@ def setFixtureParamNames(*args, **kwargs):
 import os
 import json
 import time
+import math
 import itertools as it
 import warnings
 import numpy as np
@@ -40,6 +41,7 @@ def setFixtureParamNames(*args, **kwargs):
 import pytest
 from cuml.benchmark import datagen, algorithms
 from cuml.benchmark.nvtx_benchmark import Profiler
+from dask.distributed import wait
 import dask.array as da
 import dask.dataframe as df
 from copy import copy
@@ -54,13 +56,19 @@ def distribute(client, data):
     if data is not None:
         n_rows = data.shape[0]
         n_workers = len(client.scheduler_info()['workers'])
+        rows_per_chunk = math.ceil(n_rows / n_workers)
         if isinstance(data, (np.ndarray, cp.ndarray)):
             dask_array = da.from_array(x=data,
-                                       chunks={0: n_rows // n_workers, 1: -1})
+                                       chunks={0: rows_per_chunk, 1: -1})
+            dask_array = dask_array.persist()
+            wait(dask_array)
+            client.rebalance()
             return dask_array
         elif isinstance(data, (cudf.DataFrame, cudf.Series)):
-            dask_df = df.from_pandas(data,
-                                     chunksize=n_rows // n_workers)
+            dask_df = df.from_pandas(data, chunksize=rows_per_chunk)
+            dask_df = dask_df.persist()
+            wait(dask_df)
+            client.rebalance()
             return dask_df
         else:
             raise ValueError('Could not distribute data')
diff --git a/python/cuml/benchmark/nvtx_benchmark.py b/python/cuml/benchmark/nvtx_benchmark.py
index 8a2b7338fd..d549fcf46d 100644
--- a/python/cuml/benchmark/nvtx_benchmark.py
+++ b/python/cuml/benchmark/nvtx_benchmark.py
@@ -22,7 +22,7 @@
 
 class Profiler:
     def __init__(self, tmp_path='/tmp/nsys_report'):
-        self.qdrep_file = tmp_path + '/report.qdrep'
+        self.nsys_file = tmp_path + '/report.nsys-rep'
         self.json_file = tmp_path + '/report.json'
         self._execute(['rm', '-rf', tmp_path])
         self._execute(['mkdir', '-p', tmp_path])
@@ -39,8 +39,8 @@ def _nsys_profile(self, command):
                            'profile',
                            '--trace=nvtx',
                            '--force-overwrite=true',
-                           '--output={qdrep_file}'.format(
-                               qdrep_file=self.qdrep_file)]
+                           '--output={nsys_file}'.format(
+                               nsys_file=self.nsys_file)]
         profile_command.extend(command.split(' '))
         self._execute(profile_command)
 
@@ -52,7 +52,7 @@ def _nsys_export2json(self):
                           '--force-overwrite=true',
                           '--output={json_file}'.format(
                               json_file=self.json_file),
-                          self.qdrep_file]
+                          self.nsys_file]
         self._execute(export_command)
 
     def _parse_json(self):
diff --git a/python/cuml/dask/preprocessing/LabelEncoder.py b/python/cuml/dask/preprocessing/LabelEncoder.py
index a271869a3a..4c731de842 100644
--- a/python/cuml/dask/preprocessing/LabelEncoder.py
+++ b/python/cuml/dask/preprocessing/LabelEncoder.py
@@ -51,6 +51,9 @@ class LabelEncoder(BaseEstimator,
         >>> import dask_cudf
         >>> from cuml.dask.preprocessing import LabelEncoder
 
+        >>> import pandas as pd
+        >>> pd.set_option('display.max_colwidth', 2000)
+
         >>> cluster = LocalCUDACluster(threads_per_worker=1)
         >>> client = Client(cluster)
         >>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30],
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index b4855c3b71..389bc945dc 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -37,6 +37,8 @@ from raft.common.handle cimport handle_t
 from cuml.common import input_to_cuml_array, logger
 from cuml.common.mixins import CMajorInputTagMixin
 from cuml.common.doc_utils import _parameters_docstrings
+from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm._lib.memory_resource cimport get_current_device_resource
 
 import treelite
 import treelite.sklearn as tl_skl
@@ -256,6 +258,7 @@ cdef class ForestInference_impl():
     cdef size_t num_class
     cdef bool output_class
     cdef char* shape_str
+    cdef DeviceMemoryResource mr
 
     cdef forest32_t get_forest32(self):
         return get[forest32_t, forest32_t, forest64_t](self.forest_data)
@@ -268,6 +271,7 @@ cdef class ForestInference_impl():
         self.handle = handle
         self.forest_data = forest_variant(<forest32_t> NULL)
         self.shape_str = NULL
+        self.mr = get_current_device_resource()
 
     def get_shape_str(self):
         if self.shape_str:
diff --git a/python/cuml/manifold/umap_utils.pxd b/python/cuml/manifold/umap_utils.pxd
index a9edd64ff8..abf4698b75 100644
--- a/python/cuml/manifold/umap_utils.pxd
+++ b/python/cuml/manifold/umap_utils.pxd
@@ -16,6 +16,7 @@
 
 # distutils: language = c++
 
+from rmm._lib.memory_resource cimport DeviceMemoryResource
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 from libcpp.memory cimport unique_ptr
 
@@ -73,6 +74,7 @@ cdef extern from "raft/sparse/coo.hpp":
 
 cdef class GraphHolder:
     cdef unique_ptr[COO] c_graph
+    cdef DeviceMemoryResource mr
 
     @staticmethod
     cdef GraphHolder new_graph(cuda_stream_view stream)
diff --git a/python/cuml/manifold/umap_utils.pyx b/python/cuml/manifold/umap_utils.pyx
index 499aea605c..210cfb5846 100644
--- a/python/cuml/manifold/umap_utils.pyx
+++ b/python/cuml/manifold/umap_utils.pyx
@@ -16,6 +16,7 @@
 
 # distutils: language = c++
 
+from rmm._lib.memory_resource cimport get_current_device_resource
 from raft.common.handle cimport handle_t
 from cuml.manifold.umap_utils cimport *
 from libcpp.utility cimport move
@@ -28,6 +29,7 @@ cdef class GraphHolder:
     cdef GraphHolder new_graph(cuda_stream_view stream):
         cdef GraphHolder graph = GraphHolder.__new__(GraphHolder)
         graph.c_graph.reset(new COO(stream))
+        graph.mr = get_current_device_resource()
         return graph
 
     @staticmethod
@@ -65,6 +67,7 @@ cdef class GraphHolder:
         copy_from_array(graph.rows(), coo_array.row.astype('int32'))
         copy_from_array(graph.cols(), coo_array.col.astype('int32'))
 
+        graph.mr = get_current_device_resource()
         return graph
 
     cdef inline COO* get(self):
diff --git a/python/cuml/tests/dask/test_dask_kmeans.py b/python/cuml/tests/dask/test_dask_kmeans.py
index 970c5a51d2..0fb2cbb927 100644
--- a/python/cuml/tests/dask/test_dask_kmeans.py
+++ b/python/cuml/tests/dask/test_dask_kmeans.py
@@ -256,4 +256,4 @@ def test_score(nrows, ncols, nclusters, n_parts,
     local_model = cumlModel.get_combined_model()
     expected_score = local_model.score(X_train.compute())
 
-    assert abs(actual_score - expected_score) < 1e-3
+    assert abs(actual_score - expected_score) < 9e-3
diff --git a/python/cuml/tests/dask/test_dask_random_forest.py b/python/cuml/tests/dask/test_dask_random_forest.py
index 97c4e56522..df7ed87424 100644
--- a/python/cuml/tests/dask/test_dask_random_forest.py
+++ b/python/cuml/tests/dask/test_dask_random_forest.py
@@ -93,7 +93,7 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
             train_test_split(X, y, test_size=n_workers * 300, random_state=123)
 
         cu_rf_params = {
-            'n_estimators': 25,
+            'n_estimators': n_workers*8,
             'max_depth': 16,
             'n_bins': 256,
             'random_state': 10,
@@ -115,7 +115,7 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
         # Refer to issue : https://github.com/rapidsai/cuml/issues/2806 for
         # more information on the threshold value.
 
-        assert acc_score_gpu >= 0.55
+        assert acc_score_gpu >= 0.52
 
     finally:
         c.close()
@@ -603,8 +603,10 @@ def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client):
     X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, 1)
     X_test_dask_array = from_array(X_test)
 
+    n_estimators = n_workers*8
+
     if model_type == 'classification':
-        cuml_mod = cuRFC_mg(n_estimators=10, max_depth=8, n_bins=16,
+        cuml_mod = cuRFC_mg(n_estimators=n_estimators, max_depth=8, n_bins=16,
                             ignore_empty_partitions=True)
         cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast)
         cuml_mod_predict = cuml_mod.predict(X_test_dask_array,
@@ -613,10 +615,10 @@ def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client):
         cuml_mod_predict = cuml_mod_predict.compute()
         cuml_mod_predict = cp.asnumpy(cuml_mod_predict)
         acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True)
-        assert acc_score >= 0.70
+        assert acc_score >= 0.68
 
     else:
-        cuml_mod = cuRFR_mg(n_estimators=10, max_depth=8, n_bins=16,
+        cuml_mod = cuRFR_mg(n_estimators=n_estimators, max_depth=8, n_bins=16,
                             ignore_empty_partitions=True)
         cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast)
         cuml_mod_predict = cuml_mod.predict(X_test_dask_array,