diff --git a/pymc/step_methods/mlda.py b/pymc/step_methods/mlda.py
index 4b960b22c51..d66013189da 100644
--- a/pymc/step_methods/mlda.py
+++ b/pymc/step_methods/mlda.py
@@ -15,7 +15,7 @@
 import logging
 import warnings
 
-from typing import List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import aesara
 import arviz as az
@@ -25,7 +25,8 @@
 
 import pymc as pm
 
-from pymc.blocking import DictToArrayBijection
+from pymc.aesaraf import compile_rv_inplace
+from pymc.blocking import DictToArrayBijection, RaveledVars
 from pymc.model import Model, Point
 from pymc.step_methods.arraystep import ArrayStepShared, Competence, metrop_select
 from pymc.step_methods.compound import CompoundStep
@@ -66,20 +67,20 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            value_vars = kwargs.get("vars", None)
-            if value_vars is None:
-                value_vars = model.value_vars
+            vars = kwargs.get("vars", None)
+            if vars is None:
+                vars = model.value_vars
             else:
-                value_vars = [model.rvs_to_values.get(var, var) for var in value_vars]
-            value_vars = pm.inputvars(value_vars)
-            shared = pm.make_shared_replacements(initial_values, value_vars, model)
+                vars = [model.rvs_to_values.get(var, var) for var in vars]
+            vars = pm.inputvars(vars)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -136,20 +137,20 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            value_vars = kwargs.get("vars", None)
-            if value_vars is None:
-                value_vars = model.value_vars
+            vars = kwargs.get("vars", None)
+            if vars is None:
+                vars = model.value_vars
             else:
-                value_vars = [model.rvs_to_values.get(var, var) for var in value_vars]
-            value_vars = pm.inputvars(value_vars)
-            shared = pm.make_shared_replacements(initial_values, value_vars, model)
+                vars = [model.rvs_to_values.get(var, var) for var in vars]
+            vars = pm.inputvars(vars)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -363,7 +364,7 @@ class MLDA(ArrayStepShared):
     def __init__(
         self,
         coarse_models: List[Model],
-        value_vars: Optional[list] = None,
+        vars: Optional[list] = None,
         base_sampler="DEMetropolisZ",
         base_S: Optional = None,
         base_proposal_dist: Optional[Type[Proposal]] = None,
@@ -386,10 +387,6 @@ def __init__(
         # this variable is used to identify MLDA objects which are
         # not in the finest level (i.e. child MLDA objects)
         self.is_child = kwargs.get("is_child", False)
-        if not self.is_child:
-            warnings.warn(
-                "The MLDA implementation in PyMC is still immature. You should be particularly critical of its results."
-            )
 
         if not isinstance(coarse_models, list):
             raise ValueError("MLDA step method cannot use coarse_models if it is not a list")
@@ -546,20 +543,20 @@ def __init__(
         self.mode = mode
 
         # Process model variables
-        if value_vars is None:
-            value_vars = model.value_vars
+        if vars is None:
+            vars = model.value_vars
         else:
-            value_vars = [model.rvs_to_values.get(var, var) for var in value_vars]
-        value_vars = pm.inputvars(value_vars)
-        self.vars = value_vars
+            vars = [model.rvs_to_values.get(var, var) for var in vars]
+        vars = pm.inputvars(vars)
+        self.vars = vars
         self.var_names = [var.name for var in self.vars]
 
         self.accepted = 0
 
         # Construct Aesara function for current-level model likelihood
         # (for use in acceptance)
-        shared = pm.make_shared_replacements(initial_values, value_vars, model)
-        self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
 
         # Construct Aesara function for below-level model likelihood
         # (for use in acceptance)
@@ -571,7 +568,7 @@ def __init__(
             initial_values, model_below.logpt, vars_below, shared_below
         )
 
-        super().__init__(value_vars, shared)
+        super().__init__(vars, shared)
 
         # initialise complete step method hierarchy
         if self.num_levels == 2:
@@ -643,7 +640,7 @@ def __init__(
 
                 # MLDA sampler in some intermediate level, targeting self.model_below
                 self.step_method_below = pm.MLDA(
-                    value_vars=vars_below,
+                    vars=vars_below,
                     base_S=self.base_S,
                     base_sampler=self.base_sampler,
                     base_proposal_dist=self.base_proposal_dist,
@@ -715,7 +712,7 @@ def __init__(
         if self.store_Q_fine and not self.is_child:
             self.stats_dtypes[0][f"Q_{self.num_levels - 1}"] = object
 
-    def astep(self, q0):
+    def astep(self, q0: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         """One MLDA step, given current sample q0"""
         # Check if the tuning flag has been changed and if yes,
         # change the proposal's tuning flag and reset self.accepted
@@ -730,10 +727,6 @@ def astep(self, q0):
                     method.tune = self.tune
             self.accepted = 0
 
-        # Convert current sample from numpy array ->
-        # dict before feeding to proposal
-        q0_dict = DictToArrayBijection.rmap(q0)
-
         # Set subchain_selection (which sample from the coarse chain
         # is passed as a proposal to the fine chain). If variance
         # reduction is used, a random sample is selected as proposal.
@@ -747,14 +740,13 @@ def astep(self, q0):
 
         # Call the recursive DA proposal to get proposed sample
         # and convert dict -> numpy array
-        pre_q = self.proposal_dist(q0_dict)
-        q = DictToArrayBijection.map(pre_q)
+        q = self.proposal_dist(q0)
 
         # Evaluate MLDA acceptance log-ratio
         # If proposed sample from lower levels is the same as current one,
         # do not calculate likelihood, just set accept to 0.0
         if (q.data == q0.data).all():
-            accept = np.float(0.0)
+            accept = np.float64(0.0)
             skipped_logp = True
         else:
             accept = self.delta_logp(q.data, q0.data) + self.delta_logp_below(q0.data, q.data)
@@ -811,11 +803,11 @@ def astep(self, q0):
         if isinstance(self.step_method_below, MLDA):
             self.base_tuning_stats = self.step_method_below.base_tuning_stats
         elif isinstance(self.step_method_below, MetropolisMLDA):
-            self.base_tuning_stats.append({"base_scaling": self.step_method_below.scaling[0]})
+            self.base_tuning_stats.append({"base_scaling": self.step_method_below.scaling})
         elif isinstance(self.step_method_below, DEMetropolisZMLDA):
             self.base_tuning_stats.append(
                 {
-                    "base_scaling": self.step_method_below.scaling[0],
+                    "base_scaling": self.step_method_below.scaling,
                     "base_lambda": self.step_method_below.lamb,
                 }
             )
@@ -823,10 +815,10 @@ def astep(self, q0):
             # Below method is CompoundStep
             for method in self.step_method_below.methods:
                 if isinstance(method, MetropolisMLDA):
-                    self.base_tuning_stats.append({"base_scaling": method.scaling[0]})
+                    self.base_tuning_stats.append({"base_scaling": method.scaling})
                 elif isinstance(method, DEMetropolisZMLDA):
                     self.base_tuning_stats.append(
-                        {"base_scaling": method.scaling[0], "base_lambda": method.lamb}
+                        {"base_scaling": method.scaling, "base_lambda": method.lamb}
                     )
 
         return q_new, [stats] + self.base_tuning_stats
@@ -970,7 +962,7 @@ def delta_logp_inverse(point, logp, vars, shared):
 
     logp1 = pm.CallableTensor(logp0)(inarray1)
 
-    f = aesara.function([inarray1, inarray0], -logp0 + logp1)
+    f = compile_rv_inplace([inarray1, inarray0], -logp0 + logp1)
     f.trust_input = True
     return f
 
@@ -1015,9 +1007,6 @@ def subsample(
     trace=None,
     tune=0,
     model=None,
-    random_seed=None,
-    callback=None,
-    **kwargs,
 ):
     """
     A stripped down version of sample(), which is called only
@@ -1032,19 +1021,10 @@ def subsample(
     model = pm.modelcontext(model)
     chain = 0
     random_seed = np.random.randint(2 ** 30)
-
-    if start is not None:
-        pm.sampling._check_start_shape(model, start)
-    else:
-        start = {}
+    callback = None
 
     draws += tune
 
-    step = pm.sampling.assign_step_methods(model, step, step_kwargs=kwargs)
-
-    if isinstance(step, list):
-        step = CompoundStep(step)
-
     sampling = pm.sampling._iter_sample(
         draws, step, start, trace, chain, tune, model, random_seed, callback
     )
@@ -1086,9 +1066,8 @@ def __init__(
         self.subsampling_rate = subsampling_rate
         self.subchain_selection = None
         self.tuning_end_trigger = True
-        self.trace = None
 
-    def __call__(self, q0_dict: dict) -> dict:
+    def __call__(self, q0: RaveledVars) -> RaveledVars:
         """Returns proposed sample given the current sample
         in dictionary form (q0_dict)."""
 
@@ -1097,6 +1076,10 @@ def __call__(self, q0_dict: dict) -> dict:
         _log = logging.getLogger("pymc")
         _log.setLevel(logging.ERROR)
 
+        # Convert current sample from RaveledVars ->
+        # dict before feeding to subsample.
+        q0_dict = DictToArrayBijection.rmap(q0)
+
         with self.model_below:
             # Check if the tuning flag has been set to False
             # in which case tuning is stopped. The flag is set
@@ -1106,11 +1089,10 @@ def __call__(self, q0_dict: dict) -> dict:
 
             if self.tune:
                 # Subsample in tuning mode
-                self.trace = subsample(
+                trace = subsample(
                     draws=0,
                     step=self.step_method_below,
                     start=q0_dict,
-                    trace=self.trace,
                     tune=self.subsampling_rate,
                 )
             else:
@@ -1122,11 +1104,11 @@ def __call__(self, q0_dict: dict) -> dict:
                         self.step_method_below.tuning_end_trigger = True
                     self.tuning_end_trigger = False
 
-                self.trace = subsample(
+                trace = subsample(
                     draws=self.subsampling_rate,
                     step=self.step_method_below,
                     start=q0_dict,
-                    trace=self.trace,
+                    tune=0,
                 )
 
         # set logging back to normal
@@ -1135,7 +1117,13 @@ def __call__(self, q0_dict: dict) -> dict:
         # return sample with index self.subchain_selection from the generated
         # sequence of length self.subsampling_rate. The index is set within
         # MLDA's astep() function
-        new_point = self.trace.point(-self.subsampling_rate + self.subchain_selection)
-        new_point = Point(new_point, model=self.model_below, filter_model_vars=True)
+        q_dict = trace.point(self.subchain_selection)
+
+        # Make sure output dict is ordered the same way as the input dict.
+        q_dict = Point(
+            {key: q_dict[key] for key in q0_dict.keys()},
+            model=self.model_below,
+            filter_model_vars=True,
+        )
 
-        return new_point
+        return DictToArrayBijection.map(q_dict)
diff --git a/pymc/tests/test_step.py b/pymc/tests/test_step.py
index 0d0886b5665..b12ff12a402 100644
--- a/pymc/tests/test_step.py
+++ b/pymc/tests/test_step.py
@@ -579,12 +579,11 @@ def test_step_continuous(self):
                         HamiltonianMC(scaling=C, is_cov=True, blocked=False),
                     ]
                 ),
-                # NOTE: The MLDA uses the trace continuation which was removed.
-                # MLDA(
-                #     coarse_models=[model_coarse],
-                #     base_S=C,
-                #     base_proposal_dist=MultivariateNormalProposal,
-                # ),
+                MLDA(
+                    coarse_models=[model_coarse],
+                    base_S=C,
+                    base_proposal_dist=MultivariateNormalProposal,
+                ),
             )
         for step in steps:
             idata = sample(
@@ -1039,9 +1038,6 @@ def test_sampler_stats(self):
         assert (trace.model_logp == model_logp_).all()
 
 
-@pytest.mark.skip(
-    reason="MLDA needs to be refactored to no longer depend on trace continuation. See #5021."
-)
 class TestMLDA:
     steppers = [MLDA]
 
@@ -1278,7 +1274,12 @@ def test_tuning_and_scaling_on(self):
             trace_2 = sample(
                 tune=ts,
                 draws=20,
-                step=MLDA(coarse_models=[model_coarse], base_tune_interval=50, base_lamb=100.0),
+                step=MLDA(
+                    coarse_models=[model_coarse],
+                    base_tune_interval=50,
+                    base_scaling=10,
+                    base_lamb=100.0,
+                ),
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=1234,
@@ -1494,8 +1495,8 @@ def perform(self, node, inputs, outputs):
         coarse_models = []
 
         with Model() as coarse_model_0:
-            mu_B = Data("mu_B", np.zeros(y.shape, dtype=p))
             bias = Data("bias", 3.5 * np.ones(y.shape, dtype=p))
+            mu_B = Data("mu_B", -1.3 * np.ones(y.shape, dtype=p))
             Sigma_B = Data("Sigma_B", np.zeros((y.shape[0], y.shape[0]), dtype=p))
             model_output = Data("model_output", np.zeros(y.shape, dtype=p))
             Sigma_e = Data("Sigma_e", s)
@@ -1514,8 +1515,8 @@ def perform(self, node, inputs, outputs):
             coarse_models.append(coarse_model_0)
 
         with Model() as coarse_model_1:
-            mu_B = Data("mu_B", np.zeros(y.shape, dtype=p))
             bias = Data("bias", 2.2 * np.ones(y.shape, dtype=p))
+            mu_B = Data("mu_B", -2.2 * np.ones(y.shape, dtype=p))
             Sigma_B = Data("Sigma_B", np.zeros((y.shape[0], y.shape[0]), dtype=p))
             model_output = Data("model_output", np.zeros(y.shape, dtype=p))
             Sigma_e = Data("Sigma_e", s)
@@ -1566,16 +1567,11 @@ def perform(self, node, inputs, outputs):
             m1 = step_mlda.model_below.mu_B.get_value()
             s1 = step_mlda.model_below.Sigma_B.get_value()
 
-            assert np.all(np.abs(m0 + 3.5 * np.ones(y.shape, dtype=p)) < 1e-1)
-            assert np.all(np.abs(m1 + 2.2 * np.ones(y.shape, dtype=p)) < 1e-1)
-            assert np.all(np.abs(s0 < 1e-1))
-            assert np.all(np.abs(s1 < 1e-1))
+            assert np.allclose(m0, -3.5)
+            assert np.allclose(m1, -2.2)
+            assert np.allclose(s0, 0, atol=1e-3)
+            assert np.allclose(s1, 0, atol=1e-3)
 
-    @pytest.mark.xfail(
-        reason="This test appears to contain a flaky assert. "
-        "Better RNG seeding will need to be worked-out before "
-        "this will pass consistently."
-    )
     def test_variance_reduction(self):
         """
         Test if the right stats are outputed when variance reduction is used in MLDA,
@@ -1596,7 +1592,7 @@ def test_variance_reduction(self):
         size = 100
         true_intercept = 1
         true_slope = 2
-        sigma = 0.2
+        sigma = 0.1
         x = np.linspace(0, 1, size, dtype=p)
         # y = a + b*x
         true_regression_line = true_intercept + true_slope * x
@@ -1616,7 +1612,7 @@ def test_variance_reduction(self):
         nchains = 1
 
         # define likelihoods with different Q
-        class Likelihood1(Op):
+        class Likelihood(Op):
             if aesara.config.floatX == "float32":
                 itypes = [at.fvector]
                 otypes = [at.fscalar]
@@ -1634,143 +1630,117 @@ def perform(self, node, inputs, outputs):
                 x_coeff = inputs[0][1]
 
                 temp = np.array(intercept + x_coeff * self.x, dtype=p)
-                self.pymc_model.Q.set_value(np.array(x_coeff, dtype=p))
+                with self.pymc_model:
+                    set_data({"Q": np.array(x_coeff, dtype=p)})
                 outputs[0][0] = np.array(
                     -(0.5 / s ** 2) * np.sum((temp - self.y) ** 2, dtype=p), dtype=p
                 )
 
-        class Likelihood2(Op):
-            if aesara.config.floatX == "float32":
-                itypes = [at.fvector]
-                otypes = [at.fscalar]
-            else:
-                itypes = [at.dvector]
-                otypes = [at.dscalar]
+        # run four MLDA steppers for all combinations of
+        # base_sampler and forward model
+        for stepper in ["Metropolis", "DEMetropolisZ"]:
+            mout = []
+            coarse_models = []
 
-            def __init__(self, x, y, pymc_model):
-                self.x = x
-                self.y = y
-                self.pymc_model = pymc_model
+            rng = np.random.RandomState(seed)
 
-            def perform(self, node, inputs, outputs):
-                intercept = inputs[0][0]
-                x_coeff = inputs[0][1]
+            with Model(rng_seeder=rng) as coarse_model_0:
+                if aesara.config.floatX == "float32":
+                    Q = Data("Q", np.float32(0.0))
+                else:
+                    Q = Data("Q", np.float64(0.0))
 
-                temp = np.array(intercept + x_coeff * self.x, dtype=p)
-                self.pymc_model.Q.set_value(temp.mean(dtype=p))
-                outputs[0][0] = np.array(
-                    -(0.5 / s ** 2) * np.sum((temp - self.y) ** 2, dtype=p), dtype=p
+                # Define priors
+                intercept = Normal("Intercept", true_intercept, sigma=1)
+                x_coeff = Normal("x", true_slope, sigma=1)
+
+                theta = at.as_tensor_variable([intercept, x_coeff])
+
+                mout.append(Likelihood(x_coarse_0, y_coarse_0, coarse_model_0))
+                Potential("likelihood", mout[0](theta))
+
+                coarse_models.append(coarse_model_0)
+
+            rng = np.random.RandomState(seed)
+
+            with Model(rng_seeder=rng) as coarse_model_1:
+                if aesara.config.floatX == "float32":
+                    Q = Data("Q", np.float32(0.0))
+                else:
+                    Q = Data("Q", np.float64(0.0))
+
+                # Define priors
+                intercept = Normal("Intercept", true_intercept, sigma=1)
+                x_coeff = Normal("x", true_slope, sigma=1)
+
+                theta = at.as_tensor_variable([intercept, x_coeff])
+
+                mout.append(Likelihood(x_coarse_1, y_coarse_1, coarse_model_1))
+                Potential("likelihood", mout[1](theta))
+
+                coarse_models.append(coarse_model_1)
+
+            rng = np.random.RandomState(seed)
+
+            with Model(rng_seeder=rng) as model:
+                if aesara.config.floatX == "float32":
+                    Q = Data("Q", np.float32(0.0))
+                else:
+                    Q = Data("Q", np.float64(0.0))
+
+                # Define priors
+                intercept = Normal("Intercept", true_intercept, sigma=1)
+                x_coeff = Normal("x", true_slope, sigma=1)
+
+                theta = at.as_tensor_variable([intercept, x_coeff])
+
+                mout.append(Likelihood(x, y, model))
+                Potential("likelihood", mout[-1](theta))
+
+                step = MLDA(
+                    coarse_models=coarse_models,
+                    base_sampler=stepper,
+                    subsampling_rates=nsub,
+                    variance_reduction=True,
+                    store_Q_fine=True,
                 )
 
-        # run four MLDA steppers for all combinations of
-        # base_sampler and forward model
-        for stepper in ["Metropolis", "DEMetropolisZ"]:
-            for f in [Likelihood1, Likelihood2]:
-                mout = []
-                coarse_models = []
-
-                rng = np.random.RandomState(seed)
-
-                with Model(rng_seeder=rng) as coarse_model_0:
-                    if aesara.config.floatX == "float32":
-                        Q = Data("Q", np.float32(0.0))
-                    else:
-                        Q = Data("Q", np.float64(0.0))
-
-                    # Define priors
-                    intercept = Normal("Intercept", 0, sigma=20)
-                    x_coeff = Normal("x", 0, sigma=20)
-
-                    theta = at.as_tensor_variable([intercept, x_coeff])
-
-                    mout.append(f(x_coarse_0, y_coarse_0, coarse_model_0))
-                    Potential("likelihood", mout[0](theta))
-
-                    coarse_models.append(coarse_model_0)
-
-                rng = np.random.RandomState(seed)
-
-                with Model(rng_seeder=rng) as coarse_model_1:
-                    if aesara.config.floatX == "float32":
-                        Q = Data("Q", np.float32(0.0))
-                    else:
-                        Q = Data("Q", np.float64(0.0))
-
-                    # Define priors
-                    intercept = Normal("Intercept", 0, sigma=20)
-                    x_coeff = Normal("x", 0, sigma=20)
-
-                    theta = at.as_tensor_variable([intercept, x_coeff])
-
-                    mout.append(f(x_coarse_1, y_coarse_1, coarse_model_1))
-                    Potential("likelihood", mout[1](theta))
-
-                    coarse_models.append(coarse_model_1)
-
-                rng = np.random.RandomState(seed)
-
-                with Model(rng_seeder=rng) as model:
-                    if aesara.config.floatX == "float32":
-                        Q = Data("Q", np.float32(0.0))
-                    else:
-                        Q = Data("Q", np.float64(0.0))
-
-                    # Define priors
-                    intercept = Normal("Intercept", 0, sigma=20)
-                    x_coeff = Normal("x", 0, sigma=20)
-
-                    theta = at.as_tensor_variable([intercept, x_coeff])
-
-                    mout.append(f(x, y, model))
-                    Potential("likelihood", mout[-1](theta))
-
-                    step = MLDA(
-                        coarse_models=coarse_models,
-                        base_sampler=stepper,
-                        subsampling_rates=nsub,
-                        variance_reduction=True,
-                        store_Q_fine=True,
-                    )
-
-                    trace = sample(
-                        draws=ndraws,
-                        step=step,
-                        chains=nchains,
-                        tune=ntune,
-                        cores=1,
-                        discard_tuned_samples=True,
-                        random_seed=seed,
-                    )
-
-                    # get fine level stats (standard method)
-                    Q_2 = trace.get_sampler_stats("Q_2").reshape((nchains, ndraws))
-                    Q_mean_standard = Q_2.mean(axis=1).mean()
-                    Q_se_standard = np.sqrt(Q_2.var() / az.ess(np.array(Q_2, np.float64)))
-
-                    # get VR stats
-                    Q_mean_vr, Q_se_vr = extract_Q_estimate(trace, 3)
-
-                    # compare standard and VR
-                    assert isclose(Q_mean_standard, Q_mean_vr, rel_tol=1e-1)
-
-                    # TODO FIXME: This appears to be a flaky/rng-sensitive test.
-                    # It passes and fails under certain seed values, and, when
-                    # each models' seed is set to the same value, these tested
-                    # values are the same up to 6 digits (e.g. fails with
-                    # `assert 0.0029612950613254006 > 0.0029613590468204106`).
-                    # assert Q_se_standard > Q_se_vr
-                    assert Q_se_standard > Q_se_vr or isclose(Q_se_standard, Q_se_vr, abs_tol=1e-2)
-
-                    # check consistency of QoI across levels.
-                    if isinstance(f, Likelihood1):
-                        Q_1_0 = np.concatenate(trace.get_sampler_stats("Q_1_0")).reshape(
-                            (nchains, ndraws * nsub)
-                        )
-                        Q_2_1 = np.concatenate(trace.get_sampler_stats("Q_2_1")).reshape(
-                            (nchains, ndraws)
-                        )
-                        assert Q_1_0.mean(axis=1) == 0.0
-                        assert Q_2_1.mean(axis=1) == 0.0
+                trace = sample(
+                    draws=ndraws,
+                    step=step,
+                    chains=nchains,
+                    tune=ntune,
+                    cores=1,
+                    discard_tuned_samples=True,
+                    random_seed=seed,
+                    return_inferencedata=False,
+                )
+
+                # get fine level stats (standard method)
+                Q_2 = trace.get_sampler_stats("Q_2").reshape((nchains, ndraws))
+                Q_mean_standard = Q_2.mean(axis=1).mean()
+                Q_se_standard = np.sqrt(Q_2.var() / az.ess(np.array(Q_2, np.float64)))
+
+                # get VR stats
+                Q_mean_vr, Q_se_vr = extract_Q_estimate(trace, 3)
+
+                # check that returned values are floats and finite.
+                assert isinstance(Q_mean_standard, float)
+                assert np.isfinite(Q_mean_standard)
+                assert isinstance(Q_mean_vr, float)
+                assert np.isfinite(Q_mean_vr)
+                assert isinstance(Q_se_standard, float)
+                assert np.isfinite(Q_se_standard)
+                assert isinstance(Q_se_vr, float)
+                assert np.isfinite(Q_se_vr)
+
+                # check consistency of QoI across levels.
+                Q_1_0 = np.concatenate(trace.get_sampler_stats("Q_1_0")).reshape(
+                    (nchains, ndraws * nsub)
+                )
+                Q_2_1 = np.concatenate(trace.get_sampler_stats("Q_2_1")).reshape((nchains, ndraws))
+                assert Q_1_0.mean(axis=1) == 0.0
+                assert Q_2_1.mean(axis=1) == 0.0
 
 
 class TestRVsAssignmentSteps:
diff --git a/pymc/tests/test_types.py b/pymc/tests/test_types.py
index db7b369ba9e..c0c46f2519a 100644
--- a/pymc/tests/test_types.py
+++ b/pymc/tests/test_types.py
@@ -61,7 +61,6 @@ def test_float32(self):
             with model:
                 sample(draws=10, tune=10, chains=1, step=sampler())
 
-    @pytest.mark.xfail(reason="MLDA not refactored for V4 yet")
     @aesara.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
     def test_float64_MLDA(self):
         data = np.random.randn(5)
@@ -80,7 +79,6 @@ def test_float64_MLDA(self):
         with model:
             sample(draws=10, tune=10, chains=1, step=MLDA(coarse_models=[coarse_model]))
 
-    @pytest.mark.xfail(reason="MLDA not refactored for V4 yet")
     @aesara.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
     def test_float32_MLDA(self):
         data = np.random.randn(5).astype("float32")