pyro-ppl · neerajprad · Oct 19, 2018 · Oct 17, 2018 · Oct 17, 2018 · Oct 17, 2018
diff --git a/pyro/infer/mcmc/hmc.py b/pyro/infer/mcmc/hmc.py
@@ -199,7 +199,7 @@ def _find_reasonable_step_size(self, z):
         # We are going to find a step_size which make accept_prob (Metropolis correction)
         # near the target_accept_prob. If accept_prob:=exp(-delta_energy) is small,
         # then we have to decrease step_size; otherwise, increase step_size.
-        r = self._sample_r(name="r_presample")
+        r, _ = self._sample_r(name="r_presample")
         energy_current = self._energy(z, r)
         z_new, r_new, z_grads, potential_energy = single_step_velocity_verlet(
             z, r, self._potential_energy, self._inverse_mass_matrix, step_size)
@@ -322,7 +322,7 @@ def _sample_r(self, name):
             r[name] = r_flat[pos:next_pos].reshape(self._r_shapes[name])
             pos = next_pos
         assert pos == r_flat.size(0)
-        return r
+        return r, r_flat
 
     def _validate_trace(self, trace):
         trace_eval = TraceEinsumEvaluator if self.use_einsum else TraceTreeEvaluator
@@ -378,7 +378,7 @@ def sample(self, trace):
         for name, transform in self.transforms.items():
             z[name] = transform(z[name])
 
-        r = self._sample_r(name="r_t={}".format(self._t))
+        r, _ = self._sample_r(name="r_t={}".format(self._t))
 
         # Temporarily disable distributions args checking as
         # NaNs are expected during step size adaptation

diff --git a/pyro/infer/mcmc/nuts.py b/pyro/infer/mcmc/nuts.py
@@ -14,10 +14,11 @@
 # sum_accept_probs and num_proposals are used to calculate
 # the statistic accept_prob for Dual Averaging scheme;
 # z_left_grads and z_right_grads are kept to avoid recalculating
-# grads at left and right leaves
+# grads at left and right leaves;
+# r_sum is used to check turning condition
 _TreeInfo = namedtuple("TreeInfo", ["z_left", "r_left", "z_left_grads",
                                     "z_right", "r_right", "z_right_grads",
-                                    "z_proposal", "size", "turning", "diverging",
+                                    "z_proposal", "r_sum", "size", "turning", "diverging",
                                     "sum_accept_probs", "num_proposals"])
 
 
@@ -115,19 +116,27 @@ def __init__(self,
         # Here, as suggested in [1], we set dE_max = 1000.
         self._max_sliced_energy = 1000
 
-    def _is_turning(self, z_left, r_left, z_right, r_right):
-        diff_left = 0
-        diff_right = 0
-        for name in self._r_shapes:
-            dz = z_right[name] - z_left[name]
-            diff_left += (dz * r_left[name]).sum()
-            diff_right += (dz * r_right[name]).sum()
-        return diff_left < 0 or diff_right < 0
+        # Set a flag to decide if we want to eliminate the initial point from the candidates to
+        # choose uniformly along the trajectory. In [1], this flag is True, but in Stan, they set
+        # it to False (implicitly).
+        self._eliminate_starting_point = True
+
+    def _is_turning(self, r_left, r_right, r_sum):
+        # We follow the strategy in Section A.4.2 of [2] for this implementation.
+        r_left_flat = torch.cat([r_left[site_name].reshape(-1) for site_name in sorted(r_left)])
+        r_right_flat = torch.cat([r_right[site_name].reshape(-1) for site_name in sorted(r_right)])
+        if self.full_mass:
+            return (r_sum - r_left_flat).dot(self._inverse_mass_matrix.matmul(r_left_flat)) <= 0 \
+                or (r_sum - r_right_flat).dot(self._inverse_mass_matrix.matmul(r_right_flat)) <= 0
+        else:
+            return self._inverse_mass_matrix.dot((r_sum - r_left_flat) * r_left_flat) <= 0 \
+                or self._inverse_mass_matrix.dot((r_sum - r_right_flat) * r_right_flat) <= 0
 
     def _build_basetree(self, z, r, z_grads, log_slice, direction, energy_current):
         step_size = self.step_size if direction == 1 else -self.step_size
         z_new, r_new, z_grads, potential_energy = single_step_velocity_verlet(
             z, r, self._potential_energy, self._inverse_mass_matrix, step_size, z_grads=z_grads)
+        r_new_flat = torch.cat([r_new[site_name].reshape(-1) for site_name in sorted(r_new)])
         energy_new = potential_energy + self._kinetic_energy(r_new)
         sliced_energy = energy_new + log_slice
 
@@ -148,7 +157,7 @@ def _build_basetree(self, z, r, z_grads, log_slice, direction, energy_current):
             delta_energy = energy_new - energy_current
             accept_prob = (-delta_energy).exp().clamp(max=1.0)
         return _TreeInfo(z_new, r_new, z_grads, z_new, r_new, z_grads,
-                         z_new, tree_size, False, diverging, accept_prob, 1)
+                         z_new, r_new_flat, tree_size, False, diverging, accept_prob, 1)
 
     def _build_tree(self, z, r, z_grads, log_slice, direction, tree_depth, energy_current):
         if tree_depth == 0:
@@ -180,6 +189,7 @@ def _build_tree(self, z, r, z_grads, log_slice, direction, tree_depth, energy_cu
         tree_size = half_tree.size + other_half_tree.size
         sum_accept_probs = half_tree.sum_accept_probs + other_half_tree.sum_accept_probs
         num_proposals = half_tree.num_proposals + other_half_tree.num_proposals
+        r_sum = half_tree.r_sum + other_half_tree.r_sum
 
         # Under the slice sampling process, a proposal for z is uniformly picked.
         # The probability of that proposal belongs to which half of tree
@@ -212,20 +222,20 @@ def _build_tree(self, z, r, z_grads, log_slice, direction, tree_depth, energy_cu
 
         # We already check if first half tree is turning. Now, we check
         #     if the other half tree or full tree are turning.
-        turning = other_half_tree.turning or self._is_turning(z_left, r_left, z_right, r_right)
+        turning = other_half_tree.turning or self._is_turning(r_left, r_right, r_sum)
 
         # The divergence is checked by the second half tree (the first half is already checked).
         diverging = other_half_tree.diverging
 
         return _TreeInfo(z_left, r_left, z_left_grads, z_right, r_right, z_right_grads, z_proposal,
-                         tree_size, turning, diverging, sum_accept_probs, num_proposals)
+                         r_sum, tree_size, turning, diverging, sum_accept_probs, num_proposals)
 
     def sample(self, trace):
         z = {name: node["value"].detach() for name, node in self._iter_latent_nodes(trace)}
         # automatically transform `z` to unconstrained space, if needed.
         for name, transform in self.transforms.items():
             z[name] = transform(z[name])
-        r = self._sample_r(name="r_t={}".format(self._t))
+        r, r_flat = self._sample_r(name="r_t={}".format(self._t))
         energy_current = self._energy(z, r)
 
         # Ideally, following a symplectic integrator trajectory, the energy is constant.
@@ -240,8 +250,8 @@ def sample(self, trace):
         #     (z, r) ~ Uniform({(z', r') in trajectory | p(z', r') >= u}).
         #
         # For more information about slice sampling method, see [3].
-        # For another version of NUTS which uses multinomial sampling instead of slice sampling, see
-        # [2].
+        # For another version of NUTS which uses multinomial sampling instead of slice sampling,
+        # see [2].
 
         # Rather than sampling the slice variable from `Uniform(0, exp(-energy))`, we can
         # sample log_slice directly using `energy`, so as to avoid potential underflow or
@@ -253,8 +263,9 @@ def sample(self, trace):
         z_left = z_right = z
         r_left = r_right = r
         z_left_grads = z_right_grads = None
-        tree_size = 1
+        tree_size = 0 if self._eliminate_starting_point else 1
         accepted = False
+        r_sum = r_flat
 
         # Temporarily disable distributions args checking as
         # NaNs are expected during step size adaptation.
@@ -283,11 +294,13 @@ def sample(self, trace):
 
                 rand = pyro.sample("rand_t={}_treedepth={}".format(self._t, tree_depth),
                                    dist.Uniform(torch.zeros(1), torch.ones(1)))
-                if rand < new_tree.size / tree_size:
+                if ((tree_size > 0) and (rand < new_tree.size / tree_size)) \
+                        or ((tree_size == 0) and (new_tree.size > 0)):
                     accepted = True
                     z = new_tree.z_proposal
 
-                if self._is_turning(z_left, r_left, z_right, r_right):  # stop doubling
+                r_sum += new_tree.r_sum
+                if self._is_turning(r_left, r_right, r_sum):  # stop doubling
                     break
                 else:  # update tree_size
                     tree_size += new_tree.size

diff --git a/tests/infer/mcmc/test_nuts.py b/tests/infer/mcmc/test_nuts.py
@@ -90,8 +90,8 @@ def model(data):
 @pytest.mark.parametrize(
     "step_size, adapt_step_size, adapt_mass_matrix, full_mass",
     [
-        (0.02, False, False, False),
-        (0.02, False, True, False),
+        (0.03, False, False, False),
+        (0.03, False, True, False),
         (None, True, False, False),
         (None, True, True, False),
         (None, True, True, True),
@@ -154,7 +154,7 @@ def model(data):
     true_beta = torch.tensor(1.)
     data = dist.Beta(concentration1=true_alpha, concentration0=true_beta).sample(torch.Size((5000,)))
     nuts_kernel = NUTS(model)
-    mcmc_run = MCMC(nuts_kernel, num_samples=500, warmup_steps=200).run(data)
+    mcmc_run = MCMC(nuts_kernel, num_samples=600, warmup_steps=200).run(data)
     posterior = EmpiricalMarginal(mcmc_run, sites=['alpha', 'beta'])
     assert_equal(posterior.mean, torch.stack([true_alpha, true_beta]), prec=0.05)
 
@@ -177,7 +177,7 @@ def gmm(data):
     cluster_assignments = dist.Categorical(true_mix_proportions).sample(torch.Size((N,)))
     data = dist.Normal(true_cluster_means[cluster_assignments], 1.0).sample()
     nuts_kernel = NUTS(gmm, max_iarange_nesting=1)
-    mcmc_run = MCMC(nuts_kernel, num_samples=300, warmup_steps=100).run(data)
+    mcmc_run = MCMC(nuts_kernel, num_samples=400, warmup_steps=100).run(data)
     posterior = EmpiricalMarginal(mcmc_run, sites=["phi", "cluster_means"]).mean.sort()[0]
     assert_equal(posterior[0], true_mix_proportions, prec=0.05)
     assert_equal(posterior[1], true_cluster_means, prec=0.2)