ABC: Add Kullback Liebler and save pointwise pseudolikelihood values (#…

…4155) * add kl distance rename old ones, save pointwise pseudolikelihood values * fix docstring * fix docstring style * fix docstrings * improve docstrings * fix docstring * add missing header to docstring
pymc-devs · Oct 8, 2020 · a248c3d · a248c3d
1 parent b707791
commit a248c3d
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 152 deletions.
diff --git a/pymc3/distributions/simulator.py b/pymc3/distributions/simulator.py
@@ -15,65 +15,66 @@
 import logging
 import numpy as np
 from .distribution import NoDistribution, draw_values
+from scipy.spatial import cKDTree
 
 __all__ = ["Simulator"]
 
 _log = logging.getLogger("pymc3")
 
 
 class Simulator(NoDistribution):
+    r"""
+    Define a simulator, from a Python function, to be used in ABC methods.
+
+    Parameters
+    ----------
+    function: callable
+        Python function defined by the user.
+    params: list
+        Parameters passed to function.
+    distance: str or callable
+        Distance functions. Available options are "gaussian" (default), "laplacian",
+        "kullback_leibler" or a user defined function that takes epsilon, the summary statistics of
+        observed_data and the summary statistics of simulated_data as input.
+        ``gaussian`` :math: `-0.5 \left(\left(\frac{xo - xs}{\epsilon}\right)^2\right)`
+        ``laplace`` :math: `{\left(\frac{|xo - xs|}{\epsilon}\right)}`
+        ``kullback_leibler`` `:math: d \sum(-\log(\frac{\nu_d} {\rho_d}) / \epsilon) + log_r`
+        gaussian + ``sum_stat="sort"`` is equivalent to the 1D 2-wasserstein distance
+        laplace + ``sum_stat="sort"`` is equivalent to the the 1D 1-wasserstein distance
+    sum_stat: str or callable
+        Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
+        If a callable is based it should return a number or a 1d numpy array.
+    epsilon: float or array
+        Scaling parameter for the distance functions. It should be a float or an array of the
+        same size of the output of ``sum_stat``.
+    *args and **kwargs:
+        Arguments and keywords arguments that the function takes.
+    """
+
     def __init__(
         self,
         function,
         *args,
         params=None,
-        distance="gaussian_kernel",
+        distance="gaussian",
         sum_stat="identity",
         epsilon=1,
         **kwargs,
     ):
-        r"""
-        This class stores a function defined by the user in Python language.
-
-        function: function
-            Python function defined by the user.
-        params: list
-            Parameters passed to function.
-        distance: str or callable
-            Distance functions. Available options are "gaussian_kernel" (default), "wasserstein",
-            "energy" or a user defined function that takes epsilon (a scalar), and the summary
-            statistics of observed_data, and simulated_data as input.
-            ``gaussian_kernel`` :math: `\sum \left(-0.5  \left(\frac{xo - xs}{\epsilon}\right)^2\right)`
-            ``wasserstein`` :math: `\frac{1}{n} \sum{\left(\frac{|xo - xs|}{\epsilon}\right)}`
-            ``energy`` :math: `\sqrt{2} \sqrt{\frac{1}{n} \sum \left(\frac{|xo - xs|}{\epsilon}\right)^2}`
-            For the wasserstein and energy distances the observed data xo and simulated data xs
-            are internally sorted (i.e. the sum_stat is "sort").
-        sum_stat: str or callable
-            Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
-            If a callable is based it should return a number or a 1d numpy array.
-        epsilon: float
-            Standard deviation of the gaussian_kernel.
-        *args and **kwargs:
-            Arguments and keywords arguments that the function takes.
-        """
-
         self.function = function
         self.params = params
         observed = self.data
         self.epsilon = epsilon
 
-        if distance == "gaussian_kernel":
-            self.distance = gaussian_kernel
-        elif distance == "wasserstein":
-            self.distance = wasserstein
-            if sum_stat != "sort":
-                _log.info(f"Automatically setting sum_stat to sort as expected by {distance}")
-                sum_stat = "sort"
-        elif distance == "energy":
-            self.distance = energy
-            if sum_stat != "sort":
-                _log.info(f"Automatically setting sum_stat to sort as expected by {distance}")
-                sum_stat = "sort"
+        if distance == "gaussian":
+            self.distance = gaussian
+        elif distance == "laplace":
+            self.distance = laplace
+        elif distance == "kullback_leibler":
+            self.distance = KullbackLiebler(observed)
+            if sum_stat != "identity":
+                _log.info(f"Automatically setting sum_stat to identity as expected by {distance}")
+                sum_stat = "identity"
         elif hasattr(distance, "__call__"):
             self.distance = distance
         else:
@@ -96,15 +97,16 @@ def __init__(
 
     def random(self, point=None, size=None):
         """
-        Draw random values from Simulator
+        Draw random values from Simulator.
+
         Parameters
         ----------
         point: dict, optional
-            Dict of variable values on which random values are to be
-            conditioned (uses default point if not specified).
+            Dict of variable values on which random values are to be conditioned (uses default
+            point if not specified).
         size: int, optional
-            Desired size of random sample (returns one sample if not
-            specified).
+            Desired size of random sample (returns one sample if not specified).
+
         Returns
         -------
         array
@@ -122,7 +124,7 @@ def _str_repr(self, name=None, dist=None, formatting="plain"):
         function = dist.function.__name__
         params = ", ".join([var.name for var in dist.params])
         sum_stat = self.sum_stat.__name__ if hasattr(self.sum_stat, "__call__") else self.sum_stat
-        distance = self.distance.__name__
+        distance = getattr(self.distance, "__name__", self.distance.__class__.__name__)
 
         if formatting == "latex":
             return f"$\\text{{{name}}} \\sim  \\text{{Simulator}}(\\text{{{function}}}({params}), \\text{{{distance}}}, \\text{{{sum_stat}}})$"
@@ -135,22 +137,31 @@ def identity(x):
     return x
 
 
-def gaussian_kernel(epsilon, obs_data, sim_data):
-    """gaussian distance function"""
-    return np.sum(-0.5 * ((obs_data - sim_data) / epsilon) ** 2)
+def gaussian(epsilon, obs_data, sim_data):
+    """Gaussian kernel."""
+    return -0.5 * ((obs_data - sim_data) / epsilon) ** 2
 
 
-def wasserstein(epsilon, obs_data, sim_data):
-    """Wasserstein distance function.
+def laplace(epsilon, obs_data, sim_data):
+    """Laplace kernel."""
+    return -np.abs((obs_data - sim_data) / epsilon)
 
-    We are assuming obs_data and sim_data are already sorted!
-    """
-    return np.mean(np.abs((obs_data - sim_data) / epsilon))
 
+class KullbackLiebler:
+    """Approximate Kullback-Liebler."""
 
-def energy(epsilon, obs_data, sim_data):
-    """Energy distance function.
+    def __init__(self, obs_data):
+        if obs_data.ndim == 1:
+            obs_data = obs_data[:, None]
+        n, d = obs_data.shape
+        rho_d, _ = cKDTree(obs_data).query(obs_data, 2)
+        self.rho_d = rho_d[:, 1]
+        self.d_n = d / n
+        self.log_r = np.log(n / (n - 1))
+        self.obs_data = obs_data
 
-    We are assuming obs_data and sim_data are already sorted!
-    """
-    return 1.4142 * np.mean(((obs_data - sim_data) / epsilon) ** 2) ** 0.5
+    def __call__(self, epsilon, obs_data, sim_data):
+        if sim_data.ndim == 1:
+            sim_data = sim_data[:, None]
+        nu_d, _ = cKDTree(sim_data).query(self.obs_data, 1)
+        return self.d_n * np.sum(-np.log(nu_d / self.rho_d) / epsilon) + self.log_r
diff --git a/pymc3/smc/sample_smc.py b/pymc3/smc/sample_smc.py
@@ -24,10 +24,6 @@
 from ..backends.base import MultiTrace
 from ..parallel_sampling import _cpu_count
 
-EXPERIMENTAL_WARNING = (
-    "Warning: SMC-ABC is an experimental step method and not yet recommended for use in PyMC3!"
-)
-
 
 def sample_smc(
     draws=2000,
@@ -38,20 +34,21 @@ def sample_smc(
     p_acc_rate=0.85,
     threshold=0.5,
     save_sim_data=False,
+    save_log_pseudolikelihood=True,
     model=None,
     random_seed=-1,
     parallel=False,
     chains=None,
     cores=None,
 ):
     r"""
-    Sequential Monte Carlo based sampling
+    Sequential Monte Carlo based sampling.
 
     Parameters
     ----------
     draws: int
         The number of samples to draw from the posterior (i.e. last stage). And also the number of
-        independent chains. Defaults to 1000.
+        independent chains. Defaults to 2000.
     kernel: str
         Kernel method for the SMC sampler. Available option are ``metropolis`` (default) and `ABC`.
         Use `ABC` for likelihood free inference togheter with a ``pm.Simulator``.
@@ -66,15 +63,19 @@ def sample_smc(
         Whether to compute the number of steps automatically or not. Defaults to True
     p_acc_rate: float
         Used to compute ``n_steps`` when ``tune_steps == True``. The higher the value of
-        ``p_acc_rate`` the higher the number of steps computed automatically. Defaults to 0.99.
+        ``p_acc_rate`` the higher the number of steps computed automatically. Defaults to 0.85.
         It should be between 0 and 1.
     threshold: float
         Determines the change of beta from stage to stage, i.e.indirectly the number of stages,
         the higher the value of `threshold` the higher the number of stages. Defaults to 0.5.
         It should be between 0 and 1.
     save_sim_data : bool
-        Whether or not to save the simulated data. This parameters only work with the ABC kernel.
-        The stored data corresponds to the posterior predictive distribution.
+        Whether or not to save the simulated data. This parameter only works with the ABC kernel.
+        The stored data corresponds to a samples from the posterior predictive distribution.
+    save_log_pseudolikelihood : bool
+        Whether or not to save the log pseudolikelihood values. This parameter only works with the
+        ABC kernel. The stored data can be used to compute LOO or WAIC values. Computing LOO/WAIC
+        values from log pseudolikelihood values is experimental.
     model: Model (optional if in ``with`` context)).
     random_seed: int
         random seed
@@ -134,7 +135,6 @@ def sample_smc(
         816-832. `link <http://ascelibrary.org/doi/abs/10.1061/%28ASCE%290733-9399
         %282007%29133:7%28816%29>`__
     """
-
     _log = logging.getLogger("pymc3")
     _log.info("Initializing SMC sampler...")
 
@@ -164,7 +164,6 @@ def sample_smc(
         raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")
 
     if kernel.lower() == "abc":
-        warnings.warn(EXPERIMENTAL_WARNING)
         if len(model.observed_RVs) != 1:
             warnings.warn("SMC-ABC only works properly with models with one observed variable")
         if model.potentials:
@@ -179,6 +178,7 @@ def sample_smc(
         p_acc_rate,
         threshold,
         save_sim_data,
+        save_log_pseudolikelihood,
         model,
     )
 
@@ -197,11 +197,20 @@ def sample_smc(
         for i in range(chains):
             results.append(sample_smc_int(*params, random_seed[i], i, _log))
 
-    traces, sim_data, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
+    (
+        traces,
+        sim_data,
+        log_marginal_likelihoods,
+        log_pseudolikelihood,
+        betas,
+        accept_ratios,
+        nsteps,
+    ) = zip(*results)
     trace = MultiTrace(traces)
     trace.report._n_draws = draws
     trace.report._n_tune = 0
     trace.report.log_marginal_likelihood = np.array(log_marginal_likelihoods)
+    trace.report.log_pseudolikelihood = log_pseudolikelihood
     trace.report.betas = betas
     trace.report.accept_ratios = accept_ratios
     trace.report.nsteps = nsteps
@@ -222,12 +231,13 @@ def sample_smc_int(
     p_acc_rate,
     threshold,
     save_sim_data,
+    save_log_pseudolikelihood,
     model,
     random_seed,
     chain,
     _log,
 ):
-
+    """Run one SMC instance."""
     smc = SMC(
         draws=draws,
         kernel=kernel,
@@ -237,6 +247,7 @@ def sample_smc_int(
         p_acc_rate=p_acc_rate,
         threshold=threshold,
         save_sim_data=save_sim_data,
+        save_log_pseudolikelihood=save_log_pseudolikelihood,
         model=model,
         random_seed=random_seed,
         chain=chain,
@@ -266,6 +277,7 @@ def sample_smc_int(
         smc.posterior_to_trace(),
         smc.sim_data,
         smc.log_marginal_likelihood,
+        smc.log_pseudolikelihood,
         betas,
         accept_ratios,
         nsteps,