Skip to content

Commit

Permalink
ABC: Add Kullback Liebler and save pointwise pseudolikelihood values (#…
Browse files Browse the repository at this point in the history
…4155)

* add kl distance rename old ones, save pointwise pseudolikelihood values

* fix docstring

* fix docstring style

* fix docstrings

* improve docstrings

* fix docstring

* add missing header to docstring
  • Loading branch information
aloctavodia authored Oct 8, 2020
1 parent b707791 commit a248c3d
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 152 deletions.
125 changes: 68 additions & 57 deletions pymc3/distributions/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,65 +15,66 @@
import logging
import numpy as np
from .distribution import NoDistribution, draw_values
from scipy.spatial import cKDTree

__all__ = ["Simulator"]

_log = logging.getLogger("pymc3")


class Simulator(NoDistribution):
r"""
Define a simulator, from a Python function, to be used in ABC methods.
Parameters
----------
function: callable
Python function defined by the user.
params: list
Parameters passed to function.
distance: str or callable
Distance functions. Available options are "gaussian" (default), "laplacian",
"kullback_leibler" or a user defined function that takes epsilon, the summary statistics of
observed_data and the summary statistics of simulated_data as input.
``gaussian`` :math: `-0.5 \left(\left(\frac{xo - xs}{\epsilon}\right)^2\right)`
``laplace`` :math: `{\left(\frac{|xo - xs|}{\epsilon}\right)}`
``kullback_leibler`` `:math: d \sum(-\log(\frac{\nu_d} {\rho_d}) / \epsilon) + log_r`
gaussian + ``sum_stat="sort"`` is equivalent to the 1D 2-wasserstein distance
laplace + ``sum_stat="sort"`` is equivalent to the the 1D 1-wasserstein distance
sum_stat: str or callable
Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
If a callable is based it should return a number or a 1d numpy array.
epsilon: float or array
Scaling parameter for the distance functions. It should be a float or an array of the
same size of the output of ``sum_stat``.
*args and **kwargs:
Arguments and keywords arguments that the function takes.
"""

def __init__(
self,
function,
*args,
params=None,
distance="gaussian_kernel",
distance="gaussian",
sum_stat="identity",
epsilon=1,
**kwargs,
):
r"""
This class stores a function defined by the user in Python language.
function: function
Python function defined by the user.
params: list
Parameters passed to function.
distance: str or callable
Distance functions. Available options are "gaussian_kernel" (default), "wasserstein",
"energy" or a user defined function that takes epsilon (a scalar), and the summary
statistics of observed_data, and simulated_data as input.
``gaussian_kernel`` :math: `\sum \left(-0.5 \left(\frac{xo - xs}{\epsilon}\right)^2\right)`
``wasserstein`` :math: `\frac{1}{n} \sum{\left(\frac{|xo - xs|}{\epsilon}\right)}`
``energy`` :math: `\sqrt{2} \sqrt{\frac{1}{n} \sum \left(\frac{|xo - xs|}{\epsilon}\right)^2}`
For the wasserstein and energy distances the observed data xo and simulated data xs
are internally sorted (i.e. the sum_stat is "sort").
sum_stat: str or callable
Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``.
If a callable is based it should return a number or a 1d numpy array.
epsilon: float
Standard deviation of the gaussian_kernel.
*args and **kwargs:
Arguments and keywords arguments that the function takes.
"""

self.function = function
self.params = params
observed = self.data
self.epsilon = epsilon

if distance == "gaussian_kernel":
self.distance = gaussian_kernel
elif distance == "wasserstein":
self.distance = wasserstein
if sum_stat != "sort":
_log.info(f"Automatically setting sum_stat to sort as expected by {distance}")
sum_stat = "sort"
elif distance == "energy":
self.distance = energy
if sum_stat != "sort":
_log.info(f"Automatically setting sum_stat to sort as expected by {distance}")
sum_stat = "sort"
if distance == "gaussian":
self.distance = gaussian
elif distance == "laplace":
self.distance = laplace
elif distance == "kullback_leibler":
self.distance = KullbackLiebler(observed)
if sum_stat != "identity":
_log.info(f"Automatically setting sum_stat to identity as expected by {distance}")
sum_stat = "identity"
elif hasattr(distance, "__call__"):
self.distance = distance
else:
Expand All @@ -96,15 +97,16 @@ def __init__(

def random(self, point=None, size=None):
"""
Draw random values from Simulator
Draw random values from Simulator.
Parameters
----------
point: dict, optional
Dict of variable values on which random values are to be
conditioned (uses default point if not specified).
Dict of variable values on which random values are to be conditioned (uses default
point if not specified).
size: int, optional
Desired size of random sample (returns one sample if not
specified).
Desired size of random sample (returns one sample if not specified).
Returns
-------
array
Expand All @@ -122,7 +124,7 @@ def _str_repr(self, name=None, dist=None, formatting="plain"):
function = dist.function.__name__
params = ", ".join([var.name for var in dist.params])
sum_stat = self.sum_stat.__name__ if hasattr(self.sum_stat, "__call__") else self.sum_stat
distance = self.distance.__name__
distance = getattr(self.distance, "__name__", self.distance.__class__.__name__)

if formatting == "latex":
return f"$\\text{{{name}}} \\sim \\text{{Simulator}}(\\text{{{function}}}({params}), \\text{{{distance}}}, \\text{{{sum_stat}}})$"
Expand All @@ -135,22 +137,31 @@ def identity(x):
return x


def gaussian_kernel(epsilon, obs_data, sim_data):
"""gaussian distance function"""
return np.sum(-0.5 * ((obs_data - sim_data) / epsilon) ** 2)
def gaussian(epsilon, obs_data, sim_data):
"""Gaussian kernel."""
return -0.5 * ((obs_data - sim_data) / epsilon) ** 2


def wasserstein(epsilon, obs_data, sim_data):
"""Wasserstein distance function.
def laplace(epsilon, obs_data, sim_data):
"""Laplace kernel."""
return -np.abs((obs_data - sim_data) / epsilon)

We are assuming obs_data and sim_data are already sorted!
"""
return np.mean(np.abs((obs_data - sim_data) / epsilon))

class KullbackLiebler:
"""Approximate Kullback-Liebler."""

def energy(epsilon, obs_data, sim_data):
"""Energy distance function.
def __init__(self, obs_data):
if obs_data.ndim == 1:
obs_data = obs_data[:, None]
n, d = obs_data.shape
rho_d, _ = cKDTree(obs_data).query(obs_data, 2)
self.rho_d = rho_d[:, 1]
self.d_n = d / n
self.log_r = np.log(n / (n - 1))
self.obs_data = obs_data

We are assuming obs_data and sim_data are already sorted!
"""
return 1.4142 * np.mean(((obs_data - sim_data) / epsilon) ** 2) ** 0.5
def __call__(self, epsilon, obs_data, sim_data):
if sim_data.ndim == 1:
sim_data = sim_data[:, None]
nu_d, _ = cKDTree(sim_data).query(self.obs_data, 1)
return self.d_n * np.sum(-np.log(nu_d / self.rho_d) / epsilon) + self.log_r
38 changes: 25 additions & 13 deletions pymc3/smc/sample_smc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@
from ..backends.base import MultiTrace
from ..parallel_sampling import _cpu_count

EXPERIMENTAL_WARNING = (
"Warning: SMC-ABC is an experimental step method and not yet recommended for use in PyMC3!"
)


def sample_smc(
draws=2000,
Expand All @@ -38,20 +34,21 @@ def sample_smc(
p_acc_rate=0.85,
threshold=0.5,
save_sim_data=False,
save_log_pseudolikelihood=True,
model=None,
random_seed=-1,
parallel=False,
chains=None,
cores=None,
):
r"""
Sequential Monte Carlo based sampling
Sequential Monte Carlo based sampling.
Parameters
----------
draws: int
The number of samples to draw from the posterior (i.e. last stage). And also the number of
independent chains. Defaults to 1000.
independent chains. Defaults to 2000.
kernel: str
Kernel method for the SMC sampler. Available option are ``metropolis`` (default) and `ABC`.
Use `ABC` for likelihood free inference togheter with a ``pm.Simulator``.
Expand All @@ -66,15 +63,19 @@ def sample_smc(
Whether to compute the number of steps automatically or not. Defaults to True
p_acc_rate: float
Used to compute ``n_steps`` when ``tune_steps == True``. The higher the value of
``p_acc_rate`` the higher the number of steps computed automatically. Defaults to 0.99.
``p_acc_rate`` the higher the number of steps computed automatically. Defaults to 0.85.
It should be between 0 and 1.
threshold: float
Determines the change of beta from stage to stage, i.e.indirectly the number of stages,
the higher the value of `threshold` the higher the number of stages. Defaults to 0.5.
It should be between 0 and 1.
save_sim_data : bool
Whether or not to save the simulated data. This parameters only work with the ABC kernel.
The stored data corresponds to the posterior predictive distribution.
Whether or not to save the simulated data. This parameter only works with the ABC kernel.
The stored data corresponds to a samples from the posterior predictive distribution.
save_log_pseudolikelihood : bool
Whether or not to save the log pseudolikelihood values. This parameter only works with the
ABC kernel. The stored data can be used to compute LOO or WAIC values. Computing LOO/WAIC
values from log pseudolikelihood values is experimental.
model: Model (optional if in ``with`` context)).
random_seed: int
random seed
Expand Down Expand Up @@ -134,7 +135,6 @@ def sample_smc(
816-832. `link <http://ascelibrary.org/doi/abs/10.1061/%28ASCE%290733-9399
%282007%29133:7%28816%29>`__
"""

_log = logging.getLogger("pymc3")
_log.info("Initializing SMC sampler...")

Expand Down Expand Up @@ -164,7 +164,6 @@ def sample_smc(
raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")

if kernel.lower() == "abc":
warnings.warn(EXPERIMENTAL_WARNING)
if len(model.observed_RVs) != 1:
warnings.warn("SMC-ABC only works properly with models with one observed variable")
if model.potentials:
Expand All @@ -179,6 +178,7 @@ def sample_smc(
p_acc_rate,
threshold,
save_sim_data,
save_log_pseudolikelihood,
model,
)

Expand All @@ -197,11 +197,20 @@ def sample_smc(
for i in range(chains):
results.append(sample_smc_int(*params, random_seed[i], i, _log))

traces, sim_data, log_marginal_likelihoods, betas, accept_ratios, nsteps = zip(*results)
(
traces,
sim_data,
log_marginal_likelihoods,
log_pseudolikelihood,
betas,
accept_ratios,
nsteps,
) = zip(*results)
trace = MultiTrace(traces)
trace.report._n_draws = draws
trace.report._n_tune = 0
trace.report.log_marginal_likelihood = np.array(log_marginal_likelihoods)
trace.report.log_pseudolikelihood = log_pseudolikelihood
trace.report.betas = betas
trace.report.accept_ratios = accept_ratios
trace.report.nsteps = nsteps
Expand All @@ -222,12 +231,13 @@ def sample_smc_int(
p_acc_rate,
threshold,
save_sim_data,
save_log_pseudolikelihood,
model,
random_seed,
chain,
_log,
):

"""Run one SMC instance."""
smc = SMC(
draws=draws,
kernel=kernel,
Expand All @@ -237,6 +247,7 @@ def sample_smc_int(
p_acc_rate=p_acc_rate,
threshold=threshold,
save_sim_data=save_sim_data,
save_log_pseudolikelihood=save_log_pseudolikelihood,
model=model,
random_seed=random_seed,
chain=chain,
Expand Down Expand Up @@ -266,6 +277,7 @@ def sample_smc_int(
smc.posterior_to_trace(),
smc.sim_data,
smc.log_marginal_likelihood,
smc.log_pseudolikelihood,
betas,
accept_ratios,
nsteps,
Expand Down
Loading

0 comments on commit a248c3d

Please sign in to comment.