diff --git a/docs/source/api/distributions.rst b/docs/source/api/distributions.rst index 26a1f01ff59..60bb36aa36e 100644 --- a/docs/source/api/distributions.rst +++ b/docs/source/api/distributions.rst @@ -8,6 +8,7 @@ Distributions distributions/discrete distributions/multivariate distributions/mixture + distributions/simulator distributions/timeseries distributions/transforms distributions/utilities diff --git a/docs/source/api/distributions/simulator.rst b/docs/source/api/distributions/simulator.rst new file mode 100644 index 00000000000..1f2cb08a844 --- /dev/null +++ b/docs/source/api/distributions/simulator.rst @@ -0,0 +1,12 @@ +********** +Simulator +********** + +.. currentmodule:: pymc3.distributions.simulator +.. autosummary:: + + SimulatorRV + Simulator + +.. automodule:: pymc3.distributions.simulator + :members: diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py index fdc6e45e712..75979f1b80d 100644 --- a/pymc3/aesaraf.py +++ b/pymc3/aesaraf.py @@ -350,15 +350,26 @@ def rvs_to_value_vars( """ + # Avoid circular dependency + from pymc3.distributions.simulator import SimulatorRV + def transform_replacements(var, replacements): rv_var, rv_value_var = extract_rv_and_value_vars(var) if rv_value_var is None: - warnings.warn( - f"No value variable found for {rv_var}; " - "the random variable will not be replaced." - ) - return [] + # If RandomVariable does not have a value_var and corresponds to + # a SimulatorRV, we allow further replacements in upstream graph + if isinstance(rv_var.owner.op, SimulatorRV): + # First 3 inputs are just rng, dtype, and size, which don't + # need to be replaced. + return var.owner.inputs[3:] + + else: + warnings.warn( + f"No value variable found for {rv_var}; " + "the random variable will not be replaced." + ) + return [] transform = getattr(rv_value_var.tag, "transform", None) diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index 164975e8b7f..76486b98d8f 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -100,7 +100,7 @@ Wishart, WishartBartlett, ) -from pymc3.distributions.simulator import Simulator +from pymc3.distributions.simulator import Simulator, SimulatorRV from pymc3.distributions.timeseries import ( AR, AR1, @@ -188,6 +188,7 @@ "Rice", "Moyal", "Simulator", + "SimulatorRV", "BART", "CAR", "PolyaGamma", diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py index 42c33a1afe4..245277031ee 100644 --- a/pymc3/distributions/distribution.py +++ b/pymc3/distributions/distribution.py @@ -22,7 +22,6 @@ from typing import Optional import aesara -import aesara.tensor as at from aesara.tensor.random.op import RandomVariable from aesara.tensor.random.var import RandomStateSharedVariable @@ -325,47 +324,6 @@ def dist( return rv_out -class NoDistribution(Distribution): - def __init__( - self, - shape, - dtype, - initval=None, - defaults=(), - parent_dist=None, - *args, - **kwargs, - ): - super().__init__( - shape=shape, dtype=dtype, initval=initval, defaults=defaults, *args, **kwargs - ) - self.parent_dist = parent_dist - - def __getattr__(self, name): - # Do not use __getstate__ and __setstate__ from parent_dist - # to avoid infinite recursion during unpickling - if name.startswith("__"): - raise AttributeError("'NoDistribution' has no attribute '%s'" % name) - return getattr(self.parent_dist, name) - - def logp(self, x): - """Calculate log probability. - - Parameters - ---------- - x: numeric - Value for which log-probability is calculated. - - Returns - ------- - TensorVariable - """ - return at.zeros_like(x) - - def _distr_parameters_for_repr(self): - return [] - - class Discrete(Distribution): """Base class for discrete distributions""" @@ -381,6 +339,13 @@ class Continuous(Distribution): """Base class for continuous distributions""" +class NoDistribution(Distribution): + """Base class for artifical distributions + + RandomVariables that share this type are allowed in logprob graphs + """ + + class DensityDist(Distribution): """Distribution based on a given log density function. diff --git a/pymc3/distributions/simulator.py b/pymc3/distributions/simulator.py index 0b0fba1d30a..d9ecd3eb597 100644 --- a/pymc3/distributions/simulator.py +++ b/pymc3/distributions/simulator.py @@ -13,128 +13,330 @@ # limitations under the License. import logging +import warnings +import aesara +import aesara.tensor as at import numpy as np +from aesara.graph.op import Apply, Op +from aesara.graph.utils import MetaType +from aesara.tensor.random.op import RandomVariable +from aesara.tensor.var import TensorVariable from scipy.spatial import cKDTree +from pymc3.aesaraf import floatX from pymc3.distributions.distribution import NoDistribution +from pymc3.distributions.logprob import _logp -__all__ = ["Simulator"] +__all__ = ["Simulator", "SimulatorRV"] _log = logging.getLogger("pymc3") +def identity(x): + """Identity function, used as a summary statistics.""" + return x + + +def gaussian(epsilon, obs_data, sim_data): + """Gaussian kernel.""" + return -0.5 * ((obs_data - sim_data) / epsilon) ** 2 + + +def laplace(epsilon, obs_data, sim_data): + """Laplace kernel.""" + return -at.abs_((obs_data - sim_data) / epsilon) + + +class SimulatorRV(RandomVariable): + """ + Base class for SimulatorRVs + + This should be subclassed when defining custom Simulator objects. + + The following class attributes must be defined: + + ndim_supp : int + Number of dimensions of the SimulatorRV (0 for scalar, 1 for vector, etc.) + ndims_params: list[int] + Number of minimum dimensions of each parameter of the RV. For example, + if the Simulator accepts two scalar inputs, it should be `[0, 0]` + fn: callable + Python random simulator function. Should expect the following signature + (rng, arg1, arg2, ... argn, size), where rng is a numpy.random.RandomStream() + and size defines the size of the desired sample. The epsilon parameter + is not passed to the callable. + + The following class attributes can be optionally defined: + + distance : Aesara Op, callable or str + Distance function. Available options are `"gaussian"` (default), `"laplacian"`, + `"kullback_leibler"` or a user defined function (or Aesara Op) that takes + epsilon, the summary statistics of observed_data and the summary statistics + of simulated_data as input. + ``gaussian`` :math: `-0.5 \\left(\\left(\frac{xo - xs}{\\epsilon}\right)^2\right)` + ``laplace`` :math: `{\\left(\frac{|xo - xs|}{\\epsilon}\right)}` + ``kullback_leibler`` `:math: d \\sum(-\\log(\frac{\nu_d} {\rho_d}) / \\epsilon) + log_r` + gaussian + ``sum_stat="sort"`` is equivalent to the 1D 2-wasserstein distance + laplace + ``sum_stat="sort"`` is equivalent to the the 1D 1-wasserstein distance + + sum_stat: Aesara Op, callable or str + Summary statistic function. Available options are `"indentity"` (default), + `"sort"`, `"mean"`, `"median"`. If a callable (or Aesara Op) is defined, + it should return a 1d numpy array (or Aesara vector). + + epsilon: float or array + Scaling parameter for the distance functions. It should be a float or + an array of the same size of the output of ``sum_stat``. Defaults to ``1.0`` + + Other class attributes used by `RandomVariables` can also be defined. See the + documentation for aesara.tensor.random.op.RandomVariable for more information. + + Examples + -------- + .. code-block:: python + + def my_simulator_fn(rng, loc, scale, size): + return rng.normal(loc, scale, size=size) + + class MySimulatorRV(pm.SimulatorRV): + ndim_supp = 0 + ndims_params = [0, 0] + fn = my_simulator_fn + distance = "gaussian" + sum_stat = "sort" + epsilon = 1.0 + + my_simulator = MySimulatorRV() + + with pm.Model() as m: + simulator = pm.Simulator("sim", my_simulator, 0, 1, observed=data) + + """ + + name = "SimulatorRV" + ndim_supp = None + ndims_params = None + dtype = "floatX" + _print_name = ("Simulator", "\\operatorname{Simulator}") + + fn = None + distance = gaussian + sum_stat = identity + epsilon = 1.0 + + def __new__(cls, *args, **kwargs): + if cls.fn is None: + raise ValueError("SimulatorRV fn was not specified") + + if cls.ndim_supp is None: + raise ValueError( + "SimulatorRV must specify `ndim_supp`. This is the minimum" + "number of dimensions for a single random draw from the simulator." + "\nFor a univariate simulator `ndims_supp = 0`" + ) + + if cls.ndims_params is None: + raise ValueError( + "Simulator RV must specify `ndims_params`. This is the minimum" + "number of dimensions for every parameter that the Simulator takes," + "including epsilon.\nFor a Simulator that can take two scalar " + "parameters` ndims_params = [0, 0, 0]" + ) + + distance = cls.distance + if not isinstance(distance, Op): + if distance == "gaussian": + distance = gaussian + elif distance == "laplace": + distance = laplace + elif distance == "kullback_leibler": + raise NotImplementedError("KL not refactored yet") + # TODO: Wrap KL in aesara OP + # distance = KullbackLiebler(observed) + # if sum_stat != "identity": + # _log.info(f"Automatically setting sum_stat to identity as expected by {distance}") + # sum_stat = "identity" + elif callable(distance): + distance = create_distance_op_from_fn(distance) + else: + raise ValueError(f"The distance metric {distance} is not implemented") + + sum_stat = cls.sum_stat + if not isinstance(sum_stat, Op): + if sum_stat == "identity": + sum_stat = identity + elif sum_stat == "sort": + sum_stat = at.sort + elif sum_stat == "mean": + sum_stat = at.mean + elif sum_stat == "median": + # Missing in Aesara, see aesara/issues/525 + sum_stat = create_sum_stat_op_from_fn(np.median) + elif callable(sum_stat): + sum_stat = create_sum_stat_op_from_fn(sum_stat) + else: + raise ValueError(f"The summary statistic {sum_stat} is not implemented") + + cls.distance = distance + cls.sum_stat = sum_stat + cls.epsilon = at.as_tensor_variable(floatX(cls.epsilon)) + return super().__new__(cls) + + @classmethod + def rng_fn(cls, *args, **kwargs): + return cls.fn(*args, **kwargs) + + @classmethod + def _distance(cls, epsilon, value, simulated_value): + return cls.distance(epsilon, value, simulated_value) + + @classmethod + def _sum_stat(cls, value): + return cls.sum_stat(value) + + class Simulator(NoDistribution): r""" - Define a simulator, from a Python function, to be used in ABC methods. + Simulator pseudo-distribution, used for Approximate Bayesian Inference (ABC) + with Sequential Monte Carlo (SMC) sampling (i.e.,`pm.sample_smc`). + + The first argument should be an instance of a `pm.SimulatorRV` subclass that + defines the Simulator object. See the documentation for `pm.SimulatorRV` for + details on how to define a custom Simulator object + + Simulator distributions have a stochastic pseudo-loglikelihood defined by + a distance metric between the observed data and simulated data, and tweaked + by a hyper-parameter `epsilon`. Parameters ---------- - function: callable - Python function defined by the user. + simulator: `pm.SimulatorRV` `Op` + Simulator object. See `pm.SimulatorRV` docstrings for more details params: list - Parameters passed to function. - distance: str or callable - Distance functions. Available options are "gaussian" (default), "laplacian", - "kullback_leibler" or a user defined function that takes epsilon, the summary statistics of - observed_data and the summary statistics of simulated_data as input. - ``gaussian`` :math: `-0.5 \left(\left(\frac{xo - xs}{\epsilon}\right)^2\right)` - ``laplace`` :math: `{\left(\frac{|xo - xs|}{\epsilon}\right)}` - ``kullback_leibler`` `:math: d \sum(-\log(\frac{\nu_d} {\rho_d}) / \epsilon) + log_r` - gaussian + ``sum_stat="sort"`` is equivalent to the 1D 2-wasserstein distance - laplace + ``sum_stat="sort"`` is equivalent to the the 1D 1-wasserstein distance - sum_stat: str or callable - Summary statistics. Available options are ``indentity``, ``sort``, ``mean``, ``median``. - If a callable is based it should return a number or a 1d numpy array. - epsilon: float or array - Scaling parameter for the distance functions. It should be a float or an array of the - same size of the output of ``sum_stat``. - *args and **kwargs: - Arguments and keywords arguments that the function takes. + Parameters used by the Simulator random function. Parameters can also + be passed by order, in which case the keyword argumetn ``params`` is + ignored. + + Examples + -------- + .. code-block:: python + + def my_simulator_fn(rng, loc, scale, size): + return rng.normal(loc, scale, size=size) + + class MySimulatorRV(pm.SimulatorRV): + ndim_supp = 0 + ndims_params = [0, 0, 0] + fn = my_simulator_fn + distance = "gaussian" + sum_stat = "sort" + epsilon = 1.0 + + my_simulator = MySimulatorRV() + + with pm.Model() as m: + simulator = pm.Simulator("sim", my_simulator, 0, 1, observed=data) + """ - def __init__( - self, - function, - *args, - params=None, - distance="gaussian", - sum_stat="identity", - epsilon=1, - **kwargs, - ): - self.function = function - self.params = params - observed = self.data - self.epsilon = epsilon - - if distance == "gaussian": - self.distance = gaussian - elif distance == "laplace": - self.distance = laplace - elif distance == "kullback_leibler": - self.distance = KullbackLiebler(observed) - if sum_stat != "identity": - _log.info(f"Automatically setting sum_stat to identity as expected by {distance}") - sum_stat = "identity" - elif hasattr(distance, "__call__"): - self.distance = distance - else: - raise ValueError(f"The distance metric {distance} is not implemented") - - if sum_stat == "identity": - self.sum_stat = identity - elif sum_stat == "sort": - self.sum_stat = np.sort - elif sum_stat == "mean": - self.sum_stat = np.mean - elif sum_stat == "median": - self.sum_stat = np.median - elif hasattr(sum_stat, "__call__"): - self.sum_stat = sum_stat - else: - raise ValueError(f"The summary statistics {sum_stat} is not implemented") - - super().__init__(shape=np.prod(observed.shape), dtype=observed.dtype, *args, **kwargs) - - def random(self, point=None, size=None): - """ - Draw random values from Simulator. - - Parameters - ---------- - point: dict, optional - Dict of variable values on which random values are to be conditioned (uses default - point if not specified). - size: int, optional - Desired size of random sample (returns one sample if not specified). - - Returns - ------- - array - """ - # size = to_tuple(size) - # params = draw_values([*self.params], point=point, size=size) - # if len(size) == 0: - # return self.function(*params) - # else: - # return np.array([self.function(*params) for _ in range(size[0])]) + def __new__(cls, name, simulator, *params, **kwargs): + cls.check_simulator_args(simulator, *params, **kwargs) + # Register custom op and logp + cls.rv_op = simulator + rv_type = type(simulator) + NoDistribution.register(rv_type) -def identity(x): - """Identity function, used as a summary statistics.""" - return x + @_logp.register(rv_type) + def logp(op, sim_rv, rvs_to_values, *sim_params, **kwargs): + value_var = rvs_to_values.get(sim_rv, sim_rv) + return cls.logp( + value_var, + sim_rv, + *sim_params, + ) + return super().__new__(cls, name, simulator, *params, **kwargs) -def gaussian(epsilon, obs_data, sim_data): - """Gaussian kernel.""" - return -0.5 * ((obs_data - sim_data) / epsilon) ** 2 + @classmethod + def dist(cls, simulator, *params, **kwargs): + cls.check_simulator_args(simulator, *params, **kwargs) + # Compatibility with V3 params keyword argument + if "params" in kwargs: + warnings.warn( + "The kwarg ``params`` will be deprecated. You should pass the " + 'Simulator parameters in order `pm.Simulator("sim", param1, param2, ...)', + DeprecationWarning, + stacklevel=2, + ) + assert not params + params = kwargs.pop("params") -def laplace(epsilon, obs_data, sim_data): - """Laplace kernel.""" - return -np.abs((obs_data - sim_data) / epsilon) + params = [at.as_tensor_variable(floatX(param)) for param in params] + return super().dist([*params], **kwargs) + + @classmethod + def logp(cls, value, sim_rv, *sim_params): + # Create a new simulatorRV that is identical to the original one + sim_op = sim_rv.owner.op + sim_value = at.as_tensor_variable(sim_op.make_node(*sim_rv.owner.inputs)) + sim_value.name = "sim_value" + + # Override rng to avoid non-randomness in parallel sampling + # TODO: Model rngs should be updated prior to multiprocessing split, + # in which case this would not be needed. However, that would have to be + # done for every sampler that may accomodate Simulators + sim_value.owner.inputs[0].set_value(np.random.RandomState()) + + return sim_op._distance( + sim_op.epsilon, + sim_op._sum_stat(value), + sim_op._sum_stat(sim_value), + ) + + @classmethod + def check_simulator_args(cls, simulator, *args, **kwargs): + if not isinstance(simulator, SimulatorRV): + if isinstance(simulator, MetaType): + raise ValueError( + f"simulator {simulator} does not seem to be an instantiated " + f"class. Did you forget to call `{simulator}()`?" + ) + raise ValueError( + f"simulator {simulator} should be a subclass instance of " + f"`pm.SimulatorRV` but got {type(simulator)}" + ) + + n_params = len(args) + len(kwargs.get("params", [])) + if n_params != len(simulator.ndims_params): + raise ValueError( + f"`Simulator` expected {len(simulator.ndims_params)} parameters" + f"but got {n_params}." + ) + + if "distance" in kwargs: + raise ValueError( + "distance is no longer defined when calling `pm.Simulator`. It" + "should be defined as a class attribute of the simulator object." + "See pm.SimulatorRV for more details." + ) + + if "sum_stat" in kwargs: + raise ValueError( + "sum_stat is no longer defined when calling `pm.Simulator`. It" + "should be defined as a class attribute of the simulator object." + "See pm.SimulatorRV for more details." + ) + + if "epsilon" in kwargs: + raise ValueError( + "epsilon is no longer defined when calling `pm.Simulator`. It" + "should be defined as a class attribute of the simulator object." + "See pm.SimulatorRV for more details." + ) class KullbackLiebler: @@ -155,3 +357,53 @@ def __call__(self, epsilon, obs_data, sim_data): sim_data = sim_data[:, None] nu_d, _ = cKDTree(sim_data).query(self.obs_data, 1) return self.d_n * np.sum(-np.log(nu_d / self.rho_d) / epsilon) + self.log_r + + +scalarX = at.dscalar if aesara.config.floatX == "float64" else at.fscalar +vectorX = at.dvector if aesara.config.floatX == "float64" else at.fvector + + +def create_sum_stat_op_from_fn(fn): + # Check if callable returns TensorVariable with dummy inputs + try: + res = fn(vectorX()) + if isinstance(res, TensorVariable): + return fn + except Exception: + pass + + # Otherwise, automatically wrap in Aesara Op + class SumStat(Op): + def make_node(self, x): + x = at.as_tensor_variable(x) + return Apply(self, [x], [vectorX()]) + + def perform(self, node, inputs, outputs): + (x,) = inputs + outputs[0][0] = np.atleast_1d(fn(x)).astype(aesara.config.floatX) + + return SumStat() + + +def create_distance_op_from_fn(fn): + # Check if callable returns TensorVariable with dummy inputs + try: + res = fn(scalarX(), vectorX(), vectorX()) + if isinstance(res, TensorVariable): + return fn + except Exception: + pass + + # Otherwise, automatically wrap in Aesara Op + class Distance(Op): + def make_node(self, epsilon, obs_data, sim_data): + epsilon = at.as_tensor_variable(epsilon) + obs_data = at.as_tensor_variable(obs_data) + sim_data = at.as_tensor_variable(sim_data) + return Apply(self, [epsilon, obs_data, sim_data], [vectorX()]) + + def perform(self, node, inputs, outputs): + eps, obs_data, sim_data = inputs + outputs[0][0] = np.atleast_1d(fn(eps, obs_data, sim_data)).astype(aesara.config.floatX) + + return Distance() diff --git a/pymc3/smc/sample_smc.py b/pymc3/smc/sample_smc.py index 010bd52665c..e28f72d01c1 100644 --- a/pymc3/smc/sample_smc.py +++ b/pymc3/smc/sample_smc.py @@ -36,15 +36,15 @@ def sample_smc( draws=2000, - kernel="metropolis", + kernel=None, n_steps=25, *, start=None, tune_steps=True, p_acc_rate=0.85, threshold=0.5, - save_sim_data=False, - save_log_pseudolikelihood=True, + save_sim_data=None, + save_log_pseudolikelihood=None, model=None, random_seed=-1, parallel=None, @@ -63,9 +63,6 @@ def sample_smc( draws: int The number of samples to draw from the posterior (i.e. last stage). And also the number of independent chains. Defaults to 2000. - kernel: str - Kernel method for the SMC sampler. Available option are ``metropolis`` (default) and `ABC`. - Use `ABC` for likelihood free inference together with a ``pm.Simulator``. n_steps: int The number of steps of each Markov Chain. If ``tune_steps == True`` ``n_steps`` will be used for the first stage and for the others it will be determined automatically based on the @@ -83,13 +80,6 @@ def sample_smc( Determines the change of beta from stage to stage, i.e.indirectly the number of stages, the higher the value of `threshold` the higher the number of stages. Defaults to 0.5. It should be between 0 and 1. - save_sim_data : bool - Whether or not to save the simulated data. This parameter only works with the ABC kernel. - The stored data corresponds to a samples from the posterior predictive distribution. - save_log_pseudolikelihood : bool - Whether or not to save the log pseudolikelihood values. This parameter only works with the - ABC kernel. The stored data can be used to compute LOO or WAIC values. Computing LOO/WAIC - values from log pseudolikelihood values is experimental. model: Model (optional if in ``with`` context)). random_seed: int random seed @@ -157,6 +147,30 @@ def sample_smc( %282007%29133:7%28816%29>`__ """ + if isinstance(kernel, str) and kernel.lower() == "abc": + warnings.warn( + f'The kernel "{kernel}" in sample_smc has been deprecated. ' + f"It is no longer needed to specify it.", + DeprecationWarning, + stacklevel=2, + ) + + if save_sim_data is not None: + warnings.warn( + "save_sim_data has been deprecated. Use pm.sample_posterior_predictive " + "to obtain the same type of samples.", + DeprecationWarning, + stacklevel=2, + ) + + if save_log_pseudolikelihood is not None: + warnings.warn( + "save_log_pseudolikelihood has been deprecated. This information is " + "now saved as log_likelihood in models with Simulator distributions.", + DeprecationWarning, + stacklevel=2, + ) + if parallel is not None: warnings.warn( "The argument parallel is deprecated, use the argument cores instead.", @@ -199,22 +213,13 @@ def sample_smc( if not isinstance(random_seed, Iterable): raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int") - if kernel.lower() == "abc": - if len(model.observed_RVs) != 1: - warnings.warn("SMC-ABC only works properly with models with one observed variable") - if model.potentials: - _log.info("Potentials will be added to the prior term") - params = ( draws, - kernel, n_steps, start, tune_steps, p_acc_rate, threshold, - save_sim_data, - save_log_pseudolikelihood, model, ) @@ -245,9 +250,7 @@ def sample_smc( ( traces, - sim_data, log_marginal_likelihoods, - log_pseudolikelihood, betas, accept_ratios, nsteps, @@ -263,7 +266,6 @@ def sample_smc( trace.report._n_draws = draws trace.report._n_tune = _n_tune trace.report.log_marginal_likelihood = log_marginal_likelihoods - trace.report.log_pseudolikelihood = log_pseudolikelihood trace.report.betas = betas trace.report.accept_ratios = accept_ratios trace.report.nsteps = nsteps @@ -313,23 +315,16 @@ def sample_smc( trace.report._run_convergence_checks(idata, model) trace.report._log_summary() - posterior = idata if return_inferencedata else trace - if save_sim_data: - return posterior, {modelcontext(model).observed_RVs[0].name: np.array(sim_data)} - else: - return posterior + return idata if return_inferencedata else trace def _sample_smc_int( draws, - kernel, n_steps, start, tune_steps, p_acc_rate, threshold, - save_sim_data, - save_log_pseudolikelihood, model, random_seed, chain, @@ -339,43 +334,26 @@ def _sample_smc_int( in_out_pickled = type(model) == bytes if in_out_pickled: # function was called in multiprocessing context, deserialize first - ( - draws, - kernel, - n_steps, - start, - tune_steps, - p_acc_rate, - threshold, - save_sim_data, - save_log_pseudolikelihood, - model, - ) = map( + (draws, n_steps, start, tune_steps, p_acc_rate, threshold, model,) = map( cloudpickle.loads, ( draws, - kernel, n_steps, start, tune_steps, p_acc_rate, threshold, - save_sim_data, - save_log_pseudolikelihood, model, ), ) smc = SMC( draws=draws, - kernel=kernel, n_steps=n_steps, start=start, tune_steps=tune_steps, p_acc_rate=p_acc_rate, threshold=threshold, - save_sim_data=save_sim_data, - save_log_pseudolikelihood=save_log_pseudolikelihood, model=model, random_seed=random_seed, chain=chain, @@ -411,9 +389,7 @@ def _sample_smc_int( results = ( smc.posterior_to_trace(), - smc.sim_data, smc.log_marginal_likelihood, - smc.log_pseudolikelihood, betas, accept_ratios, nsteps, diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py index f6c3a031e43..f7b6d772531 100644 --- a/pymc3/smc/smc.py +++ b/pymc3/smc/smc.py @@ -16,15 +16,14 @@ from collections import OrderedDict -import aesara.tensor as at import numpy as np from aesara import config -from aesara import function as aesara_function from scipy.special import logsumexp from scipy.stats import multivariate_normal from pymc3.aesaraf import ( + compile_rv_inplace, floatX, inputvars, join_nonshared_inputs, @@ -42,28 +41,22 @@ class SMC: def __init__( self, draws=2000, - kernel="metropolis", n_steps=25, start=None, tune_steps=True, p_acc_rate=0.85, threshold=0.5, - save_sim_data=False, - save_log_pseudolikelihood=True, model=None, random_seed=-1, chain=0, ): self.draws = draws - self.kernel = kernel.lower() self.n_steps = n_steps self.start = start self.tune_steps = tune_steps self.p_acc_rate = p_acc_rate self.threshold = threshold - self.save_sim_data = save_sim_data - self.save_log_pseudolikelihood = save_log_pseudolikelihood self.model = model self.random_seed = random_seed self.chain = chain @@ -73,15 +66,22 @@ def __init__( if self.random_seed != -1: np.random.seed(self.random_seed) + self.var_info = None + self.posterior = None + self.prior_logp = None + self.likelihood_logp = None + self.posterior_logp = None + self.prior_logp_func = None + self.likelihood_logp_func = None + self.log_marginal_likelihood = 0 + self.beta = 0 self.max_steps = n_steps self.proposed = draws * n_steps self.acc_rate = 1 self.variables = inputvars(self.model.value_vars) self.weights = np.ones(self.draws) / self.draws - self.log_marginal_likelihood = 0 - self.sim_data = [] - self.log_pseudolikelihood = [] + self.cov = None def initialize_population(self): """Create an initial population from the prior distribution.""" @@ -102,7 +102,6 @@ def initialize_population(self): var_info[v.name] = (init[v.name].shape, init[v.name].size) for i in range(self.draws): - point = Point({v.name: init_rnd[v.name][i] for v in self.variables}, model=self.model) population.append(DictToArrayBijection.map(point).data) @@ -114,36 +113,12 @@ def setup_kernel(self): initial_values = self.model.initial_point shared = make_shared_replacements(initial_values, self.variables, self.model) - if self.kernel == "abc": - factors = [var.logpt for var in self.model.free_RVs] - factors += [at.sum(factor) for factor in self.model.potentials] - self.prior_logp_func = logp_forw( - initial_values, [at.sum(factors)], self.variables, shared - ) - simulator = self.model.observed_RVs[0] - distance = simulator.distribution.distance - sum_stat = simulator.distribution.sum_stat - self.likelihood_logp_func = PseudoLikelihood( - simulator.distribution.epsilon, - simulator.observations, - simulator.distribution.function, - [v.name for v in simulator.distribution.params], - self.model, - self.var_info, - self.variables, - distance, - sum_stat, - self.draws, - self.save_sim_data, - self.save_log_pseudolikelihood, - ) - elif self.kernel == "metropolis": - self.prior_logp_func = logp_forw( - initial_values, [self.model.varlogpt], self.variables, shared - ) - self.likelihood_logp_func = logp_forw( - initial_values, [self.model.datalogpt], self.variables, shared - ) + self.prior_logp_func = logp_forw( + initial_values, [self.model.varlogpt], self.variables, shared + ) + self.likelihood_logp_func = logp_forw( + initial_values, [self.model.datalogpt], self.variables, shared + ) def initialize_logp(self): """Initialize the prior and likelihood log probabilities.""" @@ -153,12 +128,6 @@ def initialize_logp(self): self.prior_logp = np.array(priors).squeeze() self.likelihood_logp = np.array(likelihoods).squeeze() - if self.kernel == "abc" and self.save_sim_data: - self.sim_data = self.likelihood_logp_func.get_data() - - if self.kernel == "abc" and self.save_log_pseudolikelihood: - self.log_pseudolikelihood = self.likelihood_logp_func.get_lpl() - def update_weights_beta(self): """Calculate the next inverse temperature (beta). @@ -201,8 +170,6 @@ def resample(self): self.prior_logp = self.prior_logp[resampling_indexes] self.likelihood_logp = self.likelihood_logp[resampling_indexes] self.posterior_logp = self.prior_logp + self.likelihood_logp * self.beta - if self.save_sim_data: - self.sim_data = self.sim_data[resampling_indexes] def update_proposal(self): """Update proposal based on the covariance matrix from tempered posterior.""" @@ -254,12 +221,6 @@ def mutate(self): self.prior_logp[accepted] = pl[accepted] self.likelihood_logp[accepted] = ll[accepted] - if self.kernel == "abc" and self.save_sim_data: - self.sim_data[accepted] = self.likelihood_logp_func.get_data()[accepted] - - if self.kernel == "abc" and self.save_log_pseudolikelihood: - self.log_pseudolikelihood[accepted] = self.likelihood_logp_func.get_lpl()[accepted] - self.acc_rate = np.mean(ac_) def posterior_to_trace(self): @@ -304,124 +265,8 @@ def logp_forw(point, out_vars, vars, shared): "together with aesara.config.floatX == `float32`", UserWarning, ) - f = aesara_function([inarray0], out_list[0], allow_input_downcast=True) + f = compile_rv_inplace([inarray0], out_list[0], allow_input_downcast=True) else: - f = aesara_function([inarray0], out_list[0]) + f = compile_rv_inplace([inarray0], out_list[0]) f.trust_input = True return f - - -class PseudoLikelihood: - """ - Pseudo Likelihood. - - epsilon: float - Standard deviation of the gaussian pseudo likelihood. - observations: array-like - observed data - function: python function - data simulator - params: list - names of the variables parameterizing the simulator. - model: PyMC3 model - var_info: dict - generated by ``SMC.initialize_population`` - variables: list - Model variables. - distance : str or callable - Distance function. - sum_stat: str or callable - Summary statistics. - size : int - Number of simulated datasets to save. When this number is exceeded the counter will be - restored to zero and it will start saving again. - save_sim_data : bool - whether to save or not the simulated data. - save_log_pseudolikelihood : bool - whether to save or not the log pseudolikelihood values. - """ - - def __init__( - self, - epsilon, - observations, - function, - params, - model, - var_info, - variables, - distance, - sum_stat, - size, - save_sim_data, - save_log_pseudolikelihood, - ): - self.epsilon = epsilon - self.function = function - self.params = params - self.model = model - self.var_info = var_info - self.variables = variables - self.varnames = [v.name for v in self.variables] - self.distance = distance - self.sum_stat = sum_stat - self.unobserved_RVs = [v.name for v in self.model.unobserved_RVs] - self.get_unobserved_fn = self.model.fastfn( - [v.tag.value_var for v in self.model.unobserved_RVs] - ) - self.size = size - self.save_sim_data = save_sim_data - self.save_log_pseudolikelihood = save_log_pseudolikelihood - self.sim_data_l = [] - self.lpl_l = [] - - self.observations = self.sum_stat(observations) - - def posterior_to_function(self, posterior): - """Turn posterior samples into function parameters to feed the simulator.""" - model = self.model - var_info = self.var_info - - varvalues = [] - samples = {} - size = 0 - for var in self.variables: - shape, new_size = var_info[var.name] - varvalues.append(posterior[size : size + new_size].reshape(shape)) - size += new_size - point = {k: v for k, v in zip(self.varnames, varvalues)} - for varname, value in zip(self.unobserved_RVs, self.get_unobserved_fn(point)): - if varname in self.params: - samples[varname] = value - return samples - - def save_data(self, sim_data): - """Save simulated data.""" - if len(self.sim_data_l) == self.size: - self.sim_data_l = [] - self.sim_data_l.append(sim_data) - - def get_data(self): - """Get simulated data.""" - return np.array(self.sim_data_l) - - def save_lpl(self, elemwise): - """Save log pseudolikelihood values.""" - if len(self.lpl_l) == self.size: - self.lpl_l = [] - self.lpl_l.append(elemwise) - - def get_lpl(self): - """Get log pseudolikelihood values.""" - return np.array(self.lpl_l) - - def __call__(self, posterior): - """Compute the pseudolikelihood.""" - func_parameters = self.posterior_to_function(posterior) - sim_data = self.function(**func_parameters) - if self.save_sim_data: - self.save_data(sim_data) - elemwise = self.distance(self.epsilon, self.observations, self.sum_stat(sim_data)) - if self.save_log_pseudolikelihood: - self.save_lpl(elemwise) - return elemwise.sum() diff --git a/pymc3/tests/test_smc.py b/pymc3/tests/test_smc.py index 3dbf0e37ca8..74eb2a690bb 100644 --- a/pymc3/tests/test_smc.py +++ b/pymc3/tests/test_smc.py @@ -19,6 +19,9 @@ import numpy as np import pytest +from aesara.graph.basic import ancestors +from aesara.tensor.random.op import RandomVariable +from aesara.tensor.sort import SortOp from arviz.data.inference_data import InferenceData import pymc3 as pm @@ -154,7 +157,7 @@ def test_parallel_sampling(self): with self.slow_model: _ = pm.sample_smc(draws=10, chains=1, cores=1, return_inferencedata=False) - chains = 4 + chains = 8 draws = 100 t0 = time.time() @@ -182,82 +185,278 @@ def test_depracated_parallel_arg(self): pm.sample_smc(draws=10, chains=1, parallel=False) -@pytest.mark.xfail(reason="SMC-ABC not refactored yet") -class TestSMCABC(SeededTest): +def normal_sim(rng, a, b, size): + return rng.normal(a, b, size=size) + + +class NormalSimBaseRV(pm.SimulatorRV): + ndim_supp = 0 + ndims_params = [0, 0] + fn = normal_sim + + +class NormalSimRV1(NormalSimBaseRV): + distance = "gaussian" + sum_stat = "sort" + + +class NormalSimRV2(NormalSimBaseRV): + distance = "laplace" + sum_stat = "mean" + epsilon = 0.1 + + +class NormalSimRV3(NormalSimBaseRV): + distance = "gaussian" + sum_stat = "mean" + epsilon = 0.5 + + +def abs_diff(eps, obs_data, sim_data): + return np.mean(np.abs((obs_data - sim_data) / eps)) + + +def quantiles(x): + return np.quantile(x, [0.25, 0.5, 0.75]) + + +class NormalSimCustomRV1(NormalSimBaseRV): + distance = abs_diff + sum_stat = quantiles + + +class NormalSimCustomRV2(NormalSimBaseRV): + distance = abs_diff + sum_stat = "mean" + + +class TestSimulator(SeededTest): + """ + Tests for pm.Simulator. They are included in this file because Simulator was + designed primarily to be used with SMC sampling. + """ + + @staticmethod + def count_rvs(end_node): + return len( + [ + node + for node in ancestors([end_node]) + if node.owner is not None and isinstance(node.owner.op, RandomVariable) + ] + ) + def setup_class(self): super().setup_class() self.data = np.random.normal(loc=0, scale=1, size=1000) - def normal_sim(a, b): - return np.random.normal(a, b, 1000) - with pm.Model() as self.SMABC_test: a = pm.Normal("a", mu=0, sigma=1) b = pm.HalfNormal("b", sigma=1) - s = pm.Simulator( - "s", normal_sim, params=(a, b), sum_stat="sort", epsilon=1, observed=self.data - ) + s = pm.Simulator("s", NormalSimRV1(), a, b, observed=self.data) self.s = s - def quantiles(x): - return np.quantile(x, [0.25, 0.5, 0.75]) - - def abs_diff(eps, obs_data, sim_data): - return np.mean(np.abs((obs_data - sim_data) / eps)) - - with pm.Model() as self.SMABC_test2: - a = pm.Normal("a", mu=0, sigma=1) - b = pm.HalfNormal("b", sigma=1) - s = pm.Simulator( - "s", - normal_sim, - params=(a, b), - distance=abs_diff, - sum_stat=quantiles, - epsilon=1, - observed=self.data, - ) - with pm.Model() as self.SMABC_potential: - a = pm.Normal("a", mu=0, sigma=1) + a = pm.Normal("a", mu=0, sigma=1, initval=0.5) b = pm.HalfNormal("b", sigma=1) c = pm.Potential("c", pm.math.switch(a > 0, 0, -np.inf)) - s = pm.Simulator( - "s", normal_sim, params=(a, b), sum_stat="sort", epsilon=1, observed=self.data - ) + s = pm.Simulator("s", NormalSimRV1(), a, b, observed=self.data) def test_one_gaussian(self): - with self.SMABC_test: - trace = pm.sample_smc(draws=1000, kernel="ABC") + assert self.count_rvs(self.SMABC_test.logpt) == 1 - np.testing.assert_almost_equal(self.data.mean(), trace["a"].mean(), decimal=2) - np.testing.assert_almost_equal(self.data.std(), trace["b"].mean(), decimal=1) - - def test_sim_data_ppc(self): with self.SMABC_test: - trace, sim_data = pm.sample_smc(draws=1000, kernel="ABC", chains=2, save_sim_data=True) + trace = pm.sample_smc(draws=1000, return_inferencedata=False) pr_p = pm.sample_prior_predictive(1000) po_p = pm.sample_posterior_predictive(trace, 1000) - assert sim_data["s"].shape == (2, 1000, 1000) - np.testing.assert_almost_equal(self.data.mean(), sim_data["s"].mean(), decimal=2) - np.testing.assert_almost_equal(self.data.std(), sim_data["s"].std(), decimal=1) + assert abs(self.data.mean() - trace["a"].mean()) < 0.05 + assert abs(self.data.std() - trace["b"].mean()) < 0.05 + assert pr_p["s"].shape == (1000, 1000) - np.testing.assert_almost_equal(0, pr_p["s"].mean(), decimal=1) - np.testing.assert_almost_equal(1.4, pr_p["s"].std(), decimal=1) + assert abs(0 - pr_p["s"].mean()) < 0.10 + assert abs(1.4 - pr_p["s"].std()) < 0.10 + assert po_p["s"].shape == (1000, 1000) - np.testing.assert_almost_equal(0, po_p["s"].mean(), decimal=2) - np.testing.assert_almost_equal(1, po_p["s"].std(), decimal=1) + assert abs(self.data.mean() - po_p["s"].mean()) < 0.10 + assert abs(self.data.std() - po_p["s"].std()) < 0.10 + + def test_custom_dist_sum_stat(self): + with pm.Model() as m: + a = pm.Normal("a", mu=0, sigma=1) + b = pm.HalfNormal("b", sigma=1) + s = pm.Simulator("s", NormalSimCustomRV1(), a, b, observed=self.data) + + assert self.count_rvs(m.logpt) == 1 + + with m: + pm.sample_smc(draws=100, chains=2) - def test_custom_dist_sum(self): - with self.SMABC_test2: - trace = pm.sample_smc(draws=1000, kernel="ABC") + def test_custom_dist_sum_stat_scalar(self): + """ + Test that automatically wrapped functions cope well with scalar inputs + """ + scalar_data = 5 + + with pm.Model() as m: + s = pm.Simulator("s", NormalSimCustomRV1(), 0, 1, observed=scalar_data) + assert self.count_rvs(m.logpt) == 1 + + with pm.Model() as m: + s = pm.Simulator("s", NormalSimCustomRV2(), 0, 1, observed=scalar_data) + assert self.count_rvs(m.logpt) == 1 + + def test_model_with_potential(self): + assert self.count_rvs(self.SMABC_potential.logpt) == 1 - def test_potential(self): with self.SMABC_potential: - trace = pm.sample_smc(draws=1000, kernel="ABC") + trace = pm.sample_smc(draws=100, chains=1, return_inferencedata=False) assert np.all(trace["a"] >= 0) + def test_simulator_metropolis_mcmc(self): + with self.SMABC_test as m: + step = pm.Metropolis([m.rvs_to_values[m["a"]], m.rvs_to_values[m["b"]]]) + trace = pm.sample(step=step, return_inferencedata=False) + + assert abs(self.data.mean() - trace["a"].mean()) < 0.05 + assert abs(self.data.std() - trace["b"].mean()) < 0.05 + + def test_multiple_simulators(self): + true_a = 2 + true_b = -2 + + data1 = np.random.normal(true_a, 0.1, size=1000) + data2 = np.random.normal(true_b, 0.1, size=1000) + + with pm.Model() as m: + a = pm.Normal("a", mu=0, sigma=3) + b = pm.Normal("b", mu=0, sigma=3) + sim1 = pm.Simulator("sim1", NormalSimRV1(), a, 0.1, observed=data1) + sim2 = pm.Simulator("sim2", NormalSimRV2(), b, 0.1, observed=data2) + + assert self.count_rvs(m.logpt) == 2 + + # Check that the logps use the correct methods + a_val = m.rvs_to_values[a] + sim1_val = m.rvs_to_values[sim1] + logp_sim1 = pm.logp(sim1, sim1_val) + logp_sim1_fn = aesara.function([sim1_val, a_val], logp_sim1) + + b_val = m.rvs_to_values[b] + sim2_val = m.rvs_to_values[sim2] + logp_sim2 = pm.logp(sim2, sim2_val) + logp_sim2_fn = aesara.function([sim2_val, b_val], logp_sim2) + + assert any( + node for node in logp_sim1_fn.maker.fgraph.toposort() if isinstance(node.op, SortOp) + ) + + assert not any( + node for node in logp_sim2_fn.maker.fgraph.toposort() if isinstance(node.op, SortOp) + ) + + with m: + trace = pm.sample_smc(return_inferencedata=False) + assert abs(true_a - trace["a"].mean()) < 0.05 + assert abs(true_b - trace["b"].mean()) < 0.05 + + def test_nested_simulators(self): + true_a = 2 + data = np.random.normal(true_a, 0.1, size=1000) + + with pm.Model() as m: + sim1 = pm.Simulator("sim1", NormalSimRV3(), 0, 4) + sim2 = pm.Simulator("sim2", NormalSimRV3(), sim1, 0.1, observed=data) + + assert self.count_rvs(m.logpt) == 2 + + with m: + trace = pm.sample_smc(return_inferencedata=False) + + assert (true_a - trace["sim1"].mean()) < 0.1 + + def test_simulator_rv_error_msg(self): + class RV(pm.SimulatorRV): + pass + + msg = "SimulatorRV fn was not specified" + with pytest.raises(ValueError, match=msg): + RV() + + class RV(pm.SimulatorRV): + fn = lambda: None + + msg = "SimulatorRV must specify `ndim_supp`" + with pytest.raises(ValueError, match=msg): + RV() + + class RV(pm.SimulatorRV): + fn = lambda: None + ndim_supp = 0 + + msg = "Simulator RV must specify `ndims_params`" + with pytest.raises(ValueError, match=msg): + RV() + + class RV(pm.SimulatorRV): + fn = lambda: None + ndim_supp = 0 + ndims_params = [0, 0, 0] + distance = "not_real" + + msg = "The distance metric not_real is not implemented" + with pytest.raises(ValueError, match=msg): + RV() + + class RV(pm.SimulatorRV): + fn = lambda: None + ndim_supp = 0 + ndims_params = [0, 0, 0] + distance = "gaussian" + sum_stat = "not_real" + + msg = "The summary statistic not_real is not implemented" + with pytest.raises(ValueError, match=msg): + RV() + + def test_simulator_error_msg(self): + msg = "does not seem to be an instantiated" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", NormalSimRV1, 0, 1) + + msg = "should be a subclass instance of" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", lambda: None, 0, 1) + + msg = "`Simulator` expected 2 parameters" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", NormalSimRV1(), 0) + + msg = "distance is no longer defined when calling `pm.Simulator`" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", NormalSimRV1(), 0, 1, distance="gaussian") + + msg = "sum_stat is no longer defined when calling `pm.Simulator`" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", NormalSimRV1(), 0, 1, sum_stat="sort") + + msg = "epsilon is no longer defined when calling `pm.Simulator`" + with pm.Model() as m: + with pytest.raises(ValueError, match=msg): + pm.Simulator("sim", NormalSimRV1(), 0, 1, epsilon=1.0) + + def test_params_kwarg_deprecation(self): + msg = "The kwarg ``params`` will be deprecated." + with pm.Model() as m: + with pytest.warns(DeprecationWarning, match=msg): + pm.Simulator("sim", NormalSimRV1(), params=(0, 1)) + + @pytest.mark.xfail(reason="KL not refactored") def test_automatic_use_of_sort(self): with pm.Model() as model: s_k = pm.Simulator( @@ -270,28 +469,38 @@ def test_automatic_use_of_sort(self): ) assert s_k.distribution.sum_stat is pm.distributions.simulator.identity - def test_repr_latex(self): - expected = "$\\text{s} \\sim \\text{Simulator}(\\text{normal_sim}(a, b), \\text{gaussian}, \\text{sort})$" - assert expected == self.s._repr_latex_() - assert self.s._repr_latex_() == self.s.__latex__() - assert self.SMABC_test.model._repr_latex_() == self.SMABC_test.model.__latex__() - def test_name_is_string_type(self): with self.SMABC_potential: assert not self.SMABC_potential.name - trace = pm.sample_smc(draws=10, kernel="ABC") + trace = pm.sample_smc(draws=10, cores=1, return_inferencedata=False) assert isinstance(trace._straces[0].name, str) def test_named_models_are_unsupported(self): - def normal_sim(a, b): - return np.random.normal(a, b, 1000) - with pm.Model(name="NamedModel"): a = pm.Normal("a", mu=0, sigma=1) b = pm.HalfNormal("b", sigma=1) - c = pm.Potential("c", pm.math.switch(a > 0, 0, -np.inf)) - s = pm.Simulator( - "s", normal_sim, params=(a, b), sum_stat="sort", epsilon=1, observed=self.data - ) + s = pm.Simulator("s", NormalSimRV1(), a, b, observed=self.data) + + # TODO: Why is this? with pytest.raises(NotImplementedError, match="named models"): - pm.sample_smc(draws=10, kernel="ABC") + pm.sample_smc(draws=10, chains=1) + + def test_deprecated_abc_args(self): + with self.SMABC_test: + with pytest.warns( + DeprecationWarning, + match='The kernel "ABC" in sample_smc has been deprecated', + ): + pm.sample_smc(draws=10, chains=1, kernel="ABC") + + with pytest.warns( + DeprecationWarning, + match="save_sim_data has been deprecated", + ): + pm.sample_smc(draws=10, chains=1, save_sim_data=True) + + with pytest.warns( + DeprecationWarning, + match="save_log_pseudolikelihood has been deprecated", + ): + pm.sample_smc(draws=10, chains=1, save_log_pseudolikelihood=True)