Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SMC Sampling fails with multivariate distribution #312

Open
mathDR opened this issue Nov 29, 2019 · 2 comments
Open

SMC Sampling fails with multivariate distribution #312

mathDR opened this issue Nov 29, 2019 · 2 comments

Comments

@mathDR
Copy link

mathDR commented Nov 29, 2019

Summary:

When a sequence of thresholds having length > 1 is passed to smc, the inference fails with ValueError: In executing node '_theta_logpdf': Vector 'x' must have either the same number of entries as, or one entry fewer than, parameter vector 'a', but alpha.shape = (10,) and x.shape = (100000,)..

Description:

Using a simulator of a multinomial model, having a dirichlet prior, attempting to run smc for a sequence of thresholds produces an error.

Reproducible Steps:

import numpy as np
import scipy.stats as ss
import elfi
num_categories = 10
num_experiments = 100
theta = elfi.Prior('dirichlet', np.ones(num_categories), name='theta')
def simulator(num_experiments, theta, batch_size=1, random_state=None):
    if len(theta.shape)==1:
        theta = theta.reshape(1,-1)
    return_value = []

    for i in range(theta.shape[0]):
        return_value.append(
            ss.multinomial.rvs(
                100, 
                theta[i,:], 
                random_state=random_state
            )
        )
    return np.array(return_value)
                       
def mean(y):
    return np.mean(y, axis=1)

def ret_identity(y):
    return y

def distance_in_variation(p1,observed=[]):
    arr_p1 = np.asarray(p1)
    y = np.tile(observed[0],(np.asarray(p1).shape[0],1))
    
    return 0.5*np.sum(np.abs(arr_p1-y),axis=1)

theta0 = np.random.dirichlet(np.ones(num_categories))
np.random.seed(20191126) 
y0 = simulator(num_experiments,theta0)

from functools import partial
# Add the simulator node and observed data to the model
sim_fn = partial(simulator,num_experiments)
sim = elfi.Simulator(sim_fn, theta, observed=y0)
d = elfi.Discrepancy(
    distance_in_variation, 
    elfi.Summary(ret_identity, sim, name='ret_identity'), 
    name='d')

smc = elfi.SMC(d, batch_size=10000)

N = 1000
schedule = [100,99]
%time result_smc = smc.sample(N, schedule)

Current Output:

When the threshold schedule is [100], everything runs. When it is changed to [100,99] it fails with the error:

Progress: |█████████████████████████-------------------------| 50.0% Complete
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/executor.py in execute(cls, G)
     69                 try:
---> 70                     G.node[node] = cls._run(op, node, G)
     71                 except Exception as exc:

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/executor.py in _run(fn, node, G)
    153 
--> 154         output_dict = {'output': fn(*args, **kwargs)}
    155         return output_dict

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/scipy/stats/_multivariate.py in logpdf(self, x, alpha)
   1448         alpha = _dirichlet_check_parameters(alpha)
-> 1449         x = _dirichlet_check_input(alpha, x)
   1450 

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/scipy/stats/_multivariate.py in _dirichlet_check_input(alpha, x)
   1242                          "parameter vector 'a', but alpha.shape = %s "
-> 1243                          "and x.shape = %s." % (alpha.shape, x.shape))
   1244 

ValueError: Vector 'x' must have either the same number of entries as, or one entry fewer than, parameter vector 'a', but alpha.shape = (10,) and x.shape = (100000,).

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<timed exec> in <module>

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/parameter_inference.py in sample(self, n_samples, *args, **kwargs)
    404         bar = kwargs.pop('bar', True)
    405 
--> 406         return self.infer(n_samples, *args, bar=bar, **kwargs)
    407 
    408     def _extract_result_kwargs(self):

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/parameter_inference.py in infer(self, vis, bar, *args, **kwargs)
    262 
    263         while not self.finished:
--> 264             self.iterate()
    265             if vis:
    266                 self.plot_state(interactive=True, **vis_opt)

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/parameter_inference.py in iterate(self)
    299         # Submit new batches if allowed
    300         while self._allow_submit(self.batches.next_index):
--> 301             next_batch = self.prepare_new_batch(self.batches.next_index)
    302             logger.debug("Submitting batch %d" % self.batches.next_index)
    303             self.batches.submit(next_batch)

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/parameter_inference.py in prepare_new_batch(self, batch_index)
    716         params = GMDistribution.rvs(*self._gm_params, size=self.batch_size,
    717                                     prior_logpdf=self._prior.logpdf,
--> 718                                     random_state=self._round_random_state)
    719 
    720         batch = arr2d_to_batch(params, self.parameter_names)

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/utils.py in rvs(cls, means, cov, weights, size, prior_logpdf, random_state)
    229             # check validity of x
    230             if prior_logpdf is not None:
--> 231                 x = x[np.isfinite(prior_logpdf(x))]
    232 
    233             n_accepted1 = len(x)

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/utils.py in logpdf(self, x)
    356     def logpdf(self, x):
    357         """Return the log density of the joint prior at x."""
--> 358         return self._evaluate_pdf(x, log=True)
    359 
    360     def _evaluate_pdf(self, x, log=False):

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/methods/utils.py in _evaluate_pdf(self, x, log)
    380             loaded_net.node[k] = {'output': v}
    381 
--> 382         val = self.client.compute(loaded_net)[node]
    383         if ndim == 0 or (ndim == 1 and self.dim > 1):
    384             val = val[0]

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/client.py in compute(self, loaded_net)
    271     def compute(self, loaded_net):
    272         """Request evaluation of `loaded_net` and wait for result."""
--> 273         return self.apply_sync(Executor.execute, loaded_net)
    274 
    275     @property

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/clients/native.py in apply_sync(self, kallable, *args, **kwargs)
     51 
     52         """
---> 53         return kallable(*args, **kwargs)
     54 
     55     def get_result(self, task_id):

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/executor.py in execute(cls, G)
     71                 except Exception as exc:
     72                     raise exc.__class__("In executing node '{}': {}."
---> 73                                         .format(node, exc)).with_traceback(exc.__traceback__)
     74 
     75             elif 'output' not in attr:

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/executor.py in execute(cls, G)
     68                 op = attr['operation']
     69                 try:
---> 70                     G.node[node] = cls._run(op, node, G)
     71                 except Exception as exc:
     72                     raise exc.__class__("In executing node '{}': {}."

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/elfi/executor.py in _run(fn, node, G)
    152         args = [a[1] for a in sorted(args, key=itemgetter(0))]
    153 
--> 154         output_dict = {'output': fn(*args, **kwargs)}
    155         return output_dict
    156 

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/scipy/stats/_multivariate.py in logpdf(self, x, alpha)
   1447         """
   1448         alpha = _dirichlet_check_parameters(alpha)
-> 1449         x = _dirichlet_check_input(alpha, x)
   1450 
   1451         out = self._logpdf(x, alpha)

~/.pyenv/versions/3.7.4/envs/work/lib/python3.7/site-packages/scipy/stats/_multivariate.py in _dirichlet_check_input(alpha, x)
   1241                          "of entries as, or one entry fewer than, "
   1242                          "parameter vector 'a', but alpha.shape = %s "
-> 1243                          "and x.shape = %s." % (alpha.shape, x.shape))
   1244 
   1245     if x.shape[0] != alpha.shape[0]:

ValueError: In executing node '_theta_logpdf': Vector 'x' must have either the same number of entries as, or one entry fewer than, parameter vector 'a', but alpha.shape = (10,) and x.shape = (100000,)..

Expected Output:

I expect the output to have samples similar to running with threshold [99].

ELFI Version:

0.7.4

Python Version:

3.7.4

Operating System:

MAC OSX

@hpesonen
Copy link
Member

hpesonen commented Dec 3, 2019

Hi,
unfortunately SMC-ABC in ELFI doesn't support multidimensional priors. However, in the case of Dirichlet-prior, you could use the following change of variables based on independent gamma-priors.

import numpy as np
import scipy.stats as ss
import elfi
import collections
from functools import partial

def simulator(*alpha, num_experiments, batch_size=1, random_state=None):

    n_dim=len(alpha)
    batches_alpha = np.zeros(shape=(batch_size, n_dim))

    for idx_dim, param_alpha in enumerate(alpha):
        batches_alpha[:, idx_dim] = param_alpha

    return_value = []
    theta = batches_alpha / np.sum(batches_alpha,axis=1, keepdims=True)
    return_value = np.zeros((theta.shape))
    for i in range(batch_size):
        return_value[i,:] = ss.multinomial.rvs(num_experiments, 
                                           theta[i,:], random_state=random_state)
        
    return np.array(return_value)
                       
def mean(y):
    return np.mean(y, axis=1)

def ret_identity(y):
    return y

def distance_in_variation(p1,observed=[]):
    arr_p1 = np.asarray(p1)
    y = np.tile(observed[0],(np.asarray(p1).shape[0],1))
    return 0.5*np.sum(np.abs(arr_p1-y),axis=1)

num_categories = 10
num_experiments = 100
theta0 = np.random.dirichlet(np.ones(num_categories))
y0 = simulator(*theta0, num_experiments = num_experiments)
m = elfi.new_model()

priors = []

for i in range(num_categories):
    name_prior = 'alpha_{}'.format(i)
    prior_alpha = elfi.Prior('gamma', 1, model=m, name=name_prior)
    priors.append(prior_alpha)

np.random.seed(20191126) 

# Add the simulator node and observed data to the model
sim_fn = partial(simulator,num_experiments=num_experiments)
sim = elfi.Simulator(sim_fn, *priors, observed=y0)
d = elfi.Discrepancy(
    distance_in_variation, 
    elfi.Summary(ret_identity, sim, model=m, name='ret_identity'), name='d')

smc = elfi.SMC(d, batch_size=1000)

N = 10000
schedule = [100,90,80,70,50,40]
result_smc = smc.sample(N, thresholds = schedule, bar=False)

alpha = result_smc.sample_means.values()
total_sum = sum(alpha) 
theta_values = list(alpha) / total_sum

for i in range(num_categories):
    print("theta_% 1d % 10.4f % 10.4f" %(i, theta0[i], theta_values[i]))

@mathDR
Copy link
Author

mathDR commented Dec 4, 2019

Thanks for the tip. I will definitely put this into practice.

If this is functionality you think would help out in ELFI, I would be willing to take a shot at a PR extending the SMC functionality.

Please advise and thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants