Skip to content

Commit

Permalink
Expand pm.Data capacities (#3925)
Browse files Browse the repository at this point in the history
* Initial changes to allow pymc3.Data() to support both int and float input data (previously all input data was coerced to float)
WIP for #3813

* added exception for invalid dtype input to pandas_to_array

* Refined implementation

* Finished dtype conversion handling

* Added SharedVariable option to getattr_value

* Added dtype handling to set_data function

* Added tests for pm.Data used for index variables

* Added tests for using pm.data as RV input

* Ran Black on data tests files

* Added release note

* Updated release notes

* Updated code in light of Luciano's comments

* Fixed implementation of integer checking

* Simplified implementation of type checking

* Corrected implementation for other uses of pandas_to_array

Co-authored-by: hottwaj <[email protected]>
  • Loading branch information
AlexAndorra and hottwaj authored May 18, 2020
1 parent 7f307b9 commit 1ed0478
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 72 deletions.
4 changes: 3 additions & 1 deletion RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## PyMC3 3.9 (On deck)

### New features
- use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693)
- Use [fastprogress](https://github.com/fastai/fastprogress) instead of tqdm [#3693](https://github.com/pymc-devs/pymc3/pull/3693).
- `DEMetropolis` can now tune both `lambda` and `scaling` parameters, but by default neither of them are tuned. See [#3743](https://github.com/pymc-devs/pymc3/pull/3743) for more info.
- `DEMetropolisZ`, an improved variant of `DEMetropolis` brings better parallelization and higher efficiency with fewer chains with a slower initial convergence. This implementation is experimental. See [#3784](https://github.com/pymc-devs/pymc3/pull/3784) for more info.
- Notebooks that give insight into `DEMetropolis`, `DEMetropolisZ` and the `DifferentialEquation` interface are now located in the [Tutorials/Deep Dive](https://docs.pymc.io/nb_tutorials/index.html) section.
Expand All @@ -14,6 +14,8 @@
- `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705), [#3858](https://github.com/pymc-devs/pymc3/pull/3858), and [#3893](https://github.com/pymc-devs/pymc3/pull/3893)). Use `init="adapt_full"` or `init="jitter+adapt_full"` to use.
- `Moyal` distribution added (see [#3870](https://github.com/pymc-devs/pymc3/pull/3870)).
- `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
- `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).

### Maintenance
- Tuning results no longer leak into sequentially sampled `Metropolis` chains (see #3733 and #3796).
Expand Down
20 changes: 11 additions & 9 deletions pymc3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,9 @@ class Minibatch(tt.TensorVariable):
Examples
--------
Consider we have `data` as follows:
>>> data = np.random.rand(100, 100)
if we want a 1d slice of size 10 we do
>>> x = Minibatch(data, batch_size=10)
Expand All @@ -182,7 +182,7 @@ class Minibatch(tt.TensorVariable):
>>> assert x.eval().shape == (10, 10)
You can pass the Minibatch `x` to your desired model:
>>> with pm.Model() as model:
Expand All @@ -192,7 +192,7 @@ class Minibatch(tt.TensorVariable):
Then you can perform regular Variational Inference out of the box
>>> with model:
... approx = pm.fit()
Expand Down Expand Up @@ -478,16 +478,19 @@ class Data:
For more information, take a look at this example notebook
https://docs.pymc.io/notebooks/data_container.html
"""

def __new__(self, name, value):
if isinstance(value, list):
value = np.array(value)

# Add data container to the named variables of the model.
try:
model = pm.Model.get_context()
except TypeError:
raise TypeError("No model on context stack, which is needed to "
"instantiate a data container. Add variable "
"inside a 'with model:' block.")

raise TypeError(
"No model on context stack, which is needed to instantiate a data container. "
"Add variable inside a 'with model:' block."
)
name = model.name_for(name)

# `pm.model.pandas_to_array` takes care of parameter `value` and
Expand All @@ -498,7 +501,6 @@ def __new__(self, name, value):
# its shape.
shared_object.dshape = tuple(shared_object.shape.eval())


model.add_random_variable(shared_object)

return shared_object
3 changes: 3 additions & 0 deletions pymc3/distributions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ def getattr_value(self, val):
if isinstance(val, tt.TensorVariable):
return val.tag.test_value

if isinstance(val, tt.sharedvar.TensorSharedVariable):
return val.get_value()

if isinstance(val, theano_constant):
return val.value

Expand Down
18 changes: 15 additions & 3 deletions pymc3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1244,7 +1244,7 @@ def set_data(new_data, model=None):
----------
new_data: dict
New values for the data containers. The keys of the dictionary are
the variables names in the model and the values are the objects
the variables' names in the model and the values are the objects
with which to update.
model: Model (optional if in `with` context)
Expand All @@ -1266,7 +1266,7 @@ def set_data(new_data, model=None):
.. code:: ipython
>>> with model:
... pm.set_data({'x': [5,6,9]})
... pm.set_data({'x': [5., 6., 9.]})
... y_test = pm.sample_posterior_predictive(trace)
>>> y_test['obs'].mean(axis=0)
array([4.6088569 , 5.54128318, 8.32953844])
Expand All @@ -1275,6 +1275,8 @@ def set_data(new_data, model=None):

for variable_name, new_value in new_data.items():
if isinstance(model[variable_name], SharedVariable):
if isinstance(new_value, list):
new_value = np.array(new_value)
model[variable_name].set_value(pandas_to_array(new_value))
else:
message = 'The variable `{}` must be defined as `pymc3.' \
Expand Down Expand Up @@ -1501,7 +1503,17 @@ def pandas_to_array(data):
ret = generator(data)
else:
ret = np.asarray(data)
return pm.floatX(ret)

# type handling to enable index variables when data is int:
if hasattr(data, "dtype"):
if "int" in str(data.dtype):
return pm.intX(ret)
# otherwise, assume float:
else:
return pm.floatX(ret)
# needed for uses of this function other than with pm.Data:
else:
return pm.floatX(ret)


def as_tensor(data, name, model, distribution):
Expand Down
169 changes: 110 additions & 59 deletions pymc3/tests/test_data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,117 +20,168 @@

class TestData(SeededTest):
def test_deterministic(self):
data_values = np.array([.5, .4, 5, 2])
data_values = np.array([0.5, 0.4, 5, 2])
with pm.Model() as model:
X = pm.Data('X', data_values)
pm.Normal('y', 0, 1, observed=X)
X = pm.Data("X", data_values)
pm.Normal("y", 0, 1, observed=X)
model.logp(model.test_point)

def test_sample(self):
x = np.random.normal(size=100)
y = x + np.random.normal(scale=1e-2, size=100)

x_pred = np.linspace(-3, 3, 200, dtype='float32')
x_pred = np.linspace(-3, 3, 200, dtype="float32")

with pm.Model():
x_shared = pm.Data('x_shared', x)
b = pm.Normal('b', 0., 10.)
pm.Normal('obs', b * x_shared, np.sqrt(1e-2), observed=y)
prior_trace0 = pm.sample_prior_predictive(1000)
x_shared = pm.Data("x_shared", x)
b = pm.Normal("b", 0.0, 10.0)
pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y)

prior_trace0 = pm.sample_prior_predictive(1000)
trace = pm.sample(1000, init=None, tune=1000, chains=1)
pp_trace0 = pm.sample_posterior_predictive(trace, 1000)
pp_trace01 = pm.fast_sample_posterior_predictive(trace, 1000)

x_shared.set_value(x_pred)
prior_trace1 = pm.sample_prior_predictive(1000)
pp_trace1 = pm.sample_posterior_predictive(trace, samples=1000)
pp_trace11 = pm.fast_sample_posterior_predictive(trace, samples=1000)
prior_trace1 = pm.sample_prior_predictive(1000)

assert prior_trace0['b'].shape == (1000,)
assert prior_trace0['obs'].shape == (1000, 100)
assert prior_trace1['obs'].shape == (1000, 200)
assert prior_trace0["b"].shape == (1000,)
assert prior_trace0["obs"].shape == (1000, 100)
assert prior_trace1["obs"].shape == (1000, 200)

assert pp_trace0['obs'].shape == (1000, 100)
assert pp_trace01['obs'].shape == (1000, 100)
assert pp_trace0["obs"].shape == (1000, 100)
assert pp_trace01["obs"].shape == (1000, 100)

np.testing.assert_allclose(x, pp_trace0['obs'].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(x, pp_trace01['obs'].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(x, pp_trace0["obs"].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(x, pp_trace01["obs"].mean(axis=0), atol=1e-1)

assert pp_trace1['obs'].shape == (1000, 200)
assert pp_trace11['obs'].shape == (1000, 200)
assert pp_trace1["obs"].shape == (1000, 200)
assert pp_trace11["obs"].shape == (1000, 200)

np.testing.assert_allclose(x_pred, pp_trace1['obs'].mean(axis=0),
atol=1e-1)
np.testing.assert_allclose(x_pred, pp_trace11['obs'].mean(axis=0),
atol=1e-1)
np.testing.assert_allclose(x_pred, pp_trace1["obs"].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(x_pred, pp_trace11["obs"].mean(axis=0), atol=1e-1)

def test_sample_posterior_predictive_after_set_data(self):
with pm.Model() as model:
x = pm.Data('x', [1., 2., 3.])
y = pm.Data('y', [1., 2., 3.])
beta = pm.Normal('beta', 0, 10.)
pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
x = pm.Data("x", [1.0, 2.0, 3.0])
y = pm.Data("y", [1.0, 2.0, 3.0])
beta = pm.Normal("beta", 0, 10.0)
pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
trace = pm.sample(1000, tune=1000, chains=1)
# Predict on new data.
with model:
x_test = [5, 6, 9]
pm.set_data(new_data={'x': x_test})
pm.set_data(new_data={"x": x_test})
y_test = pm.sample_posterior_predictive(trace)
y_test1 = pm.fast_sample_posterior_predictive(trace)

assert y_test['obs'].shape == (1000, 3)
assert y_test1['obs'].shape == (1000, 3)
np.testing.assert_allclose(x_test, y_test['obs'].mean(axis=0),
atol=1e-1)
np.testing.assert_allclose(x_test, y_test1['obs'].mean(axis=0),
atol=1e-1)
assert y_test["obs"].shape == (1000, 3)
assert y_test1["obs"].shape == (1000, 3)
np.testing.assert_allclose(x_test, y_test["obs"].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(x_test, y_test1["obs"].mean(axis=0), atol=1e-1)

def test_sample_after_set_data(self):
with pm.Model() as model:
x = pm.Data('x', [1., 2., 3.])
y = pm.Data('y', [1., 2., 3.])
beta = pm.Normal('beta', 0, 10.)
pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
x = pm.Data("x", [1.0, 2.0, 3.0])
y = pm.Data("y", [1.0, 2.0, 3.0])
beta = pm.Normal("beta", 0, 10.0)
pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
pm.sample(1000, init=None, tune=1000, chains=1)
# Predict on new data.
new_x = [5., 6., 9.]
new_y = [5., 6., 9.]
new_x = [5.0, 6.0, 9.0]
new_y = [5.0, 6.0, 9.0]
with model:
pm.set_data(new_data={'x': new_x, 'y': new_y})
pm.set_data(new_data={"x": new_x, "y": new_y})
new_trace = pm.sample(1000, init=None, tune=1000, chains=1)
pp_trace = pm.sample_posterior_predictive(new_trace, 1000)
pp_tracef = pm.fast_sample_posterior_predictive(new_trace, 1000)

assert pp_trace['obs'].shape == (1000, 3)
assert pp_tracef['obs'].shape == (1000, 3)
np.testing.assert_allclose(new_y, pp_trace['obs'].mean(axis=0),
atol=1e-1)
np.testing.assert_allclose(new_y, pp_tracef['obs'].mean(axis=0),
atol=1e-1)
assert pp_trace["obs"].shape == (1000, 3)
assert pp_tracef["obs"].shape == (1000, 3)
np.testing.assert_allclose(new_y, pp_trace["obs"].mean(axis=0), atol=1e-1)
np.testing.assert_allclose(new_y, pp_tracef["obs"].mean(axis=0), atol=1e-1)

def test_shared_data_as_index(self):
"""
Allow pm.Data to be used for index variables, i.e with integers as well as floats.
See https://github.com/pymc-devs/pymc3/issues/3813
"""
with pm.Model() as model:
index = pm.Data("index", [2, 0, 1, 0, 2])
y = pm.Data("y", [1.0, 2.0, 3.0, 2.0, 1.0])
alpha = pm.Normal("alpha", 0, 1.5, shape=3)
pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y)

prior_trace = pm.sample_prior_predictive(1000, var_names=["alpha"])
trace = pm.sample(1000, init=None, tune=1000, chains=1)

# Predict on new data
new_index = np.array([0, 1, 2])
new_y = [5.0, 6.0, 9.0]
with model:
pm.set_data(new_data={"index": new_index, "y": new_y})
pp_trace = pm.sample_posterior_predictive(
trace, 1000, var_names=["alpha", "obs"]
)
pp_tracef = pm.fast_sample_posterior_predictive(
trace, 1000, var_names=["alpha", "obs"]
)

assert prior_trace["alpha"].shape == (1000, 3)
assert trace["alpha"].shape == (1000, 3)
assert pp_trace["alpha"].shape == (1000, 3)
assert pp_trace["obs"].shape == (1000, 3)
assert pp_tracef["alpha"].shape == (1000, 3)
assert pp_tracef["obs"].shape == (1000, 3)

def test_shared_data_as_rv_input(self):
"""
Allow pm.Data to be used as input for other RVs.
See https://github.com/pymc-devs/pymc3/issues/3842
"""
with pm.Model() as m:
x = pm.Data("x", [1.0, 2.0, 3.0])
_ = pm.Normal("y", mu=x, shape=3)
trace = pm.sample(chains=1)

np.testing.assert_allclose(np.array([1.0, 2.0, 3.0]), x.get_value(), atol=1e-1)
np.testing.assert_allclose(
np.array([1.0, 2.0, 3.0]), trace["y"].mean(0), atol=1e-1
)

with m:
pm.set_data({"x": np.array([2.0, 4.0, 6.0])})
trace = pm.sample(chains=1)

np.testing.assert_allclose(np.array([2.0, 4.0, 6.0]), x.get_value(), atol=1e-1)
np.testing.assert_allclose(
np.array([2.0, 4.0, 6.0]), trace["y"].mean(0), atol=1e-1
)

def test_creation_of_data_outside_model_context(self):
with pytest.raises((IndexError, TypeError)) as error:
pm.Data('data', [1.1, 2.2, 3.3])
error.match('No model on context stack')
pm.Data("data", [1.1, 2.2, 3.3])
error.match("No model on context stack")

def test_set_data_to_non_data_container_variables(self):
with pm.Model() as model:
x = np.array([1., 2., 3.])
y = np.array([1., 2., 3.])
beta = pm.Normal('beta', 0, 10.)
pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
x = np.array([1.0, 2.0, 3.0])
y = np.array([1.0, 2.0, 3.0])
beta = pm.Normal("beta", 0, 10.0)
pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
pm.sample(1000, init=None, tune=1000, chains=1)
with pytest.raises(TypeError) as error:
pm.set_data({'beta': [1.1, 2.2, 3.3]}, model=model)
error.match('defined as `pymc3.Data` inside the model')
pm.set_data({"beta": [1.1, 2.2, 3.3]}, model=model)
error.match("defined as `pymc3.Data` inside the model")

def test_model_to_graphviz_for_model_with_data_container(self):
with pm.Model() as model:
x = pm.Data('x', [1., 2., 3.])
y = pm.Data('y', [1., 2., 3.])
beta = pm.Normal('beta', 0, 10.)
pm.Normal('obs', beta * x, np.sqrt(1e-2), observed=y)
x = pm.Data("x", [1.0, 2.0, 3.0])
y = pm.Data("y", [1.0, 2.0, 3.0])
beta = pm.Normal("beta", 0, 10.0)
pm.Normal("obs", beta * x, np.sqrt(1e-2), observed=y)
pm.sample(1000, init=None, tune=1000, chains=1)

g = pm.model_to_graphviz(model)
Expand All @@ -147,7 +198,7 @@ def test_model_to_graphviz_for_model_with_data_container(self):

def test_data_naming():
"""
This is a test for issue #3793 -- `Data` objects in named models are
This is a test for issue #3793 -- `Data` objects in named models are
not given model-relative names.
"""
with pm.Model("named_model") as model:
Expand Down

0 comments on commit 1ed0478

Please sign in to comment.