Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add numerically safe ordered probit distribution. #4232

Merged
merged 9 commits into from
Dec 16, 2020
2 changes: 2 additions & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ This is the first release to support Python3.9 and to drop Python3.6.
- Make `sample_shape` same across all contexts in `draw_values` (see [#4305](https://github.com/pymc-devs/pymc3/pull/4305)).
- Removed `theanof.set_theano_config` because it illegally touched Theano's privates (see [#4329](https://github.com/pymc-devs/pymc3/pull/4329)).

### New Features
- `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)).

## PyMC3 3.10.0 (7 December 2020)

Expand Down
2 changes: 2 additions & 0 deletions pymc3/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
HyperGeometric,
NegativeBinomial,
OrderedLogistic,
OrderedProbit,
Poisson,
ZeroInflatedBinomial,
ZeroInflatedNegativeBinomial,
Expand Down Expand Up @@ -140,6 +141,7 @@
"HyperGeometric",
"Categorical",
"OrderedLogistic",
"OrderedProbit",
"DensityDist",
"Distribution",
"Continuous",
Expand Down
131 changes: 131 additions & 0 deletions pymc3/distributions/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
binomln,
bound,
factln,
log_diff_normal_cdf,
logpow,
normal_lccdf,
normal_lcdf,
random_choice,
)
from pymc3.distributions.distribution import Discrete, draw_values, generate_samples
Expand Down Expand Up @@ -1638,3 +1641,131 @@ def __init__(self, eta, cutpoints, *args, **kwargs):
p = p_cum[..., 1:] - p_cum[..., :-1]

super().__init__(p=p, *args, **kwargs)


class OrderedProbit(Categorical):
R"""
Ordered Probit log-likelihood.

Useful for regression on ordinal data values whose values range
from 1 to K as a function of some predictor, :math:`\eta`. The
cutpoints, :math:`c`, separate which ranges of :math:`\eta` are
mapped to which of the K observed dependent variables. The number
of cutpoints is K - 1. It is recommended that the cutpoints are
constrained to be ordered.

In order to stabilize the computation, log-likelihood is computed
in log space using the scaled error function `erfcx`.

.. math::

f(k \mid \eta, c) = \left\{
\begin{array}{l}
1 - \text{normal_cdf}(0, \sigma, \eta - c_1)
\,, \text{if } k = 0 \\
\text{normal_cdf}(0, \sigma, \eta - c_{k - 1}) -
\text{normal_cdf}(0, \sigma, \eta - c_{k})
\,, \text{if } 0 < k < K \\
\text{normal_cdf}(0, \sigma, \eta - c_{K - 1})
\,, \text{if } k = K \\
\end{array}
\right.

Parameters
----------
eta : float
The predictor.
c : array
The length K - 1 array of cutpoints which break :math:`\eta` into
ranges. Do not explicitly set the first and last elements of
:math:`c` to negative and positive infinity.

sigma: float
The standard deviation of probit function.
Example
--------
.. code:: python

# Generate data for a simple 1 dimensional example problem
n1_c = 300; n2_c = 300; n3_c = 300
cluster1 = np.random.randn(n1_c) + -1
cluster2 = np.random.randn(n2_c) + 0
cluster3 = np.random.randn(n3_c) + 2

x = np.concatenate((cluster1, cluster2, cluster3))
y = np.concatenate((1*np.ones(n1_c),
2*np.ones(n2_c),
3*np.ones(n3_c))) - 1

# Ordered probit regression
with pm.Model() as model:
cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
transform=pm.distributions.transforms.ordered)
y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y)
tr = pm.sample(1000)

# Plot the results
plt.hist(cluster1, 30, alpha=0.5);
plt.hist(cluster2, 30, alpha=0.5);
plt.hist(cluster3, 30, alpha=0.5);
plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k');
plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k');

"""

def __init__(self, eta, cutpoints, *args, **kwargs):

self.eta = tt.as_tensor_variable(floatX(eta))
self.cutpoints = tt.as_tensor_variable(cutpoints)

probits = tt.shape_padright(self.eta) - self.cutpoints
_log_p = tt.concatenate(
[
tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])),
log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]),
tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])),
],
axis=-1,
)
_log_p = tt.as_tensor_variable(floatX(_log_p))

self._log_p = _log_p
self.mode = tt.argmax(_log_p, axis=-1)
p = tt.exp(_log_p)

super().__init__(p=p, *args, **kwargs)

def logp(self, value):
r"""
Calculate log-probability of Ordered Probit distribution at specified value.

Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or theano tensor

Returns
-------
TensorVariable
"""
logp = self._log_p
k = self.k

# Clip values before using them for indexing
value_clip = tt.clip(value, 0, k - 1)

if logp.ndim > 1:
if logp.ndim > value_clip.ndim:
value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim)
elif logp.ndim < value_clip.ndim:
logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim)
pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1))
a = take_along_axis(
logp.dimshuffle(pattern),
value_clip,
)
else:
a = logp[value_clip]

return bound(a, value >= 0, value <= (k - 1))
40 changes: 40 additions & 0 deletions pymc3/distributions/dist_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,46 @@ def normal_lccdf(mu, sigma, x):
)


def log_diff_normal_cdf(mu, sigma, x, y):
"""
Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space.

Parameters
----------
mu: float
mean
sigma: float
std

x: float

y: float
must be strictly less than x.

Returns
-------
log (\\Phi(x) - \\Phi(y))

"""
x = (x - mu) / sigma / tt.sqrt(2.0)
y = (y - mu) / sigma / tt.sqrt(2.0)

# To stabilize the computation, consider these three regions:
# 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
# 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
# 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
return tt.log(0.5) + tt.switch(
tt.gt(y, 0),
-tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)),
tt.switch(
tt.lt(x, 0), # 0 > x > y
-tt.square(x)
+ tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)),
tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y
),
)


def sigma2rho(sigma):
"""
`sigma -> rho` theano converter
Expand Down
23 changes: 22 additions & 1 deletion pymc3/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from numpy import array, exp, inf, log
from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
from scipy import integrate
from scipy.special import logit
from scipy.special import erf, logit

import pymc3 as pm

Expand Down Expand Up @@ -74,6 +74,7 @@
NegativeBinomial,
Normal,
OrderedLogistic,
OrderedProbit,
Pareto,
Poisson,
Rice,
Expand Down Expand Up @@ -431,6 +432,17 @@ def orderedlogistic_logpdf(value, eta, cutpoints):
return np.where(np.all(ps >= 0), np.log(p), -np.inf)


def invprobit(x):
return (erf(x / np.sqrt(2)) + 1) / 2


def orderedprobit_logpdf(value, eta, cutpoints):
c = np.concatenate(([-np.inf], cutpoints, [np.inf]))
ps = np.array([invprobit(eta - cc) - invprobit(eta - cc1) for cc, cc1 in zip(c[:-1], c[1:])])
p = ps[value]
return np.where(np.all(ps >= 0), np.log(p), -np.inf)


class Simplex:
def __init__(self, n):
self.vals = list(simplex_values(n))
Expand Down Expand Up @@ -1536,6 +1548,15 @@ def test_orderedlogistic(self, n):
lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints),
)

@pytest.mark.parametrize("n", [2, 3, 4])
def test_orderedprobit(self, n):
self.pymc3_matches_scipy(
OrderedProbit,
Domain(range(n), "int64"),
{"eta": Runif, "cutpoints": UnitSortedVector(n - 1)},
lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints),
)

def test_densitydist(self):
def logp(x):
return -log(2 * 0.5) - abs(x - 0.5) / 0.5
Expand Down