diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 8858828c065..0700dd84db8 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -9,6 +9,8 @@ This is the first release to support Python3.9 and to drop Python3.6. - Make `sample_shape` same across all contexts in `draw_values` (see [#4305](https://github.com/pymc-devs/pymc3/pull/4305)). - Removed `theanof.set_theano_config` because it illegally touched Theano's privates (see [#4329](https://github.com/pymc-devs/pymc3/pull/4329)). +### New Features +- `OrderedProbit` distribution added (see [#4232](https://github.com/pymc-devs/pymc3/pull/4232)). ## PyMC3 3.10.0 (7 December 2020) diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index 9ebf8ee4eea..ade7c9ef000 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -61,6 +61,7 @@ HyperGeometric, NegativeBinomial, OrderedLogistic, + OrderedProbit, Poisson, ZeroInflatedBinomial, ZeroInflatedNegativeBinomial, @@ -140,6 +141,7 @@ "HyperGeometric", "Categorical", "OrderedLogistic", + "OrderedProbit", "DensityDist", "Distribution", "Continuous", diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index c70c394e14a..34fb39a48f3 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -24,7 +24,10 @@ binomln, bound, factln, + log_diff_normal_cdf, logpow, + normal_lccdf, + normal_lcdf, random_choice, ) from pymc3.distributions.distribution import Discrete, draw_values, generate_samples @@ -1638,3 +1641,131 @@ def __init__(self, eta, cutpoints, *args, **kwargs): p = p_cum[..., 1:] - p_cum[..., :-1] super().__init__(p=p, *args, **kwargs) + + +class OrderedProbit(Categorical): + R""" + Ordered Probit log-likelihood. + + Useful for regression on ordinal data values whose values range + from 1 to K as a function of some predictor, :math:`\eta`. The + cutpoints, :math:`c`, separate which ranges of :math:`\eta` are + mapped to which of the K observed dependent variables. The number + of cutpoints is K - 1. It is recommended that the cutpoints are + constrained to be ordered. + + In order to stabilize the computation, log-likelihood is computed + in log space using the scaled error function `erfcx`. + + .. math:: + + f(k \mid \eta, c) = \left\{ + \begin{array}{l} + 1 - \text{normal_cdf}(0, \sigma, \eta - c_1) + \,, \text{if } k = 0 \\ + \text{normal_cdf}(0, \sigma, \eta - c_{k - 1}) - + \text{normal_cdf}(0, \sigma, \eta - c_{k}) + \,, \text{if } 0 < k < K \\ + \text{normal_cdf}(0, \sigma, \eta - c_{K - 1}) + \,, \text{if } k = K \\ + \end{array} + \right. + + Parameters + ---------- + eta : float + The predictor. + c : array + The length K - 1 array of cutpoints which break :math:`\eta` into + ranges. Do not explicitly set the first and last elements of + :math:`c` to negative and positive infinity. + + sigma: float + The standard deviation of probit function. + Example + -------- + .. code:: python + + # Generate data for a simple 1 dimensional example problem + n1_c = 300; n2_c = 300; n3_c = 300 + cluster1 = np.random.randn(n1_c) + -1 + cluster2 = np.random.randn(n2_c) + 0 + cluster3 = np.random.randn(n3_c) + 2 + + x = np.concatenate((cluster1, cluster2, cluster3)) + y = np.concatenate((1*np.ones(n1_c), + 2*np.ones(n2_c), + 3*np.ones(n3_c))) - 1 + + # Ordered probit regression + with pm.Model() as model: + cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2, + transform=pm.distributions.transforms.ordered) + y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y) + tr = pm.sample(1000) + + # Plot the results + plt.hist(cluster1, 30, alpha=0.5); + plt.hist(cluster2, 30, alpha=0.5); + plt.hist(cluster3, 30, alpha=0.5); + plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k'); + plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k'); + + """ + + def __init__(self, eta, cutpoints, *args, **kwargs): + + self.eta = tt.as_tensor_variable(floatX(eta)) + self.cutpoints = tt.as_tensor_variable(cutpoints) + + probits = tt.shape_padright(self.eta) - self.cutpoints + _log_p = tt.concatenate( + [ + tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])), + log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]), + tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])), + ], + axis=-1, + ) + _log_p = tt.as_tensor_variable(floatX(_log_p)) + + self._log_p = _log_p + self.mode = tt.argmax(_log_p, axis=-1) + p = tt.exp(_log_p) + + super().__init__(p=p, *args, **kwargs) + + def logp(self, value): + r""" + Calculate log-probability of Ordered Probit distribution at specified value. + + Parameters + ---------- + value: numeric + Value(s) for which log-probability is calculated. If the log probabilities for multiple + values are desired the values must be provided in a numpy array or theano tensor + + Returns + ------- + TensorVariable + """ + logp = self._log_p + k = self.k + + # Clip values before using them for indexing + value_clip = tt.clip(value, 0, k - 1) + + if logp.ndim > 1: + if logp.ndim > value_clip.ndim: + value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim) + elif logp.ndim < value_clip.ndim: + logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim) + pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1)) + a = take_along_axis( + logp.dimshuffle(pattern), + value_clip, + ) + else: + a = logp[value_clip] + + return bound(a, value >= 0, value <= (k - 1)) diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py index 35790e8f60b..e3020007356 100644 --- a/pymc3/distributions/dist_math.py +++ b/pymc3/distributions/dist_math.py @@ -134,6 +134,46 @@ def normal_lccdf(mu, sigma, x): ) +def log_diff_normal_cdf(mu, sigma, x, y): + """ + Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space. + + Parameters + ---------- + mu: float + mean + sigma: float + std + + x: float + + y: float + must be strictly less than x. + + Returns + ------- + log (\\Phi(x) - \\Phi(y)) + + """ + x = (x - mu) / sigma / tt.sqrt(2.0) + y = (y - mu) / sigma / tt.sqrt(2.0) + + # To stabilize the computation, consider these three regions: + # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y) + # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y) + # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine. + return tt.log(0.5) + tt.switch( + tt.gt(y, 0), + -tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)), + tt.switch( + tt.lt(x, 0), # 0 > x > y + -tt.square(x) + + tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)), + tt.log(tt.erf(x) - tt.erf(y)), # x >0 > y + ), + ) + + def sigma2rho(sigma): """ `sigma -> rho` theano converter diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 8d3e4af7270..daffdbe8e16 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -26,7 +26,7 @@ from numpy import array, exp, inf, log from numpy.testing import assert_allclose, assert_almost_equal, assert_equal from scipy import integrate -from scipy.special import logit +from scipy.special import erf, logit import pymc3 as pm @@ -74,6 +74,7 @@ NegativeBinomial, Normal, OrderedLogistic, + OrderedProbit, Pareto, Poisson, Rice, @@ -431,6 +432,17 @@ def orderedlogistic_logpdf(value, eta, cutpoints): return np.where(np.all(ps >= 0), np.log(p), -np.inf) +def invprobit(x): + return (erf(x / np.sqrt(2)) + 1) / 2 + + +def orderedprobit_logpdf(value, eta, cutpoints): + c = np.concatenate(([-np.inf], cutpoints, [np.inf])) + ps = np.array([invprobit(eta - cc) - invprobit(eta - cc1) for cc, cc1 in zip(c[:-1], c[1:])]) + p = ps[value] + return np.where(np.all(ps >= 0), np.log(p), -np.inf) + + class Simplex: def __init__(self, n): self.vals = list(simplex_values(n)) @@ -1536,6 +1548,15 @@ def test_orderedlogistic(self, n): lambda value, eta, cutpoints: orderedlogistic_logpdf(value, eta, cutpoints), ) + @pytest.mark.parametrize("n", [2, 3, 4]) + def test_orderedprobit(self, n): + self.pymc3_matches_scipy( + OrderedProbit, + Domain(range(n), "int64"), + {"eta": Runif, "cutpoints": UnitSortedVector(n - 1)}, + lambda value, eta, cutpoints: orderedprobit_logpdf(value, eta, cutpoints), + ) + def test_densitydist(self): def logp(x): return -log(2 * 0.5) - abs(x - 0.5) / 0.5