From 731958aa04f466edb0fe3cf1bf1259bda0f04743 Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 18:02:39 +0100 Subject: [PATCH 1/6] implemented the harmonic numbers --- lmo/special.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/lmo/special.py b/lmo/special.py index bbddf5dd..6ec080f0 100644 --- a/lmo/special.py +++ b/lmo/special.py @@ -1,6 +1,6 @@ """Mathematical "special" functions, extending `scipy.special`.""" -__all__ = ('fpow', 'gamma2') +__all__ = ('fpow', 'gamma2', 'harmonic') from typing import cast, overload @@ -148,3 +148,51 @@ def gamma2( ) res *= cast(float, _special.gamma(a)) # type: ignore return res + + +def harmonic( + n: npt.ArrayLike, + /, + out: npt.NDArray[np.float64 | np.float128] | None = None, +) -> float | complex | npt.NDArray[np.float64 | np.float128]: + r""" + Harmonic number \( H_n = \sum_{k=1}^{n} 1 / k \), extended for real and + complex argument via analytic contunuation. + + Examples: + >>> harmonic(0) + 0.0 + >>> harmonic(1) + 1.0 + >>> harmonic(2) + 1.5 + >>> harmonic(42) + 4.3267... + >>> harmonic(np.pi) + 1.8727... + >>> harmonic(-1 / 12) + -0.1461... + >>> harmonic(1 - 1j) + (1.1718...-0.5766...j) + + Args: + n: Real- or complex- valued parameter, as array-like or scalar. + out: Optional real or complex output array for the results. + + Returns: + out: Array or scalar with the value(s) of the function. + + See Also: + - [Harmonic number - Wikipedia + ](https://wikipedia.org/wiki/Harmonic_number) + """ + _n = np.asanyarray(n) + + _out = cast( + npt.NDArray[np.float64 | np.float128], + _special.digamma(_n + 1, out), # type: ignore + ) + _out += np.euler_gamma + + return _out[()] if np.isscalar(n) else _out + From 426d35a93c65c6b235f19e987aa1d3d30435632d Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 18:42:58 +0100 Subject: [PATCH 2/6] cleaner Kumaraswamy L-moment expression --- docs/distributions.md | 13 +- lmo/distributions.py | 509 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 515 insertions(+), 7 deletions(-) create mode 100644 lmo/distributions.py diff --git a/docs/distributions.md b/docs/distributions.md index e46a6665..ccd10a99 100644 --- a/docs/distributions.md +++ b/docs/distributions.md @@ -1041,13 +1041,12 @@ Its general \( r \)-th trimmed L-moment are: \[ \begin{equation} \tlmoment{s,t}{r} = - \beta \ - \frac{r + s + t}{r} - \sum_{k = t}^{r + s + t - 1} - (-1)^k - \binom{k + r - 1}{k - t} - \binom{r + s + t - 1}{k} - \B\bigl(1 + 1 / \alpha,\ \beta + k \beta \bigr) + \frac{1}{r} + \sum_{k = t + 1}^{r + s + t} + (-1)^{k - 1} + \binom{r + k - 2}{r + t - 1} + \binom{r + s + t}{k} + \frac{\B\bigl(1 / \alpha,\ 1 + k \beta \bigr)}{\alpha} \label{eq:lr_kum} \end{equation} \] diff --git a/lmo/distributions.py b/lmo/distributions.py new file mode 100644 index 00000000..3e8618b8 --- /dev/null +++ b/lmo/distributions.py @@ -0,0 +1,509 @@ +"""Probability distributions, compatible with [`scipy.stats`][scipy.stats].""" + +__all__ = ( + 'l_rv_nonparametric', + 'kumaraswamy', +) + +# pyright: reportIncompatibleMethodOverride=false + +import functools +import math +import warnings +from collections.abc import Callable, Mapping +from typing import ( + Any, + Final, + Sequence, + SupportsIndex, + TypeVar, + cast, +) + +import numpy as np +import numpy.polynomial as npp +import numpy.typing as npt +import scipy.special as sc # type: ignore +from scipy.stats._distn_infrastructure import _ShapeInfo # type: ignore +from scipy.stats.distributions import ( # type: ignore + rv_continuous, +) + +from .special import harmonic + +from ._poly import jacobi_series, roots +from ._utils import ( + clean_order, + clean_trim, +) +from .diagnostic import l_ratio_bounds +from .typing import ( + AnyTrim, + FloatVector, + PolySeries, +) + +T = TypeVar('T') +X = TypeVar('X', bound='l_rv_nonparametric') +F = TypeVar('F', bound=np.floating[Any]) +M = TypeVar('M', bound=Callable[..., Any]) +V = TypeVar('V', bound=float | npt.NDArray[np.float64]) + +_F_EPS: Final[np.float64] = np.finfo(float).eps + + +# Non-parametric + +def _check_lmoments(l_r: npt.NDArray[np.floating[Any]], s: float, t: float): + if (n := len(l_r)) < 2: + msg = f'at least 2 L-moments required, got {n}' + raise ValueError(msg) + if n == 2: + return + + r = np.arange(1, n + 1) + t_r = l_r[2:] / l_r[1] + t_r_max = l_ratio_bounds(r[2:], (s, t)) + if np.any(rs0_oob := np.abs(t_r) > t_r_max): + r_oob = np.argwhere(rs0_oob)[0] + 3 + t_oob = t_r[rs0_oob][0] + t_max = t_r_max[rs0_oob][0] + msg = ( + f'invalid L-moment ratio for r={list(r_oob)}: ' + f'|{t_oob}| <= {t_max} does not hold' + ) + raise ArithmeticError(msg) + + +def _ppf_poly_series( + l_r: npt.NDArray[np.floating[Any]], + s: float, + t: float, +) -> PolySeries: + # Corrected version of Theorem 3. from Hosking (2007). + # + r = np.arange(1, len(l_r) + 1) + c = (s + t - 1 + 2 * r) * r / (s + t + r) + + return jacobi_series( + c * l_r, + t, + s, + domain=[0, 1], + # convert to Legendre, even if trimmed; this avoids huge coefficient + kind=npp.Legendre, + symbol='q', + ) + +class l_rv_nonparametric(rv_continuous): # noqa: N801 + r""" + Estimate a distribution using the given L-moments. + See [`scipy.stats.rv_continuous`][scipy.stats.rv_continuous] for the + available method. + + The PPF (quantile function) is estimated using generalized Fourier series, + with the (shifted) Jacobi orthogonal polynomials as basis, and the (scaled) + L-moments as coefficients. + + The *corrected* version of theorem 3 from Hosking (2007) states that + + $$ + \hat{Q}(q) = \sum_{r=1}^{R} + \frac{(r + 1) (2r + s + t - 1)}{r + s + t + 1} + \lambda^{(s, t)}_r + P^{(t, s)}_{r - 1}(2u - 1) \; , + $$ + + converges almost everywhere as $R \rightarrow \infty$, for any + sufficiently smooth (quantile) function $Q(u)$ with $0 < u < 1$. + + References: + - [J.R.M. Hosking (2007) - Some theory and practical uses of trimmed + L-moments](https://doi.org/10.1016/j.jspi.2006.12.002) + - [Wolfram Research - Jacobi polynomial Fourier Expansion]( + http://functions.wolfram.com/05.06.25.0007.01) + + See Also: + - [Generalized Fourier series - Wikipedia]( + https://wikipedia.org/wiki/Generalized_Fourier_series) + """ + + _lm: Final[npt.NDArray[np.floating[Any]]] + _trim: Final[tuple[int, int] | tuple[float, float]] + + _ppf_poly: Final[PolySeries] + _isf_poly: Final[PolySeries] + + a: float + b: float + badvalue: float = np.nan + + def __init__( + self, + l_moments: FloatVector, + trim: AnyTrim = (0, 0), + a: float | None = None, + b: float | None = None, + **kwargs: Any, + ) -> None: + r""" + Args: + l_moments: + Vector containing the first $R$ consecutive L-moments + $\left[ + \lambda^{(s, t)}_1 \; + \lambda^{(s, t)}_2 \; + \dots \; + \lambda^{(s, t)}_R + \right]$, where $R \ge 2$. + + Sample L-moments can be estimated using e.g. + `lmo.l_moment(x, np.mgrid[:R] + 1, trim=(s, t))`. + + The trim-lengths $(s, t)$ should be the same for all + L-moments. + trim: + The left and right trim-lengths $(s, t)$, that correspond + to the provided `l_moments`. + a: + Lower bound of the support of the distribution. + By default it is estimated from the L-moments. + b: + Upper bound of the support of the distribution. + By default it is estimated from the L-moments. + **kwargs: + Optional params for `scipy.stats.rv_continuous`. + + Raises: + ValueError: If `len(l_moments) < 2`, `l_moments.ndim != 1`, or + there are invalid L-moments / trim-lengths. + """ + l_r = np.asarray_chkfinite(l_moments) + l_r.setflags(write=False) + + self._trim = (s, t) = clean_trim(trim) + + _check_lmoments(l_r, s, t) + self._lm = l_r + + # quantile function (inverse of cdf) + self._ppf_poly = ppf = _ppf_poly_series(l_r, s, t).trim(_F_EPS) + + # inverse survival function + self._isf_poly = ppf(1 - ppf.identity(domain=[0, 1])).trim(_F_EPS) + + # empirical support + self._a0, self._b0 = (q0, q1) = ppf(np.array([0, 1])) + if q0 >= q1: + msg = 'invalid l_rv_nonparametric: ppf(0) >= ppf(1)' + raise ArithmeticError(msg) + + kwargs.setdefault('momtype', 1) + super().__init__( # type: ignore [reportUnknownMemberType] + a=q0 if a is None else a, + b=q1 if b is None else b, + **kwargs, + ) + + @property + def l_moments(self) -> npt.NDArray[np.float64]: + r"""Initial L-moments, for orders $r = 1, 2, \dots, R$.""" + return self._lm + + @property + def trim(self) -> tuple[int, int] | tuple[float, float]: + """The provided trim-lengths $(s, t)$.""" + return self._trim + + @property + def ppf_poly(self) -> PolySeries: + r""" + Polynomial estimate of the percent point function (PPF), a.k.a. + the quantile function (QF), or the inverse cumulative distribution + function (ICDF). + + Note: + Converges to the "true" PPF in the mean-squared sense, with + weight function $q^s (1 - q)^t$ of quantile $q \in \[0, 1\]$, + and trim-lengths $(t_1, t_2) \in \mathbb{R^+} \times \mathbb{R^+}$. + + Returns: + A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] + orthogonal polynomial series instance. + """ + return self._ppf_poly + + @functools.cached_property + def cdf_poly(self) -> PolySeries: + """ + Polynomial least-squares interpolation of the CDF. + + Returns: + A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] + orthogonal polynomial series instance. + """ + ppf = self._ppf_poly + # number of variables of the PPF poly + k0 = ppf.degree() + 1 + assert k0 > 1 + + n = max(100, k0 * 10) + x = np.linspace(self.a, self.b, n) + q = cast(npt.NDArray[np.float64], self.cdf(x)) # type: ignore + y = ppf.deriv()(q) + w = np.sqrt(self._weights(q) + 0.01) + + # choose the polynomial that minimizes the BIC + bic_min = np.inf + cdf_best = None + for k in range(max(k0 // 2, 2), k0 + max(k0 // 2, 8)): + # fit + cdf = ppf.fit(x, q, k - 1).trim(_F_EPS) + k = cdf.degree() + 1 + + # according to the inverse function theorem, this should be 0 + eps = 1 / cdf.deriv()(x) - y + + # Bayesian information criterion (BIC) + bic = (k - 1) * np.log(n) + n * np.log( + np.average(eps**2, weights=w), + ) + + # minimize the BIC + if bic < bic_min: + bic_min = bic + cdf_best = cdf + + assert cdf_best is not None + return cdf_best + + @functools.cached_property + def pdf_poly(self) -> PolySeries: + """ + Derivative of the polynomial interpolation of the CDF, i.e. the + polynomial estimate of the PDF. + + Returns: + A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] + orthogonal polynomial series instance. + """ + return self.cdf_poly.deriv() + + def _weights(self, q: npt.ArrayLike) -> npt.NDArray[np.float64]: + _q = np.asarray(q, np.float64) + s, t = self._trim + return np.where( + (_q >= 0) & (_q <= 1), + _q**s * (1 - _q) ** t, + cast(float, getattr(self, 'badvalue', np.nan)), # type: ignore + ) + + def _ppf(self, q: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + return cast(npt.NDArray[np.float64], self._ppf_poly(q)) + + def _isf(self, q: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + return cast(npt.NDArray[np.float64], self._isf_poly(q)) + + def _cdf_single(self, x: float) -> float: + # find all q where Q(q) == x + q0 = roots(self._ppf_poly - x) + + if (n := len(q0)) == 0: + return self.badvalue + if n > 1: + warnings.warn( + f'multiple fixed points at {x = :.6f}: ' + f'{list(np.round(q0, 6))}', + stacklevel=3, + ) + + if cast(float, np.ptp(q0)) <= 1 / 4: + # "close enough" if within the same quartile; + # probability-weighted interpolation + return np.average(q0, weights=q0 * (1 - q0)) # type: ignore + + return self.badvalue + + return q0[0] + + def _pdf(self, x: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + return np.clip(cast(npt.NDArray[np.float64], self.pdf_poly(x)), 0, 1) + + def _munp(self, n: int): + # non-central product-moment $E[X^n]$ + return (self._ppf_poly**n).integ(lbnd=0)(1) + + def _updated_ctor_param(self) -> Mapping[str, Any]: + return cast( + Mapping[str, Any], + super()._updated_ctor_param() + | { + 'l_moments': self._lm, + 'trim': self._trim, + }, + ) + + @classmethod + def fit( + cls, + data: npt.ArrayLike, + /, + rmax: SupportsIndex | None = None, + trim: AnyTrim = (0, 0), + ) -> 'l_rv_nonparametric': + r""" + Estimate L-moment from the samples, and return a new + `l_rv_nonparametric` instance. + + Args: + data: + 1d array-like with univariate sample observations. + rmax: + The (maximum) amount of L-moment orders to use. + Defaults to $\lceil 4 \log_{10} N \rceil$. + The quantile polynomial will be of degree `rmax - 1`. + trim: + The left and right trim-lengths $(s, t)$, that correspond + to the provided `l_moments`. + + Returns: + A fitted + [`l_rv_nonparametric`][lmo.distributions.l_rv_nonparametric] + instance. + + Todo: + - Optimal `rmax` selection (the error appears to be periodic..?) + - Optimal `trim` selection + """ + # avoid circular imports + from ._lm import l_moment + + # x needs to be sorted anyway + x: npt.NDArray[np.floating[Any]] = np.sort(data) + + a, b = x[[0, -1]] + + if rmax is None: + _rmax = math.ceil(np.log10(x.size) * 4) + else: + _rmax = clean_order(rmax, name='rmax', rmin=2) + + _trim = clean_trim(trim) + + # sort kind 'stable' if already sorted + l_r = l_moment( + x, + np.arange(1, _rmax + 1), + trim=_trim, + sort='stable', # stable sort if fastest if already sorted + ) + + return cls(l_r, trim=_trim, a=a, b=b) + + +# Parametric + +class kumaraswamy_gen(rv_continuous): # noqa: N801 + r""" + A Kumaraswamy random variable, similar to [`beta`][scipy.stats.beta]. + + The probability density function for + [`kumaraswamy`][kumaraswamy] is: + + \[ + f(x, a, b) = a x^{a - 1} b \left(1 - x^a\right)^{b - 1} + \] + + for \( 0 < x < 1,\ a > 0,\ b > 0 \). + + [`kumaraswamy`][kumaraswamy] takes \( a \) and \( b \) as shape parameters. + + See Also: + - [Kumaraswamy distribution - Wikipedia + ](https://wikipedia.org/wiki/Kumaraswamy_distribution) + + """ + def _shape_info(self) -> Sequence[_ShapeInfo]: + ia = _ShapeInfo('a', False, (0, np.inf), (False, False)) + ib = _ShapeInfo('b', False, (0, np.inf), (False, False)) + return [ia, ib] + + def _pdf( + self, + x: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return a * b * x**(a - 1) * (1 - x**a)**(b - 1) + + def _logpdf( + self, + x: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return ( + np.log(a * b) + + (a - 1) * np.log(x) + + (b - 1) * np.log(1 - x**a) + ) + + def _cdf( + self, + x: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return 1 - (1 - x**a)**b + + def _sf( + self, + x: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return (1 - x**a)**(b - 1) + + def _isf( + self, + q: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return (1 - q**(1 / b))**(1 / a) + + def _ppf( + self, + q: npt.NDArray[np.float64], + a: float, + b: float, + ) -> npt.NDArray[np.float64]: + return (1 - (1 - q)**(1 / b))**(1 / a) + + def _entropy(self, a: float, b: float) -> float: + # https://en.wikipedia.org/wiki/Kumaraswamy_distribution + return (1 - 1 / b) + (1 - 1 / a) * harmonic(b) - np.log(a * b) + + def _munp( + self, + n: int, + a: float, + b: float, + ) -> float: + return b * cast(float, sc.beta(1 + n / a, b)) # type: ignore + + # def _l_moment( + # self, + # r: npt.NDArray[np.int64], + # a: float, + # b: float, + # trim: tuple[int, int], + # ) -> npt.NDArray[np.float64]: + # ... + + +kumaraswamy: Final[rv_continuous] = kumaraswamy_gen( + a=0.0, + b=1.0, + name='kumaraswamy', +) From 1cee10854a48020750ee5223e3ae13d4549fc06d Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 19:28:43 +0100 Subject: [PATCH 3/6] Kumaraswamy distribution implementation --- README.md | 2 - docs/api.md | 32 ++-- lmo/__init__.py | 7 +- lmo/_distns.py | 389 ------------------------------------------- lmo/distributions.py | 70 ++++++-- 5 files changed, 75 insertions(+), 425 deletions(-) delete mode 100644 lmo/_distns.py diff --git a/README.md b/README.md index 1ebc8061..684d1f98 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,6 @@ So Lmo is as fast as sorting your samples (in terms of time-complexity). - Fast estimation of L-*co*moment matrices from your multidimensional data or multivariate distribution. - Goodness-of-fit test, using L-moment or L-moment ratio's. -- Non-parametric estimation of continuous distributions - with `lmo.l_rv_nonparametric` - Exact (co)variance structure of the sample- and population L-moments. - Theoretical & empirical influence functions of L-moments & L-ratio's. - Complete [docs](https://jorenham.github.io/lmo/), including detailed API diff --git a/docs/api.md b/docs/api.md index 1e1ca178..ed466535 100644 --- a/docs/api.md +++ b/docs/api.md @@ -19,37 +19,37 @@ filters: ["^l_co"] heading_level: 4 -### `scipy.stats` extensions +### Distributions -::: lmo.contrib.scipy_stats +::: lmo.distributions options: - show_bases: false - members: - - l_rv_generic heading_level: 4 -### `pandas` extensions (optional) +### Statistical test and tools -::: lmo.contrib.pandas +::: lmo.diagnostic options: - show_bases: false - members: - - Series - - DataFrame heading_level: 4 -### Distributions +## Third party integration -::: lmo +### `scipy.stats` + +::: lmo.contrib.scipy_stats options: + show_bases: false members: - - l_rv_nonparametric + - l_rv_generic heading_level: 4 -### Statistical test and tools +### `pandas` (optional) -::: lmo.diagnostic +::: lmo.contrib.pandas options: + show_bases: false + members: + - Series + - DataFrame heading_level: 4 ## Low-level API diff --git a/lmo/__init__.py b/lmo/__init__.py index 1f115506..1a7d5dd1 100644 --- a/lmo/__init__.py +++ b/lmo/__init__.py @@ -31,15 +31,10 @@ 'l_comoment', 'l_coratio', 'l_costats', - - 'l_rv_nonparametric', ) from typing import TYPE_CHECKING, Final -from ._distns import ( - l_rv_nonparametric, -) from ._lm import ( l_kurtosis, l_loc, @@ -69,7 +64,7 @@ from ._meta import get_version as _get_version if not TYPE_CHECKING: -# install contrib module extensions + # install contrib module extensions from .contrib import install as _install _install() diff --git a/lmo/_distns.py b/lmo/_distns.py deleted file mode 100644 index cab04445..00000000 --- a/lmo/_distns.py +++ /dev/null @@ -1,389 +0,0 @@ -# pyright: reportIncompatibleMethodOverride=false - -__all__ = ('l_rv_nonparametric',) - -import functools -import math -import warnings -from collections.abc import Callable, Mapping -from typing import ( - Any, - Final, - SupportsIndex, - TypeVar, - cast, -) - -import numpy as np -import numpy.polynomial as npp -import numpy.typing as npt -from scipy.stats.distributions import ( # type: ignore - rv_continuous, -) - -from ._poly import jacobi_series, roots -from ._utils import ( - clean_order, - clean_trim, -) -from .diagnostic import l_ratio_bounds -from .typing import ( - AnyTrim, - FloatVector, - PolySeries, -) - -T = TypeVar('T') -X = TypeVar('X', bound='l_rv_nonparametric') -F = TypeVar('F', bound=np.floating[Any]) -M = TypeVar('M', bound=Callable[..., Any]) -V = TypeVar('V', bound=float | npt.NDArray[np.float64]) - -_F_EPS: Final[np.float64] = np.finfo(float).eps - - -def _check_lmoments(l_r: npt.NDArray[np.floating[Any]], s: float, t: float): - if (n := len(l_r)) < 2: - msg = f'at least 2 L-moments required, got {n}' - raise ValueError(msg) - if n == 2: - return - - r = np.arange(1, n + 1) - t_r = l_r[2:] / l_r[1] - t_r_max = l_ratio_bounds(r[2:], (s, t)) - if np.any(rs0_oob := np.abs(t_r) > t_r_max): - r_oob = np.argwhere(rs0_oob)[0] + 3 - t_oob = t_r[rs0_oob][0] - t_max = t_r_max[rs0_oob][0] - msg = ( - f'invalid L-moment ratio for r={list(r_oob)}: ' - f'|{t_oob}| <= {t_max} does not hold' - ) - raise ArithmeticError(msg) - - -def _ppf_poly_series( - l_r: npt.NDArray[np.floating[Any]], - s: float, - t: float, -) -> PolySeries: - # Corrected version of Theorem 3. from Hosking (2007). - # - r = np.arange(1, len(l_r) + 1) - c = (s + t - 1 + 2 * r) * r / (s + t + r) - - return jacobi_series( - c * l_r, - t, - s, - domain=[0, 1], - # convert to Legendre, even if trimmed; this avoids huge coefficient - kind=npp.Legendre, - symbol='q', - ) - - -class l_rv_nonparametric(rv_continuous): # noqa: N801 - r""" - Estimate a distribution using the given L-moments. - See [`scipy.stats.rv_continuous`][scipy.stats.rv_continuous] for the - available method. - - The PPF (quantile function) is estimated using generalized Fourier series, - with the (shifted) Jacobi orthogonal polynomials as basis, and the (scaled) - L-moments as coefficients. - - The *corrected* version of theorem 3 from Hosking (2007) states that - - $$ - \hat{Q}(q) = \sum_{r=1}^{R} - \frac{(r + 1) (2r + s + t - 1)}{r + s + t + 1} - \lambda^{(s, t)}_r - P^{(t, s)}_{r - 1}(2u - 1) \; , - $$ - - converges almost everywhere as $R \rightarrow \infty$, for any - sufficiently smooth (quantile) function $Q(u)$ with $0 < u < 1$. - - References: - - [J.R.M. Hosking (2007) - Some theory and practical uses of trimmed - L-moments](https://doi.org/10.1016/j.jspi.2006.12.002) - - [Wolfram Research - Jacobi polynomial Fourier Expansion]( - http://functions.wolfram.com/05.06.25.0007.01) - - See Also: - - [Generalized Fourier series - Wikipedia]( - https://wikipedia.org/wiki/Generalized_Fourier_series) - """ - - _lm: Final[npt.NDArray[np.floating[Any]]] - _trim: Final[tuple[int, int] | tuple[float, float]] - - _ppf_poly: Final[PolySeries] - _isf_poly: Final[PolySeries] - - a: float - b: float - badvalue: float = np.nan - - def __init__( - self, - l_moments: FloatVector, - trim: AnyTrim = (0, 0), - a: float | None = None, - b: float | None = None, - **kwargs: Any, - ) -> None: - r""" - Args: - l_moments: - Vector containing the first $R$ consecutive L-moments - $\left[ - \lambda^{(s, t)}_1 \; - \lambda^{(s, t)}_2 \; - \dots \; - \lambda^{(s, t)}_R - \right]$, where $R \ge 2$. - - Sample L-moments can be estimated using e.g. - `lmo.l_moment(x, np.mgrid[:R] + 1, trim=(s, t))`. - - The trim-lengths $(s, t)$ should be the same for all - L-moments. - trim: - The left and right trim-lengths $(s, t)$, that correspond - to the provided `l_moments`. - a: - Lower bound of the support of the distribution. - By default it is estimated from the L-moments. - b: - Upper bound of the support of the distribution. - By default it is estimated from the L-moments. - **kwargs: - Optional params for `scipy.stats.rv_continuous`. - - Raises: - ValueError: If `len(l_moments) < 2`, `l_moments.ndim != 1`, or - there are invalid L-moments / trim-lengths. - """ - l_r = np.asarray_chkfinite(l_moments) - l_r.setflags(write=False) - - self._trim = (s, t) = clean_trim(trim) - - _check_lmoments(l_r, s, t) - self._lm = l_r - - # quantile function (inverse of cdf) - self._ppf_poly = ppf = _ppf_poly_series(l_r, s, t).trim(_F_EPS) - - # inverse survival function - self._isf_poly = ppf(1 - ppf.identity(domain=[0, 1])).trim(_F_EPS) - - # empirical support - self._a0, self._b0 = (q0, q1) = ppf(np.array([0, 1])) - if q0 >= q1: - msg = 'invalid l_rv_nonparametric: ppf(0) >= ppf(1)' - raise ArithmeticError(msg) - - kwargs.setdefault('momtype', 1) - super().__init__( # type: ignore [reportUnknownMemberType] - a=q0 if a is None else a, - b=q1 if b is None else b, - **kwargs, - ) - - @property - def l_moments(self) -> npt.NDArray[np.float64]: - r"""Initial L-moments, for orders $r = 1, 2, \dots, R$.""" - return self._lm - - @property - def trim(self) -> tuple[int, int] | tuple[float, float]: - """The provided trim-lengths $(s, t)$.""" - return self._trim - - @property - def ppf_poly(self) -> PolySeries: - r""" - Polynomial estimate of the percent point function (PPF), a.k.a. - the quantile function (QF), or the inverse cumulative distribution - function (ICDF). - - Note: - Converges to the "true" PPF in the mean-squared sense, with - weight function $q^s (1 - q)^t$ of quantile $q \in \[0, 1\]$, - and trim-lengths $(t_1, t_2) \in \mathbb{R^+} \times \mathbb{R^+}$. - - Returns: - A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] - orthogonal polynomial series instance. - """ - return self._ppf_poly - - @functools.cached_property - def cdf_poly(self) -> PolySeries: - """ - Polynomial least-squares interpolation of the CDF. - - Returns: - A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] - orthogonal polynomial series instance. - """ - ppf = self._ppf_poly - # number of variables of the PPF poly - k0 = ppf.degree() + 1 - assert k0 > 1 - - n = max(100, k0 * 10) - x = np.linspace(self.a, self.b, n) - q = cast(npt.NDArray[np.float64], self.cdf(x)) # type: ignore - y = ppf.deriv()(q) - w = np.sqrt(self._weights(q) + 0.01) - - # choose the polynomial that minimizes the BIC - bic_min = np.inf - cdf_best = None - for k in range(max(k0 // 2, 2), k0 + max(k0 // 2, 8)): - # fit - cdf = ppf.fit(x, q, k - 1).trim(_F_EPS) - k = cdf.degree() + 1 - - # according to the inverse function theorem, this should be 0 - eps = 1 / cdf.deriv()(x) - y - - # Bayesian information criterion (BIC) - bic = (k - 1) * np.log(n) + n * np.log( - np.average(eps**2, weights=w), - ) - - # minimize the BIC - if bic < bic_min: - bic_min = bic - cdf_best = cdf - - assert cdf_best is not None - return cdf_best - - @functools.cached_property - def pdf_poly(self) -> PolySeries: - """ - Derivative of the polynomial interpolation of the CDF, i.e. the - polynomial estimate of the PDF. - - Returns: - A [`numpy.polynomial.Legendre`][numpy.polynomial.legendre.Legendre] - orthogonal polynomial series instance. - """ - return self.cdf_poly.deriv() - - def _weights(self, q: npt.ArrayLike) -> npt.NDArray[np.float64]: - _q = np.asarray(q, np.float64) - s, t = self._trim - return np.where( - (_q >= 0) & (_q <= 1), - _q**s * (1 - _q) ** t, - cast(float, getattr(self, 'badvalue', np.nan)), # type: ignore - ) - - def _ppf(self, q: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: - return cast(npt.NDArray[np.float64], self._ppf_poly(q)) - - def _isf(self, q: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: - return cast(npt.NDArray[np.float64], self._isf_poly(q)) - - def _cdf_single(self, x: float) -> float: - # find all q where Q(q) == x - q0 = roots(self._ppf_poly - x) - - if (n := len(q0)) == 0: - return self.badvalue - if n > 1: - warnings.warn( - f'multiple fixed points at {x = :.6f}: ' - f'{list(np.round(q0, 6))}', - stacklevel=3, - ) - - if cast(float, np.ptp(q0)) <= 1 / 4: - # "close enough" if within the same quartile; - # probability-weighted interpolation - return np.average(q0, weights=q0 * (1 - q0)) # type: ignore - - return self.badvalue - - return q0[0] - - def _pdf(self, x: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: - return np.clip(cast(npt.NDArray[np.float64], self.pdf_poly(x)), 0, 1) - - def _munp(self, n: int): - # non-central product-moment $E[X^n]$ - return (self._ppf_poly**n).integ(lbnd=0)(1) - - def _updated_ctor_param(self) -> Mapping[str, Any]: - return cast( - Mapping[str, Any], - super()._updated_ctor_param() - | { - 'l_moments': self._lm, - 'trim': self._trim, - }, - ) - - @classmethod - def fit( - cls, - data: npt.ArrayLike, - /, - rmax: SupportsIndex | None = None, - trim: AnyTrim = (0, 0), - ) -> 'l_rv_nonparametric': - r""" - Estimate L-moment from the samples, and return a new - `l_rv_nonparametric` instance. - - Args: - data: - 1d array-like with univariate sample observations. - rmax: - The (maximum) amount of L-moment orders to use. - Defaults to $\lceil 4 \log_{10} N \rceil$. - The quantile polynomial will be of degree `rmax - 1`. - trim: - The left and right trim-lengths $(s, t)$, that correspond - to the provided `l_moments`. - - Returns: - A fitted [`l_rv_nonparametric`][lmo.l_rv_nonparametric] instance. - - Todo: - - Optimal `rmax` selection (the error appears to be periodic..?) - - Optimal `trim` selection - """ - # avoid circular imports - from ._lm import l_moment - - # x needs to be sorted anyway - x: npt.NDArray[np.floating[Any]] = np.sort(data) - - a, b = x[[0, -1]] - - if rmax is None: - _rmax = math.ceil(np.log10(x.size) * 4) - else: - _rmax = clean_order(rmax, name='rmax', rmin=2) - - _trim = clean_trim(trim) - - # sort kind 'stable' if already sorted - l_r = l_moment( - x, - np.arange(1, _rmax + 1), - trim=_trim, - sort='stable', # stable sort if fastest if already sorted - ) - - return cls(l_r, trim=_trim, a=a, b=b) - diff --git a/lmo/distributions.py b/lmo/distributions.py index 3e8618b8..3699b4ec 100644 --- a/lmo/distributions.py +++ b/lmo/distributions.py @@ -10,12 +10,12 @@ import functools import math import warnings -from collections.abc import Callable, Mapping +from collections.abc import Callable, Mapping, Sequence from typing import ( Any, Final, - Sequence, SupportsIndex, + TypeAlias, TypeVar, cast, ) @@ -29,18 +29,19 @@ rv_continuous, ) -from .special import harmonic - from ._poly import jacobi_series, roots from ._utils import ( clean_order, clean_trim, ) from .diagnostic import l_ratio_bounds +from .special import harmonic +from .theoretical import l_moment_from_cdf from .typing import ( AnyTrim, FloatVector, PolySeries, + QuadOptions, ) T = TypeVar('T') @@ -403,6 +404,34 @@ def fit( # Parametric + +_ArrF8: TypeAlias = npt.NDArray[np.float64] + +def _l_moment_kumaraswamy_single( + r: int, + s: int, + t: int, + a: float, + b: float, +) -> float: + if r == 0: + return 1.0 + + k = np.arange(t + 1, r + s + t + 1) + return ( + (-1)**(k - 1) + * cast(_ArrF8, sc.comb(r + k - 2, r + t - 1)) # type: ignore + * cast(_ArrF8, sc.comb(r + s + t, k)) # type: ignore + * cast(_ArrF8, sc.beta(1 / a, 1 + k * b)) / a # type: ignore + ).sum() / r + +_l_moment_kumaraswamy = np.vectorize( + _l_moment_kumaraswamy_single, + otypes=[float], + excluded={1, 2, 3, 4}, +) + + class kumaraswamy_gen(rv_continuous): # noqa: N801 r""" A Kumaraswamy random variable, similar to [`beta`][scipy.stats.beta]. @@ -492,14 +521,31 @@ def _munp( ) -> float: return b * cast(float, sc.beta(1 + n / a, b)) # type: ignore - # def _l_moment( - # self, - # r: npt.NDArray[np.int64], - # a: float, - # b: float, - # trim: tuple[int, int], - # ) -> npt.NDArray[np.float64]: - # ... + def _l_moment( + self, + r: npt.NDArray[np.int64], + a: float, + b: float, + trim: tuple[int, int] | tuple[float, float], + quad_opts: QuadOptions | None = None, + ) -> npt.NDArray[np.float64]: + s, t = trim + lmbda_r: float | npt.NDArray[np.float64] + if isinstance(s, float) or isinstance(t, float): + lmbda_r = cast( + float | npt.NDArray[np.float64], + l_moment_from_cdf( + functools.partial(self._cdf, a=a, b=b), # type: ignore + r, + trim=trim, + support=(0, 1), + ppf=functools.partial(self._ppf, a=a, b=b), # type: ignore + quad_opts=quad_opts, + ), # type: ignore + ) + return np.asarray(lmbda_r) + + return np.atleast_1d(_l_moment_kumaraswamy(r, s, t, a, b)) kumaraswamy: Final[rv_continuous] = kumaraswamy_gen( From 00d5178b2649e5c5d6fe7c2165e3f2541ef92bc1 Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 19:34:17 +0100 Subject: [PATCH 4/6] kumaraswamy distribution implementation references --- docs/distributions.md | 26 ++------------------------ lmo/distributions.py | 39 ++++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/docs/distributions.md b/docs/distributions.md index ccd10a99..803713c7 100644 --- a/docs/distributions.md +++ b/docs/distributions.md @@ -1051,8 +1051,8 @@ Its general \( r \)-th trimmed L-moment are: \end{equation} \] -Unfortunately, the Kumaraswamy distribution is not implemented in -`scipy.stats`. +The Kumaraswamy distribution is implemented in +[`lmo.distributions.kumaraswamy`][lmo.distributions.kumaraswamy]. ### Wakeby @@ -1072,28 +1072,6 @@ Each of the scale- \( \alpha, \gamma \) and shape parameters Lmo figured out that the L-moments with any order \( r \in \mathbb{N}_{\ge 1} \) and trim \( s, t \in \mathbb{N}^2_{\ge 1} \) can be expressed as - \[ \begin{equation} \tlmoment{s,t}{r} diff --git a/lmo/distributions.py b/lmo/distributions.py index 3699b4ec..5fb98aed 100644 --- a/lmo/distributions.py +++ b/lmo/distributions.py @@ -433,25 +433,6 @@ def _l_moment_kumaraswamy_single( class kumaraswamy_gen(rv_continuous): # noqa: N801 - r""" - A Kumaraswamy random variable, similar to [`beta`][scipy.stats.beta]. - - The probability density function for - [`kumaraswamy`][kumaraswamy] is: - - \[ - f(x, a, b) = a x^{a - 1} b \left(1 - x^a\right)^{b - 1} - \] - - for \( 0 < x < 1,\ a > 0,\ b > 0 \). - - [`kumaraswamy`][kumaraswamy] takes \( a \) and \( b \) as shape parameters. - - See Also: - - [Kumaraswamy distribution - Wikipedia - ](https://wikipedia.org/wiki/Kumaraswamy_distribution) - - """ def _shape_info(self) -> Sequence[_ShapeInfo]: ia = _ShapeInfo('a', False, (0, np.inf), (False, False)) ib = _ShapeInfo('b', False, (0, np.inf), (False, False)) @@ -553,3 +534,23 @@ def _l_moment( b=1.0, name='kumaraswamy', ) +r""" +A Kumaraswamy random variable, similar to +[`scipy.stats.beta`][scipy.stats.beta]. + +The probability density function for +[`kumaraswamy`][lmo.distributions.kumaraswamy] is: + +\[ + f(x, a, b) = a x^{a - 1} b \left(1 - x^a\right)^{b - 1} +\] + +for \( 0 < x < 1,\ a > 0,\ b > 0 \). + +[`kumaraswamy`][kumaraswamy] takes \( a \) and \( b \) as shape parameters. + +See Also: + - [Kumaraswamy distribution - Wikipedia + ](https://wikipedia.org/wiki/Kumaraswamy_distribution) + +""" From b1aa75c49a41b89937e1ffb1f8c6c3b1318a1c01 Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 19:41:24 +0100 Subject: [PATCH 5/6] pyright error fix --- lmo/distributions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lmo/distributions.py b/lmo/distributions.py index 5fb98aed..a71d46e5 100644 --- a/lmo/distributions.py +++ b/lmo/distributions.py @@ -509,7 +509,7 @@ def _l_moment( b: float, trim: tuple[int, int] | tuple[float, float], quad_opts: QuadOptions | None = None, - ) -> npt.NDArray[np.float64]: + ) -> _ArrF8: s, t = trim lmbda_r: float | npt.NDArray[np.float64] if isinstance(s, float) or isinstance(t, float): @@ -526,7 +526,9 @@ def _l_moment( ) return np.asarray(lmbda_r) - return np.atleast_1d(_l_moment_kumaraswamy(r, s, t, a, b)) + return np.atleast_1d( + cast(_ArrF8, _l_moment_kumaraswamy(r, s, t, a, b)), + ) kumaraswamy: Final[rv_continuous] = kumaraswamy_gen( From 5240454b7186ddd8dd7ef2a7e0d87fc80f5705d6 Mon Sep 17 00:00:00 2001 From: jorenham Date: Tue, 28 Nov 2023 19:46:41 +0100 Subject: [PATCH 6/6] fix incorrect dtype annotation --- lmo/special.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lmo/special.py b/lmo/special.py index 6ec080f0..bdb80d1e 100644 --- a/lmo/special.py +++ b/lmo/special.py @@ -153,8 +153,8 @@ def gamma2( def harmonic( n: npt.ArrayLike, /, - out: npt.NDArray[np.float64 | np.float128] | None = None, -) -> float | complex | npt.NDArray[np.float64 | np.float128]: + out: npt.NDArray[np.float64] | npt.NDArray[np.complex128] | None = None, +) -> float | complex | npt.NDArray[np.float64] | npt.NDArray[np.complex128]: r""" Harmonic number \( H_n = \sum_{k=1}^{n} 1 / k \), extended for real and complex argument via analytic contunuation. @@ -189,7 +189,7 @@ def harmonic( _n = np.asanyarray(n) _out = cast( - npt.NDArray[np.float64 | np.float128], + npt.NDArray[np.float64] | npt.NDArray[np.complex128], _special.digamma(_n + 1, out), # type: ignore ) _out += np.euler_gamma