handley-lab · DilyOng · Aug 24, 2023 · Aug 24, 2023 · Aug 31, 2023 · Sep 30, 2023
diff --git a/README.rst b/README.rst
@@ -2,7 +2,7 @@
 anesthetic: nested sampling post-processing
 ===========================================
 :Authors: Will Handley and Lukas Hergt
-:Version: 2.9.0
+:Version: 2.10.0
 :Homepage: https://github.com/handley-lab/anesthetic
 :Documentation: http://anesthetic.readthedocs.io/
 

diff --git a/anesthetic/_version.py b/anesthetic/_version.py
@@ -1 +1 @@
-__version__ = '2.9.0'
+__version__ = '2.10.0'
diff --git a/anesthetic/read/csv.py b/anesthetic/read/csv.py
@@ -5,7 +5,7 @@
 
 
 def read_csv(filename, *args, **kwargs):
-    """Read a CSV file into a :class:`Samples` object."""
+    """Read a CSV file into a :class:`anesthetic.samples.Samples` object."""
     filename = Path(filename)
     kwargs['label'] = kwargs.get('label', filename.stem)
     wldf = wl_read_csv(filename.with_suffix('.csv'))

diff --git a/anesthetic/tension.py b/anesthetic/tension.py
@@ -0,0 +1,104 @@
+"""Tension statistics between two datasets."""
+from anesthetic.samples import Samples
+from scipy.stats import chi2
+
+
+def stats(A, B, AB, nsamples=None, beta=None):  # noqa: D301
+    r"""Compute tension statistics between two samples.
+
+    Using nested sampling we can compute:
+
+    - ``logR``: R statistic for dataset consistency
+
+      .. math::
+        \log R = \log Z_{AB} - \log Z_{A} - \log Z_{B}
+
+    - ``logI``: information ratio
+
+      .. math::
+        \log I = D_{KL}^{A} + D_{KL}^{B} - D_{KL}^{AB}
+
+    - ``logS``: suspiciousness
+
+      .. math::
+        \log S = \log L_{AB} - \log L_{A} - \log L_{B}
+
+    - ``d_G``: Gaussian model dimensionality of shared constrained parameters
+
+      .. math::
+        d = d_{A} + d_{B} - d_{AB}
+
+    - ``p``: p-value for the tension between two samples
+
+      .. math::
+        p = \int_{d-2\log{S}}^{\infty} \chi^2_d(x) dx
+
+    Parameters
+    ----------
+    A : :class:`anesthetic.samples.Samples`
+        :class:`anesthetic.samples.NestedSamples` object from a sampling run
+        using only dataset A.
+        Alternatively, you can pass a precomputed stats object returned from
+        :meth:`anesthetic.samples.NestedSamples.stats`.
+
+    B : :class:`anesthetic.samples.Samples`
+        :class:`anesthetic.samples.NestedSamples` object from a sampling run
+        using only dataset B.
+        Alternatively, you can pass the precomputed stats object returned from
+        :meth:`anesthetic.samples.NestedSamples.stats`.
+
+    AB : :class:`anesthetic.samples.Samples`
+        :class:`anesthetic.samples.NestedSamples` object from a sampling run
+        using both datasets A and B jointly.
+        Alternatively, you can pass the precomputed stats object returned from
+        :meth:`anesthetic.samples.NestedSamples.stats`.
+
+    nsamples : int, optional
+        - If nsamples is not supplied, calculate mean value.
+        - If nsamples is integer, draw nsamples from the distribution of
+          values inferred by nested sampling.
+
+    beta : float, array-like, default=1
+        Inverse temperature(s) `beta=1/kT`.
+
+    Returns
+    -------
+    samples : :class:`anesthetic.samples.Samples`
+        DataFrame containing the following tension statistics in columns:
+        ['logR', 'logI', 'logS', 'd_G', 'p']
+    """
+    columns = ['logZ', 'D_KL', 'logL_P', 'd_G']
+    if set(columns).issubset(A.drop_labels().columns):
+        statsA = A
+    else:
+        statsA = A.stats(nsamples=nsamples, beta=beta)
+    if set(columns).issubset(B.drop_labels().columns):
+        statsB = B
+    else:
+        statsB = B.stats(nsamples=nsamples, beta=beta)
+    if set(columns).issubset(AB.drop_labels().columns):
+        statsAB = AB
+    else:
+        statsAB = AB.stats(nsamples=nsamples, beta=beta)
+    if statsA.shape != statsAB.shape or statsB.shape != statsAB.shape:
+        raise ValueError("Shapes of stats_A, stats_B, and stats_AB do not "
+                         "match. Make sure to pass consistent `nsamples`.")
+
+    samples = Samples(index=statsA.index)
+
+    samples['logR'] = statsAB['logZ'] - statsA['logZ'] - statsB['logZ']
+    samples.set_label('logR', r'$\ln\mathcal{R}$')
+
+    samples['logI'] = statsA['D_KL'] + statsB['D_KL'] - statsAB['D_KL']
+    samples.set_label('logI', r'$\ln\mathcal{I}$')
+
+    samples['logS'] = statsAB['logL_P'] - statsA['logL_P'] - statsB['logL_P']
+    samples.set_label('logS', r'$\ln\mathcal{S}$')
+
+    samples['d_G'] = statsA['d_G'] + statsB['d_G'] - statsAB['d_G']
+    samples.set_label('d_G', r'$d_\mathrm{G}$')
+
+    p = chi2.sf(samples['d_G'] - 2 * samples['logS'], df=samples['d_G'])
+    samples['p'] = p
+    samples.set_label('p', '$p$')
+    return samples
diff --git a/docs/source/anesthetic.read.rst b/docs/source/anesthetic.read.rst
@@ -25,6 +25,15 @@ anesthetic.read.cobaya module
    :show-inheritance:
 
 
+anesthetic.read.csv module
+--------------------------
+
+.. automodule:: anesthetic.read.csv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 anesthetic.read.getdist module
 ------------------------------
 

diff --git a/docs/source/anesthetic.rst b/docs/source/anesthetic.rst
@@ -72,6 +72,7 @@ anesthetic.samples module
 .. automodule:: anesthetic.samples
    :members:
    :undoc-members:
+   :show-inheritance:
 
 
 anesthetic.scripts module
@@ -83,6 +84,15 @@ anesthetic.scripts module
    :show-inheritance:
 
 
+anesthetic.tension module
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: anesthetic.tension
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 anesthetic.testing module
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/tests/test_tension.py b/tests/test_tension.py
@@ -0,0 +1,126 @@
+from pytest import approx, raises
+from anesthetic.examples.perfect_ns import correlated_gaussian
+import numpy as np
+from numpy.linalg import inv, slogdet
+from pandas.testing import assert_series_equal
+from anesthetic.tension import stats
+
+
+def test_tension_stats_compatible_gaussian():
+    np.random.seed(42)
+    d = 3
+    nlive = 10 * d
+    bounds = [[-1, 1], [0, 3], [0, 1]]
+    logV = np.log(np.diff(bounds).ravel().prod())
+
+    muA = np.array([0.1, 0.3, 0.5])
+    covA = 0.01 * np.array([[0.010, 0.009, 0.0],
+                            [0.009, 0.010, 0.0],
+                            [0.000, 0.000, 0.1]])
+    invcovA = inv(covA)
+    logLmaxA = 0
+    samplesA = correlated_gaussian(nlive, muA, covA, bounds, logLmaxA)
+
+    muB = np.array([0.1, 0.3, 0.5])
+    covB = 0.01 * np.array([[+0.010, -0.009, +0.010],
+                            [-0.009, +0.010, -0.001],
+                            [+0.010, -0.001, +0.100]])
+    invcovB = inv(covB)
+    logLmaxB = 0
+    samplesB = correlated_gaussian(nlive, muB, covB, bounds, logLmaxB)
+
+    covAB = inv(invcovA + invcovB)
+    muAB = covAB @ (invcovA @ muA + invcovB @ muB)
+    dmuAB = muA - muB
+    dmu_cov_dmu_AB = dmuAB @ invcovA @ covAB @ invcovB @ dmuAB
+    logLmaxAB = logLmaxA + logLmaxB - dmu_cov_dmu_AB / 2
+    samplesAB = correlated_gaussian(nlive, muAB, covAB, bounds, logLmaxAB)
+
+    nsamples = 10
+    beta = 1
+    s = stats(samplesA, samplesB, samplesAB, nsamples, beta)
+
+    logR_exact = logV - dmu_cov_dmu_AB/2 - slogdet(2*np.pi*(covA+covB))[1]/2
+    assert s.logR.mean() == approx(logR_exact, abs=3*s.logR.std())
+
+    logS_exact = d / 2 - dmu_cov_dmu_AB / 2
+    assert s.logS.mean() == approx(logS_exact, abs=3*s.logS.std())
+
+    logI_exact = logV - d / 2 - slogdet(2*np.pi*(covA+covB))[1] / 2
+    assert s.logI.mean() == approx(logI_exact, abs=3*s.logI.std())
+
+    assert s.logS.mean() == approx(s.logR.mean() - s.logI.mean(),
+                                   abs=3*s.logS.std())
+
+    assert s.get_labels().tolist() == ([r'$\ln\mathcal{R}$',
+                                        r'$\ln\mathcal{I}$',
+                                        r'$\ln\mathcal{S}$',
+                                        r'$d_\mathrm{G}$',
+                                        r'$p$'])
+
+    with raises(ValueError):
+        stats(samplesA.stats(nsamples=5), samplesB, samplesAB, nsamples)
+    s2 = stats(samplesA.stats(nsamples=nsamples),
+               samplesB.stats(nsamples=nsamples),
+               samplesAB.stats(nsamples=nsamples))
+    assert_series_equal(s2.mean(), s.mean(), atol=s2.std().max())
+
+
+def test_tension_stats_incompatible_gaussian():
+    np.random.seed(42)
+    d = 3
+    nlive = 10 * d
+    bounds = [[-1, 1], [0, 3], [0, 1]]
+    logV = np.log(np.diff(bounds).ravel().prod())
+
+    muA = np.array([0.1, 0.3, 0.5])
+    covA = 0.01 * np.array([[0.010, 0.009, 0.0],
+                            [0.009, 0.010, 0.0],
+                            [0.000, 0.000, 0.1]])
+    invcovA = inv(covA)
+    logLmaxA = 0
+    samplesA = correlated_gaussian(nlive, muA, covA, bounds, logLmaxA)
+
+    muB = np.array([0.15, 0.25, 0.45])
+    covB = 0.01 * np.array([[+0.010, -0.009, +0.010],
+                            [-0.009, +0.010, -0.001],
+                            [+0.010, -0.001, +0.100]])
+    invcovB = inv(covB)
+    logLmaxB = 0
+    samplesB = correlated_gaussian(nlive, muB, covB, bounds, logLmaxB)
+
+    covAB = inv(invcovA + invcovB)
+    muAB = covAB @ (invcovA @ muA + invcovB @ muB)
+    dmuAB = muA - muB
+    dmu_cov_dmu_AB = dmuAB @ invcovA @ covAB @ invcovB @ dmuAB
+    logLmaxAB = logLmaxA + logLmaxB - dmu_cov_dmu_AB / 2
+    samplesAB = correlated_gaussian(nlive, muAB, covAB, bounds, logLmaxAB)
+
+    nsamples = 10
+    beta = 1
+    s = stats(samplesA, samplesB, samplesAB, nsamples, beta)
+
+    logR_exact = logV - dmu_cov_dmu_AB/2 - slogdet(2*np.pi*(covA+covB))[1]/2
+    assert s.logR.mean() == approx(logR_exact, abs=3*s.logR.std())
+
+    logS_exact = d / 2 - dmu_cov_dmu_AB / 2
+    assert s.logS.mean() == approx(logS_exact, abs=3*s.logS.std())
+
+    logI_exact = logV - d / 2 - slogdet(2*np.pi*(covA+covB))[1] / 2
+    assert s.logI.mean() == approx(logI_exact, abs=3*s.logI.std())
+
+    assert s.logS.mean() == approx(s.logR.mean() - s.logI.mean(),
+                                   abs=3*s.logS.std())
+
+    assert s.get_labels().tolist() == ([r'$\ln\mathcal{R}$',
+                                        r'$\ln\mathcal{I}$',
+                                        r'$\ln\mathcal{S}$',
+                                        r'$d_\mathrm{G}$',
+                                        r'$p$'])
+
+    with raises(ValueError):
+        stats(samplesA.stats(nsamples=5), samplesB, samplesAB, nsamples)
+    s2 = stats(samplesA.stats(nsamples=nsamples),
+               samplesB.stats(nsamples=nsamples),
+               samplesAB.stats(nsamples=nsamples))
+    assert_series_equal(s2.mean(), s.mean(), atol=s2.std().max())