HDF5 read/write functionality (#303)

* Added example hdf5 reader/writer * HDF5 functionality and tests * Bumped version number * Updated pyproject.toml * Removed typing for python3.8 * Lint corrections * Windows corrections * Fixed problems with pandas documentation * Actually now making read_hdf documentation * Removed uncovered AttributeError * Updated the documentation * bumped version * Version bump * Import read_hdf * Updated docs for new location * Moved hdf functionality to anesthetic.read * Updated documentation * Bumped version * moved circular import back out again * Removed inheritance from anesthetic * fix hdf5 docstring thing * Adjusted pandas.hdf to anesthetic.read_hdf * Added back in class --------- Co-authored-by: lukashergt <[email protected]>
handley-lab · Jun 29, 2023 · 59af500 · 59af500
1 parent 61bed43
commit 59af500
Show file tree

Hide file tree

Showing 12 changed files with 173 additions and 22 deletions.
diff --git a/README.rst b/README.rst
@@ -2,7 +2,7 @@
 anesthetic: nested sampling post-processing
 ===========================================
 :Authors: Will Handley and Lukas Hergt
-:Version: 2.0.0-beta.43
+:Version: 2.0.0-beta.44
 :Homepage: https://github.com/handley-lab/anesthetic
 :Documentation: http://anesthetic.readthedocs.io/
 

diff --git a/anesthetic/__init__.py b/anesthetic/__init__.py
@@ -2,6 +2,7 @@
 import anesthetic.samples
 import anesthetic.plot
 import anesthetic.read.chain
+import anesthetic.read.hdf
 
 import pandas
 import pandas.plotting._core
@@ -47,4 +48,5 @@ def wrapper(backend=None):
 make_2d_axes = anesthetic.plot.make_2d_axes
 make_1d_axes = anesthetic.plot.make_1d_axes
 
+read_hdf = anesthetic.read.hdf.read_hdf
 read_chains = anesthetic.read.chain.read_chains
diff --git a/anesthetic/_version.py b/anesthetic/_version.py
@@ -1 +1 @@
-__version__ = '2.0.0b43'
+__version__ = '2.0.0b44'
diff --git a/anesthetic/read/hdf.py b/anesthetic/read/hdf.py
@@ -0,0 +1,67 @@
+"""Anesthetic overwrites for pandas hdf functionality."""
+from pandas import HDFStore as _HDFStore
+from pandas.io.pytables import to_hdf as _to_hdf, read_hdf as _read_hdf
+from anesthetic.utils import adjust_docstrings
+from anesthetic.samples import NestedSamples, MCMCSamples, Samples
+
+
+class HDFStore(_HDFStore):  # noqa: D101
+    anesthetic_types = {x.__name__: x
+                        for x in [NestedSamples, MCMCSamples, Samples]}
+
+    def get(self, key, *args, **kwargs):  # noqa: D102
+        storer = self.get_storer(key)
+        anesthetic_type = storer.attrs.anesthetic_type
+        anesthetic_type = self.anesthetic_types[anesthetic_type]
+        value = super().get(key, *args, **kwargs)
+        value = anesthetic_type(value)
+        _metadata = storer.attrs._metadata.keys()
+        value._metadata = list(_metadata)
+        for k, v in storer.attrs._metadata.items():
+            setattr(value, k, v)
+        return value
+
+    def put(self, key, value, *args, **kwargs):  # noqa: D102
+        from anesthetic import __version__
+        super().put(key, value, *args, **kwargs)
+        storer = self.get_storer(key)
+        storer.attrs._metadata = {
+                k: getattr(value, k)
+                for k in value._metadata
+                }
+        storer.attrs.anesthetic_type = type(value).__name__
+        storer.attrs.anesthetic_version = __version__
+
+    def select(self, key, *args, **kwargs):  # noqa: D102
+        storer = self.get_storer(key)
+        anesthetic_type = storer.attrs.anesthetic_type
+        anesthetic_type = self.anesthetic_types[anesthetic_type]
+        value = super().select(key, *args, **kwargs)
+        value = anesthetic_type(value)
+        _metadata = storer.attrs._metadata.keys()
+        value._metadata = list(_metadata)
+        for k, v in storer.attrs._metadata.items():
+            setattr(value, k, v)
+        return value
+
+
+def to_hdf(path_or_buf, key, value, mode="a", complevel=None, complib=None,
+           *args, **kwargs):  # noqa: D103
+
+    store = HDFStore(path_or_buf, mode=mode, complevel=complevel,
+                     complib=complib)
+    store.__fspath__ = lambda: store
+    return _to_hdf(store, key, value, *args, **kwargs)
+
+
+def read_hdf(path_or_buf, *args, **kwargs):  # noqa: D103
+    store = HDFStore(path_or_buf)
+    return _read_hdf(store, *args, **kwargs)
+
+
+to_hdf.__doc__ = _to_hdf.__doc__
+read_hdf.__doc__ = _read_hdf.__doc__
+adjust_docstrings(read_hdf, 'read_hdf', 'anesthetic.read_hdf')
+adjust_docstrings(read_hdf, 'DataFrame', 'pandas.DataFrame')
+adjust_docstrings(read_hdf, ':func:`open`', '`open`')
+adjust_docstrings(read_hdf, ':class:`pandas.HDFStore`', '`pandas.HDFStore`')
diff --git a/anesthetic/samples.py b/anesthetic/samples.py
@@ -19,6 +19,7 @@
 from pandas.core.accessor import CachedAccessor
 from anesthetic.plot import (make_1d_axes, make_2d_axes,
                              AxesSeries, AxesDataFrame)
+from anesthetic.utils import adjust_docstrings
 import anesthetic.weighted_pandas
 from anesthetic.plotting import PlotAccessor
 anesthetic.weighted_pandas._WeightedObject.plot =\
@@ -483,6 +484,11 @@ def tex(self):
             "tex = samples.get_label(label)  # anesthetic 2.0"
             )
 
+    def to_hdf(self, path_or_buf, key, *args, **kwargs):  # noqa: D102
+        import anesthetic.read.hdf
+        return anesthetic.read.hdf.to_hdf(path_or_buf, key, self,
+                                          *args, **kwargs)
+
 
 class MCMCSamples(Samples):
     """Storage and plotting tools for MCMC samples.
@@ -1338,3 +1344,10 @@ def merge_samples_weighted(samples, weights=None, label=None):
     new_samples.label = label
 
     return new_samples
+
+
+adjust_docstrings(Samples.to_hdf, r'(pd|pandas)\.DataFrame', 'DataFrame')
+adjust_docstrings(Samples.to_hdf, 'DataFrame', 'pandas.DataFrame')
+adjust_docstrings(Samples.to_hdf, r'(pd|pandas)\.read_hdf', 'read_hdf')
+adjust_docstrings(Samples.to_hdf, 'read_hdf', 'pandas.read_hdf')
+adjust_docstrings(Samples.to_hdf, ':func:`open`', '`open`')
diff --git a/anesthetic/testing.py b/anesthetic/testing.py
@@ -0,0 +1,13 @@
+"""Anesthetic testing utilities."""
+import pandas.testing
+import numpy.testing
+
+
+def assert_frame_equal(left, right, *args, **kwargs):
+    """Assert frames are equal, including metadata."""
+    check_metadata = kwargs.pop('check_metadata', True)
+    pandas.testing.assert_frame_equal(left, right, *args, **kwargs)
+    numpy.testing.assert_array_equal(left._metadata, right._metadata)
+    if check_metadata:
+        for key in left._metadata:
+            assert getattr(left, key) == getattr(right, key)
diff --git a/anesthetic/utils.py b/anesthetic/utils.py
@@ -582,7 +582,7 @@ def temporary_seed(seed):
         np.random.set_state(state)
 
 
-def adjust_docstrings(cls, pattern, repl, *args, **kwargs):
+def adjust_docstrings(obj, pattern, repl, *args, **kwargs):
     """Adjust the docstrings of a class using regular expressions.
 
     After the first argument, the remaining arguments are identical to re.sub.
@@ -598,11 +598,17 @@ class to adjust
     repl : str
         replacement string
     """
-    for key, val in cls.__dict__.items():
-        doc = inspect.getdoc(val)
+    if inspect.isclass(obj):
+        for key, val in obj.__dict__.items():
+            doc = inspect.getdoc(val)
+            if doc is not None:
+                newdoc = re.sub(pattern, repl, doc, *args, **kwargs)
+                try:
+                    obj.__dict__[key].__doc__ = newdoc
+                except AttributeError:
+                    pass
+    else:
+        doc = inspect.getdoc(obj)
         if doc is not None:
             newdoc = re.sub(pattern, repl, doc, *args, **kwargs)
-            try:
-                cls.__dict__[key].__doc__ = newdoc
-            except AttributeError:
-                pass
+            obj.__doc__ = newdoc
diff --git a/docs/source/anesthetic.read.rst b/docs/source/anesthetic.read.rst
@@ -34,6 +34,14 @@ anesthetic.read.getdist module
    :show-inheritance:
 
 
+anesthetic.read.hdf module
+--------------------------
+
+.. automodule:: anesthetic.read.hdf
+   :members:
+   :undoc-members:
+
+
 anesthetic.read.multinest module
 --------------------------------
 

diff --git a/docs/source/anesthetic.rst b/docs/source/anesthetic.rst
@@ -84,6 +84,15 @@ anesthetic.scripts module
    :show-inheritance:
 
 
+anesthetic.testing module
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: anesthetic.testing
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 anesthetic.utils module
 ~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/reading_writing.rst b/docs/source/reading_writing.rst
@@ -75,18 +75,17 @@ or ``parquet`` files for reading and writing.
   :meth:`pandas.DataFrame.to_csv` for the various options of saving the data
   (e.g. choosing the delimiter etc.).
 
-* ``samples.to_parquet("filename.parquet")``: When reading and writing speed is
-  an issue, we recommend using the ``parquet`` file format, which should be
+* ``samples.to_hdf("filename.h5", "samples")``: When reading and writing speed
+  is an issue, we recommend using the ``hdf5`` file format, which should be
   faster than ``to_csv`` while still capable of handling the
-  :class:`pandas.MultiIndex` format.  Check out
-  :meth:`pandas.DataFrame.to_parquet` for more information.
+  :class:`pandas.MultiIndex` format.
 
 
 Loading ``NestedSamples`` or ``MCMCSamples``
 ============================================
 
-When loading in previously saved samples, make sure to use the appropriate
-class: ``Samples``, ``MCMCSamples``, or ``NestedSamples``.
+When loading in previously saved samples from csv, make sure to use the
+appropriate class: ``Samples``, ``MCMCSamples``, or ``NestedSamples``.
 
 * ``read_csv``:
 
@@ -96,13 +95,18 @@ class: ``Samples``, ``MCMCSamples``, or ``NestedSamples``.
       from anesthetic import Samples  # or MCMCSamples, or NestedSamples
       samples = Samples(read_csv("filename.csv"))
 
-* ``read_parquet``:
+When loading in previously saved samples from hdf5, make sure to import the
+``anesthetic.read_hdf`` function, and not the ``pandas.read_hdf`` version. If
+you forget to do this, the samples will be read in as a ``DataFrame``, with a
+consequent loss of functionality
+
+
+* ``read_hdf``:
 
   ::
   
-      from pandas import read_parquet
-      from anesthetic import Samples  # or MCMCSamples, or NestedSamples
-      samples = Samples(read_parquet("filename.parquet"))
+      from anesthetic import read_hdf
+      samples = read_hdf("filename.h5", "samples")
 
 
 Converting to GetDist

diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,8 @@ test = ["pytest", "pytest-cov", "flake8", "pydocstyle", "packaging", "pre-commit
 astropy = ["astropy"]
 fastkde = ["fastkde"]
 getdist = ["getdist"]
-all = ["astropy", "fastkde", "getdist"]
+hdf5 = ["tables"]
+all = ["astropy", "fastkde", "getdist", "tables"]
 
 [project.scripts]
 anesthetic = "anesthetic.scripts:gui"

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -4,14 +4,15 @@
 import pytest
 import numpy as np
 from numpy.testing import assert_array_equal, assert_array_almost_equal
-from pandas.testing import assert_frame_equal
+from anesthetic.testing import assert_frame_equal
 from anesthetic import MCMCSamples, NestedSamples
 from anesthetic import read_chains
 from anesthetic.read.polychord import read_polychord
 from anesthetic.read.getdist import read_getdist
 from anesthetic.read.cobaya import read_cobaya
 from anesthetic.read.multinest import read_multinest
 import pandas._testing as tm
+from anesthetic.read.hdf import HDFStore, read_hdf
 try:
     import getdist
 except ImportError:
@@ -232,4 +233,31 @@ def test_read_fail():
 def test_regex_escape():
     mcmc_1 = read_chains('./tests/example_data/gd_single+X')
     mcmc_2 = read_chains('./tests/example_data/gd_single')
-    assert_frame_equal(mcmc_1, mcmc_2)
+    assert_frame_equal(mcmc_1, mcmc_2, check_metadata=False)
+
+
+@pytest.mark.parametrize('root', ['pc', 'gd'])
+@pytest.mark.xfail('tables' not in sys.modules,
+                   raises=ImportError,
+                   reason="requires tables package")
+def test_hdf5(root):
+    samples = read_chains('./tests/example_data/' + root)
+    filename = 'test_hdf5.h5'
+    key = "samples"
+
+    with HDFStore(filename) as store:
+        store[key] = samples
+
+    with HDFStore(filename) as store:
+        assert_frame_equal(samples, store[key])
+        assert type(store[key]) == type(samples)
+
+    samples.to_hdf(filename, key)
+
+    with HDFStore(filename) as store:
+        assert_frame_equal(samples, store[key])
+        assert type(store[key]) == type(samples)
+
+    samples_ = read_hdf(filename, key)
+    assert_frame_equal(samples_, samples)
+    assert type(samples_) == type(samples)