Merge pull request #98 from OpenCOMPES/bugfix/binning-tuple

Binning bug
OpenCOMPES · May 15, 2023 · 462bbeb · 462bbeb
2 parents c549ab1 + 69c14ab
commit 462bbeb
Show file tree

Hide file tree

Showing 4 changed files with 281 additions and 159 deletions.
diff --git a/sed/binning/binning.py b/sed/binning/binning.py
@@ -19,22 +19,22 @@
 
 from .numba_bin import numba_histogramdd
 from .utils import _arraysum
-from .utils import _simplify_binning_arguments
+from .utils import bin_centers_to_bin_edges
+from .utils import simplify_binning_arguments
 
 N_CPU = psutil.cpu_count()
 
 
 def bin_partition(
-    part: Union[dask.dataframe.core.DataFrame, pd.DataFrame],
+    part: Union[dask.dataframe.DataFrame, pd.DataFrame],
     bins: Union[
         int,
         dict,
-        tuple,
-        List[int],
-        List[np.ndarray],
-        List[tuple],
+        Sequence[int],
+        Sequence[np.ndarray],
+        Sequence[tuple],
     ] = 100,
-    axes: Union[str, Sequence[str]] = None,
+    axes: Sequence[str] = None,
     ranges: Sequence[Tuple[float, float]] = None,
     hist_mode: str = "numba",
     jitter: Union[list, dict] = None,
@@ -44,24 +44,35 @@ def bin_partition(
     """Compute the n-dimensional histogram of a single dataframe partition.
 
     Args:
-        part (Union[dask.dataframe.core.DataFrame, pd.DataFrame]): dataframe on which
+        part (Union[dask.dataframe.DataFrame, pd.DataFrame]): dataframe on which
             to perform the histogram. Usually a partition of a dask DataFrame.
-        bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional):
+            bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional):
             Definition of the bins. Can  be any of the following cases:
 
-                - an integer describing the number of bins in on all dimensions
-                - a tuple of 3 numbers describing start, end and step of the binning
-                  range
-                - a np.arrays defining the binning edges
-                - a list (NOT a tuple) of any of the above (int, tuple or np.ndarray)
-                - a dictionary made of the axes as keys and any of the above as values.
-
-            This takes priority over the axes and range arguments. Defaults to 100.
-        axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on
-            which to calculate the histogram. The order will be the order of the
-            dimensions in the resulting array. Defaults to None.
-        ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the
-            start and end point of the binning range. Defaults to None.
+                - an integer describing the number of bins for all dimensions. This
+                  requires "ranges" to be defined as well.
+                - A sequence containing one entry of the following types for each
+                  dimenstion:
+
+                    - an integer describing the number of bins. This requires "ranges"
+                      to be defined as well.
+                    - a np.arrays defining the bin centers
+                    - a tuple of 3 numbers describing start, end and step of the binning
+                      range.
+
+                - a dictionary made of the axes as keys and any of the above as
+                  values.
+
+            The last option takes priority over the axes and range arguments.
+            Defaults to 100.
+        axes (Sequence[str], optional): Sequence containing the names of
+            the axes (columns) on which to calculate the histogram. The order will be
+            the order of the dimensions in the resulting array. Only not required if
+            bins are provided as dictionary containing the axis names.
+            Defaults to None.
+        ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing
+            the start and end point of the binning range. Required if bins given as
+            int or Sequence[int]. Defaults to None.
         hist_mode (str, optional): Histogram calculation method.
 
                 - "numpy": use ``numpy.histogramdd``,
@@ -98,11 +109,40 @@ def bin_partition(
         - **edges**: A list of D arrays describing the bin edges for each dimension.
     """
     if not skip_test:
-        bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges)
+        bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges)
     else:
+        if not isinstance(bins, list) or not (
+            all(isinstance(x, int) for x in bins)
+            or all(isinstance(x, np.ndarray) for x in bins)
+        ):
+            raise TypeError(
+                "bins needs to be of type 'List[int] or List[np.ndarray]' if tests are skipped!",
+            )
+        if not (isinstance(axes, list)) or not all(
+            isinstance(axis, str) for axis in axes
+        ):
+            raise TypeError(
+                "axes needs to be of type 'List[str]' if tests are skipped!",
+            )
         bins = cast(Union[List[int], List[np.ndarray]], bins)
-        axes = cast(Sequence[str], axes)
-        ranges = cast(Sequence[Tuple[float, float]], ranges)
+        axes = cast(List[str], axes)
+        ranges = cast(List[Tuple[float, float]], ranges)
+
+    # convert bin centers to bin edges:
+    if all(isinstance(x, np.ndarray) for x in bins):
+        bins = cast(List[np.ndarray], bins)
+        for i, bin_centers in enumerate(bins):
+            bins[i] = bin_centers_to_bin_edges(bin_centers)
+    else:
+        bins = cast(List[int], bins)
+        # shift ranges by half a bin size to align the bin centers to the given ranges,
+        # as the histogram functions interprete the ranges as limits for the edges.
+        for i, nbins in enumerate(bins):
+            halfbinsize = (ranges[i][1] - ranges[i][0]) / (nbins) / 2
+            ranges[i] = (
+                ranges[i][0] - halfbinsize,
+                ranges[i][1] - halfbinsize,
+            )
 
     # Locate columns for binning operation
     col_id = [part.columns.get_loc(axis) for axis in axes]
@@ -162,12 +202,11 @@ def bin_dataframe(
     bins: Union[
         int,
         dict,
-        tuple,
-        List[int],
-        List[np.ndarray],
-        List[tuple],
+        Sequence[int],
+        Sequence[np.ndarray],
+        Sequence[tuple],
     ] = 100,
-    axes: Union[str, Sequence[str]] = None,
+    axes: Sequence[str] = None,
     ranges: Sequence[Tuple[float, float]] = None,
     hist_mode: str = "numba",
     mode: str = "fast",
@@ -185,22 +224,33 @@ def bin_dataframe(
     Args:
         df (dask.dataframe.DataFrame): a dask.DataFrame on which to perform the
             histogram.
-        bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional):
-            Definition of the bins. Can  be any of the following cases:
-
-                - an integer describing the number of bins in on all dimensions
-                - a tuple of 3 numbers describing start, end and step of the binning
-                  range
-                - a np.arrays defining the binning edges
-                - a list (NOT a tuple) of any of the above (int, tuple or np.ndarray)
-                - a dictionary made of the axes as keys and any of the above as values.
-
-            This takes priority over the axes and range arguments. Defaults to 100.
-        axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on
-            which to calculate the histogram. The order will be the order of the
-            dimensions in the resulting array. Defaults to None.
-        ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the
-            start and end point of the binning range. Defaults to None.
+            bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional):
+            Definition of the bins. Can be any of the following cases:
+
+                - an integer describing the number of bins for all dimensions. This
+                  requires "ranges" to be defined as well.
+                - A sequence containing one entry of the following types for each
+                  dimenstion:
+
+                    - an integer describing the number of bins. This requires "ranges"
+                      to be defined as well.
+                    - a np.arrays defining the bin centers
+                    - a tuple of 3 numbers describing start, end and step of the binning
+                      range.
+
+                - a dictionary made of the axes as keys and any of the above as
+                  values.
+
+            The last option takes priority over the axes and range arguments.
+            Defaults to 100.
+        axes (Sequence[str], optional): Sequence containing the names of
+            the axes (columns) on which to calculate the histogram. The order will be
+            the order of the dimensions in the resulting array. Only not required if
+            bins are provided as dictionary containing the axis names.
+            Defaults to None.
+        ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing
+            the start and end point of the binning range. Required if bins given as
+            int or Sequence[int]. Defaults to None.
         hist_mode (str, optional): Histogram calculation method.
 
                 - "numpy": use ``numpy.histogramdd``,
@@ -243,30 +293,27 @@ def bin_dataframe(
 
     Returns:
         xr.DataArray: The result of the n-dimensional binning represented in an
-        xarray object, combining the data with the axes.
+        xarray object, combining the data with the axes (bin centers).
     """
-    bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges)
+    bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges)
 
     # create the coordinate axes for the xarray output
+    # if provided as array, they are interpreted as bin centers
     if isinstance(bins[0], np.ndarray):
+        bins = cast(List[np.ndarray], bins)
         coords = dict(zip(axes, bins))
     elif ranges is None:
         raise ValueError(
-            "bins is not an array and range is none.. this shouldn't happen.",
+            "bins is not an array and range is none. this shouldn't happen.",
         )
     else:
         bins = cast(List[int], bins)
         coords = {
-            ax: np.linspace(r[0], r[1], n)
+            ax: np.linspace(r[0], r[1], n, endpoint=False)
             for ax, r, n in zip(axes, ranges, bins)
         }
 
-    if isinstance(bins[0], np.ndarray):
-        bins = cast(List[np.ndarray], bins)
-        full_shape = tuple(x.size for x in bins)
-    else:
-        bins = cast(List[int], bins)
-        full_shape = tuple(bins)
+    full_shape = tuple(axis.size for axis in coords.values())
 
     full_result = np.zeros(full_shape)
     partition_results = []  # Partition-level results

diff --git a/sed/binning/numba_bin.py b/sed/binning/numba_bin.py
@@ -163,10 +163,10 @@ def numba_histogramdd(
     Args:
         sample (np.ndarray): The data to be histogrammed with shape N,D
         bins (Union[int, Sequence[int], Sequence[np.ndarray], np.ndarray]): The number
-            of bins for each dimension D, or a sequence of bins on which to calculate
+            of bins for each dimension D, or a sequence of bin edges on which to calculate
             the histogram.
-        ranges (Sequence, optional): The range to use for binning when bins is a list
-            of integers. Defaults to None.
+        ranges (Sequence, optional): The range(s) to use for binning when bins is a sequence
+            of integers or sequence of arrays. Defaults to None.
 
     Raises:
         ValueError: In case of dimension mismatch.