diff --git a/sed/binning/binning.py b/sed/binning/binning.py index b150ad52..99c66d81 100644 --- a/sed/binning/binning.py +++ b/sed/binning/binning.py @@ -19,22 +19,22 @@ from .numba_bin import numba_histogramdd from .utils import _arraysum -from .utils import _simplify_binning_arguments +from .utils import bin_centers_to_bin_edges +from .utils import simplify_binning_arguments N_CPU = psutil.cpu_count() def bin_partition( - part: Union[dask.dataframe.core.DataFrame, pd.DataFrame], + part: Union[dask.dataframe.DataFrame, pd.DataFrame], bins: Union[ int, dict, - tuple, - List[int], - List[np.ndarray], - List[tuple], + Sequence[int], + Sequence[np.ndarray], + Sequence[tuple], ] = 100, - axes: Union[str, Sequence[str]] = None, + axes: Sequence[str] = None, ranges: Sequence[Tuple[float, float]] = None, hist_mode: str = "numba", jitter: Union[list, dict] = None, @@ -44,24 +44,35 @@ def bin_partition( """Compute the n-dimensional histogram of a single dataframe partition. Args: - part (Union[dask.dataframe.core.DataFrame, pd.DataFrame]): dataframe on which + part (Union[dask.dataframe.DataFrame, pd.DataFrame]): dataframe on which to perform the histogram. Usually a partition of a dask DataFrame. - bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional): + bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional): Definition of the bins. Can be any of the following cases: - - an integer describing the number of bins in on all dimensions - - a tuple of 3 numbers describing start, end and step of the binning - range - - a np.arrays defining the binning edges - - a list (NOT a tuple) of any of the above (int, tuple or np.ndarray) - - a dictionary made of the axes as keys and any of the above as values. - - This takes priority over the axes and range arguments. Defaults to 100. - axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on - which to calculate the histogram. The order will be the order of the - dimensions in the resulting array. Defaults to None. - ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the - start and end point of the binning range. Defaults to None. + - an integer describing the number of bins for all dimensions. This + requires "ranges" to be defined as well. + - A sequence containing one entry of the following types for each + dimenstion: + + - an integer describing the number of bins. This requires "ranges" + to be defined as well. + - a np.arrays defining the bin centers + - a tuple of 3 numbers describing start, end and step of the binning + range. + + - a dictionary made of the axes as keys and any of the above as + values. + + The last option takes priority over the axes and range arguments. + Defaults to 100. + axes (Sequence[str], optional): Sequence containing the names of + the axes (columns) on which to calculate the histogram. The order will be + the order of the dimensions in the resulting array. Only not required if + bins are provided as dictionary containing the axis names. + Defaults to None. + ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing + the start and end point of the binning range. Required if bins given as + int or Sequence[int]. Defaults to None. hist_mode (str, optional): Histogram calculation method. - "numpy": use ``numpy.histogramdd``, @@ -98,11 +109,40 @@ def bin_partition( - **edges**: A list of D arrays describing the bin edges for each dimension. """ if not skip_test: - bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges) + bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges) else: + if not isinstance(bins, list) or not ( + all(isinstance(x, int) for x in bins) + or all(isinstance(x, np.ndarray) for x in bins) + ): + raise TypeError( + "bins needs to be of type 'List[int] or List[np.ndarray]' if tests are skipped!", + ) + if not (isinstance(axes, list)) or not all( + isinstance(axis, str) for axis in axes + ): + raise TypeError( + "axes needs to be of type 'List[str]' if tests are skipped!", + ) bins = cast(Union[List[int], List[np.ndarray]], bins) - axes = cast(Sequence[str], axes) - ranges = cast(Sequence[Tuple[float, float]], ranges) + axes = cast(List[str], axes) + ranges = cast(List[Tuple[float, float]], ranges) + + # convert bin centers to bin edges: + if all(isinstance(x, np.ndarray) for x in bins): + bins = cast(List[np.ndarray], bins) + for i, bin_centers in enumerate(bins): + bins[i] = bin_centers_to_bin_edges(bin_centers) + else: + bins = cast(List[int], bins) + # shift ranges by half a bin size to align the bin centers to the given ranges, + # as the histogram functions interprete the ranges as limits for the edges. + for i, nbins in enumerate(bins): + halfbinsize = (ranges[i][1] - ranges[i][0]) / (nbins) / 2 + ranges[i] = ( + ranges[i][0] - halfbinsize, + ranges[i][1] - halfbinsize, + ) # Locate columns for binning operation col_id = [part.columns.get_loc(axis) for axis in axes] @@ -162,12 +202,11 @@ def bin_dataframe( bins: Union[ int, dict, - tuple, - List[int], - List[np.ndarray], - List[tuple], + Sequence[int], + Sequence[np.ndarray], + Sequence[tuple], ] = 100, - axes: Union[str, Sequence[str]] = None, + axes: Sequence[str] = None, ranges: Sequence[Tuple[float, float]] = None, hist_mode: str = "numba", mode: str = "fast", @@ -185,22 +224,33 @@ def bin_dataframe( Args: df (dask.dataframe.DataFrame): a dask.DataFrame on which to perform the histogram. - bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional): - Definition of the bins. Can be any of the following cases: - - - an integer describing the number of bins in on all dimensions - - a tuple of 3 numbers describing start, end and step of the binning - range - - a np.arrays defining the binning edges - - a list (NOT a tuple) of any of the above (int, tuple or np.ndarray) - - a dictionary made of the axes as keys and any of the above as values. - - This takes priority over the axes and range arguments. Defaults to 100. - axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on - which to calculate the histogram. The order will be the order of the - dimensions in the resulting array. Defaults to None. - ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the - start and end point of the binning range. Defaults to None. + bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional): + Definition of the bins. Can be any of the following cases: + + - an integer describing the number of bins for all dimensions. This + requires "ranges" to be defined as well. + - A sequence containing one entry of the following types for each + dimenstion: + + - an integer describing the number of bins. This requires "ranges" + to be defined as well. + - a np.arrays defining the bin centers + - a tuple of 3 numbers describing start, end and step of the binning + range. + + - a dictionary made of the axes as keys and any of the above as + values. + + The last option takes priority over the axes and range arguments. + Defaults to 100. + axes (Sequence[str], optional): Sequence containing the names of + the axes (columns) on which to calculate the histogram. The order will be + the order of the dimensions in the resulting array. Only not required if + bins are provided as dictionary containing the axis names. + Defaults to None. + ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing + the start and end point of the binning range. Required if bins given as + int or Sequence[int]. Defaults to None. hist_mode (str, optional): Histogram calculation method. - "numpy": use ``numpy.histogramdd``, @@ -243,30 +293,27 @@ def bin_dataframe( Returns: xr.DataArray: The result of the n-dimensional binning represented in an - xarray object, combining the data with the axes. + xarray object, combining the data with the axes (bin centers). """ - bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges) + bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges) # create the coordinate axes for the xarray output + # if provided as array, they are interpreted as bin centers if isinstance(bins[0], np.ndarray): + bins = cast(List[np.ndarray], bins) coords = dict(zip(axes, bins)) elif ranges is None: raise ValueError( - "bins is not an array and range is none.. this shouldn't happen.", + "bins is not an array and range is none. this shouldn't happen.", ) else: bins = cast(List[int], bins) coords = { - ax: np.linspace(r[0], r[1], n) + ax: np.linspace(r[0], r[1], n, endpoint=False) for ax, r, n in zip(axes, ranges, bins) } - if isinstance(bins[0], np.ndarray): - bins = cast(List[np.ndarray], bins) - full_shape = tuple(x.size for x in bins) - else: - bins = cast(List[int], bins) - full_shape = tuple(bins) + full_shape = tuple(axis.size for axis in coords.values()) full_result = np.zeros(full_shape) partition_results = [] # Partition-level results diff --git a/sed/binning/numba_bin.py b/sed/binning/numba_bin.py index 5c8ea95e..8b6f22ab 100644 --- a/sed/binning/numba_bin.py +++ b/sed/binning/numba_bin.py @@ -163,10 +163,10 @@ def numba_histogramdd( Args: sample (np.ndarray): The data to be histogrammed with shape N,D bins (Union[int, Sequence[int], Sequence[np.ndarray], np.ndarray]): The number - of bins for each dimension D, or a sequence of bins on which to calculate + of bins for each dimension D, or a sequence of bin edges on which to calculate the histogram. - ranges (Sequence, optional): The range to use for binning when bins is a list - of integers. Defaults to None. + ranges (Sequence, optional): The range(s) to use for binning when bins is a sequence + of integers or sequence of arrays. Defaults to None. Raises: ValueError: In case of dimension mismatch. diff --git a/sed/binning/utils.py b/sed/binning/utils.py index 442e6d8a..a85e9183 100644 --- a/sed/binning/utils.py +++ b/sed/binning/utils.py @@ -15,21 +15,20 @@ def _arraysum(array_a, array_b): return array_a + array_b -def _simplify_binning_arguments( +def simplify_binning_arguments( bins: Union[ int, dict, - tuple, - List[int], - List[np.ndarray], - List[tuple], - ] = 100, - axes: Union[str, Sequence[str]] = None, + Sequence[int], + Sequence[np.ndarray], + Sequence[tuple], + ], + axes: Sequence[str] = None, ranges: Sequence[Tuple[float, float]] = None, ) -> Tuple[ Union[List[int], List[np.ndarray]], - Sequence[str], - Sequence[Tuple[float, float]], + List[str], + List[Tuple[float, float]], ]: """Convert the flexible input for defining bins into a simple "axes" "bins" "ranges" tuple. @@ -38,23 +37,32 @@ def _simplify_binning_arguments( binning functions defined here. Args: - bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional): + bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple]): Definition of the bins. Can be any of the following cases: - - an integer describing the number of bins in on all dimensions - - a tuple of 3 numbers describing start, end and step of the binning - range. - - a np.arrays defining the binning edges - - a list (NOT a tuple) of any of the above (int, tuple or np.ndarray) + - an integer describing the number of bins for all dimensions. This + requires "ranges" to be defined as well. + - A sequence containing one entry of the following types for each + dimenstion: + + - an integer describing the number of bins. This requires "ranges" + to be defined as well. + - a np.arrays defining the bin centers + - a tuple of 3 numbers describing start, end and step of the binning + range. + - a dictionary made of the axes as keys and any of the above as values. - This takes priority over the axes and range arguments. Defaults to 100. - axes (Union[str, Sequence[str]], optional): The names of the axes (columns) - on which to calculate the histogram. The order will be the order of the - dimensions in the resulting array. Defaults to None. - ranges (Sequence[Tuple[float, float]], optional): list of tuples containing - the start and end point of the binning range. Defaults to None. + The last option takes priority over the axes and range arguments. + axes (Sequence[str], optional): Sequence containing the names of + the axes (columns) on which to calculate the histogram. The order will be + the order of the dimensions in the resulting array. Only not required if + bins are provided as dictionary containing the axis names. + Defaults to None. + ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing + the start and end point of the binning range. Required if bins given as + int or Sequence[int]. Defaults to None. Raises: ValueError: Wrong shape of bins, @@ -63,13 +71,10 @@ def _simplify_binning_arguments( AttributeError: Shape mismatch Returns: - Tuple[List[np.ndarray], Sequence[str], Sequence[Tuple[float, float]]]: - Tuple containing bins, axes, and ranges. + Tuple[Union[List[int], List[np.ndarray]], List[Tuple[float, float]]]: Tuple + containing lists of bin centers, axes, and ranges. """ - if isinstance(axes, str): - axes = [axes] # if bins is a dictionary: unravel to axes and bins - if isinstance(bins, dict): axes = [] bins_ = [] @@ -77,26 +82,33 @@ def _simplify_binning_arguments( axes.append(k) bins_.append(v) bins = bins_ - elif isinstance(bins, (int, np.ndarray)): + + # if bins provided as single int, apply to all dimensions + if isinstance(bins, int): bins = [bins] * len(axes) - elif isinstance(bins, tuple): - if len(bins) == 3: - bins = [bins] - else: - raise ValueError( - "Bins defined as tuples should only be used to define start ", - "stop and step of the bins. i.e. should always have lenght 3.", - ) - if not isinstance(bins, list): + + # Check that we have a sequence of bins now + if not isinstance(bins, Sequence): raise TypeError(f"Cannot interpret bins of type {type(bins)}") + + # check that we have axes if axes is None: raise AttributeError("Must define on which axes to bin") - if not all(isinstance(x, type(bins[0])) for x in bins): - raise TypeError('All elements in "bins" must be of the same type') - if isinstance(bins[0], tuple): - bins = cast(List[tuple], bins) - assert len(bins[0]) == 3 + # check that axes is a sequence + if not isinstance(axes, Sequence): + raise TypeError(f"Cannot interpret axes of type {type(axes)}") + + # check that all elements of axes are str + if not all(isinstance(axis, str) for axis in axes): + raise TypeError("Axes has to contain only strings!") + + # we got tuples as bins, expand to bins and ranges + if all(isinstance(x, tuple) for x in bins): + bins = cast(Sequence[tuple], bins) + assert ( + len(bins[0]) == 3 + ), "Tuples as bins need to have format (start, end, num_bins)." ranges = [] bins_ = [] for tpl in bins: @@ -104,32 +116,32 @@ def _simplify_binning_arguments( ranges.append((tpl[0], tpl[1])) bins_.append(tpl[2]) bins = bins_ - elif not isinstance(bins[0], (int, np.ndarray)): - raise TypeError(f"Could not interpret bins of type {type(bins[0])}") - - if ranges is not None: - if (len(axes) == len(bins) == 1) and isinstance( - ranges[0], - (int, float), - ): - ranges = (cast(Tuple[float, float], ranges),) - elif not len(axes) == len(bins) == len(ranges): + + # if bins are provided as int, check that ranges are present + if all(isinstance(x, int) for x in bins): + bins = cast(List[int], list(bins)) + if ranges is None: raise AttributeError( - "axes and range and bins must have the same number of elements", + "Must provide a range if bins is an integer or list of integers", ) - elif isinstance(bins[0], int): - raise AttributeError( - "Must provide a range if bins is an integer or list of integers", - ) - elif len(axes) != len(bins): + if not isinstance(ranges, Sequence): + raise AttributeError( + f"Ranges must be a sequence, not {type(ranges)}.", + ) + + # otherwise, all bins should by np.ndarrays here + elif all(isinstance(x, np.ndarray) for x in bins): + bins = cast(List[np.ndarray], list(bins)) + else: + raise TypeError(f"Could not interpret bins of type {type(bins)}") + + # check that number of bins and number of axes is the same. + if len(axes) != len(bins): raise AttributeError( "axes and bins must have the same number of elements", ) - # TODO: mypy still thinks List[tuple] is a possible type for bins, nut sure why. - bins = cast(Union[List[int], List[np.ndarray]], bins) - - return bins, axes, ranges + return bins, list(axes), list(ranges) if ranges else None def bin_edges_to_bin_centers(bin_edges: np.ndarray) -> np.ndarray: @@ -141,7 +153,6 @@ def bin_edges_to_bin_centers(bin_edges: np.ndarray) -> np.ndarray: Returns: bin_centers: 1d array of bin centers """ - bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2 return bin_centers diff --git a/tests/test_binning.py b/tests/test_binning.py index 2ce15dd1..0d5c4d67 100644 --- a/tests/test_binning.py +++ b/tests/test_binning.py @@ -1,15 +1,17 @@ """This file contains code that performs several tests for the sed.binning module """ +from typing import Any from typing import List from typing import Sequence from typing import Tuple +from typing import Union import numpy as np import pandas as pd import pytest -from sed.binning.binning import _simplify_binning_arguments from sed.binning.binning import numba_histogramdd +from sed.binning.binning import simplify_binning_arguments from sed.binning.numba_bin import _hist_from_bin_range from sed.binning.utils import bin_centers_to_bin_edges from sed.binning.utils import bin_edges_to_bin_centers @@ -199,44 +201,106 @@ def test_bin_edges_to_bin_centers(): ranges = [(-1, 1), (-2, 2), (-3, 3)] -def test_simplify_binning_arguments_direct(): +@pytest.mark.parametrize( + "args", + [ + (bins[:1], axes[:1], ranges[:1], 1), + (bins[:2], axes[:2], ranges[:2], 2), + (bins[:3], axes[:3], ranges[:3], 3), + ], + ids=lambda x: f"ndim: {x[3]}", +) +@pytest.mark.parametrize( + "arg_type", + [ + "int", + "list_int", + "array", + "tuple", + "dict_int", + "dict_tuple", + "dict_array", + ], +) +def test_simplify_binning_arguments( + args: Tuple[List[int], List[str], List[Tuple[float, float]]], + arg_type: str, +): """Test the result of the _simplify_binning_arguments functions for number of bins and ranges """ - bins_, axes_, ranges_ = _simplify_binning_arguments(bins, axes, ranges) - assert bins_ == bins - assert axes_ == axes - assert ranges_ == ranges - - -def test_simplify_binning_arguments_1d(): - """Test the result of the _simplify_binning_arguments functions for number of - bins and ranges, 1D case - """ - bins_, axes_, ranges_ = _simplify_binning_arguments( - bins[0], - axes[0], - ranges[0], + bins_: Union[int, list, dict] = None + axes_: List[str] = None + ranges_: List[Tuple[float, float]] = None + bins_expected: List[Any] = None + axes_expected: List[Any] = None + ranges_expected: List[Any] = None + + bin_centers = [] + for i in range(len(args[1])): + bin_centers.append( + np.linspace(args[2][i][0], args[2][i][1], args[0][i] + 1), + ) + + if arg_type == "int": + bins_ = args[0][0] + axes_ = args[1] + ranges_ = args[2] + bins_expected = [bins_] * len(args[0]) + axes_expected = axes_ + ranges_expected = ranges_ + elif arg_type == "list_int": + bins_ = args[0] + axes_ = args[1] + ranges_ = args[2] + bins_expected = bins_ + axes_expected = axes_ + ranges_expected = ranges_ + elif arg_type == "array": + bins_ = [] + for i in range(len(args[0])): + bins_.append(bin_centers[i]) + axes_ = args[1] + bins_expected = bins_ + axes_expected = axes_ + elif arg_type == "tuple": + bins_ = [] + for i in range(len(args[0])): + bins_.append((args[2][i][0], args[2][i][1], args[0][i])) + axes_ = args[1] + bins_expected = args[0] + axes_expected = axes_ + ranges_expected = args[2] + elif arg_type == "dict_int": + bins_ = {} + for i, axis in enumerate(args[1]): + bins_[axis] = args[0][i] + ranges_ = args[2] + bins_expected = args[0] + axes_expected = args[1] + ranges_expected = args[2] + elif arg_type == "dict_array": + bins_ = {} + for i, axis in enumerate(args[1]): + bins_[axis] = bin_centers[i] + bins_expected = bin_centers + axes_expected = args[1] + elif arg_type == "dict_tuple": + bins_ = {} + for i, axis in enumerate(args[1]): + bins_[axis] = (args[2][i][0], args[2][i][1], args[0][i]) + bins_expected = args[0] + axes_expected = args[1] + ranges_expected = args[2] + + bins__, axes__, ranges__ = simplify_binning_arguments( + bins_, + axes_, + ranges_, ) - assert bins_ == [bins[0]] - assert axes_ == [axes[0]] - assert ranges_ == (ranges[0],) - - -def test_simplify_binning_arguments_edges(): - """Test the result of the _simplify_binning_arguments functions for bin edges""" - bin_edges = [np.linspace(r[0], r[1], b) for r, b in zip(ranges, bins)] - bin_edges_, axes_, ranges_ = _simplify_binning_arguments(bin_edges, axes) - for bin_, bin_edges_ in zip(bin_edges_, bin_edges): - np.testing.assert_allclose(bin_, bin_edges_) - assert axes_ == axes - assert ranges_ is None - - -def test_simplify_binning_arguments_tuple(): - """Test the result of the _simplify_binning_arguments functions for bin tuples""" - bin_tuple = [tuple((r[0], r[1], b)) for r, b in zip(ranges, bins)] - bins_, axes_, ranges_ = _simplify_binning_arguments(bin_tuple, axes) - assert bins_ == bins - assert axes_ == axes - assert ranges_ == ranges + + for i, bin_ in enumerate(bins__): + np.testing.assert_array_equal(bin_, bins_expected[i]) + np.testing.assert_array_equal(axes__[i], axes_expected[i]) + if ranges__ is not None: + np.testing.assert_array_equal(ranges__[i], ranges_expected[i])