Skip to content

Commit

Permalink
Merge pull request #98 from OpenCOMPES/bugfix/binning-tuple
Browse files Browse the repository at this point in the history
Binning bug
  • Loading branch information
rettigl authored May 15, 2023
2 parents c549ab1 + 69c14ab commit 462bbeb
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 159 deletions.
159 changes: 103 additions & 56 deletions sed/binning/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@

from .numba_bin import numba_histogramdd
from .utils import _arraysum
from .utils import _simplify_binning_arguments
from .utils import bin_centers_to_bin_edges
from .utils import simplify_binning_arguments

N_CPU = psutil.cpu_count()


def bin_partition(
part: Union[dask.dataframe.core.DataFrame, pd.DataFrame],
part: Union[dask.dataframe.DataFrame, pd.DataFrame],
bins: Union[
int,
dict,
tuple,
List[int],
List[np.ndarray],
List[tuple],
Sequence[int],
Sequence[np.ndarray],
Sequence[tuple],
] = 100,
axes: Union[str, Sequence[str]] = None,
axes: Sequence[str] = None,
ranges: Sequence[Tuple[float, float]] = None,
hist_mode: str = "numba",
jitter: Union[list, dict] = None,
Expand All @@ -44,24 +44,35 @@ def bin_partition(
"""Compute the n-dimensional histogram of a single dataframe partition.
Args:
part (Union[dask.dataframe.core.DataFrame, pd.DataFrame]): dataframe on which
part (Union[dask.dataframe.DataFrame, pd.DataFrame]): dataframe on which
to perform the histogram. Usually a partition of a dask DataFrame.
bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional):
bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional):
Definition of the bins. Can be any of the following cases:
- an integer describing the number of bins in on all dimensions
- a tuple of 3 numbers describing start, end and step of the binning
range
- a np.arrays defining the binning edges
- a list (NOT a tuple) of any of the above (int, tuple or np.ndarray)
- a dictionary made of the axes as keys and any of the above as values.
This takes priority over the axes and range arguments. Defaults to 100.
axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on
which to calculate the histogram. The order will be the order of the
dimensions in the resulting array. Defaults to None.
ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the
start and end point of the binning range. Defaults to None.
- an integer describing the number of bins for all dimensions. This
requires "ranges" to be defined as well.
- A sequence containing one entry of the following types for each
dimenstion:
- an integer describing the number of bins. This requires "ranges"
to be defined as well.
- a np.arrays defining the bin centers
- a tuple of 3 numbers describing start, end and step of the binning
range.
- a dictionary made of the axes as keys and any of the above as
values.
The last option takes priority over the axes and range arguments.
Defaults to 100.
axes (Sequence[str], optional): Sequence containing the names of
the axes (columns) on which to calculate the histogram. The order will be
the order of the dimensions in the resulting array. Only not required if
bins are provided as dictionary containing the axis names.
Defaults to None.
ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing
the start and end point of the binning range. Required if bins given as
int or Sequence[int]. Defaults to None.
hist_mode (str, optional): Histogram calculation method.
- "numpy": use ``numpy.histogramdd``,
Expand Down Expand Up @@ -98,11 +109,40 @@ def bin_partition(
- **edges**: A list of D arrays describing the bin edges for each dimension.
"""
if not skip_test:
bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges)
bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges)
else:
if not isinstance(bins, list) or not (
all(isinstance(x, int) for x in bins)
or all(isinstance(x, np.ndarray) for x in bins)
):
raise TypeError(
"bins needs to be of type 'List[int] or List[np.ndarray]' if tests are skipped!",
)
if not (isinstance(axes, list)) or not all(
isinstance(axis, str) for axis in axes
):
raise TypeError(
"axes needs to be of type 'List[str]' if tests are skipped!",
)
bins = cast(Union[List[int], List[np.ndarray]], bins)
axes = cast(Sequence[str], axes)
ranges = cast(Sequence[Tuple[float, float]], ranges)
axes = cast(List[str], axes)
ranges = cast(List[Tuple[float, float]], ranges)

# convert bin centers to bin edges:
if all(isinstance(x, np.ndarray) for x in bins):
bins = cast(List[np.ndarray], bins)
for i, bin_centers in enumerate(bins):
bins[i] = bin_centers_to_bin_edges(bin_centers)
else:
bins = cast(List[int], bins)
# shift ranges by half a bin size to align the bin centers to the given ranges,
# as the histogram functions interprete the ranges as limits for the edges.
for i, nbins in enumerate(bins):
halfbinsize = (ranges[i][1] - ranges[i][0]) / (nbins) / 2
ranges[i] = (
ranges[i][0] - halfbinsize,
ranges[i][1] - halfbinsize,
)

# Locate columns for binning operation
col_id = [part.columns.get_loc(axis) for axis in axes]
Expand Down Expand Up @@ -162,12 +202,11 @@ def bin_dataframe(
bins: Union[
int,
dict,
tuple,
List[int],
List[np.ndarray],
List[tuple],
Sequence[int],
Sequence[np.ndarray],
Sequence[tuple],
] = 100,
axes: Union[str, Sequence[str]] = None,
axes: Sequence[str] = None,
ranges: Sequence[Tuple[float, float]] = None,
hist_mode: str = "numba",
mode: str = "fast",
Expand All @@ -185,22 +224,33 @@ def bin_dataframe(
Args:
df (dask.dataframe.DataFrame): a dask.DataFrame on which to perform the
histogram.
bins (int, dict, tuple, List[int], List[np.ndarray], List[tuple], optional):
Definition of the bins. Can be any of the following cases:
- an integer describing the number of bins in on all dimensions
- a tuple of 3 numbers describing start, end and step of the binning
range
- a np.arrays defining the binning edges
- a list (NOT a tuple) of any of the above (int, tuple or np.ndarray)
- a dictionary made of the axes as keys and any of the above as values.
This takes priority over the axes and range arguments. Defaults to 100.
axes (Union[str, Sequence[str]], optional): The names of the axes (columns) on
which to calculate the histogram. The order will be the order of the
dimensions in the resulting array. Defaults to None.
ranges (Sequence[Tuple[float, float]], optional): list of tuples containing the
start and end point of the binning range. Defaults to None.
bins (int, dict, Sequence[int], Sequence[np.ndarray], Sequence[tuple], optional):
Definition of the bins. Can be any of the following cases:
- an integer describing the number of bins for all dimensions. This
requires "ranges" to be defined as well.
- A sequence containing one entry of the following types for each
dimenstion:
- an integer describing the number of bins. This requires "ranges"
to be defined as well.
- a np.arrays defining the bin centers
- a tuple of 3 numbers describing start, end and step of the binning
range.
- a dictionary made of the axes as keys and any of the above as
values.
The last option takes priority over the axes and range arguments.
Defaults to 100.
axes (Sequence[str], optional): Sequence containing the names of
the axes (columns) on which to calculate the histogram. The order will be
the order of the dimensions in the resulting array. Only not required if
bins are provided as dictionary containing the axis names.
Defaults to None.
ranges (Sequence[Tuple[float, float]], optional): Sequence of tuples containing
the start and end point of the binning range. Required if bins given as
int or Sequence[int]. Defaults to None.
hist_mode (str, optional): Histogram calculation method.
- "numpy": use ``numpy.histogramdd``,
Expand Down Expand Up @@ -243,30 +293,27 @@ def bin_dataframe(
Returns:
xr.DataArray: The result of the n-dimensional binning represented in an
xarray object, combining the data with the axes.
xarray object, combining the data with the axes (bin centers).
"""
bins, axes, ranges = _simplify_binning_arguments(bins, axes, ranges)
bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges)

# create the coordinate axes for the xarray output
# if provided as array, they are interpreted as bin centers
if isinstance(bins[0], np.ndarray):
bins = cast(List[np.ndarray], bins)
coords = dict(zip(axes, bins))
elif ranges is None:
raise ValueError(
"bins is not an array and range is none.. this shouldn't happen.",
"bins is not an array and range is none. this shouldn't happen.",
)
else:
bins = cast(List[int], bins)
coords = {
ax: np.linspace(r[0], r[1], n)
ax: np.linspace(r[0], r[1], n, endpoint=False)
for ax, r, n in zip(axes, ranges, bins)
}

if isinstance(bins[0], np.ndarray):
bins = cast(List[np.ndarray], bins)
full_shape = tuple(x.size for x in bins)
else:
bins = cast(List[int], bins)
full_shape = tuple(bins)
full_shape = tuple(axis.size for axis in coords.values())

full_result = np.zeros(full_shape)
partition_results = [] # Partition-level results
Expand Down
6 changes: 3 additions & 3 deletions sed/binning/numba_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,10 @@ def numba_histogramdd(
Args:
sample (np.ndarray): The data to be histogrammed with shape N,D
bins (Union[int, Sequence[int], Sequence[np.ndarray], np.ndarray]): The number
of bins for each dimension D, or a sequence of bins on which to calculate
of bins for each dimension D, or a sequence of bin edges on which to calculate
the histogram.
ranges (Sequence, optional): The range to use for binning when bins is a list
of integers. Defaults to None.
ranges (Sequence, optional): The range(s) to use for binning when bins is a sequence
of integers or sequence of arrays. Defaults to None.
Raises:
ValueError: In case of dimension mismatch.
Expand Down
Loading

0 comments on commit 462bbeb

Please sign in to comment.