Skip to content

Commit

Permalink
Handle zero-extent arrays during apply_over_* functions.
Browse files Browse the repository at this point in the history
Now, these functions just don't do any iterations at all. We also
document that all block/chunk shapes are expected to be positive, so as
to avoid divide-by-zero errors in downstream functions.
  • Loading branch information
LTLA committed Jan 30, 2024
1 parent bed54ba commit 4c12481
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 25 deletions.
14 changes: 10 additions & 4 deletions src/delayedarray/apply_over_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,14 @@ def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ..
memory: Available memory in bytes, to hold a single block in memory.
Returns:
Dimensions of the blocks.
Dimensions of the blocks. All values are guaranteed to be positive,
even if the extent of any dimension of ``x`` is zero.
"""
# Checking for empty dimensions and bailing out if we find any.
for d in x.shape:
if d == 0:
return (*(max(1, d) for d in x.shape),)

num_elements = memory / x.dtype.itemsize
chunk_dims = chunk_shape(x)
block_size = 1
Expand Down Expand Up @@ -55,7 +61,6 @@ def choose_block_shape_for_iteration(x, memory: int = 10000000) -> Tuple[int, ..
block_size = block_size_other
block_dims[i] = 1


return (*block_dims,)


Expand All @@ -74,8 +79,9 @@ def apply_over_blocks(x, fun: Callable, block_shape: Optional[Tuple] = None, all
block is typically provided as a :py:class:`~numpy.ndarray`.
block_shape:
Dimensionsof the block on the iteration dimension. If None, this is
chosen by :py:func:`~choose_block_shape_for_iteration`.
Dimensions of the block. All entries should be positive, even for
zero-extent dimensions of ``x``. If None, this is chosen by
:py:func:`~choose_block_shape_for_iteration`.
allow_sparse:
Whether to allow extraction of sparse subarrays. If true and
Expand Down
18 changes: 14 additions & 4 deletions src/delayedarray/apply_over_dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
__copyright__ = "ltla"
__license__ = "MIT"


def guess_iteration_block_size(x, dimension, memory: int = 10000000) -> int:
"""
Soft-deprecated alias for :py:func:`~choose_block_size_for_1d_iteration`.
"""
return choose_block_size_for_1d_iteration(x, dimension, memory)


def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000) -> int:
"""
Choose a block size for iterating over an array on a certain dimension,
Expand All @@ -29,24 +31,31 @@ def choose_block_size_for_1d_iteration(x, dimension: int, memory: int = 10000000
memory: Available memory in bytes, to hold a single block in memory.
Returns:
Size of the block on the iteration dimension.
Size of the block on the iteration dimension. This is guaranteed to be
positive, even if the extent of the dimension of ``x`` is zero.
"""
num_elements = memory / x.dtype.itemsize
shape = x.shape
fulldim = shape[dimension]

prod_other = 1
for i, s in enumerate(shape):
if s == 0:
# Bailing out if there's a zero-length dimension anywhere.
# We set a floor of 1 to avoid divide-by-zero errors.
return max(1, fulldim)
if i != dimension:
prod_other *= s

num_elements = memory / x.dtype.itemsize
ideal = int(num_elements / prod_other)
if ideal == 0:
return 1
if ideal >= fulldim:
return fulldim

curdim = chunk_shape(x)[dimension]
if ideal <= curdim:
return ideal

return int(ideal / curdim) * curdim


Expand All @@ -69,7 +78,8 @@ def apply_over_dimension(x, dimension: int, fun: Callable, block_size: Optional[
Each block is typically provided as a :py:class:`~numpy.ndarray`.
block_size:
Size of the block on the iteration dimension. If None, this is
Size of the block on the iteration dimension. This should be a
positive integer, even for zero-extent dimensions. If None, this is
chosen by :py:func:`~choose_block_size_for_1d_iteration`.
allow_sparse:
Expand Down
9 changes: 7 additions & 2 deletions src/delayedarray/chunk_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@

@singledispatch
def chunk_shape(x: Any) -> Tuple[int, ...]:
"""Get the dimensions of the array chunks. These define the preferred
blocks with which to iterate over the array in each dimension.
"""
Get the dimensions of the array chunks. These define the preferred
intervals with which to iterate over the array in each dimension, usually
reflecting a particular layout on disk or in memory. The extent of each
chunk dimension should be positive and less than that of the array's;
except for zero-length dimensions, in which case the chunk's extent should
be greater than the array (typically 1 to avoid divide by zero errors).
Args:
x: An array-like object.
Expand Down
28 changes: 16 additions & 12 deletions tests/test_DelayedArray.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,10 @@ def test_SparseNdarray_mean_dense(mask_rate, buffer_size):
y = delayedarray.wrap(numpy.ma.MaskedArray([1], mask=True)) + 20
assert y.mean() is numpy.ma.masked

# # Zero-length array is respected.
# y = delayedarray.wrap(numpy.ndarray((10, 0))) * 50
# assert numpy.isnan(y.mean())
# Zero-length array is respected.
with pytest.warns(RuntimeWarning):
y = delayedarray.wrap(numpy.ndarray((10, 0))) * 50
assert numpy.isnan(y.mean())


@pytest.mark.parametrize("mask_rate", [0, 0.5])
Expand All @@ -179,9 +180,10 @@ def test_SparseNdarray_mean_sparse(mask_rate, buffer_size):
y = delayedarray.wrap(ref) / 5
assert y.mean() is numpy.ma.masked

# # Zero-length array is respected.
# y = delayedarray.wrap(delayedarray.SparseNdarray((0,), None)) * 50
# assert numpy.isnan(y.mean())
# Zero-length array is respected.
with pytest.warns(RuntimeWarning):
y = delayedarray.wrap(delayedarray.SparseNdarray((0,), None, dtype=numpy.int32, index_dtype=numpy.int32)) * 50
assert numpy.isnan(y.mean())


@pytest.mark.parametrize("mask_rate", [0, 0.5])
Expand Down Expand Up @@ -210,9 +212,10 @@ def test_SparseNdarray_var_dense(mask_rate, buffer_size):
with pytest.warns(RuntimeWarning):
assert y.var() is numpy.ma.masked

# # Zero-length array is respected.
# y = delayedarray.wrap(numpy.ndarray((10, 0))) * 50
# assert numpy.isnan(y.var())
# Zero-length array is respected.
with pytest.warns(RuntimeWarning):
y = delayedarray.wrap(numpy.ndarray((10, 0))) * 50
assert numpy.isnan(y.var())


@pytest.mark.parametrize("mask_rate", [0, 0.5])
Expand Down Expand Up @@ -240,6 +243,7 @@ def test_SparseNdarray_var_sparse(mask_rate, buffer_size):
with pytest.warns(RuntimeWarning):
assert y.var() is numpy.ma.masked

# # Zero-length array is respected.
# y = delayedarray.wrap(delayedarray.SparseNdarray((0,), None)) * 50
# assert numpy.isnan(y.var())
# Zero-length array is respected.
with pytest.warns(RuntimeWarning):
y = delayedarray.wrap(delayedarray.SparseNdarray((0,), None, dtype=numpy.int32, index_dtype=numpy.int32)) * 50
assert numpy.isnan(y.var())
1 change: 0 additions & 1 deletion tests/test_SparseNdarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1223,4 +1223,3 @@ def test_SparseNdarray_var(mask_rate):
y = delayedarray.SparseNdarray((0,), None, dtype=numpy.dtype("float64"), index_dtype=numpy.dtype("int8"))
with pytest.warns(RuntimeWarning):
assert numpy.isnan(y.var())

14 changes: 12 additions & 2 deletions tests/test_apply_over_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def test_choose_block_shape_for_iteration():
assert da.choose_block_shape_for_iteration(x, memory=0) == (1, 1)
assert da.choose_block_shape_for_iteration(x, memory=40) == (1, 5)

# Behaves correctly with empty objects.
empty = np.random.rand(100, 0)
assert da.choose_block_shape_for_iteration(empty) == (100, 1)

x = _ChunkyBoi((100, 200), (20, 25))
assert da.choose_block_shape_for_iteration(x, memory=4000) == (20, 25)
assert da.choose_block_shape_for_iteration(x, memory=40000) == (100, 50)
Expand All @@ -48,7 +52,7 @@ def _dense_sum(position, block):


@pytest.mark.parametrize("mask_rate", [0, 0.2])
def test_apply_over_dimension_dense(mask_rate):
def test_apply_over_block_dense(mask_rate):
x = np.ndarray([100, 200])
counter = 0
for i in range(x.shape[0]):
Expand All @@ -72,7 +76,7 @@ def test_apply_over_dimension_dense(mask_rate):


@pytest.mark.parametrize("mask_rate", [0, 0.2])
def test_apply_over_dimension_sparse(mask_rate):
def test_apply_over_block_sparse(mask_rate):
x = simulate_SparseNdarray((100, 200), mask_rate=mask_rate)

expected = 0
Expand Down Expand Up @@ -105,3 +109,9 @@ def _sparse_sum(position, block):
assert np.allclose(expected, sum(y[1] for y in output))
assert output[0][0] == [(0, 3), (0, 7)]
assert output[-1][0] == [(99, 100), (196, 200)]


def test_apply_over_block_empty():
x = np.ndarray([100, 0])
output = da.apply_over_blocks(x, _dense_sum)
assert len(output) == 0
14 changes: 14 additions & 0 deletions tests/test_apply_over_dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def test_choose_block_size_for_1d_iteration():
assert da.choose_block_size_for_1d_iteration(x, 0, memory=0) == 1
assert da.choose_block_size_for_1d_iteration(x, 1, memory=0) == 1

# Behaves correctly with empty objects.
empty = np.random.rand(100, 0)
assert da.choose_block_size_for_1d_iteration(empty, 0) == 100
assert da.choose_block_size_for_1d_iteration(empty, 1) == 1

# Making a slightly more complex situation.
x = _ChunkyBoi((100, 200), (20, 25))
assert da.choose_block_size_for_1d_iteration(x, 0, memory=4000) == 2
Expand Down Expand Up @@ -125,3 +130,12 @@ def _sparse_sum(position, block):
assert np.allclose(expected, sum(y[1] for y in output))
assert output[0][0] == (0, 7)
assert output[-1][0] == (196, 200)


def test_apply_over_dimension_empty():
x = np.ndarray([100, 0])
output = da.apply_over_dimension(x, 0, _dense_sum)
assert len(output) == 1

output = da.apply_over_dimension(x, 1, _dense_sum)
assert len(output) == 0

0 comments on commit 4c12481

Please sign in to comment.