Skip to content

Commit

Permalink
Renamed the SciPy conversion functions and made them generics.
Browse files Browse the repository at this point in the history
This should make it easier to extend for classes that don't need to go
through block processing, e.g., HDF5 compressed sparse matrices.
  • Loading branch information
LTLA committed Jan 30, 2024
1 parent 1df3701 commit 585feb7
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 77 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,7 @@ delayedarray.to_sparse_array(d)
Users can easily convert a 2-dimensional `SparseNdarray` to some of the common SciPy sparse matrix classes downstream calculations.

```python
delayedarray.to_scipy_csc_matrix(current)
delayedarray.to_scipy_csr_matrix(current)
delayedarray.to_scipy_coo_matrix(current)
delayedarray.to_scipy_sparse_matrix(current, "csc")
```

More simply, users can just call `numpy.array()` to realize the delayed operations into a standard NumPy array for consumption.
Expand Down
3 changes: 1 addition & 2 deletions src/delayedarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from .extract_sparse_array import extract_sparse_array
from .to_dense_array import to_dense_array
from .to_sparse_array import to_sparse_array
from .to_scipy_sparse_matrix import *

from .create_dask_array import create_dask_array
from .is_sparse import is_sparse
Expand All @@ -40,5 +41,3 @@
from .apply_over_dimension import apply_over_dimension, choose_block_size_for_1d_iteration, guess_iteration_block_size
from .apply_over_blocks import apply_over_blocks, choose_block_shape_for_iteration
from .wrap import wrap

from .to_scipy import *
Original file line number Diff line number Diff line change
@@ -1,38 +1,17 @@
import numpy
from typing import Any
from functools import singledispatch
from typing import Any, Literal
from biocutils.package_utils import is_package_installed

from .SparseNdarray import SparseNdarray
from .extract_sparse_array import to_sparse_array
from .to_sparse_array import to_sparse_array


if is_package_installed("scipy"):
import scipy.sparse


def to_scipy_csc_matrix(x: Any) -> scipy.sparse.csc_matrix:
"""
Convert a 2-dimensional ``DelayedArray`` or ``SparseNdarray`` into a
SciPy compressed sparse column (CSC) matrix.
Args:
x:
Input matrix where :py:func:`~delayedarray.is_sparse.is_sparse`
returns True and :py:func:`~delayedarray.is_sparse.is_masked`
returns False.
Returns:
A CSC matrix with the contents of ``x``.
"""
# One might think that we could be more memory-efficient by doing block
# processing. However, there is no advantage from doing so as we eventually
# need to hold all the blocks in memory before concatenation. We'd only
# avoid this if we did two passes; one to collect the total size for
# allocation, and another to actually fill the vectors; not good, so we
# just forget about it and load it all into memory up-front.
if not isinstance(x, SparseNdarray):
x = to_sparse_array(x)

def _to_csc(x: Any) -> scipy.sparse.csc_matrix:
all_indptrs = numpy.zeros(x.shape[1] + 1, dtype=numpy.uint64)
if x.contents is not None:
all_indices = []
Expand All @@ -53,26 +32,7 @@ def to_scipy_csc_matrix(x: Any) -> scipy.sparse.csc_matrix:
return scipy.sparse.csc_matrix((all_values, all_indices, all_indptrs), shape=x.shape)


def to_scipy_csr_matrix(x: Any) -> scipy.sparse.csr_matrix:
"""
Convert a 2-dimensional ``DelayedArray`` or ``SparseNdarray`` into a
SciPy compressed sparse row (CSR) matrix.
Args:
x:
Input matrix where :py:func:`~delayedarray.is_sparse.is_sparse`
returns True and :py:func:`~delayedarray.is_sparse.is_masked`
returns False.
Returns:
A CSR matrix with the contents of ``x``.
"""
# Same logic as above; block processing just ends up reading the entire
# thing into memory before forming the full arrays, so we just load it
# all in to start with and save ourselves the trouble.
if not isinstance(x, SparseNdarray):
x = to_sparse_array(x)

def _to_csr(x: Any) -> scipy.sparse.csr_matrix:
all_indptrs = numpy.zeros(x.shape[0] + 1, dtype=numpy.uint64)
if x.contents is not None:
# First pass (in memory) to obtain the total sizes.
Expand Down Expand Up @@ -103,24 +63,7 @@ def to_scipy_csr_matrix(x: Any) -> scipy.sparse.csr_matrix:
return scipy.sparse.csr_matrix((all_values, all_indices, all_indptrs), shape=x.shape)


def to_scipy_coo_matrix(x: Any) -> scipy.sparse.coo_matrix:
"""
Convert a 2-dimensional ``DelayedArray`` or ``SparseNdarray`` into a
SciPy sparse coordinate (COO) matrix.
Args:
x:
Input matrix where :py:func:`~delayedarray.is_sparse.is_sparse`
returns True and :py:func:`~delayedarray.is_sparse.is_masked`
returns False.
Returns:
A COO matrix with the contents of ``x``.
"""
# Same logic as above.
if not isinstance(x, SparseNdarray):
x = to_sparse_array(x)

def _to_coo(x: Any) -> scipy.sparse.coo_matrix:
if x.contents is not None:
# First pass (in memory) to obtain the total sizes.
total_count = 0
Expand All @@ -147,3 +90,41 @@ def to_scipy_coo_matrix(x: Any) -> scipy.sparse.coo_matrix:
all_values = numpy.zeros(0, dtype=x.dtype)

return scipy.sparse.coo_matrix((all_values, (all_rows, all_cols)), shape=x.shape)


@singledispatch
def to_scipy_sparse_matrix(x: Any, format: Literal["coo", "csr", "csc"] = "csc") -> scipy.sparse.spmatrix:
"""
Convert a 2-dimensional array into a SciPy sparse matrix.
Args:
x:
Input matrix where :py:func:`~delayedarray.is_sparse.is_sparse`
returns True and :py:func:`~delayedarray.is_masked.is_masked`
returns False.
format:
Type of SciPy matrix to create - coordinate (coo), compressed
sparse row (csr) or compressed sparse column (csc).
Returns:
A SciPy sparse matrix with the contents of ``x``.
"""
# One might think that we could be more memory-efficient by doing block
# processing. However, there is no advantage from doing so as we eventually
# need to hold all the blocks in memory before concatenation. We'd only
# avoid this if we did two passes; one to collect the total size for
# allocation, and another to actually fill the vectors; not good, so we
# just forget about it and load it all into memory up-front.
return to_scipy_sparse_matrix_from_SparseNdarray(to_sparse_array(x), format=format)


@to_scipy_sparse_matrix.register
def to_scipy_sparse_matrix_from_SparseNdarray(x: SparseNdarray, format: Literal["coo", "csr", "csc"] = "csc") -> scipy.sparse.spmatrix:
"""See :py:meth:`~to_scipy_sparse_matrix`."""
if format == "csc":
return _to_csc(x)
elif format == "csr":
return _to_csr(x)
else:
return _to_coo(x)
18 changes: 9 additions & 9 deletions tests/test_to_scipy.py → tests/test_to_scipy_sparse_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,37 @@
from utils import simulate_SparseNdarray


def test_to_scipy_csc_matrix():
def test_to_scipy_sparse_matrix_csc():
test_shape = (100, 150)
y = simulate_SparseNdarray(test_shape)
z = delayedarray.to_scipy_csc_matrix(y)
z = delayedarray.to_scipy_sparse_matrix(y, "csc")
assert isinstance(z, scipy.sparse.csc_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()

z = delayedarray.to_scipy_csc_matrix(delayedarray.wrap(y))
z = delayedarray.to_scipy_sparse_matrix(delayedarray.wrap(y), "csc")
assert isinstance(z, scipy.sparse.csc_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()


def test_to_scipy_csr_matrix():
def test_to_scipy_sparse_matrix_csr():
test_shape = (150, 80)
y = simulate_SparseNdarray(test_shape)
z = delayedarray.to_scipy_csr_matrix(y)
z = delayedarray.to_scipy_sparse_matrix(y, "csr")
assert isinstance(z, scipy.sparse.csr_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()

z = delayedarray.to_scipy_csr_matrix(delayedarray.wrap(y))
z = delayedarray.to_scipy_sparse_matrix(delayedarray.wrap(y), "csr")
assert isinstance(z, scipy.sparse.csr_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()


def test_to_scipy_coo_matrix():
def test_to_scipy_sparse_matrix_coo():
test_shape = (70, 90)
y = simulate_SparseNdarray(test_shape)
z = delayedarray.to_scipy_coo_matrix(y)
z = delayedarray.to_scipy_sparse_matrix(y, "coo")
assert isinstance(z, scipy.sparse.coo_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()

z = delayedarray.to_scipy_coo_matrix(delayedarray.wrap(y))
z = delayedarray.to_scipy_sparse_matrix(delayedarray.wrap(y), "coo")
assert isinstance(z, scipy.sparse.coo_matrix)
assert (z.toarray() == delayedarray.to_dense_array(y)).all()

0 comments on commit 585feb7

Please sign in to comment.