Skip to content

Commit

Permalink
Migrate the combining generics from biocgenerics to biocutils. (#8)
Browse files Browse the repository at this point in the history
This aims to consolidate all generics into a single package, rather than
scattering the various functions between here and biocgenerics.
  • Loading branch information
LTLA authored Nov 8, 2023
1 parent c7a1058 commit d828f38
Show file tree
Hide file tree
Showing 13 changed files with 597 additions and 12 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ testing =
pytest
pytest-cov
pandas
scipy

[options.entry_points]
# Add here console scripts like:
Expand Down
7 changes: 7 additions & 0 deletions src/biocutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@
from .print_wrapped_table import create_floating_names, print_type, print_wrapped_table, truncate_strings
from .subset import subset
from .union import union

from .combine import combine
from .combine_rows import combine_rows
from .combine_columns import combine_columns
from .combine_sequences import combine_sequences

from .convert_to_dense import convert_to_dense
43 changes: 43 additions & 0 deletions src/biocutils/_utils_combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
def _check_array_dimensions(x, active: int) -> bool:
first = x[0].shape
for i in range(1, len(x)):
current = x[i].shape
if len(first) != len(current):
raise ValueError("inconsistent dimensions for combining arrays (expected " + str(len(first)) + ", got " + str(len(current)) + " for array " + str(i) + ")")
for j in range(len(first)):
if j != active and first[j] != current[j]:
raise ValueError("inconsistent dimension extents for combining arrays on dimension " + str(active) + " (expected " + str(first[active]) + ", got " + str(current[active]) + " for array " + str(i) + ")")


def _coerce_sparse_matrix(first, combined, module):
if isinstance(first, module.csr_matrix):
return combined.tocsr()
elif isinstance(first, module.csc_matrix):
return combined.tocsc()
elif isinstance(first, module.bsr_matrix):
return combined.tobsr()
elif isinstance(first, module.coo_matrix):
return combined.tocoo()
elif isinstance(first, module.dia_matrix):
return combined.todia()
elif isinstance(first, module.lil_matrix):
return combined.tolil()
else:
return combined


def _coerce_sparse_array(first, combined, module):
if isinstance(first, module.csr_array):
return combined.tocsr()
elif isinstance(first, module.csc_array):
return combined.tocsc()
elif isinstance(first, module.bsr_array):
return combined.tobsr()
elif isinstance(first, module.coo_array):
return combined.tocoo()
elif isinstance(first, module.dia_array):
return combined.todia()
elif isinstance(first, module.lil_array):
return combined.tolil()
else:
return combined
24 changes: 24 additions & 0 deletions src/biocutils/combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any

from .combine_rows import combine_rows
from .combine_sequences import combine_sequences


def combine(*x: Any):
"""
Generic combine that checks if the objects are n-dimensional for n > 1
(i.e. has a ``shape`` property of length greater than 1); if so, it calls
:py:func:`~biocgenerics.combine_rows.combine_rows` to combine them by
the first dimension, otherwise it assumes that they are vector-like and
calls :py:func:`~biocgenerics.combine_seqs.combine_seqs` instead.
Args:
x: Objects to combine.
Returns:
A combined object, typically the same type as the first element in ``x``.
"""
if hasattr(x[0], "shape") and len(x[0].shape) > 1:
return combine_rows(*x)
else:
return combine_sequences(*x)
82 changes: 82 additions & 0 deletions src/biocutils/combine_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from functools import singledispatch
from typing import Any
from warnings import warn
import numpy

from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed
from .convert_to_dense import convert_to_dense

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_columns(*x: Any):
"""Combine n-dimensional objects along the second dimension.
If all elements are :py:class:`~numpy.ndarray`,
we combine them using numpy's :py:func:`~numpy.concatenate`.
If all elements are either :py:class:`~scipy.sparse.spmatrix` or
:py:class:`~scipy.sparse.sparray`, these objects are combined
using scipy's :py:class:`~scipy.sparse.hstack`.
If all elements are :py:class:`~pandas.DataFrame` objects, they are
combined using :py:func:`~pandas.concat` along the second axis.
Args:
x:
n-dimensional objects to combine. All elements of x are expected
to be the same class.
Returns:
Combined object, typically the same type as the first entry of ``x``
"""
raise NotImplementedError("no `combine_columns` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_columns.register
def _combine_columns_dense_arrays(*x: numpy.ndarray):
_check_array_dimensions(x, active=1)
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)


if is_package_installed("scipy") is True:
import scipy.sparse as sp

def _combine_columns_sparse_matrices(*x):
_check_array_dimensions(x, 1)
if is_list_of_type(x, sp.spmatrix):
combined = sp.hstack(x)
return _coerce_sparse_matrix(x[0], combined, sp)

warn("not all elements are scipy sparse matrices")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)

try:
combine_columns.register(sp.spmatrix, _combine_columns_sparse_matrices)
except Exception:
pass

def _combine_columns_sparse_arrays(*x):
_check_array_dimensions(x, 1)
if is_list_of_type(x, sp.sparray):
combined = sp.hstack(x)
return _coerce_sparse_array(x[0], combined, sp)

warn("not all elements are scipy sparse arrays")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)


if is_package_installed("pandas") is True:
from pandas import DataFrame, concat

@combine_columns.register(DataFrame)
def _combine_columns_pandas_dataframe(*x):
return concat(x, axis=1)
87 changes: 87 additions & 0 deletions src/biocutils/combine_rows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from functools import singledispatch
from typing import Any
from warnings import warn
import numpy

from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed
from .convert_to_dense import convert_to_dense

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_rows(*x: Any):
"""Combine n-dimensional objects along their first dimension.
If all elements are :py:class:`~numpy.ndarray`, we combine them using
numpy's :py:func:`~numpy.concatenate`.
If all elements are either :py:class:`~scipy.sparse.spmatrix` or
:py:class:`~scipy.sparse.sparray`, these objects are combined using scipy's
:py:class:`~scipy.sparse.vstack`.
If all elements are :py:class:`~pandas.DataFrame` objects, they are
combined using :py:func:`~pandas.concat` along the first axis.
Args:
x:
One or more n-dimensional objects to combine. All elements of x
are expected to be the same class.
Returns:
Combined object, typically the same type as the first entry of ``x``.
"""
raise NotImplementedError("no `combine_rows` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_rows.register(numpy.ndarray)
def _combine_rows_dense_arrays(*x: numpy.ndarray):
_check_array_dimensions(x, active=0)
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)


if is_package_installed("scipy"):
import scipy.sparse as sp

def _combine_rows_sparse_matrices(*x):
_check_array_dimensions(x, 0)
if is_list_of_type(x, sp.spmatrix):
combined = sp.vstack(x)
return _coerce_sparse_matrix(x[0], combined, sp)

warn("not all elements are SciPy sparse matrices")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)

try:
combine_rows.register(sp.sparray, _combine_rows_sparse_arrays)
except Exception:
pass

def _combine_rows_sparse_arrays(*x):
_check_array_dimensions(x, 0)
if is_list_of_type(x, sp.sparray):
combined = sp.vstack(x)
return _coerce_sparse_array(first, combined, sp)

warn("not all elements are SciPy sparse arrays")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)

try:
combine_rows.register(sp.spmatrix, _combine_rows_sparse_matrices)
except Exception:
pass


if is_package_installed("pandas"):
from pandas import DataFrame, concat

@combine_rows.register(DataFrame)
def _combine_rows_pandas_dataframe(*x):
return concat(x, axis=0)
63 changes: 63 additions & 0 deletions src/biocutils/combine_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from functools import singledispatch
from itertools import chain
from typing import Any
from warnings import warn
import numpy

from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_sequences(*x: Any):
"""Combine vector-like objects (1-dimensional arrays).
If all elements are :py:class:`~numpy.ndarray`,
we combine them using numpy's :py:func:`~numpy.concatenate`.
If all elements are :py:class:`~pandas.Series` objects, they are combined
using :py:func:`~pandas.concat`.
For all other scenarios, all elements are coerced to a :py:class:`~list`
and combined.
Args:
x:
Vector-like objects to combine.
All elements of ``x`` are expected to be the same class or
atleast compatible with each other.
Returns:
A combined object, ideally of the same type as the first element in ``x``.
"""
raise NotImplementedError("no `combine_sequences` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_sequences.register(list)
def _combine_sequences_lists(*x: list):
return list(chain(*x))


@combine_sequences.register(numpy.ndarray)
def _combine_sequences_dense_arrays(*x: numpy.ndarray):
return numpy.concatenate(x, axis=None)


if is_package_installed("pandas") is True:
from pandas import Series, concat

@combine_sequences.register(Series)
def _combine_sequences_pandas_series(*x):
if not is_list_of_type(x, Series):
elems = []
for elem in x:
if not isinstance(elem, Series):
elems.append(Series(elem))
else:
elems.append(elem)
x = elems
return concat(x)
44 changes: 44 additions & 0 deletions src/biocutils/convert_to_dense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from functools import singledispatch
from typing import Any
import numpy

from .package_utils import is_package_installed


@singledispatch
def convert_to_dense(x: Any) -> numpy.ndarray:
"""
Convert something to a NumPy dense array of the same shape.
This is typically used a fallback for the various combining
methods when there are lots of different array types that
``numpy.concatenate`` doesn't understand.
Args:
x: Some array-like object to be stored as a NumPy array.
Returns:
A NumPy array.
"""
return numpy.array(x)


@convert_to_dense.register
def _convert_to_dense_numpy(x: numpy.ndarray) -> numpy.ndarray:
return x


if is_package_installed("scipy"):
import scipy.sparse as sp

def _convert_sparse_to_dense(x):
return x.todense()

try:
convert_to_dense.register(sp.spmatrix, _convert_sparse_to_dense)
except Exception:
pass

try:
convert_to_dense.register(sp.sparray, _convert_sparse_to_dense)
except Exception:
pass
Loading

0 comments on commit d828f38

Please sign in to comment.