Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate the combining generics from biocgenerics to biocutils. #8

Merged
merged 4 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ testing =
pytest
pytest-cov
pandas
scipy

[options.entry_points]
# Add here console scripts like:
Expand Down
7 changes: 7 additions & 0 deletions src/biocutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@
from .print_wrapped_table import create_floating_names, print_type, print_wrapped_table, truncate_strings
from .subset import subset
from .union import union

from .combine import combine
from .combine_rows import combine_rows
from .combine_columns import combine_columns
from .combine_sequences import combine_sequences

from .convert_to_dense import convert_to_dense
43 changes: 43 additions & 0 deletions src/biocutils/_utils_combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
def _check_array_dimensions(x, active: int) -> bool:
first = x[0].shape
for i in range(1, len(x)):
current = x[i].shape
if len(first) != len(current):
raise ValueError("inconsistent dimensions for combining arrays (expected " + str(len(first)) + ", got " + str(len(current)) + " for array " + str(i) + ")")
for j in range(len(first)):
if j != active and first[j] != current[j]:
raise ValueError("inconsistent dimension extents for combining arrays on dimension " + str(active) + " (expected " + str(first[active]) + ", got " + str(current[active]) + " for array " + str(i) + ")")


def _coerce_sparse_matrix(first, combined, module):
if isinstance(first, module.csr_matrix):
return combined.tocsr()
elif isinstance(first, module.csc_matrix):
return combined.tocsc()
elif isinstance(first, module.bsr_matrix):
return combined.tobsr()
elif isinstance(first, module.coo_matrix):
return combined.tocoo()
elif isinstance(first, module.dia_matrix):
return combined.todia()
elif isinstance(first, module.lil_matrix):
return combined.tolil()
else:
return combined


def _coerce_sparse_array(first, combined, module):
if isinstance(first, module.csr_array):
return combined.tocsr()
elif isinstance(first, module.csc_array):
return combined.tocsc()
elif isinstance(first, module.bsr_array):
return combined.tobsr()
elif isinstance(first, module.coo_array):
return combined.tocoo()
elif isinstance(first, module.dia_array):
return combined.todia()
elif isinstance(first, module.lil_array):
return combined.tolil()
else:
return combined
24 changes: 24 additions & 0 deletions src/biocutils/combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any

from .combine_rows import combine_rows
from .combine_sequences import combine_sequences


def combine(*x: Any):
"""
Generic combine that checks if the objects are n-dimensional for n > 1
(i.e. has a ``shape`` property of length greater than 1); if so, it calls
:py:func:`~biocgenerics.combine_rows.combine_rows` to combine them by
the first dimension, otherwise it assumes that they are vector-like and
calls :py:func:`~biocgenerics.combine_seqs.combine_seqs` instead.

Args:
x: Objects to combine.

Returns:
A combined object, typically the same type as the first element in ``x``.
"""
if hasattr(x[0], "shape") and len(x[0].shape) > 1:
return combine_rows(*x)
else:
return combine_sequences(*x)
82 changes: 82 additions & 0 deletions src/biocutils/combine_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from functools import singledispatch
from typing import Any
from warnings import warn
import numpy

from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed
from .convert_to_dense import convert_to_dense

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_columns(*x: Any):
"""Combine n-dimensional objects along the second dimension.

If all elements are :py:class:`~numpy.ndarray`,
we combine them using numpy's :py:func:`~numpy.concatenate`.

If all elements are either :py:class:`~scipy.sparse.spmatrix` or
:py:class:`~scipy.sparse.sparray`, these objects are combined
using scipy's :py:class:`~scipy.sparse.hstack`.

If all elements are :py:class:`~pandas.DataFrame` objects, they are
combined using :py:func:`~pandas.concat` along the second axis.

Args:
x:
n-dimensional objects to combine. All elements of x are expected
to be the same class.

Returns:
Combined object, typically the same type as the first entry of ``x``
"""
raise NotImplementedError("no `combine_columns` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_columns.register
def _combine_columns_dense_arrays(*x: numpy.ndarray):
_check_array_dimensions(x, active=1)
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)


if is_package_installed("scipy") is True:
import scipy.sparse as sp

def _combine_columns_sparse_matrices(*x):
_check_array_dimensions(x, 1)
if is_list_of_type(x, sp.spmatrix):
combined = sp.hstack(x)
return _coerce_sparse_matrix(x[0], combined, sp)

warn("not all elements are scipy sparse matrices")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)

try:
combine_columns.register(sp.spmatrix, _combine_columns_sparse_matrices)
except Exception:
pass

def _combine_columns_sparse_arrays(*x):
_check_array_dimensions(x, 1)
if is_list_of_type(x, sp.sparray):
combined = sp.hstack(x)
return _coerce_sparse_array(x[0], combined, sp)

warn("not all elements are scipy sparse arrays")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x, axis=1)


if is_package_installed("pandas") is True:
from pandas import DataFrame, concat

@combine_columns.register(DataFrame)
def _combine_columns_pandas_dataframe(*x):
return concat(x, axis=1)
87 changes: 87 additions & 0 deletions src/biocutils/combine_rows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from functools import singledispatch
from typing import Any
from warnings import warn
import numpy

from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed
from .convert_to_dense import convert_to_dense

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_rows(*x: Any):
"""Combine n-dimensional objects along their first dimension.

If all elements are :py:class:`~numpy.ndarray`, we combine them using
numpy's :py:func:`~numpy.concatenate`.

If all elements are either :py:class:`~scipy.sparse.spmatrix` or
:py:class:`~scipy.sparse.sparray`, these objects are combined using scipy's
:py:class:`~scipy.sparse.vstack`.

If all elements are :py:class:`~pandas.DataFrame` objects, they are
combined using :py:func:`~pandas.concat` along the first axis.

Args:
x:
One or more n-dimensional objects to combine. All elements of x
are expected to be the same class.

Returns:
Combined object, typically the same type as the first entry of ``x``.
"""
raise NotImplementedError("no `combine_rows` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_rows.register(numpy.ndarray)
def _combine_rows_dense_arrays(*x: numpy.ndarray):
_check_array_dimensions(x, active=0)
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)


if is_package_installed("scipy"):
import scipy.sparse as sp

def _combine_rows_sparse_matrices(*x):
_check_array_dimensions(x, 0)
if is_list_of_type(x, sp.spmatrix):
combined = sp.vstack(x)
return _coerce_sparse_matrix(x[0], combined, sp)

warn("not all elements are SciPy sparse matrices")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)

try:
combine_rows.register(sp.sparray, _combine_rows_sparse_arrays)
except Exception:
pass

def _combine_rows_sparse_arrays(*x):
_check_array_dimensions(x, 0)
if is_list_of_type(x, sp.sparray):
combined = sp.vstack(x)
return _coerce_sparse_array(first, combined, sp)

warn("not all elements are SciPy sparse arrays")
x = [convert_to_dense(y) for y in x]
return numpy.concatenate(x)

try:
combine_rows.register(sp.spmatrix, _combine_rows_sparse_matrices)
except Exception:
pass


if is_package_installed("pandas"):
from pandas import DataFrame, concat

@combine_rows.register(DataFrame)
def _combine_rows_pandas_dataframe(*x):
return concat(x, axis=0)
63 changes: 63 additions & 0 deletions src/biocutils/combine_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from functools import singledispatch
from itertools import chain
from typing import Any
from warnings import warn
import numpy

from .is_list_of_type import is_list_of_type
from .package_utils import is_package_installed

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"


@singledispatch
def combine_sequences(*x: Any):
"""Combine vector-like objects (1-dimensional arrays).

If all elements are :py:class:`~numpy.ndarray`,
we combine them using numpy's :py:func:`~numpy.concatenate`.

If all elements are :py:class:`~pandas.Series` objects, they are combined
using :py:func:`~pandas.concat`.

For all other scenarios, all elements are coerced to a :py:class:`~list`
and combined.

Args:
x:
Vector-like objects to combine.
All elements of ``x`` are expected to be the same class or
atleast compatible with each other.

Returns:
A combined object, ideally of the same type as the first element in ``x``.
"""
raise NotImplementedError("no `combine_sequences` method implemented for '" + type(x[0]).__name__ + "' objects")


@combine_sequences.register(list)
def _combine_sequences_lists(*x: list):
return list(chain(*x))


@combine_sequences.register(numpy.ndarray)
def _combine_sequences_dense_arrays(*x: numpy.ndarray):
return numpy.concatenate(x, axis=None)


if is_package_installed("pandas") is True:
from pandas import Series, concat

@combine_sequences.register(Series)
def _combine_sequences_pandas_series(*x):
if not is_list_of_type(x, Series):
elems = []
for elem in x:
if not isinstance(elem, Series):
elems.append(Series(elem))
else:
elems.append(elem)
x = elems
return concat(x)
44 changes: 44 additions & 0 deletions src/biocutils/convert_to_dense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from functools import singledispatch
from typing import Any
import numpy

from .package_utils import is_package_installed


@singledispatch
def convert_to_dense(x: Any) -> numpy.ndarray:
"""
Convert something to a NumPy dense array of the same shape.
This is typically used a fallback for the various combining
methods when there are lots of different array types that
``numpy.concatenate`` doesn't understand.

Args:
x: Some array-like object to be stored as a NumPy array.

Returns:
A NumPy array.
"""
return numpy.array(x)


@convert_to_dense.register
def _convert_to_dense_numpy(x: numpy.ndarray) -> numpy.ndarray:
return x


if is_package_installed("scipy"):
import scipy.sparse as sp

def _convert_sparse_to_dense(x):
return x.todense()

try:
convert_to_dense.register(sp.spmatrix, _convert_sparse_to_dense)
except Exception:
pass

try:
convert_to_dense.register(sp.sparray, _convert_sparse_to_dense)
except Exception:
pass
Loading
Loading