From d828f380023ad7c97292a65d8745685c3e046bca Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Tue, 7 Nov 2023 16:40:46 -0800 Subject: [PATCH] Migrate the combining generics from biocgenerics to biocutils. (#8) This aims to consolidate all generics into a single package, rather than scattering the various functions between here and biocgenerics. --- setup.cfg | 1 + src/biocutils/__init__.py | 7 +++ src/biocutils/_utils_combine.py | 43 +++++++++++++++ src/biocutils/combine.py | 24 +++++++++ src/biocutils/combine_columns.py | 82 ++++++++++++++++++++++++++++ src/biocutils/combine_rows.py | 87 ++++++++++++++++++++++++++++++ src/biocutils/combine_sequences.py | 63 ++++++++++++++++++++++ src/biocutils/convert_to_dense.py | 44 +++++++++++++++ tests/test_combine.py | 38 +++++++++++++ tests/test_combine_columns.py | 65 ++++++++++++++++++++++ tests/test_combine_rows.py | 64 ++++++++++++++++++++++ tests/test_combine_sequences.py | 75 ++++++++++++++++++++++++++ tests/test_package_utils.py | 16 ++---- 13 files changed, 597 insertions(+), 12 deletions(-) create mode 100644 src/biocutils/_utils_combine.py create mode 100644 src/biocutils/combine.py create mode 100644 src/biocutils/combine_columns.py create mode 100644 src/biocutils/combine_rows.py create mode 100644 src/biocutils/combine_sequences.py create mode 100644 src/biocutils/convert_to_dense.py create mode 100644 tests/test_combine.py create mode 100644 tests/test_combine_columns.py create mode 100644 tests/test_combine_rows.py create mode 100644 tests/test_combine_sequences.py diff --git a/setup.cfg b/setup.cfg index 26ce5e1..6025eba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,6 +68,7 @@ testing = pytest pytest-cov pandas + scipy [options.entry_points] # Add here console scripts like: diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 28c74c6..783b530 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -27,3 +27,10 @@ from .print_wrapped_table import create_floating_names, print_type, print_wrapped_table, truncate_strings from .subset import subset from .union import union + +from .combine import combine +from .combine_rows import combine_rows +from .combine_columns import combine_columns +from .combine_sequences import combine_sequences + +from .convert_to_dense import convert_to_dense diff --git a/src/biocutils/_utils_combine.py b/src/biocutils/_utils_combine.py new file mode 100644 index 0000000..ba8e513 --- /dev/null +++ b/src/biocutils/_utils_combine.py @@ -0,0 +1,43 @@ +def _check_array_dimensions(x, active: int) -> bool: + first = x[0].shape + for i in range(1, len(x)): + current = x[i].shape + if len(first) != len(current): + raise ValueError("inconsistent dimensions for combining arrays (expected " + str(len(first)) + ", got " + str(len(current)) + " for array " + str(i) + ")") + for j in range(len(first)): + if j != active and first[j] != current[j]: + raise ValueError("inconsistent dimension extents for combining arrays on dimension " + str(active) + " (expected " + str(first[active]) + ", got " + str(current[active]) + " for array " + str(i) + ")") + + +def _coerce_sparse_matrix(first, combined, module): + if isinstance(first, module.csr_matrix): + return combined.tocsr() + elif isinstance(first, module.csc_matrix): + return combined.tocsc() + elif isinstance(first, module.bsr_matrix): + return combined.tobsr() + elif isinstance(first, module.coo_matrix): + return combined.tocoo() + elif isinstance(first, module.dia_matrix): + return combined.todia() + elif isinstance(first, module.lil_matrix): + return combined.tolil() + else: + return combined + + +def _coerce_sparse_array(first, combined, module): + if isinstance(first, module.csr_array): + return combined.tocsr() + elif isinstance(first, module.csc_array): + return combined.tocsc() + elif isinstance(first, module.bsr_array): + return combined.tobsr() + elif isinstance(first, module.coo_array): + return combined.tocoo() + elif isinstance(first, module.dia_array): + return combined.todia() + elif isinstance(first, module.lil_array): + return combined.tolil() + else: + return combined diff --git a/src/biocutils/combine.py b/src/biocutils/combine.py new file mode 100644 index 0000000..8f2f5fb --- /dev/null +++ b/src/biocutils/combine.py @@ -0,0 +1,24 @@ +from typing import Any + +from .combine_rows import combine_rows +from .combine_sequences import combine_sequences + + +def combine(*x: Any): + """ + Generic combine that checks if the objects are n-dimensional for n > 1 + (i.e. has a ``shape`` property of length greater than 1); if so, it calls + :py:func:`~biocgenerics.combine_rows.combine_rows` to combine them by + the first dimension, otherwise it assumes that they are vector-like and + calls :py:func:`~biocgenerics.combine_seqs.combine_seqs` instead. + + Args: + x: Objects to combine. + + Returns: + A combined object, typically the same type as the first element in ``x``. + """ + if hasattr(x[0], "shape") and len(x[0].shape) > 1: + return combine_rows(*x) + else: + return combine_sequences(*x) diff --git a/src/biocutils/combine_columns.py b/src/biocutils/combine_columns.py new file mode 100644 index 0000000..8605fcc --- /dev/null +++ b/src/biocutils/combine_columns.py @@ -0,0 +1,82 @@ +from functools import singledispatch +from typing import Any +from warnings import warn +import numpy + +from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array +from .is_list_of_type import is_list_of_type +from .package_utils import is_package_installed +from .convert_to_dense import convert_to_dense + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +@singledispatch +def combine_columns(*x: Any): + """Combine n-dimensional objects along the second dimension. + + If all elements are :py:class:`~numpy.ndarray`, + we combine them using numpy's :py:func:`~numpy.concatenate`. + + If all elements are either :py:class:`~scipy.sparse.spmatrix` or + :py:class:`~scipy.sparse.sparray`, these objects are combined + using scipy's :py:class:`~scipy.sparse.hstack`. + + If all elements are :py:class:`~pandas.DataFrame` objects, they are + combined using :py:func:`~pandas.concat` along the second axis. + + Args: + x: + n-dimensional objects to combine. All elements of x are expected + to be the same class. + + Returns: + Combined object, typically the same type as the first entry of ``x`` + """ + raise NotImplementedError("no `combine_columns` method implemented for '" + type(x[0]).__name__ + "' objects") + + +@combine_columns.register +def _combine_columns_dense_arrays(*x: numpy.ndarray): + _check_array_dimensions(x, active=1) + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x, axis=1) + + +if is_package_installed("scipy") is True: + import scipy.sparse as sp + + def _combine_columns_sparse_matrices(*x): + _check_array_dimensions(x, 1) + if is_list_of_type(x, sp.spmatrix): + combined = sp.hstack(x) + return _coerce_sparse_matrix(x[0], combined, sp) + + warn("not all elements are scipy sparse matrices") + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x, axis=1) + + try: + combine_columns.register(sp.spmatrix, _combine_columns_sparse_matrices) + except Exception: + pass + + def _combine_columns_sparse_arrays(*x): + _check_array_dimensions(x, 1) + if is_list_of_type(x, sp.sparray): + combined = sp.hstack(x) + return _coerce_sparse_array(x[0], combined, sp) + + warn("not all elements are scipy sparse arrays") + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x, axis=1) + + +if is_package_installed("pandas") is True: + from pandas import DataFrame, concat + + @combine_columns.register(DataFrame) + def _combine_columns_pandas_dataframe(*x): + return concat(x, axis=1) diff --git a/src/biocutils/combine_rows.py b/src/biocutils/combine_rows.py new file mode 100644 index 0000000..160c598 --- /dev/null +++ b/src/biocutils/combine_rows.py @@ -0,0 +1,87 @@ +from functools import singledispatch +from typing import Any +from warnings import warn +import numpy + +from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array +from .is_list_of_type import is_list_of_type +from .package_utils import is_package_installed +from .convert_to_dense import convert_to_dense + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +@singledispatch +def combine_rows(*x: Any): + """Combine n-dimensional objects along their first dimension. + + If all elements are :py:class:`~numpy.ndarray`, we combine them using + numpy's :py:func:`~numpy.concatenate`. + + If all elements are either :py:class:`~scipy.sparse.spmatrix` or + :py:class:`~scipy.sparse.sparray`, these objects are combined using scipy's + :py:class:`~scipy.sparse.vstack`. + + If all elements are :py:class:`~pandas.DataFrame` objects, they are + combined using :py:func:`~pandas.concat` along the first axis. + + Args: + x: + One or more n-dimensional objects to combine. All elements of x + are expected to be the same class. + + Returns: + Combined object, typically the same type as the first entry of ``x``. + """ + raise NotImplementedError("no `combine_rows` method implemented for '" + type(x[0]).__name__ + "' objects") + + +@combine_rows.register(numpy.ndarray) +def _combine_rows_dense_arrays(*x: numpy.ndarray): + _check_array_dimensions(x, active=0) + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x) + + +if is_package_installed("scipy"): + import scipy.sparse as sp + + def _combine_rows_sparse_matrices(*x): + _check_array_dimensions(x, 0) + if is_list_of_type(x, sp.spmatrix): + combined = sp.vstack(x) + return _coerce_sparse_matrix(x[0], combined, sp) + + warn("not all elements are SciPy sparse matrices") + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x) + + try: + combine_rows.register(sp.sparray, _combine_rows_sparse_arrays) + except Exception: + pass + + def _combine_rows_sparse_arrays(*x): + _check_array_dimensions(x, 0) + if is_list_of_type(x, sp.sparray): + combined = sp.vstack(x) + return _coerce_sparse_array(first, combined, sp) + + warn("not all elements are SciPy sparse arrays") + x = [convert_to_dense(y) for y in x] + return numpy.concatenate(x) + + try: + combine_rows.register(sp.spmatrix, _combine_rows_sparse_matrices) + except Exception: + pass + + +if is_package_installed("pandas"): + from pandas import DataFrame, concat + + @combine_rows.register(DataFrame) + def _combine_rows_pandas_dataframe(*x): + return concat(x, axis=0) diff --git a/src/biocutils/combine_sequences.py b/src/biocutils/combine_sequences.py new file mode 100644 index 0000000..c4ea345 --- /dev/null +++ b/src/biocutils/combine_sequences.py @@ -0,0 +1,63 @@ +from functools import singledispatch +from itertools import chain +from typing import Any +from warnings import warn +import numpy + +from .is_list_of_type import is_list_of_type +from .package_utils import is_package_installed + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +@singledispatch +def combine_sequences(*x: Any): + """Combine vector-like objects (1-dimensional arrays). + + If all elements are :py:class:`~numpy.ndarray`, + we combine them using numpy's :py:func:`~numpy.concatenate`. + + If all elements are :py:class:`~pandas.Series` objects, they are combined + using :py:func:`~pandas.concat`. + + For all other scenarios, all elements are coerced to a :py:class:`~list` + and combined. + + Args: + x: + Vector-like objects to combine. + All elements of ``x`` are expected to be the same class or + atleast compatible with each other. + + Returns: + A combined object, ideally of the same type as the first element in ``x``. + """ + raise NotImplementedError("no `combine_sequences` method implemented for '" + type(x[0]).__name__ + "' objects") + + +@combine_sequences.register(list) +def _combine_sequences_lists(*x: list): + return list(chain(*x)) + + +@combine_sequences.register(numpy.ndarray) +def _combine_sequences_dense_arrays(*x: numpy.ndarray): + return numpy.concatenate(x, axis=None) + + +if is_package_installed("pandas") is True: + from pandas import Series, concat + + @combine_sequences.register(Series) + def _combine_sequences_pandas_series(*x): + if not is_list_of_type(x, Series): + elems = [] + for elem in x: + if not isinstance(elem, Series): + elems.append(Series(elem)) + else: + elems.append(elem) + x = elems + return concat(x) diff --git a/src/biocutils/convert_to_dense.py b/src/biocutils/convert_to_dense.py new file mode 100644 index 0000000..3144b6e --- /dev/null +++ b/src/biocutils/convert_to_dense.py @@ -0,0 +1,44 @@ +from functools import singledispatch +from typing import Any +import numpy + +from .package_utils import is_package_installed + + +@singledispatch +def convert_to_dense(x: Any) -> numpy.ndarray: + """ + Convert something to a NumPy dense array of the same shape. + This is typically used a fallback for the various combining + methods when there are lots of different array types that + ``numpy.concatenate`` doesn't understand. + + Args: + x: Some array-like object to be stored as a NumPy array. + + Returns: + A NumPy array. + """ + return numpy.array(x) + + +@convert_to_dense.register +def _convert_to_dense_numpy(x: numpy.ndarray) -> numpy.ndarray: + return x + + +if is_package_installed("scipy"): + import scipy.sparse as sp + + def _convert_sparse_to_dense(x): + return x.todense() + + try: + convert_to_dense.register(sp.spmatrix, _convert_sparse_to_dense) + except Exception: + pass + + try: + convert_to_dense.register(sp.sparray, _convert_sparse_to_dense) + except Exception: + pass diff --git a/tests/test_combine.py b/tests/test_combine.py new file mode 100644 index 0000000..15e1ddf --- /dev/null +++ b/tests/test_combine.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd +from biocutils import combine +from scipy import sparse as sp +import pytest + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_basic_list(): + x = [1, 2, "c"] + y = ["a", "b"] + + z = combine(x, y) + + assert z == x + y + assert isinstance(z, list) + assert len(z) == len(x) + len(y) + + +def test_basic_mixed_dense_list(): + x = [1, 2, 3] + y = [0.1, 0.2] + xd = np.array(x) + zcomb = combine(xd, y) + + z = x + y + assert (zcomb == z).all() + assert len(zcomb) == len(xd) + len(y) + + +def test_basic_mixed_dense_array(): + x = np.array([1, 2, 3, 4]).reshape((2,2)) + y = np.array([4, 5, 6, 7]).reshape((2,2)) + zcomb = combine(x, y) + assert zcomb.shape == (4, 2) diff --git a/tests/test_combine_columns.py b/tests/test_combine_columns.py new file mode 100644 index 0000000..24cea40 --- /dev/null +++ b/tests/test_combine_columns.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd +from biocutils import combine_columns +from scipy import sparse as sp + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_combine_columns_dense(): + num_rows = 20 + x = np.ones(shape=(num_rows, 10)) + y = np.random.rand(num_rows, 5) + + z = combine_columns(x, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (20, 15) + + +def test_combine_columns_sparse(): + num_rows = 20 + + x = sp.random(num_rows, 10) + y = sp.identity(num_rows) + + z = combine_columns(x, y) + + assert isinstance(z, sp.spmatrix) + assert z.shape == (20, 30) + + +def test_combine_columns_mixed(): + num_rows = 20 + x = np.ones(shape=(num_rows, 10)) + y = sp.identity(num_rows) + + print(x, y) + z = combine_columns(x, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (20, 30) + + +def test_pandas_dataframe(): + df1 = pd.DataFrame([["a", 1], ["b", 2]], columns=["letter", "number"]) + + df2 = pd.DataFrame( + [["c", 3, "cat"], ["d", 4, "dog"]], columns=["letter", "number", "animal"] + ) + + z = combine_columns(df1, df2) + assert isinstance(z, pd.DataFrame) + + +def test_combine_columns_ndim(): + num_rows = 20 + x = np.ones(shape=(num_rows, 10, 20)) + y = np.ones(shape=(num_rows, 20, 20)) + + z = combine_columns(x, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (20, 30, 20) diff --git a/tests/test_combine_rows.py b/tests/test_combine_rows.py new file mode 100644 index 0000000..a25aa50 --- /dev/null +++ b/tests/test_combine_rows.py @@ -0,0 +1,64 @@ +import numpy as np +import pandas as pd +from biocutils import combine_rows +from scipy import sparse as sp + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_combine_rows_dense(): + num_cols = 20 + x = np.ones(shape=(10, num_cols)) + y = np.random.rand(5, num_cols) + + z = combine_rows(x, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (15, 20) + + +def test_combine_rows_sparse(): + num_cols = 20 + + x = sp.random(10, num_cols) + y = sp.identity(num_cols) + + z = combine_rows(x, y) + + assert isinstance(z, sp.spmatrix) + assert z.shape == (30, 20) + + +def test_combine_rows_mixed(): + num_cols = 20 + x = np.ones(shape=(10, num_cols)) + y = sp.identity(num_cols) + + z = combine_rows(x, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (30, 20) + + +def test_pandas_dataframe(): + df1 = pd.DataFrame([["a", 1], ["b", 2]], columns=["letter", "number"]) + + df2 = pd.DataFrame( + [["c", 3, "cat"], ["d", 4, "dog"]], columns=["letter", "number", "animal"] + ) + + z = combine_rows(df1, df2) + assert isinstance(z, pd.DataFrame) + + +def test_combine_rows_ndim(): + num_cols = 20 + x = np.ones(shape=(20, num_cols, 20)) + y = np.ones(shape=(10, num_cols, num_cols)) + + z = combine_rows(x, y, y) + + assert isinstance(z, np.ndarray) + assert z.shape == (40, 20, 20) diff --git a/tests/test_combine_sequences.py b/tests/test_combine_sequences.py new file mode 100644 index 0000000..4e15ac5 --- /dev/null +++ b/tests/test_combine_sequences.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd +from biocutils import combine_sequences +from scipy import sparse as sp + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def test_basic_list(): + x = [1, 2, "c"] + y = ["a", "b"] + + z = combine_sequences(x, y) + + assert z == x + y + assert isinstance(z, list) + assert len(z) == len(x) + len(y) + + +def test_basic_dense(): + x = [1, 2, 3] + y = [0.1, 0.2] + xd = np.array([1, 2, 3]) + yd = np.array([0.1, 0.2], dtype=float) + + zcomb = combine_sequences(xd, yd) + + z = x + y + zd = np.array(z) + + assert all(np.isclose(zcomb, zd)) is True + assert isinstance(zcomb, np.ndarray) + assert len(zcomb) == len(zd) + + +def test_basic_mixed_dense_list(): + x = [1, 2, 3] + y = [0.1, 0.2] + xd = np.array([1, 2, 3]) + + zcomb = combine_sequences(xd, y) + + z = x + y + assert (zcomb == z).all() + assert len(zcomb) == len(xd) + len(y) + + +def test_basic_mixed_tuple_list(): + x = [1, 2, 3] + y = (0.1, 0.2) + xd = np.array([1, 2, 3]) + + zcomb = combine_sequences(xd, y, x) + + z = x + list(y) + x + assert (zcomb == z).all() + assert len(zcomb) == 2 * len(xd) + len(y) + + +def test_pandas_series(): + s1 = pd.Series(["a", "b"]) + s2 = pd.Series(["c", "d"]) + + z = combine_sequences(s1, s2) + + assert isinstance(z, pd.Series) + assert len(z) == 4 + + x = ["gg", "ff"] + + z = combine_sequences(s1, x) + assert isinstance(z, pd.Series) + assert len(z) == 4 diff --git a/tests/test_package_utils.py b/tests/test_package_utils.py index 2e023a8..9b7c96e 100644 --- a/tests/test_package_utils.py +++ b/tests/test_package_utils.py @@ -4,15 +4,7 @@ __copyright__ = "jkanche" __license__ = "MIT" - - -def test_for_scipy(): - pkg = is_package_installed("scipy") - - assert pkg is False - - -def test_for_numpy(): - pkg = is_package_installed("numpy") - - assert pkg is True +def test_installed_package(): + assert is_package_installed("scipy") + assert is_package_installed("numpy") + assert not is_package_installed("some_random_package")