From d828f380023ad7c97292a65d8745685c3e046bca Mon Sep 17 00:00:00 2001
From: Aaron Lun <infinite.monkeys.with.keyboards@gmail.com>
Date: Tue, 7 Nov 2023 16:40:46 -0800
Subject: [PATCH] Migrate the combining generics from biocgenerics to
 biocutils.  (#8)

This aims to consolidate all generics into a single package, rather than
scattering the various functions between here and biocgenerics.
---
 setup.cfg                          |  1 +
 src/biocutils/__init__.py          |  7 +++
 src/biocutils/_utils_combine.py    | 43 +++++++++++++++
 src/biocutils/combine.py           | 24 +++++++++
 src/biocutils/combine_columns.py   | 82 ++++++++++++++++++++++++++++
 src/biocutils/combine_rows.py      | 87 ++++++++++++++++++++++++++++++
 src/biocutils/combine_sequences.py | 63 ++++++++++++++++++++++
 src/biocutils/convert_to_dense.py  | 44 +++++++++++++++
 tests/test_combine.py              | 38 +++++++++++++
 tests/test_combine_columns.py      | 65 ++++++++++++++++++++++
 tests/test_combine_rows.py         | 64 ++++++++++++++++++++++
 tests/test_combine_sequences.py    | 75 ++++++++++++++++++++++++++
 tests/test_package_utils.py        | 16 ++----
 13 files changed, 597 insertions(+), 12 deletions(-)
 create mode 100644 src/biocutils/_utils_combine.py
 create mode 100644 src/biocutils/combine.py
 create mode 100644 src/biocutils/combine_columns.py
 create mode 100644 src/biocutils/combine_rows.py
 create mode 100644 src/biocutils/combine_sequences.py
 create mode 100644 src/biocutils/convert_to_dense.py
 create mode 100644 tests/test_combine.py
 create mode 100644 tests/test_combine_columns.py
 create mode 100644 tests/test_combine_rows.py
 create mode 100644 tests/test_combine_sequences.py

diff --git a/setup.cfg b/setup.cfg
index 26ce5e1..6025eba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -68,6 +68,7 @@ testing =
     pytest
     pytest-cov
     pandas
+    scipy
 
 [options.entry_points]
 # Add here console scripts like:
diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py
index 28c74c6..783b530 100644
--- a/src/biocutils/__init__.py
+++ b/src/biocutils/__init__.py
@@ -27,3 +27,10 @@
 from .print_wrapped_table import create_floating_names, print_type, print_wrapped_table, truncate_strings
 from .subset import subset
 from .union import union
+
+from .combine import combine
+from .combine_rows import combine_rows
+from .combine_columns import combine_columns
+from .combine_sequences import combine_sequences
+
+from .convert_to_dense import convert_to_dense
diff --git a/src/biocutils/_utils_combine.py b/src/biocutils/_utils_combine.py
new file mode 100644
index 0000000..ba8e513
--- /dev/null
+++ b/src/biocutils/_utils_combine.py
@@ -0,0 +1,43 @@
+def _check_array_dimensions(x, active: int) -> bool:
+    first = x[0].shape
+    for i in range(1, len(x)):
+        current = x[i].shape
+        if len(first) != len(current):
+            raise ValueError("inconsistent dimensions for combining arrays (expected " + str(len(first)) + ", got " + str(len(current)) + " for array " + str(i) + ")")
+        for j in range(len(first)):
+            if j != active and first[j] != current[j]:
+                raise ValueError("inconsistent dimension extents for combining arrays on dimension " + str(active) + " (expected " + str(first[active]) + ", got " + str(current[active]) + " for array " + str(i) + ")")
+
+
+def _coerce_sparse_matrix(first, combined, module):
+    if isinstance(first, module.csr_matrix):
+        return combined.tocsr()
+    elif isinstance(first, module.csc_matrix):
+        return combined.tocsc()
+    elif isinstance(first, module.bsr_matrix):
+        return combined.tobsr()
+    elif isinstance(first, module.coo_matrix):
+        return combined.tocoo()
+    elif isinstance(first, module.dia_matrix):
+        return combined.todia()
+    elif isinstance(first, module.lil_matrix):
+        return combined.tolil()
+    else:
+        return combined
+
+
+def _coerce_sparse_array(first, combined, module):
+    if isinstance(first, module.csr_array):
+        return combined.tocsr()
+    elif isinstance(first, module.csc_array):
+        return combined.tocsc()
+    elif isinstance(first, module.bsr_array):
+        return combined.tobsr()
+    elif isinstance(first, module.coo_array):
+        return combined.tocoo()
+    elif isinstance(first, module.dia_array):
+        return combined.todia()
+    elif isinstance(first, module.lil_array):
+        return combined.tolil()
+    else:
+        return combined
diff --git a/src/biocutils/combine.py b/src/biocutils/combine.py
new file mode 100644
index 0000000..8f2f5fb
--- /dev/null
+++ b/src/biocutils/combine.py
@@ -0,0 +1,24 @@
+from typing import Any
+
+from .combine_rows import combine_rows
+from .combine_sequences import combine_sequences
+
+
+def combine(*x: Any):
+    """
+    Generic combine that checks if the objects are n-dimensional for n > 1
+    (i.e. has a ``shape`` property of length greater than 1); if so, it calls
+    :py:func:`~biocgenerics.combine_rows.combine_rows` to combine them by
+    the first dimension, otherwise it assumes that they are vector-like and
+    calls :py:func:`~biocgenerics.combine_seqs.combine_seqs` instead.
+
+    Args:
+        x: Objects to combine.
+
+    Returns:
+        A combined object, typically the same type as the first element in ``x``.
+    """
+    if hasattr(x[0], "shape") and len(x[0].shape) > 1:
+        return combine_rows(*x)
+    else:
+        return combine_sequences(*x)
diff --git a/src/biocutils/combine_columns.py b/src/biocutils/combine_columns.py
new file mode 100644
index 0000000..8605fcc
--- /dev/null
+++ b/src/biocutils/combine_columns.py
@@ -0,0 +1,82 @@
+from functools import singledispatch
+from typing import Any
+from warnings import warn
+import numpy
+
+from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
+from .is_list_of_type import is_list_of_type
+from .package_utils import is_package_installed
+from .convert_to_dense import convert_to_dense
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+@singledispatch
+def combine_columns(*x: Any):
+    """Combine n-dimensional objects along the second dimension.
+
+    If all elements are :py:class:`~numpy.ndarray`,
+    we combine them using numpy's :py:func:`~numpy.concatenate`.
+
+    If all elements are either :py:class:`~scipy.sparse.spmatrix` or
+    :py:class:`~scipy.sparse.sparray`, these objects are combined
+    using scipy's :py:class:`~scipy.sparse.hstack`.
+
+    If all elements are :py:class:`~pandas.DataFrame` objects, they are
+    combined using :py:func:`~pandas.concat` along the second axis.
+
+    Args:
+        x: 
+            n-dimensional objects to combine. All elements of x are expected
+            to be the same class.
+
+    Returns:
+        Combined object, typically the same type as the first entry of ``x``
+    """
+    raise NotImplementedError("no `combine_columns` method implemented for '" + type(x[0]).__name__ + "' objects")
+
+
+@combine_columns.register
+def _combine_columns_dense_arrays(*x: numpy.ndarray):
+    _check_array_dimensions(x, active=1)
+    x = [convert_to_dense(y) for y in x]
+    return numpy.concatenate(x, axis=1)
+
+
+if is_package_installed("scipy") is True:
+    import scipy.sparse as sp
+
+    def _combine_columns_sparse_matrices(*x):
+        _check_array_dimensions(x, 1)
+        if is_list_of_type(x, sp.spmatrix):
+            combined = sp.hstack(x)
+            return _coerce_sparse_matrix(x[0], combined, sp)
+
+        warn("not all elements are scipy sparse matrices")
+        x = [convert_to_dense(y) for y in x]
+        return numpy.concatenate(x, axis=1)
+
+    try:
+        combine_columns.register(sp.spmatrix, _combine_columns_sparse_matrices)
+    except Exception:
+        pass
+
+    def _combine_columns_sparse_arrays(*x):
+        _check_array_dimensions(x, 1)
+        if is_list_of_type(x, sp.sparray):
+            combined = sp.hstack(x)
+            return _coerce_sparse_array(x[0], combined, sp)
+
+        warn("not all elements are scipy sparse arrays")
+        x = [convert_to_dense(y) for y in x]
+        return numpy.concatenate(x, axis=1)
+
+
+if is_package_installed("pandas") is True:
+    from pandas import DataFrame, concat
+
+    @combine_columns.register(DataFrame)
+    def _combine_columns_pandas_dataframe(*x):
+        return concat(x, axis=1)
diff --git a/src/biocutils/combine_rows.py b/src/biocutils/combine_rows.py
new file mode 100644
index 0000000..160c598
--- /dev/null
+++ b/src/biocutils/combine_rows.py
@@ -0,0 +1,87 @@
+from functools import singledispatch
+from typing import Any
+from warnings import warn
+import numpy
+
+from ._utils_combine import _check_array_dimensions, _coerce_sparse_matrix, _coerce_sparse_array
+from .is_list_of_type import is_list_of_type
+from .package_utils import is_package_installed
+from .convert_to_dense import convert_to_dense
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+@singledispatch
+def combine_rows(*x: Any):
+    """Combine n-dimensional objects along their first dimension.
+
+    If all elements are :py:class:`~numpy.ndarray`, we combine them using
+    numpy's :py:func:`~numpy.concatenate`.
+
+    If all elements are either :py:class:`~scipy.sparse.spmatrix` or
+    :py:class:`~scipy.sparse.sparray`, these objects are combined using scipy's
+    :py:class:`~scipy.sparse.vstack`.
+
+    If all elements are :py:class:`~pandas.DataFrame` objects, they are
+    combined using :py:func:`~pandas.concat` along the first axis.
+
+    Args:
+        x: 
+            One or more n-dimensional objects to combine. All elements of x
+            are expected to be the same class.
+
+    Returns:
+        Combined object, typically the same type as the first entry of ``x``.
+    """
+    raise NotImplementedError("no `combine_rows` method implemented for '" + type(x[0]).__name__ + "' objects")
+
+
+@combine_rows.register(numpy.ndarray)
+def _combine_rows_dense_arrays(*x: numpy.ndarray):
+    _check_array_dimensions(x, active=0)
+    x = [convert_to_dense(y) for y in x]
+    return numpy.concatenate(x)
+
+
+if is_package_installed("scipy"):
+    import scipy.sparse as sp
+
+    def _combine_rows_sparse_matrices(*x):
+        _check_array_dimensions(x, 0)
+        if is_list_of_type(x, sp.spmatrix):
+            combined = sp.vstack(x)
+            return _coerce_sparse_matrix(x[0], combined, sp)
+
+        warn("not all elements are SciPy sparse matrices")
+        x = [convert_to_dense(y) for y in x]
+        return numpy.concatenate(x)
+
+    try:
+        combine_rows.register(sp.sparray, _combine_rows_sparse_arrays)
+    except Exception:
+        pass
+
+    def _combine_rows_sparse_arrays(*x):
+        _check_array_dimensions(x, 0)
+        if is_list_of_type(x, sp.sparray):
+            combined = sp.vstack(x)
+            return _coerce_sparse_array(first, combined, sp)
+
+        warn("not all elements are SciPy sparse arrays")
+        x = [convert_to_dense(y) for y in x]
+        return numpy.concatenate(x)
+
+    try:
+        combine_rows.register(sp.spmatrix, _combine_rows_sparse_matrices)
+    except Exception:
+        pass
+
+
+if is_package_installed("pandas"):
+    from pandas import DataFrame, concat
+
+    @combine_rows.register(DataFrame)
+    def _combine_rows_pandas_dataframe(*x):
+        return concat(x, axis=0)
diff --git a/src/biocutils/combine_sequences.py b/src/biocutils/combine_sequences.py
new file mode 100644
index 0000000..c4ea345
--- /dev/null
+++ b/src/biocutils/combine_sequences.py
@@ -0,0 +1,63 @@
+from functools import singledispatch
+from itertools import chain
+from typing import Any
+from warnings import warn
+import numpy
+
+from .is_list_of_type import is_list_of_type
+from .package_utils import is_package_installed
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+@singledispatch
+def combine_sequences(*x: Any):
+    """Combine vector-like objects (1-dimensional arrays).
+
+    If all elements are :py:class:`~numpy.ndarray`,
+    we combine them using numpy's :py:func:`~numpy.concatenate`.
+
+    If all elements are :py:class:`~pandas.Series` objects, they are combined
+    using :py:func:`~pandas.concat`.
+
+    For all other scenarios, all elements are coerced to a :py:class:`~list`
+    and combined.
+
+    Args:
+        x: 
+            Vector-like objects to combine.
+            All elements of ``x`` are expected to be the same class or
+            atleast compatible with each other.
+
+    Returns:
+        A combined object, ideally of the same type as the first element in ``x``.
+    """
+    raise NotImplementedError("no `combine_sequences` method implemented for '" + type(x[0]).__name__ + "' objects")
+
+
+@combine_sequences.register(list)
+def _combine_sequences_lists(*x: list):
+    return list(chain(*x))
+
+
+@combine_sequences.register(numpy.ndarray)
+def _combine_sequences_dense_arrays(*x: numpy.ndarray):
+    return numpy.concatenate(x, axis=None)
+
+
+if is_package_installed("pandas") is True:
+    from pandas import Series, concat
+
+    @combine_sequences.register(Series)
+    def _combine_sequences_pandas_series(*x):
+        if not is_list_of_type(x, Series):
+            elems = []
+            for elem in x:
+                if not isinstance(elem, Series):
+                    elems.append(Series(elem))
+                else:
+                    elems.append(elem)
+            x = elems
+        return concat(x)
diff --git a/src/biocutils/convert_to_dense.py b/src/biocutils/convert_to_dense.py
new file mode 100644
index 0000000..3144b6e
--- /dev/null
+++ b/src/biocutils/convert_to_dense.py
@@ -0,0 +1,44 @@
+from functools import singledispatch
+from typing import Any
+import numpy
+
+from .package_utils import is_package_installed
+
+
+@singledispatch
+def convert_to_dense(x: Any) -> numpy.ndarray:
+    """
+    Convert something to a NumPy dense array of the same shape.
+    This is typically used a fallback for the various combining
+    methods when there are lots of different array types that
+    ``numpy.concatenate`` doesn't understand.
+
+    Args:
+        x: Some array-like object to be stored as a NumPy array.
+
+    Returns:
+        A NumPy array.
+    """
+    return numpy.array(x)
+
+
+@convert_to_dense.register
+def _convert_to_dense_numpy(x: numpy.ndarray) -> numpy.ndarray:
+    return x
+
+
+if is_package_installed("scipy"):
+    import scipy.sparse as sp
+
+    def _convert_sparse_to_dense(x):
+        return x.todense()
+
+    try:
+        convert_to_dense.register(sp.spmatrix, _convert_sparse_to_dense)
+    except Exception:
+        pass
+
+    try:
+        convert_to_dense.register(sp.sparray, _convert_sparse_to_dense)
+    except Exception:
+        pass
diff --git a/tests/test_combine.py b/tests/test_combine.py
new file mode 100644
index 0000000..15e1ddf
--- /dev/null
+++ b/tests/test_combine.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pandas as pd
+from biocutils import combine
+from scipy import sparse as sp
+import pytest
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_basic_list():
+    x = [1, 2, "c"]
+    y = ["a", "b"]
+
+    z = combine(x, y)
+
+    assert z == x + y
+    assert isinstance(z, list)
+    assert len(z) == len(x) + len(y)
+
+
+def test_basic_mixed_dense_list():
+    x = [1, 2, 3]
+    y = [0.1, 0.2]
+    xd = np.array(x)
+    zcomb = combine(xd, y)
+
+    z = x + y
+    assert (zcomb == z).all()
+    assert len(zcomb) == len(xd) + len(y)
+
+
+def test_basic_mixed_dense_array():
+    x = np.array([1, 2, 3, 4]).reshape((2,2))
+    y = np.array([4, 5, 6, 7]).reshape((2,2))
+    zcomb = combine(x, y)
+    assert zcomb.shape == (4, 2)
diff --git a/tests/test_combine_columns.py b/tests/test_combine_columns.py
new file mode 100644
index 0000000..24cea40
--- /dev/null
+++ b/tests/test_combine_columns.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+from biocutils import combine_columns
+from scipy import sparse as sp
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_combine_columns_dense():
+    num_rows = 20
+    x = np.ones(shape=(num_rows, 10))
+    y = np.random.rand(num_rows, 5)
+
+    z = combine_columns(x, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (20, 15)
+
+
+def test_combine_columns_sparse():
+    num_rows = 20
+
+    x = sp.random(num_rows, 10)
+    y = sp.identity(num_rows)
+
+    z = combine_columns(x, y)
+
+    assert isinstance(z, sp.spmatrix)
+    assert z.shape == (20, 30)
+
+
+def test_combine_columns_mixed():
+    num_rows = 20
+    x = np.ones(shape=(num_rows, 10))
+    y = sp.identity(num_rows)
+
+    print(x, y)
+    z = combine_columns(x, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (20, 30)
+
+
+def test_pandas_dataframe():
+    df1 = pd.DataFrame([["a", 1], ["b", 2]], columns=["letter", "number"])
+
+    df2 = pd.DataFrame(
+        [["c", 3, "cat"], ["d", 4, "dog"]], columns=["letter", "number", "animal"]
+    )
+
+    z = combine_columns(df1, df2)
+    assert isinstance(z, pd.DataFrame)
+
+
+def test_combine_columns_ndim():
+    num_rows = 20
+    x = np.ones(shape=(num_rows, 10, 20))
+    y = np.ones(shape=(num_rows, 20, 20))
+
+    z = combine_columns(x, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (20, 30, 20)
diff --git a/tests/test_combine_rows.py b/tests/test_combine_rows.py
new file mode 100644
index 0000000..a25aa50
--- /dev/null
+++ b/tests/test_combine_rows.py
@@ -0,0 +1,64 @@
+import numpy as np
+import pandas as pd
+from biocutils import combine_rows
+from scipy import sparse as sp
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_combine_rows_dense():
+    num_cols = 20
+    x = np.ones(shape=(10, num_cols))
+    y = np.random.rand(5, num_cols)
+
+    z = combine_rows(x, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (15, 20)
+
+
+def test_combine_rows_sparse():
+    num_cols = 20
+
+    x = sp.random(10, num_cols)
+    y = sp.identity(num_cols)
+
+    z = combine_rows(x, y)
+
+    assert isinstance(z, sp.spmatrix)
+    assert z.shape == (30, 20)
+
+
+def test_combine_rows_mixed():
+    num_cols = 20
+    x = np.ones(shape=(10, num_cols))
+    y = sp.identity(num_cols)
+
+    z = combine_rows(x, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (30, 20)
+
+
+def test_pandas_dataframe():
+    df1 = pd.DataFrame([["a", 1], ["b", 2]], columns=["letter", "number"])
+
+    df2 = pd.DataFrame(
+        [["c", 3, "cat"], ["d", 4, "dog"]], columns=["letter", "number", "animal"]
+    )
+
+    z = combine_rows(df1, df2)
+    assert isinstance(z, pd.DataFrame)
+
+
+def test_combine_rows_ndim():
+    num_cols = 20
+    x = np.ones(shape=(20, num_cols, 20))
+    y = np.ones(shape=(10, num_cols, num_cols))
+
+    z = combine_rows(x, y, y)
+
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (40, 20, 20)
diff --git a/tests/test_combine_sequences.py b/tests/test_combine_sequences.py
new file mode 100644
index 0000000..4e15ac5
--- /dev/null
+++ b/tests/test_combine_sequences.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pandas as pd
+from biocutils import combine_sequences
+from scipy import sparse as sp
+
+__author__ = "jkanche"
+__copyright__ = "jkanche"
+__license__ = "MIT"
+
+
+def test_basic_list():
+    x = [1, 2, "c"]
+    y = ["a", "b"]
+
+    z = combine_sequences(x, y)
+
+    assert z == x + y
+    assert isinstance(z, list)
+    assert len(z) == len(x) + len(y)
+
+
+def test_basic_dense():
+    x = [1, 2, 3]
+    y = [0.1, 0.2]
+    xd = np.array([1, 2, 3])
+    yd = np.array([0.1, 0.2], dtype=float)
+
+    zcomb = combine_sequences(xd, yd)
+
+    z = x + y
+    zd = np.array(z)
+
+    assert all(np.isclose(zcomb, zd)) is True
+    assert isinstance(zcomb, np.ndarray)
+    assert len(zcomb) == len(zd)
+
+
+def test_basic_mixed_dense_list():
+    x = [1, 2, 3]
+    y = [0.1, 0.2]
+    xd = np.array([1, 2, 3])
+
+    zcomb = combine_sequences(xd, y)
+
+    z = x + y
+    assert (zcomb == z).all()
+    assert len(zcomb) == len(xd) + len(y)
+
+
+def test_basic_mixed_tuple_list():
+    x = [1, 2, 3]
+    y = (0.1, 0.2)
+    xd = np.array([1, 2, 3])
+
+    zcomb = combine_sequences(xd, y, x)
+
+    z = x + list(y) + x
+    assert (zcomb == z).all()
+    assert len(zcomb) == 2 * len(xd) + len(y)
+
+
+def test_pandas_series():
+    s1 = pd.Series(["a", "b"])
+    s2 = pd.Series(["c", "d"])
+
+    z = combine_sequences(s1, s2)
+
+    assert isinstance(z, pd.Series)
+    assert len(z) == 4
+
+    x = ["gg", "ff"]
+
+    z = combine_sequences(s1, x)
+    assert isinstance(z, pd.Series)
+    assert len(z) == 4
diff --git a/tests/test_package_utils.py b/tests/test_package_utils.py
index 2e023a8..9b7c96e 100644
--- a/tests/test_package_utils.py
+++ b/tests/test_package_utils.py
@@ -4,15 +4,7 @@
 __copyright__ = "jkanche"
 __license__ = "MIT"
 
-
-
-def test_for_scipy():
-    pkg = is_package_installed("scipy")
-
-    assert pkg is False
-
-
-def test_for_numpy():
-    pkg = is_package_installed("numpy")
-
-    assert pkg is True
+def test_installed_package():
+    assert is_package_installed("scipy")
+    assert is_package_installed("numpy")
+    assert not is_package_installed("some_random_package")