reduce dependence on pandas, use the combine generic more often

BiocPy · Oct 17, 2023 · 971f69d · 971f69d
1 parent fa774cc
commit 971f69d
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 74 deletions.
diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py
@@ -18,7 +18,7 @@
 from biocgenerics.combine_cols import combine_cols
 from biocgenerics.combine_rows import combine_rows
 from biocutils import is_list_of_type
-from numpy import concatenate, count_nonzero, ndarray, sum, zeros
+from numpy import count_nonzero, ndarray, sum, zeros
 from pandas import DataFrame, concat, isna
 
 from .interval import (
@@ -55,29 +55,30 @@ class GenomicRanges(BiocFrame):
     :py:class:`~genomicranges.SeqInfo.SeqInfo`) might also contain metadata about the
     genome, e.g. if it's circular (`is_circular`) or not.
 
-    Note: The documentation for some of the methods comes from the
+    Note: The documentation for some of the methods are derived from the
     `GenomicRanges R/Bioconductor package <https://github.com/Bioconductor/GenomicRanges>`_.
 
-    Typical usage example:
+    Typical usage:
 
     To construct a **GenomicRanges** object, simply pass in the column representation as a
     dictionary. This dictionary must contain "seqnames", "starts", "ends" columns, and optionally,
     specify "strand". If the "strand" column is not provided, "*" is used as the default value for
     each genomic interval.
 
-    ```python
+    .. code-block:: python
+
         gr = GenomicRanges(
             {
                 "seqnames": ["chr1", "chr2", "chr3"],
                 "starts": [100, 115, 119],
                 "ends": [103, 116, 120],
             }
         )
-    ```
 
     Alternatively, you may also convert a :py:class:`~pandas.DataFrame` to ``GenomicRanges``.
 
-    ```python
+    .. code-block:: python
+
         df = pd.DataFrame(
             {
                 "seqnames": ["chr1", "chr2", "chr3"],
@@ -87,7 +88,6 @@ class GenomicRanges(BiocFrame):
         )
 
         gr = genomicranges.from_pandas(df)
-    ```
 
     All columns other than "seqnames", "starts", "ends", and "strand" are considered
     metadata columns and can be accessed by
@@ -97,19 +97,13 @@ class GenomicRanges(BiocFrame):
 
         gr.mcols()
 
-    or slice the object
-
-    .. code-block:: python
-
-        sliced_gr = gr[1:2, [True, False, False]]
-
     Attributes:
-        data (Dict[str, Any], optional): Dictionary of column names as `keys` and their values.
-            All columns must have the same length. Defaults to {}.
-        number_of_rows (int, optional): Number of rows.
-        row_names (List, optional): Row index names.
-        column_names (List[str], optional): Column names, if not provided, they are automatically inferred
-            from the data.
+        data (Dict[str, Any], optional): Dictionary of column names as `keys` and
+            their values. All columns must have the same length. Defaults to {}.
+        number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``.
+        row_names (list, optional): Row names.
+        column_names (list, optional): Column names. If not provided,
+            inferred from ``data``.
         metadata (dict): Additional metadata. Defaults to {}.
     """
 
@@ -127,12 +121,12 @@ def __init__(
 
         Args:
             data (Dict[str, Any], optional): Dictionary of column names as `keys` and
-                their values. All columns must have the same length. Defaults to None.
-            number_of_rows (int, optional): Number of rows. Defaults to None.
-            row_names (List, optional): Row index names. Defaults to None.
-            column_names (List[str], optional): Column names, if not provided,
-                they are automatically inferred from the data. Defaults to None.
-            metadata (dict, optional): Additional metadata. Defaults to None.
+                their values. All columns must have the same length. Defaults to {}.
+            number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``.
+            row_names (list, optional): Row names.
+            column_names (list, optional): Column names. If not provided,
+                inferred from ``data``.
+            metadata (dict): Additional metadata. Defaults to {}.
         """
         super().__init__(data, number_of_rows, row_names, column_names, metadata)
 
@@ -209,7 +203,8 @@ def ranges(
             ValueError: If ``return_type`` is not supported.
 
         Returns:
-            Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions.
+            Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions in type specified by
+            ``return_type``.
         """
 
         obj = {
@@ -228,7 +223,9 @@ def ranges(
             try:
                 return return_type(obj)
             except Exception as e:
-                raise ValueError(f"{return_type} is not supported, {str(e)}")
+                raise RuntimeError(
+                    f"Cannot convert ranges to '{return_type}', {str(e)}"
+                )
 
     @property
     def strand(self) -> List[str]:
@@ -261,7 +258,7 @@ def seq_info(self) -> Optional[SeqInfo]:
         """Get sequence information, if available.
 
         Returns:
-            (SeqInfo, optional): List information, otherwise None.
+            (SeqInfo, optional): Sequence information, otherwise None.
         """
 
         if self.metadata and "seq_info" in self.metadata:
@@ -274,7 +271,8 @@ def seq_info(self, seq_info: Optional[SeqInfo]):
         """Set sequence information.
 
         Args:
-            (SeqInfo, optional): List information, otherwise None.
+            (SeqInfo): Sequence information. Can be None to remove sequence
+                information from the object.
 
         Raises:
             ValueError: If `seq_info` is not a `SeqInfo` class.
@@ -367,7 +365,8 @@ def genome(self) -> Optional[str]:
         return None
 
     def granges(self) -> "GenomicRanges":
-        """Create a new ``GenomicRanges`` object with only ranges (``seqnames``, ``starts``, ``ends``, and ``strand``).
+        """Create a new ``GenomicRanges`` object with only ranges
+        (``seqnames``, ``starts``, ``ends``, and ``strand``).
 
         Returns:
             GenomicRanges: A new ``GenomicRanges`` with only ranges.
@@ -410,7 +409,9 @@ def mcols(self, return_type: Optional[Callable] = None) -> Any:
             try:
                 return return_type(new_data)
             except Exception as e:
-                raise ValueError(f"{return_type} not supported, {str(e)}")
+                raise RuntimeError(
+                    f"Cannot convert metadata to '{return_type}', {str(e)}"
+                )
 
     def __repr__(self) -> str:
         from io import StringIO
@@ -507,7 +508,7 @@ def __getitem__(self, args: SlicerArgTypes) -> Union["GenomicRanges", dict, list
             gr[<List of column names>]
 
         Args:
-            args (SlicerArgTypes): A Tuple of slicer arguments to subset rows and
+            args (SlicerArgTypes): A Tuple of arguments to subset rows and
                 columns. An element in ``args`` may be,
 
                 - List of booleans, True to keep the row/column, False to remove.
@@ -1037,7 +1038,7 @@ def _calc_gap_widths(self, ignore_strand: bool = False) -> List[int]:
     # inter range methods
 
     # TODO: implement dropEmptyRanges
-    # TODO: this is a very ineffecient implementation, can do a better job later.
+    # TODO: this is a very ineffecient implementation, can do a better.
     def reduce(
         self,
         with_reverse_map: bool = False,
@@ -1536,7 +1537,7 @@ def coverage(
             )
 
             if shift > 0:
-                cov = concatenate((shift_arr, cov))
+                cov = combine(shift_arr, cov)
 
             if weight > 0:
                 cov = cov * weight
@@ -2219,18 +2220,18 @@ def invert_strand(self) -> "GenomicRanges":
         )
 
     def combine(self, *other: "GenomicRanges") -> "GenomicRanges":
-        """Combine multiple GenomicRanges objects by row.
+        """Combine multiple `GenomicRanges` objects by row.
 
         Note: Fills missing columns with an array of `None`s.
 
         Args:
-            *other (GenomicRanges): GenomicRanges objects.
+            *other (GenomicRanges): Objects to combine.
 
         Raises:
             TypeError: If all objects are not of type GenomicRanges.
 
         Returns:
-            BiocFrame: A combined BiocFrame.
+            GenomicRanges: A combined GenomicRanges object.
         """
         if not is_list_of_type(other, GenomicRanges):
             raise TypeError("All objects to combine must be GenomicRanges objects.")

diff --git a/src/genomicranges/GenomicRangesList.py b/src/genomicranges/GenomicRangesList.py
@@ -1,17 +1,17 @@
 from typing import Dict, List, Optional, Union
 
 from biocframe import BiocFrame
-from pandas import DataFrame
+from biocgenerics.combine import combine
+from biocgenerics.combine_cols import combine_cols
+from biocgenerics.combine_rows import combine_rows
 
 from .GenomicRanges import GenomicRanges
-from .utils import is_list_of_type
+from biocutils import is_list_of_type
 
 __author__ = "jkanche"
 __copyright__ = "jkanche"
 __license__ = "MIT"
 
-BiocOrPandasFrame = Union[DataFrame, BiocFrame]
-
 
 class GenomicRangesList:
     """Just as it sounds, a `GenomicRangesList` is a named-list like object.
@@ -22,12 +22,13 @@ class GenomicRangesList:
 
     Currently, this class is limited in functionality, purely a read-only class with basic accessors.
 
-    Typical usage example:
+    Typical usage:
 
-    To construct a `GenomicRangesList` object, simply pass in a list of
+    To construct a **GenomicRangesList** object, simply pass in a list of
     :py:class:`genomicranges.GenomicRanges.GenomicRanges` objects and Optionally ``names``.
 
-    ```python
+    .. code-block:: python
+
         gr1 = GenomicRanges(
             {
                 "seqnames": ["chr1", "chr2", "chr1", "chr3"],
@@ -49,7 +50,6 @@ class GenomicRangesList:
         )
 
         grl = GenomicRangesList(ranges=[gr1, gr2], names=["gene1", "gene2"])
-    ```
 
     Additionally, you may also provide metadata about the genomic elements in the dictionary
     using mcols attribute.
@@ -60,7 +60,7 @@ def __init__(
         ranges: Union[GenomicRanges, List[GenomicRanges]],
         range_lengths: Optional[List[int]] = None,
         names: Optional[List[str]] = None,
-        mcols: BiocOrPandasFrame = None,
+        mcols: Optional[BiocFrame] = None,
         metadata: Optional[dict] = None,
     ):
         """Initialize a `GenomicRangesList` object.
@@ -74,7 +74,7 @@ def __init__(
             names (Optional[List[str]], optional): Names of the genomic elements.
                 The length of this must match the number of genomic elements in ``ranges``.
                 Defaults to None.
-            mcols (BiocOrPandasFrame, optional): Metadata about each genomic element. Defaults to None.
+            mcols (BiocFrame, optional): Metadata about each genomic element. Defaults to None.
             metadata (Optional[Dict], optional): Additional metadata. Defaults to None.
         """
         self._validate(ranges)
@@ -152,11 +152,12 @@ def names(self) -> Optional[list]:
         return self.groups
 
     @property
-    def mcols(self) -> Optional[BiocOrPandasFrame]:
+    def mcols(self) -> Optional[BiocFrame]:
         """Get metadata across all genomic elements.
 
         Returns:
-            (BiocOrPandasFrame, optional): Metadata frame or None if there is no element level metadata.
+            (BiocFrame, optional): Metadata :py:class:`~biocframe.BiocFrame.Biocframe` or
+            None if there is no element level metadata.
         """
         if "mcols" in self._data:
             return self._data["mcols"]
@@ -298,7 +299,7 @@ def score(self) -> Dict[str, List[int]]:
         """
         return self._generic_accessor("score")
 
-    def to_pandas(self) -> DataFrame:
+    def to_pandas(self) -> "DataFrame":
         """Coerce object to a :py:class:`pandas.DataFrame`.
 
         Returns:
@@ -368,7 +369,7 @@ def __getitem__(
             new_metadata = self.metadata
 
             if isinstance(args, tuple):
-                # TODO: probably should figure out what to do with the second dimension later.
+                # TODO: should figure out what to do with the second dimension later.
                 if len(args) >= 1:
                     args = args[0]
 
@@ -397,13 +398,13 @@ def __getitem__(
                 if self.mcols is not None:
                     new_mcols = self.mcols[args, :]
             else:
-                raise TypeError("`args` is not supported.")
+                raise TypeError("Arguments to slice is not a list of supported types.")
 
             return GenomicRangesList(
                 new_ranges, new_range_lengths, new_names, new_mcols, new_metadata
             )
 
-        raise TypeError("`args` must be either a string or an integer.")
+        raise TypeError("Arguments to slice is not supported.")
 
     def __len__(self) -> int:
         """Number of genomic elements.
@@ -415,11 +416,47 @@ def __len__(self) -> int:
 
     @classmethod
     def empty(cls, n: int):
-        """Create an ``n``-length `GenomicRangesList` object.
+        """Create an empty ``n``-length `GenomicRangesList` object.
 
         Returns:
             same type as caller, in this case a `GenomicRangesList`.
         """
         _range_lengths = [0] * n
 
         return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths)
+
+
+@combine.register(GenomicRangesList)
+def _combine_grl(*x: GenomicRangesList):
+    if not is_list_of_type(x, GenomicRangesList):
+        raise ValueError(
+            "All elements to `combine` must be `GenomicRangesList` objects."
+        )
+
+    raise NotImplementedError(
+        "`combine` is not implemented for `GenomicRangesList` objects."
+    )
+
+
+@combine_rows.register(GenomicRangesList)
+def _combine_rows_grl(*x: GenomicRangesList):
+    if not is_list_of_type(x, GenomicRangesList):
+        raise ValueError(
+            "All elements to `combine_rows` must be `GenomicRangesList` objects."
+        )
+
+    raise NotImplementedError(
+        "`combine_rows` is not implemented for `GenomicRangesList` objects."
+    )
+
+
+@combine_cols.register(GenomicRangesList)
+def _combine_cols_grl(*x: GenomicRangesList):
+    if not is_list_of_type(x, GenomicRangesList):
+        raise ValueError(
+            "All elements to `combine_cols` must be `GenomicRangesList` objects."
+        )
+
+    raise NotImplementedError(
+        "`combine_cols` is not implemented for `GenomicRangesList` objects."
+    )
diff --git a/src/genomicranges/utils.py b/src/genomicranges/utils.py