Skip to content

Commit

Permalink
reduce dependence on pandas, use the combine generic more often
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Oct 17, 2023
1 parent fa774cc commit 971f69d
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 74 deletions.
75 changes: 38 additions & 37 deletions src/genomicranges/GenomicRanges.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from biocgenerics.combine_cols import combine_cols
from biocgenerics.combine_rows import combine_rows
from biocutils import is_list_of_type
from numpy import concatenate, count_nonzero, ndarray, sum, zeros
from numpy import count_nonzero, ndarray, sum, zeros
from pandas import DataFrame, concat, isna

from .interval import (
Expand Down Expand Up @@ -55,29 +55,30 @@ class GenomicRanges(BiocFrame):
:py:class:`~genomicranges.SeqInfo.SeqInfo`) might also contain metadata about the
genome, e.g. if it's circular (`is_circular`) or not.
Note: The documentation for some of the methods comes from the
Note: The documentation for some of the methods are derived from the
`GenomicRanges R/Bioconductor package <https://github.com/Bioconductor/GenomicRanges>`_.
Typical usage example:
Typical usage:
To construct a **GenomicRanges** object, simply pass in the column representation as a
dictionary. This dictionary must contain "seqnames", "starts", "ends" columns, and optionally,
specify "strand". If the "strand" column is not provided, "*" is used as the default value for
each genomic interval.
```python
.. code-block:: python
gr = GenomicRanges(
{
"seqnames": ["chr1", "chr2", "chr3"],
"starts": [100, 115, 119],
"ends": [103, 116, 120],
}
)
```
Alternatively, you may also convert a :py:class:`~pandas.DataFrame` to ``GenomicRanges``.
```python
.. code-block:: python
df = pd.DataFrame(
{
"seqnames": ["chr1", "chr2", "chr3"],
Expand All @@ -87,7 +88,6 @@ class GenomicRanges(BiocFrame):
)
gr = genomicranges.from_pandas(df)
```
All columns other than "seqnames", "starts", "ends", and "strand" are considered
metadata columns and can be accessed by
Expand All @@ -97,19 +97,13 @@ class GenomicRanges(BiocFrame):
gr.mcols()
or slice the object
.. code-block:: python
sliced_gr = gr[1:2, [True, False, False]]
Attributes:
data (Dict[str, Any], optional): Dictionary of column names as `keys` and their values.
All columns must have the same length. Defaults to {}.
number_of_rows (int, optional): Number of rows.
row_names (List, optional): Row index names.
column_names (List[str], optional): Column names, if not provided, they are automatically inferred
from the data.
data (Dict[str, Any], optional): Dictionary of column names as `keys` and
their values. All columns must have the same length. Defaults to {}.
number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``.
row_names (list, optional): Row names.
column_names (list, optional): Column names. If not provided,
inferred from ``data``.
metadata (dict): Additional metadata. Defaults to {}.
"""

Expand All @@ -127,12 +121,12 @@ def __init__(
Args:
data (Dict[str, Any], optional): Dictionary of column names as `keys` and
their values. All columns must have the same length. Defaults to None.
number_of_rows (int, optional): Number of rows. Defaults to None.
row_names (List, optional): Row index names. Defaults to None.
column_names (List[str], optional): Column names, if not provided,
they are automatically inferred from the data. Defaults to None.
metadata (dict, optional): Additional metadata. Defaults to None.
their values. All columns must have the same length. Defaults to {}.
number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``.
row_names (list, optional): Row names.
column_names (list, optional): Column names. If not provided,
inferred from ``data``.
metadata (dict): Additional metadata. Defaults to {}.
"""
super().__init__(data, number_of_rows, row_names, column_names, metadata)

Expand Down Expand Up @@ -209,7 +203,8 @@ def ranges(
ValueError: If ``return_type`` is not supported.
Returns:
Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions.
Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions in type specified by
``return_type``.
"""

obj = {
Expand All @@ -228,7 +223,9 @@ def ranges(
try:
return return_type(obj)
except Exception as e:
raise ValueError(f"{return_type} is not supported, {str(e)}")
raise RuntimeError(
f"Cannot convert ranges to '{return_type}', {str(e)}"
)

@property
def strand(self) -> List[str]:
Expand Down Expand Up @@ -261,7 +258,7 @@ def seq_info(self) -> Optional[SeqInfo]:
"""Get sequence information, if available.
Returns:
(SeqInfo, optional): List information, otherwise None.
(SeqInfo, optional): Sequence information, otherwise None.
"""

if self.metadata and "seq_info" in self.metadata:
Expand All @@ -274,7 +271,8 @@ def seq_info(self, seq_info: Optional[SeqInfo]):
"""Set sequence information.
Args:
(SeqInfo, optional): List information, otherwise None.
(SeqInfo): Sequence information. Can be None to remove sequence
information from the object.
Raises:
ValueError: If `seq_info` is not a `SeqInfo` class.
Expand Down Expand Up @@ -367,7 +365,8 @@ def genome(self) -> Optional[str]:
return None

def granges(self) -> "GenomicRanges":
"""Create a new ``GenomicRanges`` object with only ranges (``seqnames``, ``starts``, ``ends``, and ``strand``).
"""Create a new ``GenomicRanges`` object with only ranges
(``seqnames``, ``starts``, ``ends``, and ``strand``).
Returns:
GenomicRanges: A new ``GenomicRanges`` with only ranges.
Expand Down Expand Up @@ -410,7 +409,9 @@ def mcols(self, return_type: Optional[Callable] = None) -> Any:
try:
return return_type(new_data)
except Exception as e:
raise ValueError(f"{return_type} not supported, {str(e)}")
raise RuntimeError(
f"Cannot convert metadata to '{return_type}', {str(e)}"
)

def __repr__(self) -> str:
from io import StringIO
Expand Down Expand Up @@ -507,7 +508,7 @@ def __getitem__(self, args: SlicerArgTypes) -> Union["GenomicRanges", dict, list
gr[<List of column names>]
Args:
args (SlicerArgTypes): A Tuple of slicer arguments to subset rows and
args (SlicerArgTypes): A Tuple of arguments to subset rows and
columns. An element in ``args`` may be,
- List of booleans, True to keep the row/column, False to remove.
Expand Down Expand Up @@ -1037,7 +1038,7 @@ def _calc_gap_widths(self, ignore_strand: bool = False) -> List[int]:
# inter range methods

# TODO: implement dropEmptyRanges
# TODO: this is a very ineffecient implementation, can do a better job later.
# TODO: this is a very ineffecient implementation, can do a better.
def reduce(
self,
with_reverse_map: bool = False,
Expand Down Expand Up @@ -1536,7 +1537,7 @@ def coverage(
)

if shift > 0:
cov = concatenate((shift_arr, cov))
cov = combine(shift_arr, cov)

if weight > 0:
cov = cov * weight
Expand Down Expand Up @@ -2219,18 +2220,18 @@ def invert_strand(self) -> "GenomicRanges":
)

def combine(self, *other: "GenomicRanges") -> "GenomicRanges":
"""Combine multiple GenomicRanges objects by row.
"""Combine multiple `GenomicRanges` objects by row.
Note: Fills missing columns with an array of `None`s.
Args:
*other (GenomicRanges): GenomicRanges objects.
*other (GenomicRanges): Objects to combine.
Raises:
TypeError: If all objects are not of type GenomicRanges.
Returns:
BiocFrame: A combined BiocFrame.
GenomicRanges: A combined GenomicRanges object.
"""
if not is_list_of_type(other, GenomicRanges):
raise TypeError("All objects to combine must be GenomicRanges objects.")
Expand Down
71 changes: 54 additions & 17 deletions src/genomicranges/GenomicRangesList.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from typing import Dict, List, Optional, Union

from biocframe import BiocFrame
from pandas import DataFrame
from biocgenerics.combine import combine
from biocgenerics.combine_cols import combine_cols
from biocgenerics.combine_rows import combine_rows

from .GenomicRanges import GenomicRanges
from .utils import is_list_of_type
from biocutils import is_list_of_type

__author__ = "jkanche"
__copyright__ = "jkanche"
__license__ = "MIT"

BiocOrPandasFrame = Union[DataFrame, BiocFrame]


class GenomicRangesList:
"""Just as it sounds, a `GenomicRangesList` is a named-list like object.
Expand All @@ -22,12 +22,13 @@ class GenomicRangesList:
Currently, this class is limited in functionality, purely a read-only class with basic accessors.
Typical usage example:
Typical usage:
To construct a `GenomicRangesList` object, simply pass in a list of
To construct a **GenomicRangesList** object, simply pass in a list of
:py:class:`genomicranges.GenomicRanges.GenomicRanges` objects and Optionally ``names``.
```python
.. code-block:: python
gr1 = GenomicRanges(
{
"seqnames": ["chr1", "chr2", "chr1", "chr3"],
Expand All @@ -49,7 +50,6 @@ class GenomicRangesList:
)
grl = GenomicRangesList(ranges=[gr1, gr2], names=["gene1", "gene2"])
```
Additionally, you may also provide metadata about the genomic elements in the dictionary
using mcols attribute.
Expand All @@ -60,7 +60,7 @@ def __init__(
ranges: Union[GenomicRanges, List[GenomicRanges]],
range_lengths: Optional[List[int]] = None,
names: Optional[List[str]] = None,
mcols: BiocOrPandasFrame = None,
mcols: Optional[BiocFrame] = None,
metadata: Optional[dict] = None,
):
"""Initialize a `GenomicRangesList` object.
Expand All @@ -74,7 +74,7 @@ def __init__(
names (Optional[List[str]], optional): Names of the genomic elements.
The length of this must match the number of genomic elements in ``ranges``.
Defaults to None.
mcols (BiocOrPandasFrame, optional): Metadata about each genomic element. Defaults to None.
mcols (BiocFrame, optional): Metadata about each genomic element. Defaults to None.
metadata (Optional[Dict], optional): Additional metadata. Defaults to None.
"""
self._validate(ranges)
Expand Down Expand Up @@ -152,11 +152,12 @@ def names(self) -> Optional[list]:
return self.groups

@property
def mcols(self) -> Optional[BiocOrPandasFrame]:
def mcols(self) -> Optional[BiocFrame]:
"""Get metadata across all genomic elements.
Returns:
(BiocOrPandasFrame, optional): Metadata frame or None if there is no element level metadata.
(BiocFrame, optional): Metadata :py:class:`~biocframe.BiocFrame.Biocframe` or
None if there is no element level metadata.
"""
if "mcols" in self._data:
return self._data["mcols"]
Expand Down Expand Up @@ -298,7 +299,7 @@ def score(self) -> Dict[str, List[int]]:
"""
return self._generic_accessor("score")

def to_pandas(self) -> DataFrame:
def to_pandas(self) -> "DataFrame":
"""Coerce object to a :py:class:`pandas.DataFrame`.
Returns:
Expand Down Expand Up @@ -368,7 +369,7 @@ def __getitem__(
new_metadata = self.metadata

if isinstance(args, tuple):
# TODO: probably should figure out what to do with the second dimension later.
# TODO: should figure out what to do with the second dimension later.
if len(args) >= 1:
args = args[0]

Expand Down Expand Up @@ -397,13 +398,13 @@ def __getitem__(
if self.mcols is not None:
new_mcols = self.mcols[args, :]
else:
raise TypeError("`args` is not supported.")
raise TypeError("Arguments to slice is not a list of supported types.")

return GenomicRangesList(
new_ranges, new_range_lengths, new_names, new_mcols, new_metadata
)

raise TypeError("`args` must be either a string or an integer.")
raise TypeError("Arguments to slice is not supported.")

def __len__(self) -> int:
"""Number of genomic elements.
Expand All @@ -415,11 +416,47 @@ def __len__(self) -> int:

@classmethod
def empty(cls, n: int):
"""Create an ``n``-length `GenomicRangesList` object.
"""Create an empty ``n``-length `GenomicRangesList` object.
Returns:
same type as caller, in this case a `GenomicRangesList`.
"""
_range_lengths = [0] * n

return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths)


@combine.register(GenomicRangesList)
def _combine_grl(*x: GenomicRangesList):
if not is_list_of_type(x, GenomicRangesList):
raise ValueError(
"All elements to `combine` must be `GenomicRangesList` objects."
)

raise NotImplementedError(
"`combine` is not implemented for `GenomicRangesList` objects."
)


@combine_rows.register(GenomicRangesList)
def _combine_rows_grl(*x: GenomicRangesList):
if not is_list_of_type(x, GenomicRangesList):
raise ValueError(
"All elements to `combine_rows` must be `GenomicRangesList` objects."
)

raise NotImplementedError(
"`combine_rows` is not implemented for `GenomicRangesList` objects."
)


@combine_cols.register(GenomicRangesList)
def _combine_cols_grl(*x: GenomicRangesList):
if not is_list_of_type(x, GenomicRangesList):
raise ValueError(
"All elements to `combine_cols` must be `GenomicRangesList` objects."
)

raise NotImplementedError(
"`combine_cols` is not implemented for `GenomicRangesList` objects."
)
20 changes: 0 additions & 20 deletions src/genomicranges/utils.py

This file was deleted.

0 comments on commit 971f69d

Please sign in to comment.