diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py index ad38992..4bb72ad 100644 --- a/src/genomicranges/GenomicRanges.py +++ b/src/genomicranges/GenomicRanges.py @@ -18,7 +18,7 @@ from biocgenerics.combine_cols import combine_cols from biocgenerics.combine_rows import combine_rows from biocutils import is_list_of_type -from numpy import concatenate, count_nonzero, ndarray, sum, zeros +from numpy import count_nonzero, ndarray, sum, zeros from pandas import DataFrame, concat, isna from .interval import ( @@ -55,17 +55,18 @@ class GenomicRanges(BiocFrame): :py:class:`~genomicranges.SeqInfo.SeqInfo`) might also contain metadata about the genome, e.g. if it's circular (`is_circular`) or not. - Note: The documentation for some of the methods comes from the + Note: The documentation for some of the methods are derived from the `GenomicRanges R/Bioconductor package `_. - Typical usage example: + Typical usage: To construct a **GenomicRanges** object, simply pass in the column representation as a dictionary. This dictionary must contain "seqnames", "starts", "ends" columns, and optionally, specify "strand". If the "strand" column is not provided, "*" is used as the default value for each genomic interval. - ```python + .. code-block:: python + gr = GenomicRanges( { "seqnames": ["chr1", "chr2", "chr3"], @@ -73,11 +74,11 @@ class GenomicRanges(BiocFrame): "ends": [103, 116, 120], } ) - ``` Alternatively, you may also convert a :py:class:`~pandas.DataFrame` to ``GenomicRanges``. - ```python + .. code-block:: python + df = pd.DataFrame( { "seqnames": ["chr1", "chr2", "chr3"], @@ -87,7 +88,6 @@ class GenomicRanges(BiocFrame): ) gr = genomicranges.from_pandas(df) - ``` All columns other than "seqnames", "starts", "ends", and "strand" are considered metadata columns and can be accessed by @@ -97,19 +97,13 @@ class GenomicRanges(BiocFrame): gr.mcols() - or slice the object - - .. code-block:: python - - sliced_gr = gr[1:2, [True, False, False]] - Attributes: - data (Dict[str, Any], optional): Dictionary of column names as `keys` and their values. - All columns must have the same length. Defaults to {}. - number_of_rows (int, optional): Number of rows. - row_names (List, optional): Row index names. - column_names (List[str], optional): Column names, if not provided, they are automatically inferred - from the data. + data (Dict[str, Any], optional): Dictionary of column names as `keys` and + their values. All columns must have the same length. Defaults to {}. + number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``. + row_names (list, optional): Row names. + column_names (list, optional): Column names. If not provided, + inferred from ``data``. metadata (dict): Additional metadata. Defaults to {}. """ @@ -127,12 +121,12 @@ def __init__( Args: data (Dict[str, Any], optional): Dictionary of column names as `keys` and - their values. All columns must have the same length. Defaults to None. - number_of_rows (int, optional): Number of rows. Defaults to None. - row_names (List, optional): Row index names. Defaults to None. - column_names (List[str], optional): Column names, if not provided, - they are automatically inferred from the data. Defaults to None. - metadata (dict, optional): Additional metadata. Defaults to None. + their values. All columns must have the same length. Defaults to {}. + number_of_rows (int, optional): Number of rows. If not specified, inferred from ``data``. + row_names (list, optional): Row names. + column_names (list, optional): Column names. If not provided, + inferred from ``data``. + metadata (dict): Additional metadata. Defaults to {}. """ super().__init__(data, number_of_rows, row_names, column_names, metadata) @@ -209,7 +203,8 @@ def ranges( ValueError: If ``return_type`` is not supported. Returns: - Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions. + Union[DataFrame, Dict, "GenomicRanges", Any]: Genomic regions in type specified by + ``return_type``. """ obj = { @@ -228,7 +223,9 @@ def ranges( try: return return_type(obj) except Exception as e: - raise ValueError(f"{return_type} is not supported, {str(e)}") + raise RuntimeError( + f"Cannot convert ranges to '{return_type}', {str(e)}" + ) @property def strand(self) -> List[str]: @@ -261,7 +258,7 @@ def seq_info(self) -> Optional[SeqInfo]: """Get sequence information, if available. Returns: - (SeqInfo, optional): List information, otherwise None. + (SeqInfo, optional): Sequence information, otherwise None. """ if self.metadata and "seq_info" in self.metadata: @@ -274,7 +271,8 @@ def seq_info(self, seq_info: Optional[SeqInfo]): """Set sequence information. Args: - (SeqInfo, optional): List information, otherwise None. + (SeqInfo): Sequence information. Can be None to remove sequence + information from the object. Raises: ValueError: If `seq_info` is not a `SeqInfo` class. @@ -367,7 +365,8 @@ def genome(self) -> Optional[str]: return None def granges(self) -> "GenomicRanges": - """Create a new ``GenomicRanges`` object with only ranges (``seqnames``, ``starts``, ``ends``, and ``strand``). + """Create a new ``GenomicRanges`` object with only ranges + (``seqnames``, ``starts``, ``ends``, and ``strand``). Returns: GenomicRanges: A new ``GenomicRanges`` with only ranges. @@ -410,7 +409,9 @@ def mcols(self, return_type: Optional[Callable] = None) -> Any: try: return return_type(new_data) except Exception as e: - raise ValueError(f"{return_type} not supported, {str(e)}") + raise RuntimeError( + f"Cannot convert metadata to '{return_type}', {str(e)}" + ) def __repr__(self) -> str: from io import StringIO @@ -507,7 +508,7 @@ def __getitem__(self, args: SlicerArgTypes) -> Union["GenomicRanges", dict, list gr[] Args: - args (SlicerArgTypes): A Tuple of slicer arguments to subset rows and + args (SlicerArgTypes): A Tuple of arguments to subset rows and columns. An element in ``args`` may be, - List of booleans, True to keep the row/column, False to remove. @@ -1037,7 +1038,7 @@ def _calc_gap_widths(self, ignore_strand: bool = False) -> List[int]: # inter range methods # TODO: implement dropEmptyRanges - # TODO: this is a very ineffecient implementation, can do a better job later. + # TODO: this is a very ineffecient implementation, can do a better. def reduce( self, with_reverse_map: bool = False, @@ -1536,7 +1537,7 @@ def coverage( ) if shift > 0: - cov = concatenate((shift_arr, cov)) + cov = combine(shift_arr, cov) if weight > 0: cov = cov * weight @@ -2219,18 +2220,18 @@ def invert_strand(self) -> "GenomicRanges": ) def combine(self, *other: "GenomicRanges") -> "GenomicRanges": - """Combine multiple GenomicRanges objects by row. + """Combine multiple `GenomicRanges` objects by row. Note: Fills missing columns with an array of `None`s. Args: - *other (GenomicRanges): GenomicRanges objects. + *other (GenomicRanges): Objects to combine. Raises: TypeError: If all objects are not of type GenomicRanges. Returns: - BiocFrame: A combined BiocFrame. + GenomicRanges: A combined GenomicRanges object. """ if not is_list_of_type(other, GenomicRanges): raise TypeError("All objects to combine must be GenomicRanges objects.") diff --git a/src/genomicranges/GenomicRangesList.py b/src/genomicranges/GenomicRangesList.py index c7b48d0..190a98a 100644 --- a/src/genomicranges/GenomicRangesList.py +++ b/src/genomicranges/GenomicRangesList.py @@ -1,17 +1,17 @@ from typing import Dict, List, Optional, Union from biocframe import BiocFrame -from pandas import DataFrame +from biocgenerics.combine import combine +from biocgenerics.combine_cols import combine_cols +from biocgenerics.combine_rows import combine_rows from .GenomicRanges import GenomicRanges -from .utils import is_list_of_type +from biocutils import is_list_of_type __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -BiocOrPandasFrame = Union[DataFrame, BiocFrame] - class GenomicRangesList: """Just as it sounds, a `GenomicRangesList` is a named-list like object. @@ -22,12 +22,13 @@ class GenomicRangesList: Currently, this class is limited in functionality, purely a read-only class with basic accessors. - Typical usage example: + Typical usage: - To construct a `GenomicRangesList` object, simply pass in a list of + To construct a **GenomicRangesList** object, simply pass in a list of :py:class:`genomicranges.GenomicRanges.GenomicRanges` objects and Optionally ``names``. - ```python + .. code-block:: python + gr1 = GenomicRanges( { "seqnames": ["chr1", "chr2", "chr1", "chr3"], @@ -49,7 +50,6 @@ class GenomicRangesList: ) grl = GenomicRangesList(ranges=[gr1, gr2], names=["gene1", "gene2"]) - ``` Additionally, you may also provide metadata about the genomic elements in the dictionary using mcols attribute. @@ -60,7 +60,7 @@ def __init__( ranges: Union[GenomicRanges, List[GenomicRanges]], range_lengths: Optional[List[int]] = None, names: Optional[List[str]] = None, - mcols: BiocOrPandasFrame = None, + mcols: Optional[BiocFrame] = None, metadata: Optional[dict] = None, ): """Initialize a `GenomicRangesList` object. @@ -74,7 +74,7 @@ def __init__( names (Optional[List[str]], optional): Names of the genomic elements. The length of this must match the number of genomic elements in ``ranges``. Defaults to None. - mcols (BiocOrPandasFrame, optional): Metadata about each genomic element. Defaults to None. + mcols (BiocFrame, optional): Metadata about each genomic element. Defaults to None. metadata (Optional[Dict], optional): Additional metadata. Defaults to None. """ self._validate(ranges) @@ -152,11 +152,12 @@ def names(self) -> Optional[list]: return self.groups @property - def mcols(self) -> Optional[BiocOrPandasFrame]: + def mcols(self) -> Optional[BiocFrame]: """Get metadata across all genomic elements. Returns: - (BiocOrPandasFrame, optional): Metadata frame or None if there is no element level metadata. + (BiocFrame, optional): Metadata :py:class:`~biocframe.BiocFrame.Biocframe` or + None if there is no element level metadata. """ if "mcols" in self._data: return self._data["mcols"] @@ -298,7 +299,7 @@ def score(self) -> Dict[str, List[int]]: """ return self._generic_accessor("score") - def to_pandas(self) -> DataFrame: + def to_pandas(self) -> "DataFrame": """Coerce object to a :py:class:`pandas.DataFrame`. Returns: @@ -368,7 +369,7 @@ def __getitem__( new_metadata = self.metadata if isinstance(args, tuple): - # TODO: probably should figure out what to do with the second dimension later. + # TODO: should figure out what to do with the second dimension later. if len(args) >= 1: args = args[0] @@ -397,13 +398,13 @@ def __getitem__( if self.mcols is not None: new_mcols = self.mcols[args, :] else: - raise TypeError("`args` is not supported.") + raise TypeError("Arguments to slice is not a list of supported types.") return GenomicRangesList( new_ranges, new_range_lengths, new_names, new_mcols, new_metadata ) - raise TypeError("`args` must be either a string or an integer.") + raise TypeError("Arguments to slice is not supported.") def __len__(self) -> int: """Number of genomic elements. @@ -415,7 +416,7 @@ def __len__(self) -> int: @classmethod def empty(cls, n: int): - """Create an ``n``-length `GenomicRangesList` object. + """Create an empty ``n``-length `GenomicRangesList` object. Returns: same type as caller, in this case a `GenomicRangesList`. @@ -423,3 +424,39 @@ def empty(cls, n: int): _range_lengths = [0] * n return cls(ranges=GenomicRanges.empty(), range_lengths=_range_lengths) + + +@combine.register(GenomicRangesList) +def _combine_grl(*x: GenomicRangesList): + if not is_list_of_type(x, GenomicRangesList): + raise ValueError( + "All elements to `combine` must be `GenomicRangesList` objects." + ) + + raise NotImplementedError( + "`combine` is not implemented for `GenomicRangesList` objects." + ) + + +@combine_rows.register(GenomicRangesList) +def _combine_rows_grl(*x: GenomicRangesList): + if not is_list_of_type(x, GenomicRangesList): + raise ValueError( + "All elements to `combine_rows` must be `GenomicRangesList` objects." + ) + + raise NotImplementedError( + "`combine_rows` is not implemented for `GenomicRangesList` objects." + ) + + +@combine_cols.register(GenomicRangesList) +def _combine_cols_grl(*x: GenomicRangesList): + if not is_list_of_type(x, GenomicRangesList): + raise ValueError( + "All elements to `combine_cols` must be `GenomicRangesList` objects." + ) + + raise NotImplementedError( + "`combine_cols` is not implemented for `GenomicRangesList` objects." + ) diff --git a/src/genomicranges/utils.py b/src/genomicranges/utils.py deleted file mode 100644 index 26a40e9..0000000 --- a/src/genomicranges/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Any, Callable - -__author__ = "jkanche, keviny2" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def is_list_of_type(x: Any, target_type: Callable) -> bool: - """Checks if ``x`` is a list, and whether all elements of the list are of the same type. - - Args: - x (Any): Any list-like object. - target_type (callable): Type to check for, e.g. ``str``, ``int``. - - Returns: - bool: True if ``x`` is :py:class:`list` and all elements are of the same type. - """ - return isinstance(x, (list, tuple)) and all( - isinstance(item, target_type) for item in x - )