diff --git a/src/summarizedexperiment/BaseSE.py b/src/summarizedexperiment/BaseSE.py index 848f0f8..8dfd0de 100644 --- a/src/summarizedexperiment/BaseSE.py +++ b/src/summarizedexperiment/BaseSE.py @@ -101,7 +101,7 @@ def __init__( assays: Dict[str, Any], row_data: Optional[BiocFrame] = None, col_data: Optional[BiocFrame] = None, - metadata: Optional[Dict] = None, + metadata: Optional[dict] = None, validate: bool = True, ) -> None: """Initialize an instance of ``BaseSE``. @@ -243,7 +243,7 @@ def dims(self) -> Tuple[int, int]: def __repr__(self) -> str: pattern = ( - f"Class BaseSE with {self.shape[0]} features and {self.shape[1]} samples \n" + f"Class {type(self).__name__} with {self.shape[0]} features and {self.shape[1]} samples \n" f" assays: {', '.join(list(self.assays.keys()))} \n" f" row_data: {self._rows.names if self._rows is not None else None} \n" f" col_data: {self._cols.names if self._cols is not None else None}" @@ -711,9 +711,10 @@ def get_slice( columns: Optional[Union[str, int, bool, Sequence]], ) -> "BaseSE": """Alias for :py:attr:`~__getitem__`, for back-compatibility.""" - current_class_const = type(self) + slicer = self._generic_slice(rows=rows, columns=columns) + current_class_const = type(self) return current_class_const( assays=slicer.assays, row_data=slicer.rows, @@ -729,8 +730,9 @@ def __getitem__( Args: args: - A sequence or a scalar integer or string, specifying the - columns to retain based on their names or indices. + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. Alternatively a tuple of length 1. The first entry specifies the rows to retain based on their names or indices. diff --git a/src/summarizedexperiment/RangedSummarizedExperiment.py b/src/summarizedexperiment/RangedSummarizedExperiment.py index bbcaa38..c54d0b0 100644 --- a/src/summarizedexperiment/RangedSummarizedExperiment.py +++ b/src/summarizedexperiment/RangedSummarizedExperiment.py @@ -1,10 +1,13 @@ -from typing import Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Sequence, Union +from warnings import warn import numpy as np +from biocframe import BiocFrame from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo +from ._frameutils import is_pandas from .SummarizedExperiment import SummarizedExperiment -from .types import MatrixTypes, SlicerArgTypes +from .types import MatrixTypes __author__ = "jkanche" __copyright__ = "jkanche" @@ -14,23 +17,23 @@ GRangesOrRangeSE = Union[GRangesOrGRangesList, "RangedSummarizedExperiment"] -# TODO: technically should be in _type_checks but fails due to circular imports. +# TODO: technically should be in _type_checks but causes circular imports. def _check_gr_or_rse(x: GRangesOrRangeSE): """Check if ``x`` is either a `RangedSummarizedExperiment` or `GenomicRanges`. Args: - x (Union[GenomicRanges, RangedSummarizedExperiment]): Object to check. + x: + Object to check. Raises: - TypeError: Object is not a `RangedSummarizedExperiment` or `GenomicRanges`. + TypeError: + Object is not a `RangedSummarizedExperiment` or `GenomicRanges`. """ - if not ( - isinstance(x, RangedSummarizedExperiment) - or isinstance(x, GenomicRanges) - or isinstance(x, GenomicRangesList) + if not isinstance( + x, (RangedSummarizedExperiment, GenomicRanges, GenomicRangesList) ): raise TypeError( - "object is not a `RangedSummarizedExperiment`, `GenomicRanges` or `GenomicRangesList`." + "'x' is not a `RangedSummarizedExperiment`, `GenomicRanges` or `GenomicRangesList`." ) @@ -38,10 +41,11 @@ def _access_granges(x: GRangesOrRangeSE) -> GenomicRanges: """Access ranges from the object. Args: - x (Union[GenomicRanges, RangedSummarizedExperiment]): Input object. + x: + Input object. Returns: - GenomicRanges: Genomic ranges object. + `GenomicRanges` object. """ qranges = x if isinstance(x, RangedSummarizedExperiment): @@ -50,154 +54,230 @@ def _access_granges(x: GRangesOrRangeSE) -> GenomicRanges: return qranges -class RangedSummarizedExperiment(SummarizedExperiment): - """RangedSummarizedExperiment class to represent genomic experiment data, genomic features as - :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or - :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList` sample data and any additional experimental metadata. - - The key difference between this and `SummarizedExperiment` is enforcing type for - feature information (`row_ranges`), must be a `GenomicRanges` object. This allows us to - provides new methods, to perform genomic range based operations over experimental data. +def _validate_rowranges(row_ranges, shape): + if not (isinstance(row_ranges, (GenomicRanges, GenomicRangesList))): + raise TypeError( + "`row_ranges` must be a `GenomicRanges` or `GenomicRangesList`" + f" , provided {type(row_ranges)}." + ) - Note: If ``row_ranges`` is empty, None or not a - :py:class:`genomicranges.GenomicRanges.GenomicRanges` object, use a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` instead. + if len(row_ranges) != shape[0]: + raise ValueError( + "Number of features in `row_ranges` and number of rows in assays do not match." + ) - Attributes: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys - and 2-dimensional matrices represented as either - :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. - Alternatively, you may use any 2-dimensional matrix that has the ``shape`` property and - implements the slice operation using the ``__getitem__`` dunder method. +def _sanitize_frame(frame, num_rows: int): + frame = frame if frame is not None else GenomicRanges.empty(num_rows) - All matrices in assays must be 2-dimensional and have the same shape - (number of rows, number of columns). + if is_pandas(frame): + frame = GenomicRanges.from_pandas(frame) - row_ranges (GRangesOrGRangesList, optional): Genomic features, must be the same length as - rows of the matrices in assays. + return frame - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of - the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or - :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the - columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` - or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. +class RangedSummarizedExperiment(SummarizedExperiment): + """RangedSummarizedExperiment class to represent genomic experiment data, genomic features as + :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or + :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList` sample data and any additional experimental metadata. - metadata (Dict, optional): Additional experimental metadata describing the methods. Defaults to None. + Note: If ``row_ranges`` is empty, None or not a + :py:class:`genomicranges.GenomicRanges.GenomicRanges` object, use a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` instead. """ def __init__( self, assays: Dict[str, MatrixTypes], row_ranges: Optional[GRangesOrGRangesList] = None, - row_data: Optional[BiocOrPandasFrame] = None, - col_data: Optional[BiocOrPandasFrame] = None, - metadata: Optional[Dict] = None, + row_data: Optional[BiocFrame] = None, + col_data: Optional[BiocFrame] = None, + metadata: Optional[dict] = None, + validate: bool = True, ) -> None: """Initialize a `RangedSummarizedExperiment` (RSE) object. Args: - assays (Dict[str, MatrixTypes]): Dictionary - of matrices, with assay names as keys and 2-dimensional matrices represented as - :py:class:`~numpy.ndarray` or :py:class:`scipy.sparse.spmatrix` matrices. + assays: + A dictionary containing matrices, with assay names as keys + and 2-dimensional matrices represented as either + :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. - Alternatively, you may use any 2-dimensional matrix that contains the property ``shape`` - and implements the slice operation using the ``__getitem__`` dunder method. + Alternatively, you may use any 2-dimensional matrix that has + the ``shape`` property and implements the slice operation + using the ``__getitem__`` dunder method. - All matrices in ``assays`` must be 2-dimensional and have the same - shape (number of rows, number of columns). + All matrices in assays must be 2-dimensional and have the + same shape (number of rows, number of columns). - row_ranges (GRangesOrGRangesList, optional): Genomic features, must be the same length as - rows of the matrices in assays. + row_ranges: + Genomic features, must be the same length as the number of rows of + the matrices in assays. - row_data (BiocOrPandasFrame, optional): Features, must be the same length as - rows of the matrices in assays. + row_data: + Features, must be the same length as the number of rows of + the matrices in assays. - Features may be either a :py:class:`~pandas.DataFrame` or - :py:class:`~biocframe.BiocFrame.BiocFrame`. + Feature information is coerced to a + :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, must be - the same length as columns of the matrices in assays. + col_data: + Sample data, must be the same length as the number of + columns of the matrices in assays. - Sample Information may be either a :py:class:`~pandas.DataFrame` or - :py:class:`~biocframe.BiocFrame.BiocFrame`. + Sample information is coerced to a + :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. + metadata: + Additional experimental metadata describing the methods. Defaults to None. - metadata (Dict, optional): Additional experimental metadata describing the - methods. Defaults to None. + + validate: + Internal use only. """ super().__init__( - assays, row_data=row_data, col_data=col_data, metadata=metadata + assays, + row_data=row_data, + col_data=col_data, + metadata=metadata, + validate=validate, ) - if row_ranges is None: - row_ranges = GenomicRangesList.empty(n=self._shape[0]) - - self._validate_row_ranges(row_ranges) + self._row_ranges = _sanitize_frame(row_ranges, self._shape[0]) self._row_ranges = row_ranges - def _validate_row_ranges(self, row_ranges: GRangesOrGRangesList): - """Internal method to validate feature information (``row_ranges``). + if validate: + _validate_rowranges(row_ranges, self._shape) - Args: - rows (GRangesOrGRangesList): Genomic features. + ######################### + ######>> Copying <<###### + ######################### - Raises: - ValueError: If number of rows does not match the number of features in - `row_ranges` & `assays`. - TypeError: If `row_ranges` is not a `GenomicRanges` or `GenomicRangesList`. + def __deepcopy__(self, memo=None, _nil=[]): """ - if not ( - isinstance(row_ranges, GenomicRanges) - or isinstance(row_ranges, GenomicRangesList) - ): - raise TypeError( - "`row_ranges` must be a `GenomicRanges` or `GenomicRangesList`" - f" , provided {type(row_ranges)}." - ) - - if len(row_ranges) != self._shape[0]: - raise ValueError( - "Number of features and number of rows in assays do not match." - ) + Returns: + A deep copy of the current ``RangedSummarizedExperiment``. + """ + from copy import deepcopy + + _assays_copy = deepcopy(self._assays) + _rows_copy = deepcopy(self._rows) + _rowranges_copy = deepcopy(self._row_ranges) + _cols_copy = deepcopy(self._cols) + _metadata_copy = deepcopy(self.metadata) + + current_class_const = type(self) + return current_class_const( + assays=_assays_copy, + row_ranges=_rowranges_copy, + row_data=_rows_copy, + col_data=_cols_copy, + metadata=_metadata_copy, + ) - @property - def row_ranges(self) -> GRangesOrGRangesList: - """Get features. + def __copy__(self): + """ + Returns: + A shallow copy of the current ``RangedSummarizedExperiment``. + """ + current_class_const = type(self) + return current_class_const( + assays=self._assays, + row_ranges=self._row_ranges, + row_data=self._rows, + col_data=self._cols, + metadata=self._metadata, + ) + + def copy(self): + """Alias for :py:meth:`~__copy__`.""" + return self.__copy__() + + ########################## + ######>> Printing <<###### + ########################## + + def __repr__(self) -> str: + pattern = ( + f"Class {type(self).__name__} with {self.shape[0]} features and {self.shape[1]} " + "samples \n" + f" assays: {list(self.assays.keys())} \n" + f" row_data: {self._rows.columns if self._rows is not None else None} \n" + f" col_data: {self._cols.columns if self._cols is not None else None}" + ) + return pattern + + ############################ + ######>> row_ranges <<###### + ############################ + + def get_rowranges(self) -> GRangesOrGRangesList: + """Get genomic feature information. Returns: - GRangesOrGRangesList: Genomic features. + Genomic feature information. """ return self._row_ranges - @row_ranges.setter - def row_ranges(self, ranges: GRangesOrGRangesList) -> None: - """Set features. + def set_rowranges( + self, row_ranges: Optional[GRangesOrGRangesList], in_place: bool = False + ): + """Set new genomic features Args: - ranges (GRangesOrGRangesList): Features to update. + row_ranges: + Genomic features, must be the same length as the number of rows of + the matrices in assays. + + in_place: + Whether to modify the ``RangeSummarizedExperiment`` in place. + + Returns: + A modified ``RangeSummarizedExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ - self._validate_row_ranges(ranges) - self._row_ranges = ranges + rows = _sanitize_frame(row_ranges, self._shape[0]) + _validate_rowranges(rows, self._shape) + + output = self._define_output(in_place) + output._rows = rows + return output + + @property + def row_ranges(self) -> GRangesOrGRangesList: + """Alias for :py:meth:`~get_rowranges`.""" + return self.get_rowranges() + + @row_ranges.setter + def row_ranges(self, row_ranges: GRangesOrGRangesList) -> None: + """Alias for :py:meth:`~set_rowranges`.""" + warn( + "Setting property 'row_ranges' is an in-place operation, use 'set_rowranges' instead", + UserWarning, + ) + return self.set_rowranges(row_ranges=row_ranges, in_place=True) + + ################################# + ######>> range accessors <<###### + ################################# @property - def end(self) -> List[int]: - """Get genomic end positions for each feature or row in experimental data. + def end(self) -> np.ndarray: + """Get genomic end positions for each feature + or row in experimental data. Returns: - List[int]: List of end positions. + A :py:class:`numpy.ndarray` of end positions. """ return self.row_ranges.end @property - def start(self) -> List[int]: - """Get genomic start positions for each feature or row in experimental data. + def start(self) -> np.ndarray: + """Get genomic start positions for each feature + or row in experimental data. Returns: - List[int]: List of start positions. + A :py:class:`numpy.ndarray` of start positions. """ return self.row_ranges.start @@ -206,65 +286,75 @@ def seqnames(self) -> List[str]: """Get sequence or chromosome names. Returns: - List[str]: List of all chromosome names. + List of all chromosome names. """ return self.row_ranges.seqnames @property - def strand(self) -> List[str]: + def strand(self) -> np.ndarray: """Get strand information. Returns: - List[str]: List of strand information across all features. + A :py:class:`numpy.ndarray` of strand information. """ return self.row_ranges.strand @property - def width(self) -> List[int]: + def width(self) -> np.ndarray: """Get widths of ``row_ranges``. Returns: - List[int]: List of widths for each interval. + A :py:class:`numpy.ndarray` of widths for each interval. """ return self.row_ranges.width @property - def seq_info(self) -> Optional[SeqInfo]: + def seq_info(self) -> SeqInfo: """Get sequence information object (if available). Returns: - (SeqInfo, optional): Sequence information if available, otherwise None. + Sequence information. """ return self.row_ranges.seq_info + ########################## + ######>> slicers <<####### + ########################## + def __getitem__( self, - args: SlicerArgTypes, + args: Union[int, str, Sequence, tuple], ) -> "RangedSummarizedExperiment": """Subset a `RangedSummarizedExperiment`. Args: - args (SlicerArgTypes): Indices or names to slice. The tuple contains - slices along dimensions (rows, cols). + args: + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. - Each element in the tuple, might be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. + Alternatively a tuple of length 1. The first entry specifies + the rows to retain based on their names or indices. - Defaults to None. + Alternatively a tuple of length 2. The first entry specifies + the rows to retain, while the second entry specifies the + columns to retain, based on their names or indices. Raises: - ValueError: If too many or too few slices are provided. + ValueError: + If too many or too few slices are provided. Returns: - RangedSummarizedExperiment: Sliced `RangedSummarizedExperiment` object. + Sliced `RangedSummarizedExperiment` object. """ - sliced_objs = self._slice(args) + sliced_objs = self._generic_slice(args) new_row_ranges = None - if sliced_objs.row_indices is not None and self.row_ranges is not None: + if sliced_objs.row_indices != slice(None): new_row_ranges = self.row_ranges[sliced_objs.row_indices] - return RangedSummarizedExperiment( + current_class_const = type(self) + return current_class_const( assays=sliced_objs.assays, row_ranges=new_row_ranges, row_data=sliced_objs.row_data, @@ -272,15 +362,9 @@ def __getitem__( metadata=self.metadata, ) - def __repr__(self) -> str: - pattern = ( - f"Class RangedSummarizedExperiment with {self.shape[0]} features and {self.shape[1]} " - "samples \n" - f" assays: {list(self.assays.keys())} \n" - f" row_data: {self.row_data.columns if self.row_data is not None else None} \n" - f" col_data: {self.col_data.columns if self.col_data is not None else None}" - ) - return pattern + ############################ + ######>> range ops <<####### + ############################ def coverage( self, shift: int = 0, width: Optional[int] = None, weight: int = 1 @@ -288,142 +372,143 @@ def coverage( """Calculate coverage for each chromosome. Args: - shift (int, optional): Shifts all genomic positions by specified number - of positions. Defaults to 0. - width (int, optional): Restrict the width of all chromosomes. - Defaults to None. - weight (int, optional): Weight to use. Defaults to 1. + shift: + Shift all genomic positions. Defaults to 0. + + width: + Restrict the width of all + chromosomes. Defaults to None. + + weight: + Weight to use. Defaults to 1. Returns: - Dict[str, np.ndarray]: A dictionary containing chromosome names - as keys and the coverage vector as value. + A dictionary with chromosome names as keys and the + coverage vector as value. """ return self.row_ranges.coverage(shift=shift, width=width, weight=weight) def nearest( self, query: GRangesOrRangeSE, + select: Literal["all", "arbitrary"] = "all", ignore_strand: bool = False, ) -> Optional[List[Optional[int]]]: - """Search nearest positions, both upstream and downstream that overlap with each genomic interval in ``query``. + """Search nearest positions both upstream and downstream that overlap with each range in ``query``. Args: - query (GRangesOrRangeSE): Query intervals to find nearest positions. + query: + Query intervals to find nearest positions. + ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` object. - ignore_strand (bool, optional): Whether to ignore strand during looksups. - Defaults to False. + select: + Determine what hit to choose when there are + multiple hits for an interval in ``query``. + + ignore_strand: + Whether to ignore strand. Defaults to False. Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` + If ``query`` is not a ``RangedSummarizedExperiment`` or ``GenomicRanges``. Returns: - (List[Optional[int]], optional): List of possible `hit` indices - for each interval in `query`, Otherwise returns None. + A list with the same length as ``query``, + containing hits to nearest indices. """ _check_gr_or_rse(query) qranges = _access_granges(query) - res = self.row_ranges.nearest(query=qranges, ignore_strand=ignore_strand) - return res.column("hits") + res = self.row_ranges.nearest( + query=qranges, select=select, ignore_strand=ignore_strand + ) + return res def precede( self, query: GRangesOrRangeSE, + select: Literal["all", "arbitrary"] = "all", ignore_strand: bool = False, ) -> Optional[List[Optional[int]]]: - """Search nearest positions, only downstream that overlap with each range in ``query``. + """Search nearest positions only downstream that overlap with each range in ``query``. Args: - query (GRangesOrRangeSE): Query intervals to find nearest positions. + query: + Query intervals to find nearest positions. + ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` object. - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. + + select: + Determine what hit to choose when there are + multiple hits for an interval in ``query``. + + ignore_strand: + Whether to ignore strand. Defaults to False. Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` or + If ``query`` is not a ``RangedSummarizedExperiment`` or ``GenomicRanges``. Returns: - (List[Optional[int]], optional): List of possible hit indices - for each interval in ``query``. If there are no hits, returns None. + A List with the same length as ``query``, + containing hits to nearest indices. """ _check_gr_or_rse(query) qranges = _access_granges(query) - res = self.row_ranges.precede(query=qranges, ignore_strand=ignore_strand) - return res.column("hits") + res = self.row_ranges.precede( + query=qranges, select=select, ignore_strand=ignore_strand + ) + return res def follow( self, query: GRangesOrRangeSE, + select: Literal["all", "arbitrary"] = "all", ignore_strand: bool = False, ) -> Optional[List[Optional[int]]]: - """Search nearest positions, only upstream that overlap with the each range in ``query``. + """Search nearest positions only upstream that overlap with each range in ``query``. Args: - query (GRangesOrRangeSE): Query intervals to find nearest positions. + query: + Query intervals to find nearest positions. + ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` object. - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. - - Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` or - ``GenomicRanges``. - Returns: - (List[Optional[int]], optional): List of possible hit indices - for each interval in ``query``. If there are no hits, returns None. - """ - _check_gr_or_rse(query) - - qranges = _access_granges(query) + select: + Determine what hit to choose when there are + multiple hits for an interval in ``query``. - res = self.row_ranges.follow(query=qranges, ignore_strand=ignore_strand) - return res.column("hits") - - def distance_to_nearest( - self, - query: GRangesOrRangeSE, - ignore_strand: bool = False, - ) -> Optional[List[Optional[int]]]: - """Search nearest positions only downstream that overlap with the each genomics interval in ``query``. - - Technically same as - :py:meth:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.nearest` - since we also return `distance` to the nearest match. - - Args: - query (GRangesOrRangeSE): Query intervals to find nearest positions. - ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a - :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` - object. - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. + ignore_strand: + Whether to ignore strand. Defaults to False. Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` or ``GenomicRanges``. + If ``query`` is not a ``RangedSummarizedExperiment`` or + ``GenomicRanges``. Returns: - (List[Optional[int]], optional): List of possible hit indices - for each interval in ``query``. If there are no hits, returns None. + A List with the same length as ``query``, + containing hits to nearest indices. """ _check_gr_or_rse(query) qranges = _access_granges(query) - res = self.row_ranges.distance_to_nearest( - query=qranges, ignore_strand=ignore_strand + res = self.row_ranges.follow( + query=qranges, select=select, ignore_strand=ignore_strand ) - return res.column("distance") + return res def flank( self, @@ -431,211 +516,268 @@ def flank( start: bool = True, both: bool = False, ignore_strand: bool = False, + in_place: bool = False, ) -> "RangedSummarizedExperiment": - """Generates flanking ranges for each range in - :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges`. + """Compute flanking ranges for each range. Refer to either :py:meth:`~genomicranges.GenomicRanges.GenomicRanges.flank` or the - Bioconductor documentation for what it this method does. + Bioconductor documentation for more details. Args: - width (int): Width to flank by. - start (bool, optional): Whether to only flank starts positions. Defaults to True. - both (bool, optional): Whether to flank both starts and ends positions. - Defaults to False. - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. + width: + Width to flank by. May be negative. + + start: + Whether to only flank starts. Defaults to True. + + both: + Whether to flank both starts and ends. Defaults to False. + + ignore_strand: + Whether to ignore strands. Defaults to False. + + in_place: + Whether to modify the ``GenomicRanges`` object in place. Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` object - with the flanked ranges. + A new `RangedSummarizedExperiment` object with the flanked ranges, + either as a copy of the original or as a reference to the + (in-place-modified) original. """ new_ranges = self.row_ranges.flank( - width=width, start=start, both=both, ignore_strand=ignore_strand + width=width, + start=start, + both=both, + ignore_strand=ignore_strand, + in_place=False, ) - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, - ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output def resize( self, - width: int, + width: Union[int, List[int], np.ndarray], fix: Literal["start", "end", "center"] = "start", ignore_strand: bool = False, + in_place: bool = False, ) -> "RangedSummarizedExperiment": - """Resize :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges` to - the specified ``width`` where either the ``start``, ``end``, or ``center`` is used as an anchor. + """Resize ranges to the specified ``width`` where either the ``start``, ``end``, or ``center`` is used as an + anchor. Args: - width (int): Width to resize by. - fix (Literal["start", "end", "center"], optional): Fix positions by "start", "end" - or "center". Defaults to "start". - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. + width: + Width to resize, cannot be negative! + + fix: + Fix positions by "start", "end", or "center". + Defaults to "start". + + ignore_strand: + Whether to ignore strands. Defaults to False. + + in_place: + Whether to modify the ``GenomicRanges`` object in place. Raises: - ValueError: If ``fix`` is neither ``start``, ``center``, or ``end``. + ValueError: + If ``fix`` is neither ``start``, ``center``, or ``end``. Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` object - with the resized ranges. + A new `RangedSummarizedExperiment` object with the resized ranges, + either as a copy of the original or as a reference to the + (in-place-modified) original. """ new_ranges = self.row_ranges.resize( - width=width, fix=fix, ignore_strand=ignore_strand + width=width, fix=fix, ignore_strand=ignore_strand, in_place=False ) - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, - ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output - def shift(self, shift: int = 0) -> "RangedSummarizedExperiment": - """Shift all ranges in - :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges` by the - specified ``shift`` parameter. + def shift( + self, shift: Union[int, List[int], np.ndarray] = 0, in_place: bool = False + ) -> "RangedSummarizedExperiment": + """Shift all intervals. ``shift`` may be be negative. Args: - shift (int, optional): Shift interval. Defaults to 0. + shift: + Shift interval. If shift is 0, the current + object is returned. Defaults to 0. + + in_place: + Whether to modify the ``GenomicRanges`` object in place. Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` - object with the shifted ranges. + A new `RangedSummarizedExperiment` object with the shifted ranges, + either as a copy of the original or as a reference to the + (in-place-modified) original. """ - new_ranges = self.row_ranges.shift(shift=shift) + new_ranges = self.row_ranges.shift(shift=shift, in_place=False) - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, - ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output def promoters( - self, upstream: int = 2000, downstream: int = 200 + self, upstream: int = 2000, downstream: int = 200, in_place: bool = False ) -> "RangedSummarizedExperiment": - """Extend :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges` to - promoter regions. + """Extend intervals to promoter regions. Args: - upstream (int, optional): Number of positions to extend in the 5' direction. + upstream: + Number of positions to extend in the 5' direction. Defaults to 2000. - downstream (int, optional): Number of positions to extend in the 3' direction. + + downstream: + Number of positions to extend in the 3' direction. Defaults to 200. + in_place: + Whether to modify the ``GenomicRanges`` object in place. + Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` - object with the extended ranges for promoter regions. + A new `RangedSummarizedExperiment` object with the extended ranges for + promoter regions, either as a copy of the original or as a reference to the + (in-place-modified) original. """ - new_ranges = self.row_ranges.promoters(upstream=upstream, downstream=downstream) - - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, + new_ranges = self.row_ranges.promoters( + upstream=upstream, downstream=downstream, in_place=False ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output + def restrict( self, - start: Optional[int] = None, - end: Optional[int] = None, + start: Optional[Union[int, List[int], np.ndarray]] = None, + end: Optional[Union[int, List[int], np.ndarray]] = None, keep_all_ranges: bool = False, + in_place: bool = False, ) -> "RangedSummarizedExperiment": - """Restrict :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges` to - a given start and end positions. + """Restrict ranges to a given start and end positions. Args: - start (int, optional): Start position. Defaults to None. - end (int, optional): End position. Defaults to None. - keep_all_ranges (bool, optional): Whether to keep intervals that do not - overlap with start and end. Defaults to False. + start: + Start position. Defaults to None. + + end: + End position. Defaults to None. + + keep_all_ranges: + Whether to keep intervals that do not overlap with start and end. + Defaults to False. + + in_place: + Whether to modify the ``GenomicRanges`` object in place. Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` - object with restricted intervals. + A new `RangedSummarizedExperiment` object with restricted intervals, + either as a copy of the original or as a reference to the + (in-place-modified) original. """ new_ranges = self.row_ranges.restrict( - start=start, end=end, keep_all_ranges=keep_all_ranges + start=start, end=end, keep_all_ranges=keep_all_ranges, in_place=False ) - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, - ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output def narrow( self, - start: Optional[int] = None, - width: Optional[int] = None, - end: Optional[int] = None, + start: Optional[Union[int, List[int], np.ndarray]] = None, + width: Optional[Union[int, List[int], np.ndarray]] = None, + end: Optional[Union[int, List[int], np.ndarray]] = None, + in_place: bool = False, ) -> "RangedSummarizedExperiment": - """Narrow :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges`. + """Narrow genomic positions by provided ``start``, ``width`` and ``end`` parameters. Important: these parameters are relative shift in positions for each range. Args: - start (int, optional): Relative start position. Defaults to None. - width (int, optional): Relative end position. Defaults to None. - end (int, optional): Relative width of the interval. Defaults to None. + start: + Relative start position. Defaults to None. + + width: + Relative end position. Defaults to None. + + end: + Relative width of the interval. Defaults to None. + + in_place: + Whether to modify the ``GenomicRanges`` object in place. Raises: - ValueError: When parameters were set incorrectly or row_ranges is empty + When parameters were set incorrectly or row_ranges is empty Returns: - RangedSummarizedExperiment: A new `RangedSummarizedExperiment` - object with narrow positions. + A new `RangedSummarizedExperiment` object with narrow positions, + either as a copy of the original or as a reference to the + (in-place-modified) original. """ - new_ranges = self.row_ranges.narrow(start=start, width=width, end=end) - - return RangedSummarizedExperiment( - assays=self.assays, - row_ranges=new_ranges, - col_data=self.col_data, - metadata=self.metadata, + new_ranges = self.row_ranges.narrow( + start=start, width=width, end=end, in_place=False ) + output = self._define_output(in_place=in_place) + output._row_ranges = new_ranges + return output + def find_overlaps( self, query: GRangesOrRangeSE, query_type: str = "any", + select: Literal["all", "first", "last", "arbitrary"] = "all", max_gap: int = -1, min_overlap: int = 1, ignore_strand: bool = False, - ) -> Optional["RangedSummarizedExperiment"]: + ) -> List[List[int]]: """Find overlaps between subject (self) and query ranges. Args: - query (GRangesOrRangeSE): Query intervals to find overlapping positions. + query: + Query intervals to find nearest positions. + ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` object. - query_type (str, optional): Overlap query type, must be one of - - "any": any overlap is good - - "start": overlap at the beginning of the intervals - - "end": must overlap at the end of the intervals - - "within": fully contain the query interval + query_type: + Overlap query type, must be one of + + - "any": Any overlap is good + - "start": Overlap at the beginning of the intervals + - "end": Must overlap at the end of the intervals + - "within": Fully contain the query interval Defaults to "any". - max_gap (int, optional): Maximum gap allowed in the overlap. + + select: + Determine what hit to choose when + there are multiple hits for an interval in ``subject``. + + max_gap: + Maximum gap allowed in the overlap. Defaults to -1 (no gap allowed). - min_overlap (int, optional): Minimum overlap with query. Defaults to 1. - ignore_strand (bool, optional): Whether to ignore strand.. Defaults to False. + + min_overlap: + Minimum overlap with query. Defaults to 1. + + ignore_strand: + Whether to ignore strands. Defaults to False. Raises: TypeError: If query is not a `RangedSummarizedExperiment` or `GenomicRanges`. Returns: - ("RangedSummarizedExperiment", optional): A `RangedSummarizedExperiment` object - with the same length as query, containing hits to overlapping indices. + A list with the same length as ``query``, + containing hits to overlapping indices. """ _check_gr_or_rse(query) @@ -644,6 +786,7 @@ def find_overlaps( return self.row_ranges.find_overlaps( query=qranges, query_type=query_type, + select=select, max_gap=max_gap, min_overlap=min_overlap, ignore_strand=ignore_strand, @@ -656,32 +799,43 @@ def subset_by_overlaps( max_gap: int = -1, min_overlap: int = 1, ignore_strand: bool = False, - ) -> Optional["RangedSummarizedExperiment"]: + ) -> "RangedSummarizedExperiment": """Subset a `RangedSummarizedExperiment` by feature overlaps. Args: - query (GRangesOrRangeSE): Query intervals to subset by overlapping positions. + query: + Query `GenomicRanges`. + ``query`` may be a :py:class:`~genomicranges.GenomicRanges.GenomicRanges` or a :py:class:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment` object. - query_type (str, optional): Overlap query type, must be one of - - "any": any overlap is good - - "start": overlap at the beginning of the intervals - - "end": must overlap at the end of the intervals - - "within": fully contain the query interval + query_type: + Overlap query type, must be one of + + - "any": Any overlap is good + - "start": Overlap at the beginning of the intervals + - "end": Must overlap at the end of the intervals + - "within": Fully contain the query interval Defaults to "any". - max_gap (int, optional): Maximum gap allowed in the overlap. + + max_gap: + Maximum gap allowed in the overlap. Defaults to -1 (no gap allowed). - min_overlap (int, optional): Minimum overlap with query. Defaults to 1. - ignore_strand (bool, optional): Whether to ignore strand.. Defaults to False. + + min_overlap: + Minimum overlap with query. Defaults to 1. + + ignore_strand: + Whether to ignore strands. Defaults to False. Raises: - TypeError: If query is not a `RangedSummarizedExperiment` or `GenomicRanges`. + TypeError: + If query is not a `RangedSummarizedExperiment` or `GenomicRanges`. Returns: - Optional["RangedSummarizedExperiment"]: A new `RangedSummarizedExperiment` + A new `RangedSummarizedExperiment` object. None if there are no indices to slice. """ _check_gr_or_rse(query) @@ -696,47 +850,41 @@ def subset_by_overlaps( ignore_strand=ignore_strand, ) - if result is None: - return None - - hits = result.column("hits") - hit_counts = [len(ht) for ht in hits] - indices = [idx for idx in range(len(hit_counts)) if hit_counts[idx] > 0] + import itertools - if indices is None: - return None + indices = list(itertools.chain.from_iterable(result)) - return self[indices, :] + return self[indices,] - def order(self, decreasing: bool = False) -> List[int]: + def order(self, decreasing: bool = False) -> np.ndarray: """Get the order of indices to sort. Args: - decreasing (bool, optional): Whether to sort in descending order. Defaults to False. + decreasing: + Whether to sort in descending order. Defaults to False. Returns: - List[int]: Order of indices. + NumPy vector containing index positions in the sorted order. """ return self.row_ranges.order(decreasing=decreasing) def sort( - self, decreasing: bool = False, ignore_strand: bool = False + self, decreasing: bool = False, in_place: bool = False ) -> "RangedSummarizedExperiment": - """Sort `RangedSummarizedExperiment` by - :py:attr:`~summarizedexperiment.RangedSummarizedExperiment.RangedSummarizedExperiment.row_ranges`. + """Sort by ranges. Args: - decreasing (bool, optional): Whether to sort in decreasing order. Defaults to False. - ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. + decreasing: + Whether to sort in descending order. Defaults to False. + + in_place: + Whether to modify the object in place. Defaults to False. Returns: - "RangedSummarizedExperiment": A new sorted + A new sorted `RangedSummarizedExperiment` object. """ - order = self.row_ranges._generic_order(ignore_strand=ignore_strand) - - if decreasing: - order = order[::-1] + _order = self.row_ranges.order(decreasing=decreasing) - new_order = order.to_list() - return self[new_order, :] + output = self._define_output(in_place=in_place) + return output[_order,] diff --git a/src/summarizedexperiment/SummarizedExperiment.py b/src/summarizedexperiment/SummarizedExperiment.py index e0d38e1..edc3b79 100644 --- a/src/summarizedexperiment/SummarizedExperiment.py +++ b/src/summarizedexperiment/SummarizedExperiment.py @@ -24,7 +24,7 @@ def __init__( assays: Dict[str, MatrixTypes], row_data: Optional[BiocFrame] = None, col_data: Optional[BiocFrame] = None, - metadata: Optional[Dict] = None, + metadata: Optional[dict] = None, validate: bool = True, ) -> None: """Initialize a Summarized Experiment (SE). @@ -76,42 +76,3 @@ def __init__( metadata=metadata, validate=validate, ) - - # def __getitem__( - # self, - # args: Union[int, str, Sequence, tuple], - # ) -> "SummarizedExperiment": - # """Subset a `SummarizedExperiment`. - - # Args: - # args: - # Indices or names to slice. The tuple contains - # slices along dimensions (rows, cols). - - # Each element in the tuple, might be either a integer vector (integer positions), - # boolean vector or :py:class:`~slice` object. Defaults to None. - - # Raises: - # ValueError: - # If too many or too few slices provided. - - # Returns: - # Sliced `SummarizedExperiment` object. - # """ - # sliced_objs = self._generic_slice(args) - # return SummarizedExperiment( - # assays=sliced_objs.assays, - # row_data=sliced_objs.rows, - # col_data=sliced_objs.cols, - # metadata=self.metadata, - # ) - - def __repr__(self) -> str: - pattern = ( - f"Class SummarizedExperiment with {self.shape[0]} features and {self.shape[1]} " - "samples \n" - f" assays: {list(self.assays.keys())} \n" - f" row_data: {self.row_data.columns if self.row_data is not None else None} \n" - f" col_data: {self.col_data.columns if self.col_data is not None else None}" - ) - return pattern