diff --git a/setup.cfg b/setup.cfg index 03d6d32..8eda853 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,8 +49,8 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - genomicranges>=0.3.6 - scipy + genomicranges>=0.3.7 + biocgenerics [options.packages.find] where = src diff --git a/src/summarizedexperiment/BaseSE.py b/src/summarizedexperiment/BaseSE.py index 99c0cba..404beb9 100644 --- a/src/summarizedexperiment/BaseSE.py +++ b/src/summarizedexperiment/BaseSE.py @@ -1,23 +1,21 @@ -import warnings from collections import OrderedDict -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union +from warnings import warn -from biocframe import BiocFrame +from biocframe import BiocFrame, from_pandas +from biocgenerics import colnames, rownames, set_colnames, set_rownames +from biocutils import is_list_of_type from genomicranges import GenomicRanges from pandas import DataFrame -from .dispatchers.colnames import get_colnames, set_colnames -from .dispatchers.rownames import get_rownames, set_rownames from .type_checks import ( is_bioc_or_pandas_frame, is_list_of_subclass, - is_list_of_type, is_matrix_like, + is_pandas, ) from .types import ( - BiocOrPandasFrame, MatrixSlicerTypes, - MatrixTypes, SlicerArgTypes, SlicerResult, ) @@ -28,7 +26,6 @@ ) from .utils.slicer import get_indexes_from_bools, get_indexes_from_names - __author__ = "jkanche, keviny2" __copyright__ = "jkanche" __license__ = "MIT" @@ -42,7 +39,7 @@ class BaseSE: features in ``row_data``, sample data in ``col_data``, and any other relevant ``metadata``. Attributes: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays (Dict[str, Any]): A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. @@ -52,11 +49,11 @@ class BaseSE: All matrices in assays must be 2-dimensional and have the same shape (number of rows, number of columns). - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of + row_data (BiocFrame, optional): Features, must be the same length as the numner of rows of the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the + col_data (BiocFrame, optional): Sample data, which be the same length as the number of columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. @@ -65,15 +62,15 @@ class BaseSE: def __init__( self, - assays: Dict[str, MatrixTypes], - rows: Optional[BiocOrPandasFrame] = None, - cols: Optional[BiocOrPandasFrame] = None, - metadata: Optional[Dict] = None, + assays: Dict[str, Any], + rows: Optional[BiocFrame] = None, + cols: Optional[BiocFrame] = None, + metadata: Optional[dict] = None, ) -> None: """Initialize an instance of `BaseSE`. Args: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays (Dict[str, Any]): A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. @@ -83,22 +80,22 @@ def __init__( All matrices in assays must be 2-dimensional and have the same shape (number of rows, number of columns). - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of + row_data (BiocFrame, optional): Features, must be the same length as the numner of rows of the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the + col_data (BiocFrame, optional): Sample data, which be the same length as the number of columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - metadata (Dict, optional): Additional experimental metadata describing the methods. Defaults to None. + metadata (dict, optional): Additional experimental metadata describing the methods. Defaults to None. """ self._shape: Optional[Tuple] = None if assays is None or not isinstance(assays, dict) or len(assays.keys()) == 0: raise Exception( - "`assays` must be a dictionary and contain at least one matrix." + "`assays` must be a dictionary and contain atleast one 2-dimensional matrix." ) self._validate_assays(assays) @@ -106,31 +103,31 @@ def __init__( # should have _shape by now if self._shape is None: - raise TypeError("This should not happen! `assays` is not consistent.") + raise RuntimeError("Cannot extract shape from assays!") self._set_rows(rows) self._set_cols(cols) self._metadata = metadata - def _set_rows(self, rows: Optional[BiocOrPandasFrame]): + def _set_rows(self, rows: Optional[BiocFrame]): rows = ( rows if rows is not None else BiocFrame({}, number_of_rows=self._shape[0]) ) - # if is_pandas(rows): - # rows = from_pandas(rows) + if is_pandas(rows): + rows = from_pandas(rows) self._validate_rows(rows) self._rows = rows - def _set_cols(self, cols: Optional[BiocOrPandasFrame]): + def _set_cols(self, cols: Optional[BiocFrame]): cols = ( cols if cols is not None else BiocFrame({}, number_of_rows=self._shape[1]) ) - # if is_pandas(cols): - # cols = from_pandas(cols) + if is_pandas(cols): + cols = from_pandas(cols) self._validate_cols(cols) self._cols = cols @@ -144,25 +141,23 @@ def _validate(self): def _validate_assays( self, - assays: Dict[str, MatrixTypes], + assays: Dict[str, Any], ): """Internal method to validate experiment data (assays). Args: - assays (Dict[str, MatrixTypes]): Experiment data. + assays (Dict[str, Any]): Experiment data. Raises: ValueError: If ``assays`` contain more than 2 dimensions. If all ``assays`` do not have the same dimensions. - TypeError: If ``assays`` includes an unsupported matrix representation. + TypeError: If ``assays`` contains an unsupported matrix representation. """ for asy, mat in assays.items(): if not is_matrix_like(mat): - raise TypeError( - f"Assay: '{asy}' is not a supported matrix representation." - ) + raise TypeError(f"Assay: '{asy}' is not a supported matrix type.") if len(mat.shape) > 2: raise ValueError( @@ -176,15 +171,15 @@ def _validate_assays( if mat.shape != self._shape: raise ValueError( - f"Assay: '{asy}' must be of shape '{self._shape}'" + f"dimensions mismatch, '{asy}' must be of shape '{self._shape}'" f" but provided '{mat.shape}'." ) - def _validate_rows(self, rows: BiocOrPandasFrame): + def _validate_rows(self, rows: BiocFrame): """Internal method to validate feature information (row_data). Args: - rows (BiocOrPandasFrame): Feature data frame to validate. + rows (BiocFrame): Features to validate. Features may be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. @@ -201,15 +196,15 @@ def _validate_rows(self, rows: BiocOrPandasFrame): if rows.shape[0] != self._shape[0]: raise ValueError( - f"`Features` and `assays` do not match. must be '{self._shape[0]}'" + f"`Features` and `assays` mismatch. Must be '{self._shape[0]}'" f" but provided '{rows.shape[0]}'." ) - def _validate_cols(self, cols: BiocOrPandasFrame): + def _validate_cols(self, cols: BiocFrame): """Internal method to validate sample information (col_data). Args: - cols (BiocOrPandasFrame): Sample information (col_data). + cols (BiocFrame): Sample information (col_data). Sample may be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. @@ -226,18 +221,18 @@ def _validate_cols(self, cols: BiocOrPandasFrame): if cols.shape[0] != self._shape[1]: raise ValueError( - f"`Sample` data and `assays` do not match. must be '{self._shape[1]}'" + f"`Sample` data and `assays` mismatch. Must be '{self._shape[1]}'" f" but provided '{cols.shape[0]}'." ) @property def assays( self, - ) -> Dict[str, MatrixTypes]: - """Retrieve all assays. + ) -> Dict[str, Any]: + """Retrieve all experiment data (assays). Returns: - Dict[str, MatrixTypes]: A dictionary where experiment names serve as keys, + Dict[str, Any]: A dictionary with experiment names as keys, and matrices as corresponding values. """ return self._assays @@ -245,12 +240,12 @@ def assays( @assays.setter def assays( self, - assays: Dict[str, MatrixTypes], + assays: Dict[str, Any], ): """Set new experiment data (assays). Args: - assays (Dict[str, MatrixTypes]): New assays. + assays (Dict[str, Any]): New assays. """ self._validate_assays(assays) self._assays = assays @@ -265,52 +260,55 @@ def row_data(self) -> BiocFrame: return self._rows @row_data.setter - def row_data(self, rows: Optional[BiocOrPandasFrame]): + def row_data(self, rows: Optional[BiocFrame]): """Set features. Args: - rows (BiocOrPandasFrame, optional): New feature information. + rows (BiocFrame, optional): New feature information. If ``rows`` is None, an empty :py:class:`~biocframe.BiocFrame.BiocFrame` object is created. """ self._set_rows(rows) @property - def col_data(self) -> BiocOrPandasFrame: + def col_data(self) -> BiocFrame: """Get sample data. Returns: - BiocOrPandasFrame: Sample information. + BiocFrame: Sample information. """ return self._cols @col_data.setter - def col_data(self, cols: Optional[BiocOrPandasFrame]): + def col_data(self, cols: Optional[BiocFrame]): """Set sample data. Args: - cols (BiocOrPandasFrame, optional): New sample data. + cols (BiocFrame, optional): New sample data. If ``cols`` is None, an empty :py:class:`~biocframe.BiocFrame.BiocFrame` object is created. """ self._set_cols(cols) @property - def metadata(self) -> Optional[Dict]: + def metadata(self) -> Optional[dict]: """Retrieve metadata. Returns: - Optional[Dict]: A metadata object, typically in the form of a dictionary. + Optional[dict]: A metadata object, typically in the form of a dictionary. """ return self._metadata @metadata.setter - def metadata(self, metadata: Optional[Dict]): + def metadata(self, metadata: Optional[dict]): """Set metadata. Args: - metadata (Optional[Dict]): New metadata object. + metadata (Optional[dict]): New metadata object. """ + if metadata is None: + metadata = {} + self._metadata = metadata @property @@ -356,7 +354,7 @@ def assay_names(self, names: List[str]): """ current_names = self.assay_names if len(names) != len(current_names): - raise ValueError("Length of `names` do not match the number of `assays`.") + raise ValueError("Length of `names` does not match the number of `assays`.") new_assays = OrderedDict() for idx in range(len(names)): @@ -365,16 +363,17 @@ def assay_names(self, names: List[str]): self._assays = new_assays def __repr__(self) -> str: + current_class_const = type(self) pattern = ( - f"Class BaseSE with {self.shape[0]} features and {self.shape[1]} samples \n" + f"Class {current_class_const.__name__} with {self.shape[0]} features and {self.shape[1]} samples \n" f" assays: {', '.join(list(self.assays.keys()))} \n" f" features: {self.row_data.columns if self.row_data is not None else None} \n" f" sample data: {self.col_data.columns if self.col_data is not None else None}" ) return pattern - def assay(self, index_or_name: Union[int, str]) -> MatrixTypes: - """Convenience function to access an :py:attr:`~summarizedexperiment.BaseSE.BaseSE.assays` by name or index. + def assay(self, index_or_name: Union[int, str]) -> Any: + """Convenience method to access an :py:attr:`~summarizedexperiment.BaseSE.BaseSE.assays` by name or index. Args: name (Union[int, str]): Name or index position of the assay. @@ -384,7 +383,7 @@ def assay(self, index_or_name: Union[int, str]) -> MatrixTypes: IndexError: If index is greater than the number of assays. Returns: - MatrixTypes: Experiment data. + Any: Experiment data. """ if isinstance(index_or_name, int): if index_or_name < 0 or index_or_name > len(self.assay_names): @@ -405,7 +404,7 @@ def subset_assays( self, row_indices: Optional[MatrixSlicerTypes] = None, col_indices: Optional[MatrixSlicerTypes] = None, - ) -> Dict[str, MatrixTypes]: + ) -> Dict[str, Any]: """Subset all assays using a slice defined by rows and columns. If both ``row_indices`` and ``col_indices`` are None, a copy of the @@ -417,7 +416,7 @@ def subset_assays( ``row_indices`` may be a list of integer indices to subset. Alternatively ``row_indices`` may be a boolean vector specifying - `True` to keep the index or `False` to remove. The length of the boolean + `True` to keep the row or `False` to remove. The length of the boolean vector must match the number of rows in the experiment. Alternatively, ``row_indices`` may be a :py:class:`~slice` object. @@ -429,7 +428,7 @@ def subset_assays( ``col_indices`` may be a list of integer indices to subset. Alternatively ``col_indices`` may be a boolean vector specifying - `True` to keep the index or `False` to remove. The length of the boolean + `True` to keep the column or `False` to remove. The length of the boolean vector must match the number of columns in the experiment. Alternatively, ``col_indices`` may be a :py:class:`~slice` object. @@ -440,11 +439,11 @@ def subset_assays( warning: If ``row_indices`` and ``col_indices`` are both None. Returns: - Dict[str, MatrixTypes]: Sliced experiment data. + Dict[str, Any]: Sliced experimental data. """ if row_indices is None and col_indices is None: - warnings.warn("No slice is provided, this returns a copy of all assays!") + warn("No slice is provided, this returns a copy of all assays!") return self.assays.copy() new_assays = OrderedDict() @@ -469,13 +468,13 @@ def _slice( args (SlicerArgTypes): Indices or names to slice. The tuple contains slices along dimensions (rows, cols). - Each element in the tuple may be either a integer vector (integer positions), + Each element in the tuple may be either an list of indices, boolean vector or :py:class:`~slice` object. Defaults to None. Raises: - ValueError: If too many or too few slices provided. + ValueError: If too many or too few slices are provided. Returns: SlicerResult: The sliced tuple. @@ -505,7 +504,7 @@ def _slice( if row_indices is not None and self.row_data is not None: if is_list_of_type(row_indices, str): row_indices = get_indexes_from_names( - get_rownames(self.row_data), row_indices + rownames(self.row_data), row_indices ) elif is_list_of_type(row_indices, bool): if len(row_indices) != self.shape[0]: @@ -521,12 +520,12 @@ def _slice( else: new_rows = new_rows[row_indices, :] else: - raise TypeError("`row_indices` is not supported!") + raise TypeError("`row_indices` is not a supported type!") if col_indices is not None and self.col_data is not None: if is_list_of_type(col_indices, str): col_indices = get_indexes_from_names( - get_rownames(self.col_data), col_indices + rownames(self.col_data), col_indices ) elif is_list_of_type(col_indices, bool): if len(col_indices) != self.shape[1]: @@ -542,7 +541,7 @@ def _slice( else: new_cols = new_cols[col_indices, :] else: - raise TypeError("`col_indices` not supported!") + raise TypeError("`col_indices` not a supported type!") new_assays = self.subset_assays( row_indices=row_indices, col_indices=col_indices @@ -557,7 +556,7 @@ def row_names(self) -> List[str]: Returns: List[str]: List of row names. """ - return get_rownames(self.row_data) + return rownames(self.row_data) @row_names.setter def row_names(self, names: List[str]): @@ -581,7 +580,7 @@ def colnames(self) -> List[str]: Returns: List[str]: List of sample names. """ - return get_colnames(self.col_data) + return colnames(self.col_data) @colnames.setter def colnames(self, names: List[str]): @@ -599,7 +598,7 @@ def colnames(self, names: List[str]): self._cols = set_colnames(self.col_data, names) def to_anndata(self): - """Transform :py:class:`summarizedexperiment.BaseSE`-like into a :py:class:`~anndata.AnnData` representation. + """Coerce :py:class:`summarizedexperiment.BaseSE`-like into an :py:class:`~anndata.AnnData` representation. Returns: AnnData: An `AnnData` representation of the experiment. diff --git a/src/summarizedexperiment/RangedSummarizedExperiment.py b/src/summarizedexperiment/RangedSummarizedExperiment.py index a1dfe46..e1bca66 100644 --- a/src/summarizedexperiment/RangedSummarizedExperiment.py +++ b/src/summarizedexperiment/RangedSummarizedExperiment.py @@ -1,10 +1,11 @@ -from typing import Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union import numpy as np +from biocframe import BiocFrame from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo from .SummarizedExperiment import SummarizedExperiment -from .types import BiocOrPandasFrame, MatrixTypes, SlicerArgTypes +from .types import SlicerArgTypes __author__ = "jkanche" __copyright__ = "jkanche" @@ -57,14 +58,14 @@ class RangedSummarizedExperiment(SummarizedExperiment): The key difference between this and `SummarizedExperiment` is enforcing type for feature information (`row_ranges`), must be a `GenomicRanges` object. This allows us to - provides new methods, to perform genomic range based operations over experimental data. + implement methods to perform genomic range based operations over experimental data. Note: If ``row_ranges`` is empty, None or not a :py:class:`genomicranges.GenomicRanges.GenomicRanges` object, use a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` instead. Attributes: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays (Dict[str, Any]): A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. @@ -77,11 +78,11 @@ class RangedSummarizedExperiment(SummarizedExperiment): row_ranges (GRangesOrGRangesList, optional): Genomic features, must be the same length as rows of the matrices in assays. - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of + row_data (BiocFrame, optional): Features, which must be of the same length as the rows of the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the + col_data (BiocFrame, optional): Sample data, which must be of the same length as the columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. @@ -90,16 +91,16 @@ class RangedSummarizedExperiment(SummarizedExperiment): def __init__( self, - assays: Dict[str, MatrixTypes], + assays: Dict[str, Any], row_ranges: Optional[GRangesOrGRangesList] = None, - row_data: Optional[BiocOrPandasFrame] = None, - col_data: Optional[BiocOrPandasFrame] = None, + row_data: Optional[BiocFrame] = None, + col_data: Optional[BiocFrame] = None, metadata: Optional[Dict] = None, ) -> None: """Initialize a `RangedSummarizedExperiment` (RSE) object. Args: - assays (Dict[str, MatrixTypes]): Dictionary + assays (Dict[str, Any]): Dictionary of matrices, with assay names as keys and 2-dimensional matrices represented as :py:class:`~numpy.ndarray` or :py:class:`scipy.sparse.spmatrix` matrices. @@ -112,14 +113,14 @@ def __init__( row_ranges (GRangesOrGRangesList, optional): Genomic features, must be the same length as rows of the matrices in assays. - row_data (BiocOrPandasFrame, optional): Features, must be the same length as + row_data (BiocFrame, optional): Features, must be the same length as rows of the matrices in assays. Features may be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, must be + col_data (BiocFrame, optional): Sample data, must be the same length as columns of the matrices in assays. Sample Information may be either a :py:class:`~pandas.DataFrame` or @@ -148,10 +149,7 @@ def _validate_row_ranges(self, row_ranges: GRangesOrGRangesList): `row_ranges` & `assays`. TypeError: If `row_ranges` is not a `GenomicRanges` or `GenomicRangesList`. """ - if not ( - isinstance(row_ranges, GenomicRanges) - or isinstance(row_ranges, GenomicRangesList) - ): + if not isinstance(row_ranges, (GenomicRanges, GenomicRangesList)): raise TypeError( "`row_ranges` must be a `GenomicRanges` or `GenomicRangesList`" f" , provided {type(row_ranges)}." @@ -245,11 +243,9 @@ def __getitem__( args (SlicerArgTypes): Indices or names to slice. The tuple contains slices along dimensions (rows, cols). - Each element in the tuple, might be either a integer vector (integer positions), + Each element in the tuple may be either an list of indices, boolean vector or :py:class:`~slice` object. - Defaults to None. - Raises: ValueError: If too many or too few slices are provided. @@ -262,7 +258,8 @@ def __getitem__( if sliced_objs.row_indices is not None and self.row_ranges is not None: new_row_ranges = self.row_ranges[sliced_objs.row_indices] - return RangedSummarizedExperiment( + current_class_const = type(self) + return current_class_const( assays=sliced_objs.assays, row_ranges=new_row_ranges, row_data=sliced_objs.row_data, @@ -271,11 +268,13 @@ def __getitem__( ) def __repr__(self) -> str: + current_class_const = type(self) pattern = ( - f"Class RangedSummarizedExperiment with {self.shape[0]} features and {self.shape[1]} " + f"Class {current_class_const.__name__} with {self.shape[0]} features and {self.shape[1]} " "samples \n" f" assays: {list(self.assays.keys())} \n" f" row_data: {self.row_data.columns if self.row_data is not None else None} \n" + f" row_ranges: {self.row_ranges.columns if self.row_ranges is not None else None} \n" f" col_data: {self.col_data.columns if self.col_data is not None else None}" ) return pattern @@ -315,8 +314,7 @@ def nearest( Defaults to False. Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` - or ``GenomicRanges``. + TypeError: If query is neither `RangedSummarizedExperiment` nor `GenomicRanges`. Returns: (List[Optional[int]], optional): List of possible `hit` indices @@ -345,8 +343,7 @@ def precede( ignore_strand (bool, optional): Whether to ignore strand. Defaults to False. Raises: - TypeError: If ``query`` is not a ``RangedSummarizedExperiment`` or - ``GenomicRanges``. + TypeError: If query is neither `RangedSummarizedExperiment` nor `GenomicRanges`. Returns: (List[Optional[int]], optional): List of possible hit indices @@ -629,7 +626,7 @@ def find_overlaps( ignore_strand (bool, optional): Whether to ignore strand.. Defaults to False. Raises: - TypeError: If query is not a `RangedSummarizedExperiment` or `GenomicRanges`. + TypeError: If query is neither `RangedSummarizedExperiment` nor `GenomicRanges`. Returns: ("RangedSummarizedExperiment", optional): A `RangedSummarizedExperiment` object @@ -676,7 +673,7 @@ def subset_by_overlaps( ignore_strand (bool, optional): Whether to ignore strand.. Defaults to False. Raises: - TypeError: If query is not a `RangedSummarizedExperiment` or `GenomicRanges`. + TypeError: If query is neither `RangedSummarizedExperiment` nor `GenomicRanges`. Returns: Optional["RangedSummarizedExperiment"]: A new `RangedSummarizedExperiment` @@ -733,7 +730,7 @@ def sort( """ order = self.row_ranges._generic_order(ignore_strand=ignore_strand) - if decreasing: + if decreasing is True: order = order[::-1] new_order = order.to_list() diff --git a/src/summarizedexperiment/SummarizedExperiment.py b/src/summarizedexperiment/SummarizedExperiment.py index ab205f9..698cdf2 100644 --- a/src/summarizedexperiment/SummarizedExperiment.py +++ b/src/summarizedexperiment/SummarizedExperiment.py @@ -12,14 +12,14 @@ class SummarizedExperiment(BaseSE): - """Container to represents genomic experiment data (`assays`), features (`row_data`), sample data (`col_data`) and + """Container to represent genomic experimental data (`assays`), features (`row_data`), sample data (`col_data`) and any other `metadata`. SummarizedExperiment follows the R/Bioconductor specification; rows are features, columns are samples. Attributes: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays (Dict[str, Any]): A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. @@ -29,11 +29,11 @@ class SummarizedExperiment(BaseSE): All matrices in assays must be 2-dimensional and have the same shape (number of rows, number of columns). - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of + row_data (BiocFrame, optional): Features, must be the same length as the numner of rows of the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the + col_data (BiocFrame, optional): Sample data, which be the same length as the number of columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. @@ -50,7 +50,7 @@ def __init__( """Initialize a Summarized Experiment (SE). Args: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays (Dict[str, Any]): A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. @@ -60,11 +60,11 @@ def __init__( All matrices in assays must be 2-dimensional and have the same shape (number of rows, number of columns). - row_data (BiocOrPandasFrame, optional): Features, which must be of the same length as the rows of + row_data (BiocFrame, optional): Features, must be the same length as the numner of rows of the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocOrPandasFrame, optional): Sample data, which must be of the same length as the + col_data (BiocFrame, optional): Sample data, which be the same length as the number of columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. @@ -88,17 +88,19 @@ def __getitem__( args (SlicerArgTypes): Indices or names to slice. The tuple contains slices along dimensions (rows, cols). - Each element in the tuple, might be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. Defaults to None. + Each element in the tuple may be either an list of indices, + boolean vector or :py:class:`~slice` object. Raises: - ValueError: If too many or too few slices provided. + ValueError: If too many or too few slices are provided. Returns: - SummarizedExperiment: Sliced `SummarizedExperiment` object. + The same type as caller, with the sliced entries. """ sliced_objs = self._slice(args) - return SummarizedExperiment( + + current_class_const = type(self) + return current_class_const( assays=sliced_objs.assays, row_data=sliced_objs.row_data, col_data=sliced_objs.col_data, @@ -106,8 +108,9 @@ def __getitem__( ) def __repr__(self) -> str: + current_class_const = type(self) pattern = ( - f"Class SummarizedExperiment with {self.shape[0]} features and {self.shape[1]} " + f"Class {current_class_const.__name__} with {self.shape[0]} features and {self.shape[1]} " "samples \n" f" assays: {list(self.assays.keys())} \n" f" row_data: {self.row_data.columns if self.row_data is not None else None} \n" diff --git a/src/summarizedexperiment/dispatchers/__init__.py b/src/summarizedexperiment/dispatchers/__init__.py deleted file mode 100644 index a38de3f..0000000 --- a/src/summarizedexperiment/dispatchers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .colnames import get_colnames, set_colnames -from .rownames import get_rownames, set_rownames diff --git a/src/summarizedexperiment/dispatchers/colnames.py b/src/summarizedexperiment/dispatchers/colnames.py deleted file mode 100644 index 6ffd8e3..0000000 --- a/src/summarizedexperiment/dispatchers/colnames.py +++ /dev/null @@ -1,84 +0,0 @@ -from functools import singledispatch -from typing import List - -from biocframe import BiocFrame -from pandas import DataFrame - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -@singledispatch -def get_colnames(x) -> List[str]: - """Access column names from various representations. - - Args: - x: Any object. - - ``x`` may be a :py:class:`~pandas.DataFrame`. - - Alternatively, ``x`` may be a :py:class:`~biocframe.BiocFrame.BiocFrame` object. - - Alternatively, ``x`` may also contain a property or attribute ``colnames`` for - custom representations. - - Raises: - NotImplementedError: If ``x`` is not a supported type. - - Returns: - List[str]: List of column names. - """ - if hasattr(x, "colnames"): - return x.colnames - - raise NotImplementedError(f"`colnames` is not supported for class: '{type(x)}'.") - - -@get_colnames.register -def _(x: DataFrame) -> List[str]: - return x.index.tolist() - - -@get_colnames.register -def _(x: BiocFrame) -> List[str]: - return x.row_names - - -@singledispatch -def set_colnames(x, names: List[str]): - """Set column names for various representations. - - Args: - x: Any object. - - ``x`` may be a :py:class:`~pandas.DataFrame`. - - Alternatively, ``x`` may be a :py:class:`biocframe.BiocFrame.BiocFrame` object. - - Alternatively, ``x`` may also contain a property or attribute ``colnames`` for - custom representations. - - names (List[str]): New names. - - Raises: - NotImplementedError: if type is not supported. - - Returns: - An object with the same type as ``x``. - """ - raise NotImplementedError( - f"`set_colnames` is not supported for class: '{type(x)}'." - ) - - -@set_colnames.register -def _(x: DataFrame, names: List[str]) -> DataFrame: - x.index = names - return x - - -@set_colnames.register -def _(x: BiocFrame, names: List[str]) -> BiocFrame: - x.row_names = names - return x diff --git a/src/summarizedexperiment/dispatchers/rownames.py b/src/summarizedexperiment/dispatchers/rownames.py deleted file mode 100644 index 26ac083..0000000 --- a/src/summarizedexperiment/dispatchers/rownames.py +++ /dev/null @@ -1,82 +0,0 @@ -from functools import singledispatch -from typing import Any, List - -from biocframe import BiocFrame -from pandas import DataFrame - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -@singledispatch -def get_rownames(x) -> List[str]: - """Access row names from various representations. - - Args: - x: Any object. - - ``x`` may be a :py:class:`~pandas.DataFrame`. - - Alternatively, ``x`` may be a :py:class:`~biocframe.BiocFrame.BiocFrame` object. - - Alternatively, ``x`` may also contain a property or attribute ``row_names`` for - custom representations. - - Raises: - NotImplementedError: If ``x`` is not a supported type. - - Returns: - List[str]: List of row names. - """ - if hasattr(x, "row_names"): - return x.row_names - - raise NotImplementedError(f"`row_names` do not exist for class: '{type(x)}'.") - - -@get_rownames.register -def _(x: DataFrame) -> List[str]: - return x.index.tolist() - - -@get_rownames.register -def _(x: BiocFrame) -> List[str]: - return x.row_names - - -@singledispatch -def set_rownames(x: Any, names: List[str]): - """Set row names for various representations. - - Args: - x (Any): supported object. - - ``x`` may be a :py:class:`~pandas.DataFrame`. - - Alternatively, ``x`` may be a :py:class:`~biocframe.BiocFrame.BiocFrame` object. - - Alternatively, ``x`` may also contain a property or attribute ``row_names`` for - custom representations. - - names (List[str]): New names. - - Raises: - NotImplementedError: If ``x`` is not a supported type. - - Returns: - An object with the same type as ``x``. - """ - raise NotImplementedError(f"Cannot set row_names for class: {type(x)}") - - -@set_rownames.register -def _(x: DataFrame, names: List[str]) -> List[str]: - x.index = names - return x - - -@set_rownames.register -def _(x: BiocFrame, names: List[str]) -> List[str]: - x.row_names = names - return x diff --git a/src/summarizedexperiment/type_checks.py b/src/summarizedexperiment/type_checks.py index f2f5feb..932fed0 100644 --- a/src/summarizedexperiment/type_checks.py +++ b/src/summarizedexperiment/type_checks.py @@ -21,21 +21,6 @@ def is_bioc_or_pandas_frame(x: Any) -> bool: return is_pandas(x) or isinstance(x, BiocFrame) -# def is_gr_or_rse(x: Union[GenomicRanges, RangedSummarizedExperiment]): -# """Check if the object is either a `RangedSummarizedExperiment` or `GenomicRanges`. - -# Args: -# x (Union[GenomicRanges, RangedSummarizedExperiment]): object to check. - -# Raises: -# TypeError: object is not a `RangedSummarizedExperiment` or `GenomicRanges`. -# """ -# if not (isinstance(x, RangedSummarizedExperiment) or isinstance(x, GenomicRanges)): -# raise TypeError( -# "object is not a `RangedSummarizedExperiment` or `GenomicRanges`" -# ) - - # def _get_python_minor(): # return version_info[1] < 8 # Expectations on Assays, these should be matrices or matrix-like objects @@ -70,21 +55,6 @@ def is_matrix_like(x: Any) -> bool: return hasattr(x, "__getitem__") and hasattr(x, "shape") -def is_list_of_type(x: Any, target_type: Callable) -> bool: - """Checks if ``x`` is a list or tuple and and whether all elements are of the same type. - - Args: - x (Any): Any object. - target_type (callable): Type to check for, e.g. ``str``, ``int``. - - Returns: - bool: True if ``x`` is :py:class:`~list` and all elements are of the same type. - """ - return (isinstance(x, list) or isinstance(x, tuple)) and all( - isinstance(item, target_type) for item in x - ) - - def is_list_of_subclass(x: Any, target_type: Callable) -> bool: """Checks if all provided objects subclass of ``target_type``. @@ -95,7 +65,7 @@ def is_list_of_subclass(x: Any, target_type: Callable) -> bool: Returns: bool: True if ``x`` is :py:class:`~list` and all objects are derivatives of the same class. """ - return (isinstance(x, list) or isinstance(x, tuple)) and all( + return (isinstance(x, (list, tuple))) and all( issubclass(type(item), target_type) for item in x ) diff --git a/src/summarizedexperiment/types.py b/src/summarizedexperiment/types.py index f5a5978..9c904da 100644 --- a/src/summarizedexperiment/types.py +++ b/src/summarizedexperiment/types.py @@ -1,18 +1,10 @@ from collections import namedtuple from typing import List, Tuple, Union -import numpy as np -import pandas as pd -from biocframe import BiocFrame -from scipy import sparse as sp - __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -MatrixTypes = Union[np.ndarray, sp.spmatrix] -ArrayTypes = Union[np.ndarray, sp.lil_matrix] -BiocOrPandasFrame = Union[pd.DataFrame, BiocFrame] MatrixSlicerTypes = Union[List[int], List[bool], slice] SlicerTypes = Union[List[int], List[bool], List[str], slice] SlicerArgTypes = Union[Tuple[SlicerTypes], List[SlicerTypes], slice] diff --git a/src/summarizedexperiment/utils/combiners.py b/src/summarizedexperiment/utils/combiners.py index 1116fc8..733fae3 100644 --- a/src/summarizedexperiment/utils/combiners.py +++ b/src/summarizedexperiment/utils/combiners.py @@ -1,11 +1,11 @@ from typing import Dict, List, Literal, Tuple from biocframe import BiocFrame +from biocgenerics import rownames from numpy import argwhere, find_common_type, ndarray from pandas import DataFrame, Index, concat from scipy.sparse import lil_matrix -from ..dispatchers import get_rownames from ..types import ArrayTypes, BiocOrPandasFrame from .validators import validate_names, validate_shapes @@ -97,8 +97,8 @@ def combine_frames( concat_df = concat(all_as_pandas, axis=axis) - if (use_names is False) and (get_rownames(x[0]) is not None): - concat_df.index = get_rownames(x[0]) + if (use_names is False) and (rownames(x[0]) is not None): + concat_df.index = rownames(x[0]) if remove_duplicate_columns: return _remove_duplicate_columns(concat_df)