diff --git a/src/summarizedexperiment/BaseSE.py b/src/summarizedexperiment/BaseSE.py index fc8f007..848f0f8 100644 --- a/src/summarizedexperiment/BaseSE.py +++ b/src/summarizedexperiment/BaseSE.py @@ -99,8 +99,8 @@ class BaseSE: def __init__( self, assays: Dict[str, Any], - rows: Optional[BiocFrame] = None, - cols: Optional[BiocFrame] = None, + row_data: Optional[BiocFrame] = None, + col_data: Optional[BiocFrame] = None, metadata: Optional[Dict] = None, validate: bool = True, ) -> None: @@ -142,9 +142,9 @@ def __init__( """ self._assays = assays - self._shape = _guess_assay_shape(assays, rows, cols) - self._rows = _sanitize_frame(rows, self._shape[0]) - self._cols = _sanitize_frame(cols, self._shape[1]) + self._shape = _guess_assay_shape(assays, row_data, col_data) + self._rows = _sanitize_frame(row_data, self._shape[0]) + self._cols = _sanitize_frame(col_data, self._shape[1]) self._metadata = metadata if metadata is not None else {} if validate: @@ -182,8 +182,8 @@ def __deepcopy__(self, memo=None, _nil=[]): current_class_const = type(self) return current_class_const( assays=_assays_copy, - rows=_rows_copy, - cols=_cols_copy, + row_data=_rows_copy, + col_data=_cols_copy, metadata=_metadata_copy, ) @@ -195,8 +195,8 @@ def __copy__(self): current_class_const = type(self) return current_class_const( assays=self._assays, - rows=self._rows, - cols=self._cols, + row_data=self._rows, + col_data=self._cols, metadata=self._metadata, ) @@ -245,8 +245,8 @@ def __repr__(self) -> str: pattern = ( f"Class BaseSE with {self.shape[0]} features and {self.shape[1]} samples \n" f" assays: {', '.join(list(self.assays.keys()))} \n" - f" features: {self.rowdata.columns if self.rowdata is not None else None} \n" - f" sample data: {self.coldata.columns if self.coldata is not None else None}" + f" row_data: {self._rows.names if self._rows is not None else None} \n" + f" col_data: {self._cols.names if self._cols is not None else None}" ) return pattern @@ -716,8 +716,8 @@ def get_slice( return current_class_const( assays=slicer.assays, - rows=slicer.rows, - columns=slicer.columns, + row_data=slicer.rows, + col_data=slicer.columns, metadata=self._metadata, ) @@ -801,7 +801,7 @@ def to_anndata(self): """Transform :py:class:`summarizedexperiment.BaseSE`-like into a :py:class:`~anndata.AnnData` representation. Returns: - AnnData: An `AnnData` representation of the experiment. + An `AnnData` representation of the experiment. """ from anndata import AnnData @@ -809,12 +809,10 @@ def to_anndata(self): for asy, mat in self.assays.items(): layers[asy] = mat.transpose() - trows = self.row_data - if isinstance(self.row_data, GenomicRanges): - trows = self.row_data.to_pandas() + trows = self._rows.to_pandas() obj = AnnData( - obs=self.col_data, + obs=self._cols.to_pandas(), var=trows, uns=self.metadata, layers=layers, diff --git a/src/summarizedexperiment/RangedSummarizedExperiment.py b/src/summarizedexperiment/RangedSummarizedExperiment.py index 0f2849e..bc2e118 100644 --- a/src/summarizedexperiment/RangedSummarizedExperiment.py +++ b/src/summarizedexperiment/RangedSummarizedExperiment.py @@ -4,7 +4,7 @@ from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo from .SummarizedExperiment import SummarizedExperiment -from .types import BiocOrPandasFrame, MatrixTypes, SlicerArgTypes +from .types import MatrixTypes, SlicerArgTypes __author__ = "jkanche" __copyright__ = "jkanche" @@ -129,7 +129,7 @@ def __init__( metadata (Dict, optional): Additional experimental metadata describing the methods. Defaults to None. """ - super().__init__(assays, rows=row_data, cols=col_data, metadata=metadata) + super().__init__(assays, row_data=row_data, col_data=col_data, metadata=metadata) if row_ranges is None: row_ranges = GenomicRangesList.empty(n=self._shape[0]) diff --git a/src/summarizedexperiment/SummarizedExperiment.py b/src/summarizedexperiment/SummarizedExperiment.py index c740953..93548bb 100644 --- a/src/summarizedexperiment/SummarizedExperiment.py +++ b/src/summarizedexperiment/SummarizedExperiment.py @@ -1,11 +1,11 @@ -from typing import Dict, Optional +from typing import Dict, Optional, Union, Sequence from warnings import warn from genomicranges import GenomicRanges from biocframe import BiocFrame from .BaseSE import BaseSE -from .types import MatrixTypes, SlicerArgTypes +from .types import MatrixTypes __author__ = "jkanche" __copyright__ = "jkanche" @@ -13,32 +13,11 @@ class SummarizedExperiment(BaseSE): - """Container to represents genomic experiment data (`assays`), features (`row_data`), sample data (`col_data`) and - any other `metadata`. + """Container to represents genomic experiment data (`assays`), + features (`row_data`), sample data (`col_data`) and any other `metadata`. - SummarizedExperiment follows the R/Bioconductor specification; rows are features, columns - are samples. - - Attributes: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys - and 2-dimensional matrices represented as either - :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. - - Alternatively, you may use any 2-dimensional matrix that has the ``shape`` property and - implements the slice operation using the ``__getitem__`` dunder method. - - All matrices in assays must be 2-dimensional and have the same shape - (number of rows, number of columns). - - row_data (BiocFrame, optional): Features, which must be of the same length as the rows of - the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or - :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - - col_data (BiocFrame, optional): Sample data, which must be of the same length as the - columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` - or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - - metadata (Dict, optional): Additional experimental metadata describing the methods. Defaults to None. + SummarizedExperiment follows the R/Bioconductor specification; + rows are features, columns are samples. """ def __init__( @@ -47,29 +26,43 @@ def __init__( row_data: Optional[BiocFrame] = None, col_data: Optional[BiocFrame] = None, metadata: Optional[Dict] = None, + validate: bool = True, ) -> None: """Initialize a Summarized Experiment (SE). Args: - assays (Dict[str, MatrixTypes]): A dictionary containing matrices, with assay names as keys + assays: + A dictionary containing matrices, with assay names as keys and 2-dimensional matrices represented as either :py:class:`~numpy.ndarray` or :py:class:`~scipy.sparse.spmatrix`. - Alternatively, you may use any 2-dimensional matrix that has the ``shape`` property and - implements the slice operation using the ``__getitem__`` dunder method. + Alternatively, you may use any 2-dimensional matrix that has + the ``shape`` property and implements the slice operation + using the ``__getitem__`` dunder method. + + All matrices in assays must be 2-dimensional and have the + same shape (number of rows, number of columns). + + row_data: + Features, must be the same length as the number of rows of + the matrices in assays. + + Feature information is coerced to a + :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - All matrices in assays must be 2-dimensional and have the same shape - (number of rows, number of columns). + col_data: + Sample data, must be the same length as the number of + columns of the matrices in assays. - row_data (BiocFrame, optional): Features, which must be of the same length as the rows of - the matrices in assays. Features can be either a :py:class:`~pandas.DataFrame` or + Sample information is coerced to a :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. - col_data (BiocFrame, optional): Sample data, which must be of the same length as the - columns of the matrices in assays. Sample Information can be either a :py:class:`~pandas.DataFrame` - or :py:class:`~biocframe.BiocFrame.BiocFrame`. Defaults to None. + metadata: + Additional experimental metadata describing the methods. + Defaults to None. - metadata (Dict, optional): Additional experimental metadata describing the methods. Defaults to None. + validate: + Internal use only. """ if isinstance(row_data, GenomicRanges): @@ -77,35 +70,39 @@ def __init__( "`row_data` is `GenomicRanges`, consider using `RangeSummarizedExperiment`." ) - super().__init__(assays, rows=row_data, cols=col_data, metadata=metadata) - - def __getitem__( - self, - args: SlicerArgTypes, - ) -> "SummarizedExperiment": - """Subset a `SummarizedExperiment`. - - Args: - args (SlicerArgTypes): Indices or names to slice. The tuple contains - slices along dimensions (rows, cols). - - Each element in the tuple, might be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. Defaults to None. - - Raises: - ValueError: If too many or too few slices provided. - - Returns: - SummarizedExperiment: Sliced `SummarizedExperiment` object. - """ - sliced_objs = self._slice(args) - return SummarizedExperiment( - assays=sliced_objs.assays, - row_data=sliced_objs.row_data, - col_data=sliced_objs.col_data, - metadata=self.metadata, + super().__init__( + assays, row_data=row_data, col_data=col_data, metadata=metadata, validate=validate ) + # def __getitem__( + # self, + # args: Union[int, str, Sequence, tuple], + # ) -> "SummarizedExperiment": + # """Subset a `SummarizedExperiment`. + + # Args: + # args: + # Indices or names to slice. The tuple contains + # slices along dimensions (rows, cols). + + # Each element in the tuple, might be either a integer vector (integer positions), + # boolean vector or :py:class:`~slice` object. Defaults to None. + + # Raises: + # ValueError: + # If too many or too few slices provided. + + # Returns: + # Sliced `SummarizedExperiment` object. + # """ + # sliced_objs = self._generic_slice(args) + # return SummarizedExperiment( + # assays=sliced_objs.assays, + # row_data=sliced_objs.rows, + # col_data=sliced_objs.cols, + # metadata=self.metadata, + # ) + def __repr__(self) -> str: pattern = ( f"Class SummarizedExperiment with {self.shape[0]} features and {self.shape[1]} " diff --git a/tests/test_RSE.py b/tests/test_RSE.py index e4652ed..870379d 100644 --- a/tests/test_RSE.py +++ b/tests/test_RSE.py @@ -37,7 +37,7 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) col_data = pd.DataFrame( { @@ -45,24 +45,24 @@ } ) -a = genomicranges.GenomicRanges( - { +a = genomicranges.GenomicRanges.from_pandas( + pd.DataFrame({ "seqnames": ["chr1", "chr2", "chr1", "chr3"], "starts": [1, 3, 2, 4], "ends": [10, 30, 50, 60], "strand": ["-", "+", "*", "+"], "score": [1, 2, 3, 4], - } + }) ) -b = genomicranges.GenomicRanges( - { +b = genomicranges.GenomicRanges.from_pandas( + pd.DataFrame({ "seqnames": ["chr2", "chr4", "chr5"], "starts": [3, 6, 4], "ends": [30, 50, 60], "strand": ["-", "+", "*"], "score": [2, 3, 4], - } + }) ) grl = genomicranges.GenomicRangesList(ranges=[a, b], names=["a", "b"]) diff --git a/tests/test_RSE_methods.py b/tests/test_RSE_methods.py index 341895c..1681aa7 100644 --- a/tests/test_RSE_methods.py +++ b/tests/test_RSE_methods.py @@ -37,7 +37,7 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) col_data = pd.DataFrame( { diff --git a/tests/test_SE.py b/tests/test_SE.py index bbe9a33..5075c35 100644 --- a/tests/test_SE.py +++ b/tests/test_SE.py @@ -1,6 +1,7 @@ from random import random import genomicranges +from biocframe import BiocFrame import numpy as np import pandas as pd from summarizedexperiment.SummarizedExperiment import SummarizedExperiment @@ -13,7 +14,7 @@ nrows = 200 ncols = 6 counts = np.random.rand(nrows, ncols) -df_gr = pd.DataFrame( +row_data = BiocFrame( { "seqnames": [ "chr1", @@ -36,8 +37,6 @@ } ) -gr = genomicranges.from_pandas(df_gr) - col_data = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, @@ -45,54 +44,82 @@ ) -def test_SE_creation(): +def test_SE_init(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None assert isinstance(tse, SummarizedExperiment) assert tse.shape == (200, 6) + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) -def test_SE_df(): +def test_SE_with_df(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data.to_pandas(), col_data=col_data ) assert tse is not None assert isinstance(tse, SummarizedExperiment) assert tse.shape == (200, 6) + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) -def test_SE_none(): +def test_SE_no_row_or_col_data(): tse = SummarizedExperiment(assays={"counts": counts}) assert tse is not None assert isinstance(tse, SummarizedExperiment) assert tse.shape == (200, 6) - - tse.row_names = [f"row_{i}" for i in range(200)] - assert tse.row_names is not None - assert len(tse.row_names) == 200 + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) + + tse.row_data = tse.row_data.set_row_names([f"row_{i}" for i in range(200)]) + assert tse.rownames is not None + assert len(tse.rownames) == 200 assert tse.row_data.shape[0] == 200 + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) - tse.colnames = [f"col_{i}" for i in range(6)] + tse.col_data = tse.coldata.set_row_names([f"col_{i}" for i in range(6)]) assert tse.colnames is not None assert len(tse.colnames) == 6 assert tse.col_data.shape[0] == 6 + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) def test_SE_export(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None assert isinstance(tse, SummarizedExperiment) assert tse.shape == (200, 6) + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) adata = tse.to_anndata() assert adata is not None assert adata.shape == (6, 200) + assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) diff --git a/tests/test_SE_methods.py b/tests/test_SE_methods.py index 626037f..b242910 100644 --- a/tests/test_SE_methods.py +++ b/tests/test_SE_methods.py @@ -1,8 +1,7 @@ from random import random -import genomicranges +from biocframe import BiocFrame import numpy as np -import pandas as pd from summarizedexperiment.SummarizedExperiment import SummarizedExperiment __author__ = "jkanche" @@ -13,7 +12,7 @@ nrows = 200 ncols = 6 counts = np.random.rand(nrows, ncols) -df_gr = pd.DataFrame( +row_data = BiocFrame( { "seqnames": [ "chr1", @@ -36,9 +35,7 @@ } ) -gr = genomicranges.from_pandas(df_gr) - -col_data = pd.DataFrame( +col_data = BiocFrame( { "treatment": ["ChIP", "Input"] * 3, } @@ -47,7 +44,7 @@ def test_SE_props(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None @@ -56,17 +53,18 @@ def test_SE_props(): assert tse.assay_names is not None assert len(tse.assay_names) == 1 - assert tse.col_data is not None assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) assert tse.dims == tse.shape - - assert tse.metadata is None + assert tse.metadata is not None def test_SE_set_props(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None @@ -78,37 +76,23 @@ def test_SE_set_props(): assert len(tse.assay_names) == 1 - tse.col_data = None - assert tse.col_data is not None - tse.row_data = None assert tse.row_data is not None + assert isinstance(tse.row_data, BiocFrame) + + tse.col_data = None + assert tse.col_data is not None + assert isinstance(tse.col_data, BiocFrame) assert tse.dims == tse.shape tse.metadata = {"something": "random"} - assert tse.metadata is not None - - -def test_SE_subset_assays(): - tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data - ) - - assert tse is not None - assert isinstance(tse, SummarizedExperiment) - - subset_asys = tse.subset_assays(row_indices=slice(1, 10), col_indices=[0, 1, 2]) - assert subset_asys is not None - assert isinstance(subset_asys, type(tse.assays)) - - assert len(subset_asys.keys()) == 1 - assert subset_asys["counts"].shape == (9, 3) + assert tse.metadata is not {} def test_SE_assay(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None diff --git a/tests/test_SE_subset.py b/tests/test_SE_subset.py index bc82502..d464347 100644 --- a/tests/test_SE_subset.py +++ b/tests/test_SE_subset.py @@ -1,6 +1,6 @@ from random import random -import genomicranges +from biocframe import BiocFrame import numpy as np import pandas as pd import pytest @@ -13,7 +13,7 @@ nrows = 200 ncols = 6 counts = np.random.rand(nrows, ncols) -df_gr = pd.DataFrame( +row_data = BiocFrame( { "seqnames": [ "chr1", @@ -36,8 +36,6 @@ } ) -gr = genomicranges.from_pandas(df_gr) - col_data = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, @@ -45,9 +43,25 @@ ) +def test_SE_subset_assays(): + tse = SummarizedExperiment( + assays={"counts": counts}, row_data=row_data, col_data=col_data + ) + + assert tse is not None + assert isinstance(tse, SummarizedExperiment) + + subset_asys = tse.subset_assays(rows=slice(1, 10), columns=[0, 1, 2]) + assert subset_asys is not None + assert isinstance(subset_asys, type(tse.assays)) + + assert len(subset_asys.keys()) == 1 + assert subset_asys["counts"].shape == (9, 3) + + def test_SE_subset(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None @@ -88,9 +102,20 @@ def test_SE_subset_by_name(summarized_experiments): # subset with non-existent sample name se = summarized_experiments.se1 - with pytest.raises(ValueError): + with pytest.raises(Exception): subset_se = se[["HER2", "BRCA1", "something random"], ["cell_1", "cell_3"]] +def test_scalar_arg(summarized_experiments): + # subset with scalar + se = summarized_experiments.se1 + subset_se = se["HER2", ["cell_1", "cell_3"]] + + assert subset_se is not None + assert isinstance(subset_se, SummarizedExperiment) + assert len(subset_se.row_data) == 1 + assert len(subset_se.col_data) == 2 + + assert subset_se.assay("counts").shape == (1, 2) def test_SE_subset_by_name_fails(summarized_experiments): # subset by name with some that do not exist @@ -160,7 +185,7 @@ def test_SE_subset_biocframe_with_bools(summarized_experiments): def test_SE_subset_biocframe_with_bools_should_fail(summarized_experiments): se = summarized_experiments.se_biocframe_1 with pytest.raises(Exception): - se[[True, False],] + se[[True, False, "True"],] def test_SE_subset_fails_with_indexes(summarized_experiments): @@ -171,19 +196,19 @@ def test_SE_subset_fails_with_indexes(summarized_experiments): # subset by name when index is not available tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data + assays={"counts": counts}, row_data=row_data, col_data=col_data ) assert tse is not None assert isinstance(tse, SummarizedExperiment) - with pytest.raises(ValueError): + with pytest.raises(Exception): tse[["0", "1", "2"], ["2", "3"]] def test_SE_subset_single_indexer_list(summarized_experiments): se = summarized_experiments.se1 - subset_se = se[[True, False, True]] + subset_se = se[[True, False, True],] assert subset_se is not None assert isinstance(subset_se, SummarizedExperiment) @@ -195,7 +220,7 @@ def test_SE_subset_single_indexer_list(summarized_experiments): def test_SE_subset_single_indexer_slicer(summarized_experiments): se = summarized_experiments.se1 - subset_se = se[0:2] + subset_se = se[0:2,] assert subset_se is not None assert isinstance(subset_se, SummarizedExperiment)