From 78dfa669f2dda9ae897f2431ee6b1ed10ca8cb07 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 2 Jan 2024 14:50:18 -0800 Subject: [PATCH 01/21] untested code changes --- setup.cfg | 7 +- .../MultiAssayExperiment.py | 1185 ++++++++++------- src/multiassayexperiment/types.py | 20 - 3 files changed, 733 insertions(+), 479 deletions(-) delete mode 100644 src/multiassayexperiment/types.py diff --git a/setup.cfg b/setup.cfg index 8c7f19c..c066e67 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 +python_requires = >=3.8 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in @@ -49,9 +49,8 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - summarizedexperiment>=0.3.0 - singlecellexperiment>=0.3.0 - mudata + biocframe>=0.5.6,<0.6.0 + biocutils>=0.1.4,<0.2.0 [options.packages.find] where = src diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index b3cee41..da606b2 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -1,614 +1,854 @@ -from collections import OrderedDict +from collections import OrderedDict, namedtuple from copy import deepcopy -from typing import Dict, MutableMapping, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence, Union from warnings import warn -from mudata import MuData -from pandas import DataFrame, concat +import biocframe +import biocutils as ut +import summarizedexperiment as se from singlecellexperiment import SingleCellExperiment -from summarizedexperiment import SummarizedExperiment -from summarizedexperiment.type_checks import is_bioc_or_pandas_frame, is_list_of_type - -from .types import SlicerArgTypes, SlicerResult, SlicerTypes, StrOrListStr __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" +SlicerResult = namedtuple("SlicerResult", ["experiments", "sample_map", "column_data"]) -class MultiAssayExperiment: - """Container class for representing and managing multi-omics genomic experiments. Checkout the - `R/MultiAssayExperiment `_ - for more information. - Attributes: - experiments (MutableMapping[str, SummarizedExperiment]): A dictionary of - experiments with experiment names as keys and the experiments as values. +def _sanitize_frame(frame): + if se._frameutils.is_pandas(frame): + frame = biocframe.from_pandas(frame) + + return frame - Each ``experiment`` may be either a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - and any class that extends `SummarizedExperiment`. - col_data (DataFrame]): Bio-specimen/sample information. - The ``col_data`` may provide information about patients, cell lines, or - other biological units. +def _validate_experiments(experiments): + if not isinstance(experiments, dict): + raise TypeError("experiments must be a dictionary.") - Each row in this table is an independent biological unit. Must contain an `index` - that maps to primary in ``sample_map``. + for k, v in experiments.items(): + if not hasattr(v, "shape"): + raise ValueError(f"experiment: {k} is not supported.") - sample_map (DataFrame): Map biological units from - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data` - to the list of - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.experiments`. - Must contain columns "assay", "primary" and "colname". +def _validate_column_data(column_data): + if column_data is None: + raise ValueError("'column_data' cannot be None.") - - **assay** provides the names of the different experiments performed on the - biological units. All experiment names from - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.experiments` - must be present in this column. - - **primary** contains the sample name. All names in this column must match with - row labels from - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data`. - - **colname** is the mapping of samples/cells within each experiment back to its - biosample information in - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data`. + if not isinstance(column_data, biocframe.BiocFrame): + raise TypeError("'column_data' is not a `BiocFrame` object.") - Each sample in ``col_data`` may map to one or more columns per assay. + if column_data.row_names is None: + raise ValueError("`column_data` must have row names or labels.") - :py:class:`~multiassayexperiment.io.interface.make_mae`, or import functions to - read data as ``MultiAssayExperiment`` from - :py:class:`~multiassayexperiment.io.mudata.from_mudata` and - :py:class:`~multiassayexperiment.io.anndata.from_anndata`. - metadata (MutableMapping, optional): Additional study level metadata. Defaults to None. +def _validate_sample_map_with_column_data(sample_map, column_data): + # check if all samples are from primary exist in col data + _samples = sample_map.get_column("primary") + _sample_set = set(_samples) + _sample_diff = _sample_set.difference(column_data.row_names) + if len(_sample_diff) > 0: + raise ValueError( + "`sample_map`'s 'primary' contains samples not represented by 'row_names' from `column_data`." + ) + + if len(_sample_set) != column_data.shape[0]: + warn("'primary' from `sample_map` & `column_data` mismatch.", UserWarning) + + +def _validate_sample_map_with_expts(sample_map, experiments): + # check if all assay names are in experiments + smap_unique_assays = set(sample_map.get_column("assay")) + unique_expt_names = set(list(experiments.keys())) + + if (len(unique_expt_names) != len(smap_unique_assays)) or ( + unique_expt_names != smap_unique_assays + ): + raise ValueError("'assays' mismatch between `sample_map` and `experiments`.") + + # check if colnames exist + agroups = sample_map.split("assay") + for grp, rows in agroups: + if grp not in experiments: + warn( + f"Experiment '{grp}' exists in `sample_map` but not in `experiments`.", + UserWarning, + ) + + if set(rows.get_column("colname")) != set(experiments[grp].column_names): + raise ValueError( + f"Experiment '{grp}' does not contain all columns mentioned in `sample_map`." + ) + + +def _validate_sample_map(sample_map, column_data, experiments): + if sample_map is None: + raise ValueError("'sample_map' cannot be None.") + + if not isinstance(sample_map, biocframe.BiocFrame): + raise TypeError("'sample_map' is not a `BiocFrame` object.") + + if not set(["assay", "primary", "colname"]).issubset(sample_map.column_names): + raise ValueError( + "'sample_map' does not contain required columns: 'assay', 'primary' and 'colname'." + ) + + _validate_column_data(column_data) + _validate_sample_map_with_column_data(sample_map, column_data) + _validate_sample_map_with_expts(sample_map, experiments) + + +class MultiAssayExperiment: + """Container class for representing and managing multi-omics genomic experiments. Checkout the + `R/MultiAssayExperiment `_ + for more information. """ def __init__( self, - experiments: MutableMapping[str, SummarizedExperiment], - col_data: DataFrame, - sample_map: DataFrame, - metadata: Optional[MutableMapping] = None, + experiments: Dict[str, Any], + column_data: biocframe.BiocFrame, + sample_map: biocframe.BiocFrame, + metadata: Optional[dict] = None, + validate: bool = True, ) -> None: - """Construct an MAE.""" - self._validate_experiments(experiments) - self._validate_sample_map( - sample_map=sample_map, col_data=col_data, experiments=experiments - ) - self._sample_map = sample_map - self._col_data = col_data - self._experiments = experiments + """Initialize an instance of ``MultiAssayExperiment``. - self._metadata = metadata + You may also initialize an ``MultiAssayExperiment`` using + :py:class:`~multiassayexperiment.io.interface.make_mae` or by + transform from :py:class:`~multiassayexperiment.io.mudata.from_mudata` and + :py:class:`~multiassayexperiment.io.anndata.from_anndata` objects. - def _validate_experiments( - self, experiments: MutableMapping[str, SummarizedExperiment] - ): - """Internal method to validate experiments. + Args: + experiments: + A dictionary containing experiments, with experiment names as keys and + the experiments as values. - Raises: - TypeError: If experiments is not a :py:class:`~dict`. - """ - if not isinstance(experiments, dict): - raise TypeError("experiments must be a dictionary.") + Each ``experiment`` may be either a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + or any class that extends ``SummarizedExperiment``. - def _validate_col_data(self, col_data: DataFrame): - """Internal method to validate ``col_data``. + column_data: + Bio-specimen/sample information. - Args: - col_data (DataFrame): Column data. + ``column_data`` may provide information about patients, cell lines, or other biological units. - ``col_data`` may be either a :py:class:`~pandas.DataFrame` or - :py:class:`~biocframe.BiocFrame.BiocFrame` object. + Each row in this table represents an independent biological unit. It must contain an `index` + that maps to the 'primary' in + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. - Raises: - TypeError: If object is not an expected type. - """ - if not is_bioc_or_pandas_frame(col_data): - raise TypeError( - "`col_data` must be either a pandas DataFrame or a BiocFrame object." - ) + sample_map: + Map biological units from + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.column_data` + to the list of experiments. - if isinstance(col_data, DataFrame): - if col_data.index is None: - raise ValueError("`col_data` must have an index column.") - else: - if col_data.row_names is None: - raise ValueError("`col_data` must have row names or labels.") + Must contain columns "assay", "primary", and "colname". - def _validate_sample_map_with_col_data( - self, sample_map: DataFrame, col_data: DataFrame - ): - """Internal method to validate ``sample_map`` and ``col_data``. + - `assay` provides the names of the different experiments performed on the biological units. + All experiment names from + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.experiments` must + be present in this column. + - `primary` contains the sample name. All names in this column must match with row labels from + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.column_data`. + - `colname` is the mapping of samples/cells within each experiment back to its biosample information in + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.column_data`. - Args: - sample_map (DataFrame): Sample mapping. - col_data (DataFrame): Column data. + Each sample in ``column_data`` may map to one or more columns per assay. - Raises: - ValueError: If any of the checks fail. + metadata: + Additional study-level metadata. + Defaults to None. + + validate: + Internal use only. """ - # check if unique samples is same as in sample data - _samples = list(sample_map["primary"]) - _sample_set = set(_samples) - _sample_diff = _sample_set.difference(col_data.index.tolist()) - if len(_sample_diff) > 0: - raise ValueError( - "'primary' from `sample_map` has unknown samples not present in `col_data`." - ) - if len(_sample_set) != col_data.shape[0]: - warn("'primary' from `sample_map` & `col_data` has missing samples.") + self._sample_map = _sanitize_frame(sample_map) + self._column_data = _sanitize_frame(column_data) + self._experiments = experiments if experiments is not None else {} + self._metadata = metadata if metadata is not None else {} - def _validate_sample_map_with_Expts( - self, - sample_map: DataFrame, - experiments: MutableMapping[str, SummarizedExperiment], - ): - """Internal method to validate ``sample_map`` and ``experiments``. + if validate: + _validate_experiments(self._experiments) + _validate_column_data(self._column_data) + _validate_sample_map(self._sample_map, self._column_data, self._experiments) - Args: - sample_map (DataFrame): Sample mapping. - experiments (MutableMapping[str, SummarizedExperiment]): Experiments. + def _define_output(self, in_place: bool = False) -> "MultiAssayExperiment": + if in_place is True: + return self + else: + return self.__copy__() - Raises: - ValueError: If any of the checks fail. + ######################### + ######>> Copying <<###### + ######################### + + def __deepcopy__(self, memo=None, _nil=[]): """ - # check if all assay names are in experiments - smapUniqueAssaynames = set(sample_map["assay"]) - UniqueExperimentname = set(list(experiments.keys())) + Returns: + A deep copy of the current ``MultiAssayExperiment``. + """ + from copy import deepcopy + + _expts_copy = deepcopy(self._experiments) + _sample_map_copy = deepcopy(self._sample_map) + _column_data_copy = deepcopy(self._column_data) + _metadata_copy = deepcopy(self.metadata) + + current_class_const = type(self) + return current_class_const( + experiment=_expts_copy, + column_data=_column_data_copy, + sample_map=_sample_map_copy, + metadata=_metadata_copy, + ) - if (len(UniqueExperimentname) != len(smapUniqueAssaynames)) or ( - UniqueExperimentname != smapUniqueAssaynames - ): - raise ValueError( - "'assays' from sample_map does not match with `experiments`." - ) + def __copy__(self): + """ + Returns: + A shallow copy of the current ``MultiAssayExperiment``. + """ + current_class_const = type(self) + return current_class_const( + experiment=self._experiments, + column_data=self._column_data, + sample_map=self._sample_map, + metadata=self._metadata, + ) - # check if colnames exist - agroups = sample_map.groupby("assay") - for group, rows in agroups: - if group not in experiments: - raise ValueError( - f"Experiment '{group}' exists in `sample_map` but not in `experiments`." - ) + def copy(self): + """Alias for :py:meth:`~__copy__`.""" + return self.__copy__() - gcol_data = experiments[group].col_data + ########################## + ######>> Printing <<###### + ########################## - if set(rows["colname"].tolist()) != set(gcol_data.index.tolist()): - raise ValueError( - f"Experiment '{group}' does not contain all columns in `sample_map`." - ) + def __repr__(self) -> str: + """ + Returns: + A string representation. + """ + output = f"{type(self).__name__}(" + output += ", experiments=" + ut.print_truncated_list(self._experiments) + output += ", column_data=" + self._column_data.__repr__() + output += ", sample_map=" + self._sample_map.__repr__() - def _validate_sample_map( - self, - sample_map: DataFrame, - col_data: DataFrame, - experiments: MutableMapping[str, SummarizedExperiment], - ): - """Validate sample map. + if len(self._metadata) > 0: + output += ", metadata=" + ut.print_truncated_dict(self._metadata) - Args: - sample_map (DataFrame): Sample map. - col_data (DataFrame): Column data. - experiments (MutableMapping[str, SummarizedExperiment]): Experiments. + output += ")" + return output - Raises: - TypeError, ValueError: If any of the checks fail. + def __str__(self) -> str: """ - if not is_bioc_or_pandas_frame(sample_map): - raise TypeError( - "`sample_map` must be either a pandas `DataFrame` or a `BiocFrame` object." - ) + Returns: + A pretty-printed string containing the contents of this object. + """ + output = f"class: {type(self).__name__} containing {len(self.experiment_names)} experiments\n" - if not set(["assay", "primary", "colname"]).issubset(list(sample_map.columns)): - raise ValueError( - "`sample_map` does not contain required columns: 'assay', 'primary/ and `'colname'." - ) + for idx in range(len(self.experiment_names)): + expt_name = self.experiment_names[idx] + expt = self._experiments[expt_name] + output += f"[{idx}] {expt_name}: {type(expt).__name} with {expt.shape[0]} rows and {expt.shape[1]} columns" - self._validate_sample_map_with_col_data(sample_map, col_data) - self._validate_sample_map_with_Expts(sample_map, experiments) + output += f"column_data columns({len(self._column_data.column_names)}): {ut.print_truncated_list(self._column_data.column_names)}\n" + output += f"sample_map columns({len(self._sample_map.column_names)}): {ut.print_truncated_list(self._sample_map.column_names)}\n" - def _validate(self): - """Internal method to validate the object. + output += f"metadata({str(len(self.metadata))}): {ut.print_truncated_list(list(self.metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" - Raises: - ValueError: If attributes don't match expectations. - """ - self._validate_experiments(self._experiments) - self._validate_col_data(self._col_data) - self._validate_sample_map(self._sample_map, self._col_data, self._experiments) + return output - @property - def experiments( - self, - ) -> Dict[str, SummarizedExperiment]: - """Get experiments. + ############################# + ######>> experiments <<###### + ############################# + + def get_experiments(self) -> Dict[str, Any]: + """Access experiments. Returns: - Dict[str, SummarizedExperiment]: A dictionary of all experiments, with experiment + A dictionary of all experiments, with experiment names as keys and experiment data as value. """ return self._experiments + def set_experiments( + self, experiments: Dict[str, Any], in_place: bool = False + ) -> "MultiAssayExperiment": + """Set new experiments. + + Args: + experiments: + New experiments to set. A dictionary of experiments with experiment names as keys and + the experiments as values. + + Each ``experiment`` may be either a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + or any class that extends ``SummarizedExperiment``. + + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. + + Returns: + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. + """ + + _validate_experiments(experiments) + _validate_sample_map_with_expts(self._sample_map, experiments) + + output = self._define_output(in_place) + output._experiments = experiments + return output + + @property + def experiments( + self, + ) -> Dict[str, Any]: + """Alias for :py:meth:`~get_experiments`.""" + return self.get_experiments() + @experiments.setter def experiments( self, - experiments: MutableMapping[str, SummarizedExperiment], + experiments: Dict[str, Any], ): - """Set new experiments. + """Alias for :py:meth:`~set_experiments` with ``in_place = True``. + + As this mutates the original object, a warning is raised. + """ + warn( + "Setting property 'experiments' is an in-place operation, use 'set_experiments' instead", + UserWarning, + ) + self.set_experiments(experiments, in_place=True) + + @property + def assays(self) -> Dict[str, Any]: + """Alias for :py:meth:`~get_experiments`.""" + return self.get_experiments() + + ################################## + ######>> experiment names <<###### + ################################## + + def get_experiment_names(self) -> List[str]: + """Get experiment names. + + Returns: + List of experiment names. + """ + return list(self._experiments.keys()) + + def set_experiment_names( + self, names: List[str], in_place: bool = False + ) -> "MultiAssayExperiment": + """Replace :py:attr:`~experiments`'s names. Args: - experiments (MutableMapping[str, SummarizedExperiment]): New experiments to set. - A dictionary of experiments with experiment names as keys and the experiments - as values. + names: + New names. - Each ``experiment`` may be either a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - and any class that extends `SummarizedExperiment`. + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. + + Returns: + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ + current_names = self.get_experiment_names() + if len(names) != len(current_names): + raise ValueError( + "Length of 'names' does not match the number of `experiments`." + ) - self._validate_experiments(experiments) - self._validate_sample_map_with_Expts(self._sample_map, experiments) - self._experiments = experiments + new_experiments = OrderedDict() + for idx in range(len(names)): + new_experiments[names[idx]] = self._experiments.pop(current_names[idx]) + + output = self._define_output(in_place) + output._experiments = new_experiments + return output + + @property + def experiment_names(self) -> List[str]: + """Alias for :py:meth:`~get_experiment_names`.""" + return self.get_experiment_names() + + @experiment_names.setter + def experiment_names(self, names: List[str]): + """Alias for :py:meth:`~set_experiment_names` with ``in_place = True``. + + As this mutates the original object, a warning is raised. + """ + warn( + "Setting property 'experiment_names' is an in-place operation, use 'set_experiment_names' instead", + UserWarning, + ) + self.set_experiment_names(names, in_place=True) - def experiment( - self, name: str, with_sample_data: bool = False - ) -> SummarizedExperiment: - """Get experiment by name. + ##################################### + ######>> experiment accessor <<###### + ##################################### - If ``with_sample_data`` is True, a copy of the experiment object is returned. + def experiment(self, name: str, with_sample_data: bool = False) -> Any: + """Get an experiment by name. Args: - name (str): Experiment name. - with_sampleData (bool, optional): Whether to merge column data of the experiment with - sample data from the MAE. Defaults to False. + name: + Experiment name. - Raises: - ValueError: If experiment name does not exist. + with_sample_data: + Whether to merge column data of the experiment with + :py:attr:`~sample_data` from the MAE. + + Defaults to False. Returns: - SummarizedExperiment: A class that extends - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + The experiment object. + + If ``with_sample_data`` is `True`, a copy of the experiment object is returned. """ if name not in self._experiments: - raise ValueError(f"Experiment '{name}' does not exist.") + raise ValueError(f"'{name}' is not a valid experiment name.") expt = self.experiments[name] if with_sample_data is True: expt = deepcopy(expt) - subset_map = self.sample_map[self.sample_map["assay"] == name] - subset_map = subset_map.set_index("colname") + assay_splits = self.sample_map.split("assay", indices_only=True) + subset_map = self.sample_map[assay_splits[name]] + subset_map = subset_map.set_row_names(subset_map.get_column("colname")) - expt_col_data = expt.col_data - new_col_data = concat([subset_map, expt_col_data], axis=1) - expt.col_data = new_col_data + expt_column_data = expt.column_data + new_column_data = biocframe.merge( + [subset_map, expt_column_data], join="outer" + ) + + expt.column_data = new_column_data return expt - @property - def sample_map(self) -> DataFrame: - """Get sample map between experiments and sample metadata. + ############################ + ######>> sample map <<###### + ############################ + + def get_sample_map(self) -> biocframe.BiocFrame: + """Acess sample map. Returns: - DataFrame: a DataFrame with sample mapping information. + A :py:class:`~biocframe.BiocFrame.BiocFrame` with sample mapping information. """ return self._sample_map - @sample_map.setter - def sample_map(self, sample_map: DataFrame): + def set_sample_map( + self, sample_map: biocframe.BiocFrame, in_place: bool = False + ) -> "MultiAssayExperiment": """Set new sample mapping. Args: - sample_map (DataFrame): New sample map. + sample_map: + New sample map. + + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. + + Returns: + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ - self._validate_sample_map(sample_map, self._col_data, self._experiments) - self._sample_map = sample_map + sample_map = _sanitize_frame(sample_map) + _validate_sample_map(sample_map, self._column_data, self._experiments) + + output = self._define_output(in_place) + output._sample_map = sample_map + return output @property - def col_data(self) -> DataFrame: + def sample_map(self) -> biocframe.BiocFrame: + """Alias for :py:meth:`~get_sample_map`.""" + return self.get_sample_map() + + @sample_map.setter + def sample_map(self, sample_map: biocframe.BiocFrame): + """Alias for :py:meth:`~set_sample_map` with ``in_place = True``. + + As this mutates the original object, a warning is raised. + """ + warn( + "Setting property 'sample_map' is an in-place operation, use 'set_sample_map' instead", + UserWarning, + ) + self.set_sample_map(sample_map, in_place=True) + + ############################# + ######>> column_data <<###### + ############################# + + def get_column_data(self) -> biocframe.BiocFrame: """Get sample metadata. Returns: - DataFrame: Sample metadata. + A :py:class:`~biocframe.BiocFrame.BiocFrame` containing sample metadata. """ - return self._col_data + return self._column_data - @col_data.setter - def col_data(self, col_data: DataFrame): - """Set sample metadata. + def set_column_data( + self, column_data: biocframe.BiocFrame, in_place: bool = False + ) -> "MultiAssayExperiment": + """Set new sample metadata. Args: - col_data (DataFrame): New sample metadata. + column_data: + New sample metadata. + + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. + + Returns: + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ - self._validate_col_data(col_data) - self._validate_sample_map_with_col_data(self._sample_map, col_data) - self._col_data = col_data + column_data = _sanitize_frame(column_data) - @property - def assays( - self, - ) -> Dict[str, SummarizedExperiment]: - """Get experiments. + self._validate_column_data(column_data) + self._validate_sample_map_with_column_data(self._sample_map, column_data) - Alias to the - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.experiments`. + output = self._define_output(in_place) + output._column_data = column_data + return output - Returns: - Dict[str, SummarizedExperiment]: All experiments. - A dictionary of experiments with experiment names as keys and the experiments - as values. + @property + def column_data(self) -> biocframe.BiocFrame: + """Alias for :py:meth:`~get_column_data`.""" + return self.get_column_data() - Each ``experiment`` may be either a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - and any class that extends `SummarizedExperiment`. + @column_data.setter + def column_data(self, column_data: biocframe.BiocFrame): + """Alias for :py:meth:`~set_column_data` with ``in_place = True``. + + As this mutates the original object, a warning is raised. """ - return self.experiments + warn( + "Setting property 'column_data' is an in-place operation, use 'set_column_data' instead", + UserWarning, + ) + self.set_column_data(column_data, in_place=True) - @property - def metadata(self) -> Optional[Dict]: - """Get metadata. + ########################### + ######>> metadata <<####### + ########################### + def get_metadata(self) -> dict: + """ Returns: - Optional[Dict]: Metadata if available. + Dictionary of metadata for this object. """ return self._metadata - @metadata.setter - def metadata(self, metadata: MutableMapping): - """Set metadata. + def set_metadata( + self, metadata: dict, in_place: bool = False + ) -> "MultiAssayExperiment": + """Set additional metadata. Args: - metadata (MutableMapping): New metadata object. - """ - self._metadata = metadata + metadata: + New metadata for this object. - def _subset_experiments( - self, subset: StrOrListStr - ) -> Dict[str, SummarizedExperiment]: - """Internal method to subset experiments. - - Args: - subset (StrOrListStr): May be an single experiment name to keep. - Alternatively, ``subset`` may be a list of experiment names. + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. Returns: - Dict[str, SummarizedExperiment]: A dictionary with experiment names as keys - and the subsetted experiment data as value. + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ - if isinstance(subset, str): - subset = [subset] + if not isinstance(metadata, dict): + raise TypeError( + f"`metadata` must be a dictionary, provided {type(metadata)}." + ) + output = self._define_output(in_place) + output._metadata = metadata + return output - if not is_list_of_type(subset, str): - raise ValueError("All provided experiment names must be `strings`.") + @property + def metadata(self) -> dict: + """Alias for :py:attr:`~get_metadata`.""" + return self.get_metadata() - newExpt = OrderedDict() + @metadata.setter + def metadata(self, metadata: dict): + """Alias for :py:attr:`~set_metadata` with ``in_place = True``. - for texpt in subset: - if texpt not in self.experiments: - raise ValueError( - f"experiment {texpt} does not exist. should be {list(self.experiments.keys())}" - ) - newExpt[texpt] = self.experiments[texpt] + As this mutates the original object, a warning is raised. + """ + warn( + "Setting property 'metadata' is an in-place operation, use 'set_metadata' instead", + UserWarning, + ) + self.set_metadata(metadata, in_place=True) - return newExpt + ######################### + ######>> subset <<####### + ######################### - def _slice( + def subset_experiments( self, - args: SlicerArgTypes, - ) -> SlicerResult: - """Internal method to slice by index. + rows: Optional[Union[str, int, bool, Sequence]], + columns: Optional[Union[str, int, bool, Sequence]], + experiment_names: Union[str, int, bool, Sequence], + ) -> Dict[str, Any]: + """Subset experiments. Args: - args (SlicerArgTypes): Indices or names to slice. Tuple - contains slices along dimensions (rows, columns, experiments). + rows: + Row indices to subset. - Each element in the tuple, might be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. Defaults to None. + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. - Raises: - ValueError: Too many or too few slices. + columns: + Column indices to subset. + + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subsc + + experiment_names: + Experiment name to keep. + + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. + + Check :py:attr:`~experiment_names` for a list of valid experiment names. Returns: - SlicerResult: Sliced row, cols and assays. + A dictionary with experiment names as keys + and the subsetted experiment data as value. """ + _expts_copy = self._experiments.copy() + if experiment_names is None: + experiment_names = slice(None) - if len(args) == 0: - raise ValueError("`args` must contain at least one slice.") + if isinstance(experiment_names, slice) and experiment_names != slice(None): + expts, _ = ut.normalize_subscript( + experiment_names, len(self.experiment_names), self.experiment_names + ) - rowIndices = args[0] - colIndices = None - exptIndices = None + to_keep = self.experiment_names[expts] - if len(args) > 1: - colIndices = args[1] + new_expt = OrderedDict() + for texpt in to_keep: + new_expt[texpt] = _expts_copy[texpt] - if len(args) > 2: - exptIndices = args[2] + _expts_copy = new_expt - if exptIndices is not None: - if isinstance(exptIndices, str): - exptIndices = [exptIndices] + for k, v in _expts_copy.items(): + _expts_copy[k] = v[rows, columns] - if not all([isinstance(x, str) for x in exptIndices]): - raise ValueError( - "All `experiments` in the 3rd slice for `args` must be strings." - ) + return new_expt - if len(args) > 3: - raise ValueError("`args` contains too many slices.") + def _generic_slice( + self, + rows: Optional[Union[str, int, bool, Sequence]], + columns: Optional[Union[str, int, bool, Sequence]], + experiments: Optional[Union[str, int, bool, Sequence]], + ) -> SlicerResult: + """Slice ``MultiAssayExperiment`` along the rows and/or columns, based on their indices or names. - subsetExpts = self.experiments.copy() + Args: + rows: + Rows to be extracted. - if exptIndices is not None: - subsetExpts = self._subset_experiments(exptIndices) + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. - if rowIndices is not None: - if isinstance(rowIndices, dict): - incorrect = set(list(rowIndices.keys())).difference( - list(subsetExpts.keys()) - ) - if len(incorrect) > 0: - raise ValueError( - f"Incorrect experiment names provided: {', '.join(incorrect)}." - ) - - for expname, expt in subsetExpts.items(): - if expname in rowIndices: - subsetExpts[expname] = expt[rowIndices[expname], :] - else: - subsetExpts[expname] = expt - - elif isinstance(rowIndices, slice) or all( - isinstance(x, int) for x in rowIndices - ): - for expname, expt in subsetExpts.items(): - subsetExpts[expname] = expt[rowIndices, :] - else: - raise TypeError("Row indices is not an expected type.") + columns: + Columns to be extracted. - if colIndices is not None: - if isinstance(colIndices, dict): - incorrect = set(list(colIndices.keys())).difference( - list(subsetExpts.keys()) - ) - if len(incorrect) > 0: - raise ValueError( - f"Incorrect experiment names provided: {', '.join(incorrect)}." - ) - - for expname, expt in subsetExpts.items(): - if expname in colIndices: - subsetExpts[expname] = expt[:, colIndices[expname]] - else: - subsetExpts[expname] = expt - - elif isinstance(colIndices, slice) or all( - isinstance(x, int) for x in colIndices - ): - for expname, expt in subsetExpts.items(): - subsetExpts[expname] = expt[:, colIndices] - else: - raise TypeError("Columns indices is not an expected type.") + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. - # filter sample_map - subsetColnames = [] - subsetsample_map = DataFrame() - for expname, expt in subsetExpts.items(): - subsetColnames.extend(expt.colnames) - subsetsample_map = concat( - [ - subsetsample_map, - self.sample_map[ - (self.sample_map["assay"] == expname) - & (self.sample_map["colname"].isin(expt.colnames)) - ], - ] - ) + experiment: + Experiments to extract. + + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. + + Check :py:attr:`~experiment_names` for a list of valid experiment names. + + Returns: + The sliced tuple containing the new sample_map, column_data and experiments + for use in downstream methods. + """ - # filter col_data - subsetcol_data = self.col_data[ - self.col_data.index.isin(subsetsample_map["primary"].unique().tolist()) - ] + if rows is None: + rows = slice(None) - return SlicerResult(subsetExpts, subsetsample_map, subsetcol_data) + if columns is None: + columns = slice(None) - def subset_by_experiments(self, subset: StrOrListStr) -> "MultiAssayExperiment": + if experiments is None: + experiments = slice(None) + + _new_experiments = self.subset_experiments( + experiment_names=experiments, rows=rows, columns=columns + ) + + # filter sample_map + smap_indices_to_keep = [] + for expname, expt in _new_experiments.items(): + counter = 0 + for _, row in self._sample_map: + if row["assay"] == expname and row["colname"] in expt.column_names: + smap_indices_to_keep.append(counter) + counter += 1 + + _new_sample_map = self._sample_map[list(set(smap_indices_to_keep)),] + + # filter column_data + subset_primary = list(set(_new_sample_map.get_column("primary"))) + coldata_indices_to_keep = [] + counter = 0 + for row in self._column_data._row_names: + if row in subset_primary: + coldata_indices_to_keep.append(counter) + + _new_column_data = self._column_data[list(set(coldata_indices_to_keep)),] + + return SlicerResult(_new_experiments, _new_sample_map, _new_column_data) + + def subset_by_experiments( + self, experiments: Union[str, int, bool, Sequence] + ) -> "MultiAssayExperiment": """Subset by experiment(s). Args: - subset (StrOrListStr): May be an single experiment name to keep. - Alternatively, ``subset`` may be a list of experiment names. + experiments: + Experiments to extract. + + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. + + Check :py:attr:`~experiment_names` for a list of valid experiment names. Returns: - MultiAssayExperiment: A new `MultiAssayExperiment` with the subset experiments. + A new `MultiAssayExperiment` with the subset experiments. """ - sresult = self._slice(args=(None, None, subset)) + sresult = self._generic_slice(experiments=experiments) return MultiAssayExperiment( - sresult.experiments, sresult.col_data, sresult.sample_map, self.metadata + sresult.experiments, sresult.column_data, sresult.sample_map, self.metadata ) - def subset_by_row(self, subset: SlicerTypes) -> "MultiAssayExperiment": + def subset_by_row( + self, rows: Union[str, int, bool, Sequence] + ) -> "MultiAssayExperiment": """Subset by rows. Args: - subset (SlicerTypes): Row indices or names to slice. + rows: + Rows to be extracted. - May be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. Defaults to None. + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. Returns: - MultiAssayExperiment: A new `MultiAssayExperiment` with the subset. + A new `MultiAssayExperiment` with the subsetted rows. """ - sresult = self._slice(args=(subset, None, None)) + sresult = self._generic_slice(rows=rows) return MultiAssayExperiment( - sresult.experiments, sresult.col_data, sresult.sample_map, self.metadata + sresult.experiments, sresult.column_data, sresult.sample_map, self.metadata ) - def subset_by_column(self, subset: SlicerTypes) -> "MultiAssayExperiment": + def subset_by_column( + self, columns: Union[str, int, bool, Sequence] + ) -> "MultiAssayExperiment": """Subset by column. Args: - subset (SlicerTypes): Column indices or names to slice. + columns: + Columns to be extracted. - May be either a integer vector (integer positions), - boolean vector or :py:class:`~slice` object. Defaults to None. + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted, see + :py:meth:`~biocutils.normalize_subscript.normalize_subscript`. Returns: - MultiAssayExperiment: A new `MultiAssayExperiment` with the subset. + A new `MultiAssayExperiment` with the subsetted columns. """ - sresult = self._slice(args=(None, subset, None)) + sresult = self._generic_slice(columns=columns) return MultiAssayExperiment( - sresult.experiments, sresult.col_data, sresult.sample_map, self.metadata + sresult.experiments, sresult.column_data, sresult.sample_map, self.metadata ) - def __getitem__(self, args: SlicerArgTypes) -> "MultiAssayExperiment": + def __getitem__(self, args: tuple) -> "MultiAssayExperiment": """Subset a `MultiAssayExperiment`. Args: - args (SlicerArgTypes): Indices or names to slice. Tuple - contains slices along dimensions (rows, columns, experiments). + args: + Tuple containing slices along dimensions (rows, columns, experiments). Each element in the tuple, might be either a integer vector (integer positions), boolean vector or :py:class:`~slice` object. Defaults to None. Raises: - ValueError: Too many or too few slices. + ValueError: + Too many or too few slices. Returns: - MultiAssayExperiment: A new sliced `MultiAssayExperiment` object with the subsets. + A new sliced `MultiAssayExperiment` object with the subsets. """ - sresult = self._slice(args=args) - return MultiAssayExperiment( - sresult.experiments, sresult.col_data, sresult.sample_map, self.metadata - ) - def __str__(self) -> str: - pattern = ( - f"Class MultiAssayExperiment with {len(self.experiments.keys())} experiments and {len(self.col_data)} samples \n" # noqa: E501 - f" experiments: " - ) + if isinstance(args, tuple): + if len(args) == 0: + raise ValueError("At least one slice argument must be provided.") + + if len(args) == 1: + return self._generic_slice(rows=args[0]) + elif len(args) == 2: + return self._generic_slice(rows=args[0], columns=args[1]) + elif len(args) == 3: + return self._generic_slice( + rows=args[0], columns=args[1], experiments=args[2] + ) + else: + raise ValueError( + f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." + ) - for expname, expt in self.experiments.items(): - pattern = f"{pattern} \n {expname}: {str(expt)}" - return pattern + raise TypeError("'args' must be a tuple") + + ################################ + ######>> miscellaneous <<####### + ################################ def complete_cases(self) -> Sequence[bool]: """Identify samples that have data across all experiments. Returns: - Sequence[bool]: A boolean vector, where each element is - True if sample is present in all experiments or False. + A boolean vector same as the number of samples in column_data, + where each element is True if sample is present in all experiments or False. """ vec = [] - for x in self.col_data.index.tolist(): - subset = self.sample_map[self.sample_map["primary"] == x] + for x in self._column_data.row_names: + _primary = self._sample_map.get_column("primary") + + smap_indices_to_keep = [] + for rdx in range(len(_primary)): + if _primary[rdx] == x: + smap_indices_to_keep.append(rdx) + + subset = self.sample_map[list(set(smap_indices_to_keep)),] - vec.append(len(subset["assay"].unique()) == len(self.experiments.keys())) + vec.append(set(subset.get_column("assay")) == set(self.experiment_names)) return vec @@ -616,103 +856,138 @@ def replicated(self) -> Dict[str, Dict[str, Sequence[bool]]]: """Identify samples with replicates within each experiment. Returns: - Dict[str, Dict[str, Sequence[bool]]]: A dictionary where experiment names + A dictionary where experiment names are keys and values specify if the sample is replicated within each experiment. """ replicates = {} - allSamples = self.col_data.index.tolist() - for expname, expt in self.experiments.items(): + all_samples = self.column_data.row_names + for expname, expt in self._experiments.items(): if expname not in replicates: replicates[expname] = {} - for s in allSamples: + for s in all_samples: replicates[expname][s] = [] - colnames = expt.colnames - smap = self.sample_map[self.sample_map["assay"] == expname] + colnames = expt.column_names + smap_indices_to_keep = [] + + _assay = self._sample_map.get_column("assay") + for adx in range(len(_assay)): + if _assay[adx] == expname: + smap_indices_to_keep.append(adx) + + subset_smap = self.sample_map[list(set(smap_indices_to_keep)),] for x in colnames: - colmap = smap[smap["colname"] == x] - for s in allSamples: - replicates[expname][s].append(s in colmap["primary"]) + _subset_smap_colnames = subset_smap.get_column("colname") + _indices = [] + for cdx in range(len(_subset_smap_colnames)): + if _subset_smap_colnames[cdx] == x: + _indices.append(cdx) + + __subset_smap = subset_smap[_indices,] + + for s in all_samples: + replicates[expname][s].append(__subset_smap.get_column("primary")) return replicates + ################################# + ######>> add experiment <<####### + ################################# + def add_experiment( self, name: str, - experiment: SummarizedExperiment, - sample_map: DataFrame, - col_data: Optional[DataFrame] = None, - ): + experiment: Any, + sample_map: biocframe.BiocFrame, + column_data: Optional[biocframe.BiocFrame] = None, + in_place: bool = False, + ) -> "MultiAssayExperiment": """Add a new experiment to `MultiAssayExperiment`. - ``sample_map`` must be provided to map the cells or sample from this experiment back to - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data`. This - will be appended to the existing + ``sample_map`` must be provided to map the columns from this experiment to + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.column_data`. + This will be appended to the existing :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. - Optionally, ``col_data`` may be provided to add new sample information to the - `MultiAssayExperiment` + Optionally, ``column_data`` may be provided to add new sample information. Args: - name (str): Name of the new experiment. - experiment (SummarizedExperiment): The experiment to add. - Must extend - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` class. + name: + Name of the new experiment. - sample_map (DataFrame): Sample map to append to the MAE. + experiment: + The experiment to add. - col_data (DataFrame, optional): Sample data to append to the MAE. Defaults to None. + sample_map: + Sample map to append to the MAE. - Raises: - ValueError: If ``name`` is an existing experiment name in ``experiments``. + column_data: + Sample data to append to the MAE. + + Defaults to None. + + in_place: + Whether to modify the ``MultiAssayExperiment`` in place. + Defaults to False. + + Returns: + A modified ``MultiAssayExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. """ if name in self.experiments: raise ValueError(f"An experiment with {name} already exists.") - self._validate_col_data(col_data) + _new_column_data = self._column_data + if column_data is not None: + _new_column_data = ut.combine_rows(self._column_data, column_data) - new_experiments = self.experiments.copy() - new_experiments[name] = experiment + _new_sample_map = ut.combine_rows(self._sample_map, sample_map) - new_col_data = col_data - if new_col_data is not None: - new_col_data = concat([self.col_data, col_data], axis=0) + _new_experiments = self._experiments.copy() + _new_experiments[name] = experiment - new_sample_map = concat([self.sample_map, sample_map], axis=0) - - self._validate_experiments(new_experiments) - self._validate_sample_map( - sample_map=new_sample_map, - col_data=new_col_data, - experiments=new_experiments, + _validate_column_data(_new_column_data) + _validate_experiments(_new_experiments) + _validate_sample_map( + sample_map=_new_sample_map, + column_data=_new_column_data, + experiments=_new_experiments, ) - self._experiments = new_experiments - self._sample_map = new_sample_map - self._col_data = new_col_data + output = self._define_output(in_place) + output._experiments = _new_experiments + output._sample_map = _new_sample_map + output._column_data = _new_column_data + + return output + + ################################# + ######>> mudata interop <<####### + ################################# - def to_mudata(self) -> MuData: - """Transform `SingleCellExperiment` object to :py:class:`~mudata.MuData`. + def to_mudata(self): + """Transform ``MultiAssayExperiment`` object to :py:class:`~mudata.MuData`. Returns: - MuData: A `MuData` representation. + A `MuData` representation. """ + from mudata import MuData exptsList = OrderedDict() - for expname, expt in self.experiments.items(): + for expname, expt in self._experiments.items(): if isinstance(expt, SingleCellExperiment): - obj, adatas = expt.to_anndata(alts=True) + obj, adatas = expt.to_anndata(include_alternative_experiments=True) exptsList[expname] = obj if adatas is not None: for aname, aexpt in adatas.items(): exptsList[f"{expname}_{aname}"] = aexpt - elif isinstance(expt, SummarizedExperiment): + elif isinstance(expt, se.SummarizedExperiment): exptsList[expname] = expt.to_anndata() else: print(f"Experiment: '{expname}' is not supported!") diff --git a/src/multiassayexperiment/types.py b/src/multiassayexperiment/types.py deleted file mode 100644 index 34f82e9..0000000 --- a/src/multiassayexperiment/types.py +++ /dev/null @@ -1,20 +0,0 @@ -from collections import namedtuple -from typing import MutableMapping, Optional, Sequence, Tuple, Union - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - -StrOrListStr = Union[str, Sequence[str]] -SlicerTypes = Union[ - MutableMapping[str, Union[Sequence[int], slice]], - Union[Sequence[int], slice], -] - -SlicerArgTypes = Tuple[ - Optional[SlicerTypes], - Optional[SlicerTypes], - Optional[Sequence[str]], -] - -SlicerResult = namedtuple("SlicerResult", ["experiments", "sample_map", "col_data"]) From 9c4483716ce0def6bb66295ec74eb19accd3a2a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 22:50:46 +0000 Subject: [PATCH 02/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multiassayexperiment/MultiAssayExperiment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index da606b2..a5de488 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -99,7 +99,9 @@ def _validate_sample_map(sample_map, column_data, experiments): class MultiAssayExperiment: - """Container class for representing and managing multi-omics genomic experiments. Checkout the + """Container class for representing and managing multi-omics genomic experiments. + + Checkout the `R/MultiAssayExperiment `_ for more information. """ From 07c54a136a8d296b184cb27d1c44e24350d8bf59 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 09:05:21 -0800 Subject: [PATCH 03/21] theoretically all changes make sense --- setup.cfg | 5 + .../MultiAssayExperiment.py | 177 +++++++++++++++++- src/multiassayexperiment/__init__.py | 4 +- src/multiassayexperiment/io/__init__.py | 3 +- src/multiassayexperiment/io/anndata.py | 72 ------- src/multiassayexperiment/io/h5ad.py | 25 +++ src/multiassayexperiment/io/interface.py | 70 +++---- src/multiassayexperiment/io/mudata.py | 71 ------- 8 files changed, 227 insertions(+), 200 deletions(-) delete mode 100644 src/multiassayexperiment/io/anndata.py create mode 100644 src/multiassayexperiment/io/h5ad.py delete mode 100644 src/multiassayexperiment/io/mudata.py diff --git a/setup.cfg b/setup.cfg index c066e67..4e4eb7c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = importlib-metadata; python_version<"3.8" biocframe>=0.5.6,<0.6.0 biocutils>=0.1.4,<0.2.0 + summarizedexperiment>=0.4.0,<0.5.0 [options.packages.find] where = src @@ -61,6 +62,10 @@ exclude = # Add here additional requirements for extra features, to install with: # `pip install MultiAssayExperiment[PDF]` like: # PDF = ReportLab; RXP +optional = + singlecellexperiment + anndata + mudata # Add here test requirements (semicolon/line-separated) testing = diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index da606b2..81bfbb5 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -98,6 +98,29 @@ def _validate_sample_map(sample_map, column_data, experiments): _validate_sample_map_with_expts(sample_map, experiments) +def _create_smap_from_experiments(experiments): + _all_assays = [] + _all_primary = [] + _all_colnames = [] + samples = [] + + for expname, expt in experiments.items(): + colnames = expt.colnames + asy_sample = f"unknown_sample_{expname}" + _all_assays.extend([expname] * len(colnames)) + _all_primary.extend([asy_sample] * len(colnames)) + _all_colnames.extend(colnames) + + samples.append(asy_sample) + + sample_map = biocframe.BiocFrame( + {"assays": _all_assays, "primary": _all_primary, "colname": _all_colnames} + ) + col_data = biocframe.BiocFrame({"samples": samples}, row_names=samples) + + return col_data, sample_map + + class MultiAssayExperiment: """Container class for representing and managing multi-omics genomic experiments. Checkout the `R/MultiAssayExperiment `_ @@ -107,8 +130,8 @@ class MultiAssayExperiment: def __init__( self, experiments: Dict[str, Any], - column_data: biocframe.BiocFrame, - sample_map: biocframe.BiocFrame, + column_data: biocframe.BiocFrame = None, + sample_map: biocframe.BiocFrame = None, metadata: Optional[dict] = None, validate: bool = True, ) -> None: @@ -119,6 +142,13 @@ def __init__( transform from :py:class:`~multiassayexperiment.io.mudata.from_mudata` and :py:class:`~multiassayexperiment.io.anndata.from_anndata` objects. + If both ``column_data`` and ``sample_map`` are None, the constructor naively creates + sample mapping, with each ``experiment`` considered to be a independent `sample`. + We add a sample to :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data` + in this pattern - ``unknown_sample_{experiment_name}``. All cells from the same experiment are + considered to be from the same sample and is reflected in + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. + Args: experiments: A dictionary containing experiments, with experiment names as keys and @@ -137,6 +167,8 @@ def __init__( that maps to the 'primary' in :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. + Defaults to None. + sample_map: Map biological units from :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.column_data` @@ -155,19 +187,30 @@ def __init__( Each sample in ``column_data`` may map to one or more columns per assay. - metadata: - Additional study-level metadata. Defaults to None. + metadata: + Additional study-level metadata. Defaults to None. + validate: Internal use only. """ - - self._sample_map = _sanitize_frame(sample_map) - self._column_data = _sanitize_frame(column_data) self._experiments = experiments if experiments is not None else {} self._metadata = metadata if metadata is not None else {} + if self._sample_map is not None and self._column_data is not None: + self._sample_map = _sanitize_frame(sample_map) + self._column_data = _sanitize_frame(column_data) + elif self._sample_map is None and self._column_data is None: + # make a sample map + self._column_data, self._sample_map = _create_smap_from_experiments( + self._experiments + ) + else: + raise ValueError( + "Either 'sample_map' or 'column_data' is `None`. Either both should be provided or set both to `None`." + ) + if validate: _validate_experiments(self._experiments) _validate_column_data(self._column_data) @@ -993,3 +1036,123 @@ def to_mudata(self): print(f"Experiment: '{expname}' is not supported!") return MuData(exptsList) + + @classmethod + def from_mudata(cls, input: "mudata.MuData") -> "MultiAssayExperiment": + """Create a ``MultiAssayExperiment`` object from :py:class:`~mudata.MuData`. + + The import naively creates sample mapping, each ``experiment`` is considered to be a `sample`. + We add a sample with the following pattern - ``"unknown_sample_{experiment_name}"`` to + :py:attr:`~col_data`. All cells from the same experiment are considered to be extracted from + the same sample and is reflected in :py:attr:`~sample_map`. + + Args: + input: + MuData object. + + Raises: + Exception: + If ``mudata`` object is read in backed mode :py:attr:`~mudata.MuData.isbacked`. + + Returns: + ``MultiAssayExperiment`` object. + """ + + import singlecellexperiment + + if input.isbacked is True: + raise Exception("backed mode is currently not supported.") + + experiments = OrderedDict() + + _all_assays = [] + _all_primary = [] + _all_colnames = [] + samples = [] + + for asy, adata in input.mod.items(): + experiments[asy] = singlecellexperiment.SingleCellExperiment.from_anndata( + adata + ) + + colnames = None + if adata.obs.index.tolist() is not None: + colnames = adata.obs.index.tolist() + else: + colnames = range(len(adata.shape[0])) + + asy_sample = f"unknown_sample_{asy}" + + _all_assays.extend([asy] * len(colnames)) + _all_primary.extend([asy_sample] * len(colnames)) + _all_colnames.extend(colnames) + + samples.append(asy_sample) + + sample_map = biocframe.BiocFrame( + {"assays": _all_assays, "primary": _all_primary, "colname": _all_colnames} + ) + col_data = biocframe.BiocFrame({"samples": samples}, row_names=samples) + + return cls( + experiments=experiments, + column_data=col_data, + sample_map=sample_map, + metadata=input.uns, + ) + + @classmethod + def from_anndata( + cls, input: "anndata.AnnData", name: str = "unknown" + ) -> "MultiAssayExperiment": + """Create a ``MultiAssayExperiment`` from :py:class:`~anndata.AnnData`. + + Since :py:class:`~anndata.AnnData` does not contain sample information, + sample named ``"unknown_sample"`` will be added to + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data`. + All cells are considered to be extracted from this sample and is reflected in + :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. + + Args: + input: + An ``AnnData`` object. + + name: + Name for the experiment. + + Defaults to "unknown". + + Returns: + An ``MultiAssayExperiment``. + """ + import singlecellexperiment + + scexpt = singlecellexperiment.SingleCellExperiment.from_anndata(input=input) + + experiments = {name: scexpt} + + col_data = biocframe.BiocFrame( + {"samples": ["unknown_sample"]}, row_names=["unknown_sample"] + ) + + colnames = None + + if input.obs.index.tolist() is not None: + colnames = input.obs.index.tolist() + else: + colnames = range(len(input.shape[0])) + + sample_map = biocframe.BiocFrame( + { + "colname": colnames, + "assay": ["unknown"] * len(colnames), + "primary": ["unknown_sample"] * len(colnames), + } + ) + + return cls( + experiments=experiments, + column_data=col_data, + sample_map=sample_map, + metadata=input.uns, + ) diff --git a/src/multiassayexperiment/__init__.py b/src/multiassayexperiment/__init__.py index 40119fd..a57e93a 100644 --- a/src/multiassayexperiment/__init__.py +++ b/src/multiassayexperiment/__init__.py @@ -15,7 +15,5 @@ finally: del version, PackageNotFoundError -from .io.anndata import from_anndata, read_h5ad -from .io.interface import make_mae -from .io.mudata import from_mudata +from .io import read_h5ad, make_mae from .MultiAssayExperiment import MultiAssayExperiment diff --git a/src/multiassayexperiment/io/__init__.py b/src/multiassayexperiment/io/__init__.py index c57287f..5c5306c 100644 --- a/src/multiassayexperiment/io/__init__.py +++ b/src/multiassayexperiment/io/__init__.py @@ -1,3 +1,2 @@ -from .anndata import from_anndata, read_h5ad +from .h5ad import read_h5ad from .interface import make_mae -from .mudata import from_mudata diff --git a/src/multiassayexperiment/io/anndata.py b/src/multiassayexperiment/io/anndata.py deleted file mode 100644 index bc51557..0000000 --- a/src/multiassayexperiment/io/anndata.py +++ /dev/null @@ -1,72 +0,0 @@ -import anndata -import singlecellexperiment -from anndata import AnnData -from pandas import DataFrame - -from ..MultiAssayExperiment import MultiAssayExperiment - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def from_anndata(adata: AnnData, name: str = "unknown") -> MultiAssayExperiment: - """Read :py:class:`~anndata.AnnData` objects as a - :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment`. - - Since :py:class:`~anndata.AnnData` does not contain sample information, - sample named ``"unknown_sample"`` will be added to - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data`. - All cells are considered to be extracted from this sample and is reflected in - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. - - Args: - data (AnnData): An `AnnData` object. - name (str, optional): Name for the experiment. Defaults to "unknown". - - Returns: - MultiAssayExperiment: An MAE of ``data``. - """ - - if not isinstance(adata, AnnData): - raise TypeError("data is not an `AnnData` object.") - - scexpt = singlecellexperiment.from_anndata(adata=adata) - - experiments = {name: scexpt} - - col_data = DataFrame({"samples": ["unknown_sample"]}, index=["unknown_sample"]) - - sample_map = DataFrame() - colnames = None - if adata.obs.index.tolist() is not None: - colnames = adata.obs.index.tolist() - else: - colnames = range(len(adata.shape[0])) - sample_map["colname"] = colnames - sample_map["assay"] = "unknown" - sample_map["primary"] = "unknown_sample" - - return MultiAssayExperiment( - experiments=experiments, - col_data=col_data, - sample_map=sample_map, - metadata=adata.uns, - ) - - -def read_h5ad(path: str) -> MultiAssayExperiment: - """Read a H5ad file as a :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment`. - - This function reads the h5ad at the ``path`` using :py:func:`~anndata.read_h5ad` and converts - it into an MAE using :py:func:`~multiassayexperiment.io.anndata.from_anndata`. - - Args: - path (str): Path to a H5AD file - - Returns: - MultiAssayExperiment: An MAE from the H5ad file. - """ - - adata = anndata.read_h5ad(path) - return from_anndata(adata) diff --git a/src/multiassayexperiment/io/h5ad.py b/src/multiassayexperiment/io/h5ad.py new file mode 100644 index 0000000..c2ec520 --- /dev/null +++ b/src/multiassayexperiment/io/h5ad.py @@ -0,0 +1,25 @@ +from ..MultiAssayExperiment import MultiAssayExperiment + +__author__ = "jkanche" +__copyright__ = "jkanche" +__license__ = "MIT" + + +def read_h5ad(path: str) -> MultiAssayExperiment: + """Create a :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from + a H5AD file. + + This function reads the h5ad at the ``path`` using :py:func:`~anndata.read_h5ad` and converts + it into an MAE using :py:func:`~multiassayexperiment.io.anndata.from_anndata`. + + Args: + path: + Path to a H5AD file + + Returns: + An MAE from the H5ad file. + """ + import anndata + + adata = anndata.read_h5ad(path) + return MultiAssayExperiment.from_anndata(adata) diff --git a/src/multiassayexperiment/io/interface.py b/src/multiassayexperiment/io/interface.py index 32497de..13bc998 100644 --- a/src/multiassayexperiment/io/interface.py +++ b/src/multiassayexperiment/io/interface.py @@ -1,26 +1,21 @@ from collections import OrderedDict -from typing import MutableMapping, Union +from typing import Any, Dict -import pandas as pd -import singlecellexperiment as sce -from anndata import AnnData -from summarizedexperiment import SummarizedExperiment - -from ..MultiAssayExperiment import MultiAssayExperiment +from ..MultiAssayExperiment import MultiAssayExperiment, _create_smap_from_experiments __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -def make_mae( - experiments: MutableMapping[ - str, - Union[AnnData, SummarizedExperiment], - ] -) -> MultiAssayExperiment: - """Read a dictionary of experiments as an - :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment`. +def make_mae(experiments: Dict[str, Any]) -> MultiAssayExperiment: + """Create an + :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from + a dictionary of experiment objects. Each experiment is either an + :py:class:`~anndata.AnnData` object or a subclass of + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + :py:class:`~anndata.AnnData` objects will be converted to a + :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`. The import naively creates sample mapping, with each ``experiment`` considered to be a independent `sample`. We add a sample to @@ -29,26 +24,28 @@ def make_mae( considered to be from the same sample and is reflected in :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. - Additionally, converts :py:class:`~anndata.AnnData` objects to - :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment` objects. - Args: - experiments (MutableMapping[str, Union[AnnData, SummarizedExperiment]]): A dictionary of - experiments with experiment names as keys and the experiments as values. + experiments: + A dictionary of experiments with experiment names as keys and the + experiments as values. - Each ``experiment`` can be represented as :py:class:`~anndata.AnnData` objects or any + Each ``experiment`` can be either a :py:class:`~anndata.AnnData` object or a subclass of :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. Raises: - TypeError: If any of the provided objects are not an expected types. - TypeError: If ``experiments`` is not a dictionary. + TypeError: + - If any of the provided objects are not an expected types. + - If ``experiments`` is not a dictionary. Returns: - MultiAssayExperiment: An MAE from the experiments. + An MAE from the experiments. """ + import singlecellexperiment as sce + from anndata import AnnData + from summarizedexperiment import SummarizedExperiment if not isinstance(experiments, dict): - raise TypeError("`experiments` is not a dictionary.") + raise TypeError("'experiments' is not a dictionary.") failedExpts = [] for expname, expt in experiments.items(): @@ -60,37 +57,20 @@ def make_mae( if len(failedExpts) > 0: raise TypeError( f"Experiments '{', '.join(failedExpts)}' are not compatible, Must be either an " - "AnnData, or a subclass derived from SummarizedExperiment." + "AnnData, or a subclass derived from `SummarizedExperiment`." ) newExpts = OrderedDict() - - sample_map = pd.DataFrame() - samples = [] - for expname, expt in experiments.items(): if isinstance(expt, AnnData): newExpts[expname] = sce.from_anndata(expt) else: newExpts[expname] = expt - colnames = newExpts[expname].colnames - asy_sample = f"unknown_sample_{expname}" - asy_df = pd.DataFrame( - { - "assay": [expname] * len(colnames), - "primary": [asy_sample] * len(colnames), - "colname": colnames, - } - ) - - sample_map = pd.concat([sample_map, asy_df]) - samples.append(asy_sample) - - col_data = pd.DataFrame({"samples": samples}, index=samples) + col_data, sample_map = _create_smap_from_experiments(newExpts) return MultiAssayExperiment( experiments=newExpts, - col_data=col_data, + column_data=col_data, sample_map=sample_map, ) diff --git a/src/multiassayexperiment/io/mudata.py b/src/multiassayexperiment/io/mudata.py deleted file mode 100644 index de67008..0000000 --- a/src/multiassayexperiment/io/mudata.py +++ /dev/null @@ -1,71 +0,0 @@ -from collections import OrderedDict - -from mudata import MuData -from pandas import DataFrame, concat -from singlecellexperiment import from_anndata - -from ..MultiAssayExperiment import MultiAssayExperiment - -__author__ = "jkanche" -__copyright__ = "jkanche" -__license__ = "MIT" - - -def from_mudata(mudata: MuData) -> MultiAssayExperiment: - """Read :py:class:`~mudata.MuData` as :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment`. - - The import naively creates sample mapping, each ``experiment`` is considered to be a `sample`. - We add a sample with the following pattern - ``"unknown_sample_{experiment_name}"`` to - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.col_data` - All cells from the same experiment are considered to be extracted from the same sample and is - reflected in - :py:attr:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment.sample_map`. - - Args: - mudata (MuData): MuData object. - - Raises: - Exception: If ``mudata`` object is read in backed mode :py:attr:`~mudata.MuData.isbacked`. - - Returns: - MultiAssayExperiment: MAE representation. - """ - - if mudata.isbacked is True: - raise Exception("backed mode is currently not supported.") - - experiments = OrderedDict() - - sample_map = DataFrame() - samples = [] - - for asy, adata in mudata.mod.items(): - experiments[asy] = from_anndata(adata) - - colnames = None - if adata.obs.index.tolist() is not None: - colnames = adata.obs.index.tolist() - else: - colnames = range(len(adata.shape[0])) - - asy_sample = f"unknown_sample_{asy}" - - asy_df = DataFrame( - { - "assay": [asy] * len(colnames), - "primary": [asy_sample] * len(colnames), - "colname": colnames, - } - ) - - sample_map = concat([sample_map, asy_df]) - samples.append(asy_sample) - - col_data = DataFrame({"samples": samples}, index=samples) - - return MultiAssayExperiment( - experiments=experiments, - col_data=col_data, - sample_map=sample_map, - metadata=mudata.uns, - ) From 7a3f2c1293cb0e0ec544d60329dc9066fa987b30 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 17:05:49 +0000 Subject: [PATCH 04/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- setup.cfg | 2 +- src/multiassayexperiment/io/h5ad.py | 3 +-- src/multiassayexperiment/io/interface.py | 11 ++++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/setup.cfg b/setup.cfg index 4e4eb7c..c032527 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,7 +62,7 @@ exclude = # Add here additional requirements for extra features, to install with: # `pip install MultiAssayExperiment[PDF]` like: # PDF = ReportLab; RXP -optional = +optional = singlecellexperiment anndata mudata diff --git a/src/multiassayexperiment/io/h5ad.py b/src/multiassayexperiment/io/h5ad.py index c2ec520..3c0a462 100644 --- a/src/multiassayexperiment/io/h5ad.py +++ b/src/multiassayexperiment/io/h5ad.py @@ -6,8 +6,7 @@ def read_h5ad(path: str) -> MultiAssayExperiment: - """Create a :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from - a H5AD file. + """Create a :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from a H5AD file. This function reads the h5ad at the ``path`` using :py:func:`~anndata.read_h5ad` and converts it into an MAE using :py:func:`~multiassayexperiment.io.anndata.from_anndata`. diff --git a/src/multiassayexperiment/io/interface.py b/src/multiassayexperiment/io/interface.py index 13bc998..2d7589c 100644 --- a/src/multiassayexperiment/io/interface.py +++ b/src/multiassayexperiment/io/interface.py @@ -9,13 +9,10 @@ def make_mae(experiments: Dict[str, Any]) -> MultiAssayExperiment: - """Create an - :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from - a dictionary of experiment objects. Each experiment is either an - :py:class:`~anndata.AnnData` object or a subclass of - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. - :py:class:`~anndata.AnnData` objects will be converted to a - :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`. + """Create an :py:class:`~multiassayexperiment.MultiAssayExperiment.MultiAssayExperiment` from a dictionary of + experiment objects. Each experiment is either an :py:class:`~anndata.AnnData` object or a subclass of + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. :py:class:`~anndata.AnnData` objects + will be converted to a :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`. The import naively creates sample mapping, with each ``experiment`` considered to be a independent `sample`. We add a sample to From fb681d82e62e79c2b8208a7745f39e65ec24d5f0 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 10:29:07 -0800 Subject: [PATCH 05/21] all changes to fix tests --- setup.cfg | 6 ++ .../MultiAssayExperiment.py | 82 ++++++++++------ src/multiassayexperiment/io/interface.py | 4 +- tests/test_add_expt.py | 18 ++-- tests/test_create.py | 32 +++---- tests/test_io.py | 10 +- tests/test_methods.py | 34 +++---- tests/test_slices.py | 94 +++++++++---------- tests/test_with_coldata.py | 16 ++-- 9 files changed, 161 insertions(+), 135 deletions(-) diff --git a/setup.cfg b/setup.cfg index 4e4eb7c..30fa8f1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,12 +66,18 @@ optional = singlecellexperiment anndata mudata + genomicranges # Add here test requirements (semicolon/line-separated) testing = setuptools pytest pytest-cov + anndata + pandas + mudata + singlecellexperiment + genomicranges [options.entry_points] # Add here console scripts like: diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 56c04c8..2706bc0 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -6,7 +6,6 @@ import biocframe import biocutils as ut import summarizedexperiment as se -from singlecellexperiment import SingleCellExperiment __author__ = "jkanche" __copyright__ = "jkanche" @@ -68,7 +67,7 @@ def _validate_sample_map_with_expts(sample_map, experiments): # check if colnames exist agroups = sample_map.split("assay") - for grp, rows in agroups: + for grp, rows in agroups.items(): if grp not in experiments: warn( f"Experiment '{grp}' exists in `sample_map` but not in `experiments`.", @@ -114,7 +113,7 @@ def _create_smap_from_experiments(experiments): samples.append(asy_sample) sample_map = biocframe.BiocFrame( - {"assays": _all_assays, "primary": _all_primary, "colname": _all_colnames} + {"assay": _all_assays, "primary": _all_primary, "colname": _all_colnames} ) col_data = biocframe.BiocFrame({"samples": samples}, row_names=samples) @@ -200,10 +199,10 @@ def __init__( self._experiments = experiments if experiments is not None else {} self._metadata = metadata if metadata is not None else {} - if self._sample_map is not None and self._column_data is not None: + if sample_map is not None and column_data is not None: self._sample_map = _sanitize_frame(sample_map) self._column_data = _sanitize_frame(column_data) - elif self._sample_map is None and self._column_data is None: + elif sample_map is None and column_data is None: # make a sample map self._column_data, self._sample_map = _create_smap_from_experiments( self._experiments @@ -242,7 +241,7 @@ def __deepcopy__(self, memo=None, _nil=[]): current_class_const = type(self) return current_class_const( - experiment=_expts_copy, + experiments=_expts_copy, column_data=_column_data_copy, sample_map=_sample_map_copy, metadata=_metadata_copy, @@ -255,7 +254,7 @@ def __copy__(self): """ current_class_const = type(self) return current_class_const( - experiment=self._experiments, + experiments=self._experiments, column_data=self._column_data, sample_map=self._sample_map, metadata=self._metadata, @@ -463,8 +462,8 @@ def experiment(self, name: str, with_sample_data: bool = False) -> Any: if with_sample_data is True: expt = deepcopy(expt) - assay_splits = self.sample_map.split("assay", indices_only=True) - subset_map = self.sample_map[assay_splits[name]] + assay_splits = self.sample_map.split("assay", only_indices=True) + subset_map = self.sample_map[assay_splits[name],] subset_map = subset_map.set_row_names(subset_map.get_column("colname")) expt_column_data = expt.column_data @@ -678,12 +677,12 @@ def subset_experiments( if experiment_names is None: experiment_names = slice(None) - if isinstance(experiment_names, slice) and experiment_names != slice(None): + if experiment_names != slice(None): expts, _ = ut.normalize_subscript( experiment_names, len(self.experiment_names), self.experiment_names ) - to_keep = self.experiment_names[expts] + to_keep = [self.experiment_names[idx] for idx in expts] new_expt = OrderedDict() for texpt in to_keep: @@ -691,16 +690,17 @@ def subset_experiments( _expts_copy = new_expt - for k, v in _expts_copy.items(): - _expts_copy[k] = v[rows, columns] + if rows != slice(None) and columns != slice(None): + for k, v in _expts_copy.items(): + _expts_copy[k] = v[rows, columns] - return new_expt + return _expts_copy def _generic_slice( self, - rows: Optional[Union[str, int, bool, Sequence]], - columns: Optional[Union[str, int, bool, Sequence]], - experiments: Optional[Union[str, int, bool, Sequence]], + rows: Optional[Union[str, int, bool, Sequence]] = None, + columns: Optional[Union[str, int, bool, Sequence]] = None, + experiments: Optional[Union[str, int, bool, Sequence]] = None, ) -> SlicerResult: """Slice ``MultiAssayExperiment`` along the rows and/or columns, based on their indices or names. @@ -746,6 +746,8 @@ def _generic_slice( experiment_names=experiments, rows=rows, columns=columns ) + print("new expt keys", _new_experiments.keys()) + # filter sample_map smap_indices_to_keep = [] for expname, expt in _new_experiments.items(): @@ -754,16 +756,14 @@ def _generic_slice( if row["assay"] == expname and row["colname"] in expt.column_names: smap_indices_to_keep.append(counter) counter += 1 - _new_sample_map = self._sample_map[list(set(smap_indices_to_keep)),] # filter column_data subset_primary = list(set(_new_sample_map.get_column("primary"))) coldata_indices_to_keep = [] - counter = 0 - for row in self._column_data._row_names: + for idx, row in enumerate(self._column_data._row_names): if row in subset_primary: - coldata_indices_to_keep.append(counter) + coldata_indices_to_keep.append(idx) _new_column_data = self._column_data[list(set(coldata_indices_to_keep)),] @@ -857,13 +857,32 @@ def __getitem__(self, args: tuple) -> "MultiAssayExperiment": raise ValueError("At least one slice argument must be provided.") if len(args) == 1: - return self._generic_slice(rows=args[0]) + sresult = self._generic_slice(rows=args[0]) + return MultiAssayExperiment( + sresult.experiments, + sresult.column_data, + sresult.sample_map, + self.metadata, + ) elif len(args) == 2: - return self._generic_slice(rows=args[0], columns=args[1]) + sresult = self._generic_slice(rows=args[0], columns=args[1]) + return MultiAssayExperiment( + sresult.experiments, + sresult.column_data, + sresult.sample_map, + self.metadata, + ) elif len(args) == 3: - return self._generic_slice( + print("SHOULD BER HERE???????") + sresult = self._generic_slice( rows=args[0], columns=args[1], experiments=args[2] ) + return MultiAssayExperiment( + sresult.experiments, + sresult.column_data, + sresult.sample_map, + self.metadata, + ) else: raise ValueError( f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." @@ -987,8 +1006,10 @@ def add_experiment( _new_column_data = self._column_data if column_data is not None: + column_data = _sanitize_frame(column_data) _new_column_data = ut.combine_rows(self._column_data, column_data) + sample_map = _sanitize_frame(sample_map) _new_sample_map = ut.combine_rows(self._sample_map, sample_map) _new_experiments = self._experiments.copy() @@ -1020,6 +1041,7 @@ def to_mudata(self): A `MuData` representation. """ from mudata import MuData + from singlecellexperiment import SingleCellExperiment exptsList = OrderedDict() @@ -1060,7 +1082,7 @@ def from_mudata(cls, input: "mudata.MuData") -> "MultiAssayExperiment": ``MultiAssayExperiment`` object. """ - import singlecellexperiment + from singlecellexperiment import SingleCellExperiment if input.isbacked is True: raise Exception("backed mode is currently not supported.") @@ -1073,9 +1095,7 @@ def from_mudata(cls, input: "mudata.MuData") -> "MultiAssayExperiment": samples = [] for asy, adata in input.mod.items(): - experiments[asy] = singlecellexperiment.SingleCellExperiment.from_anndata( - adata - ) + experiments[asy] = SingleCellExperiment.from_anndata(adata) colnames = None if adata.obs.index.tolist() is not None: @@ -1092,7 +1112,7 @@ def from_mudata(cls, input: "mudata.MuData") -> "MultiAssayExperiment": samples.append(asy_sample) sample_map = biocframe.BiocFrame( - {"assays": _all_assays, "primary": _all_primary, "colname": _all_colnames} + {"assay": _all_assays, "primary": _all_primary, "colname": _all_colnames} ) col_data = biocframe.BiocFrame({"samples": samples}, row_names=samples) @@ -1127,9 +1147,9 @@ def from_anndata( Returns: An ``MultiAssayExperiment``. """ - import singlecellexperiment + from singlecellexperiment import SingleCellExperiment - scexpt = singlecellexperiment.SingleCellExperiment.from_anndata(input=input) + scexpt = SingleCellExperiment.from_anndata(input=input) experiments = {name: scexpt} diff --git a/src/multiassayexperiment/io/interface.py b/src/multiassayexperiment/io/interface.py index 13bc998..d3a821c 100644 --- a/src/multiassayexperiment/io/interface.py +++ b/src/multiassayexperiment/io/interface.py @@ -40,7 +40,7 @@ def make_mae(experiments: Dict[str, Any]) -> MultiAssayExperiment: Returns: An MAE from the experiments. """ - import singlecellexperiment as sce + from singlecellexperiment import SingleCellExperiment from anndata import AnnData from summarizedexperiment import SummarizedExperiment @@ -63,7 +63,7 @@ def make_mae(experiments: Dict[str, Any]) -> MultiAssayExperiment: newExpts = OrderedDict() for expname, expt in experiments.items(): if isinstance(expt, AnnData): - newExpts[expname] = sce.from_anndata(expt) + newExpts[expname] = SingleCellExperiment.from_anndata(expt) else: newExpts[expname] = expt diff --git a/tests/test_add_expt.py b/tests/test_add_expt.py index 8fce121..7590b18 100644 --- a/tests/test_add_expt.py +++ b/tests/test_add_expt.py @@ -39,9 +39,9 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) -col_data_sce = pd.DataFrame( +column_data_sce = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, @@ -56,9 +56,9 @@ } ) -sample_col_data_sce = pd.DataFrame({"samples": ["sample1"]}, index=["sample1"]) +sample_column_data_sce = pd.DataFrame({"samples": ["sample1"]}, index=["sample1"]) -col_data_se = pd.DataFrame( +column_data_se = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, @@ -73,17 +73,17 @@ } ) -sample_col_data_se = pd.DataFrame({"samples": ["sample2"]}, index=["sample2"]) +sample_column_data_se = pd.DataFrame({"samples": ["sample2"]}, index=["sample2"]) def test_MAE_addExpt(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) mae = MultiAssayExperiment( experiments={"sce": tsce}, - col_data=sample_col_data_sce, + column_data=sample_column_data_sce, sample_map=sample_map_sce, metadata={"could be": "anything"}, ) @@ -94,14 +94,14 @@ def test_MAE_addExpt(): tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se, + column_data=column_data_se, ) mae.add_experiment( name="se", experiment=tse2, sample_map=sample_map_se, - col_data=sample_col_data_se, + column_data=sample_column_data_se, ) assert mae is not None diff --git a/tests/test_create.py b/tests/test_create.py index bad852d..831cabb 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -40,15 +40,15 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) -col_data_sce = pd.DataFrame( +column_data_sce = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, index=["sce"] * 6, ) -col_data_se = pd.DataFrame( +column_data_se = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, @@ -70,18 +70,18 @@ def test_MAE_creation(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -92,25 +92,25 @@ def test_MAE_creation(): def test_MAE_creation_with_alts(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_se + assays={"counts": counts}, row_data=df_gr, column_data=column_data_se ) tsce = SingleCellExperiment( assays={"counts": counts}, row_data=df_gr, - col_data=col_data_sce, + column_data=column_data_sce, alternative_experiments={"alt": tse}, ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -121,19 +121,19 @@ def test_MAE_creation_with_alts(): def test_MAE_creation_fails(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_sce.copy(), + column_data=column_data_sce.copy(), ) with pytest.raises(Exception): MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -141,18 +141,18 @@ def test_MAE_creation_fails(): def test_MAE_save(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) diff --git a/tests/test_io.py b/tests/test_io.py index f76aa5a..0e1a52f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -18,7 +18,7 @@ def test_MAE_fromH5AD(): assert tse.experiments is not None assert tse.sample_map is not None - assert tse.col_data is not None + assert tse.column_data is not None # credit: MuData docs @@ -45,7 +45,7 @@ def test_MAE_from_mudata(): mdata = MuData({"A": adata, "B": adata2}) - muMAE = mae.from_mudata(mudata=mdata) + muMAE = mae.MultiAssayExperiment.from_mudata(input=mdata) assert muMAE is not None assert isinstance(muMAE, mae.MultiAssayExperiment) @@ -81,7 +81,7 @@ def test_MAE_make_mae(): assert muMAE.experiments is not None assert muMAE.sample_map is not None - assert muMAE.col_data is not None + assert muMAE.column_data is not None - assert len(muMAE.sample_map["assay"].unique()) == 3 - assert len(muMAE.sample_map["primary"].unique()) == 3 + assert len(set(muMAE.sample_map["assay"])) == 3 + assert len(set(muMAE.sample_map["primary"])) == 3 diff --git a/tests/test_methods.py b/tests/test_methods.py index a12a6c8..eac56f7 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -42,15 +42,15 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) -col_data_sce = pd.DataFrame( +column_data_sce = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, index=["sce"] * 6, ) -col_data_se = pd.DataFrame( +column_data_se = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, @@ -72,18 +72,18 @@ def test_MAE_creation(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -94,25 +94,25 @@ def test_MAE_creation(): def test_MAE_creation_with_alts(): tse = SummarizedExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_se + assays={"counts": counts}, row_data=df_gr, column_data=column_data_se ) tsce = SingleCellExperiment( assays={"counts": counts}, row_data=df_gr, - col_data=col_data_sce, + column_data=column_data_sce, alternative_experiments={"alt": tse}, ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -123,34 +123,34 @@ def test_MAE_creation_with_alts(): assert mae.experiments is not None assert mae.experiment("sce") is not None assert mae.assays is not None - assert mae.col_data is not None + assert mae.column_data is not None assert mae.sample_map is not None with pytest.raises(Exception): - mae.col_data = None + mae.column_data = None with pytest.raises(Exception): mae.sample_map = None assert mae.metadata is not None - mae.metadata = None - assert mae.metadata is None + mae.metadata = {} + assert mae.metadata is not None def test_MAE_completedcases(): tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) diff --git a/tests/test_slices.py b/tests/test_slices.py index ab18e11..04a7829 100644 --- a/tests/test_slices.py +++ b/tests/test_slices.py @@ -38,10 +38,10 @@ def test_MAE_slice(): assert muMAE.experiments is not None assert muMAE.sample_map is not None - assert muMAE.col_data is not None + assert muMAE.column_data is not None - assert len(muMAE.sample_map["assay"].unique()) == 3 - assert len(muMAE.sample_map["primary"].unique()) == 3 + assert len(set(muMAE.sample_map["assay"])) == 3 + assert len(set(muMAE.sample_map["primary"])) == 3 sliced_MAE = muMAE[1:3, 1:3] assert sliced_MAE is not None @@ -49,10 +49,10 @@ def test_MAE_slice(): assert sliced_MAE.experiments is not None assert sliced_MAE.sample_map is not None - assert sliced_MAE.col_data is not None + assert sliced_MAE.column_data is not None - assert len(sliced_MAE.sample_map["assay"].unique()) == 3 - assert len(sliced_MAE.sample_map["primary"].unique()) == 3 + assert len(set(sliced_MAE.sample_map["assay"])) == 3 + assert len(set(sliced_MAE.sample_map["primary"])) == 3 assert sliced_MAE.sample_map.shape[0] == 6 sliced_MAE_assay = muMAE[None, None, ["rna", "spatial"]] @@ -61,10 +61,10 @@ def test_MAE_slice(): assert sliced_MAE_assay.experiments is not None assert sliced_MAE_assay.sample_map is not None - assert sliced_MAE_assay.col_data is not None + assert sliced_MAE_assay.column_data is not None - assert len(sliced_MAE_assay.sample_map["assay"].unique()) == 2 - assert len(sliced_MAE_assay.sample_map["primary"].unique()) == 2 + assert len(set(sliced_MAE_assay.sample_map["assay"])) == 2 + assert len(set(sliced_MAE_assay.sample_map["primary"])) == 2 assert sliced_MAE_assay.sample_map.shape[0] == 2000 sliced_MAE_assay = muMAE[1:3, 0:5, ["rna"]] @@ -73,29 +73,29 @@ def test_MAE_slice(): assert sliced_MAE_assay.experiments is not None assert sliced_MAE_assay.sample_map is not None - assert sliced_MAE_assay.col_data is not None + assert sliced_MAE_assay.column_data is not None - assert len(sliced_MAE_assay.sample_map["assay"].unique()) == 1 - assert len(sliced_MAE_assay.sample_map["primary"].unique()) == 1 + assert len(set(sliced_MAE_assay.sample_map["assay"])) == 1 + assert len(set(sliced_MAE_assay.sample_map["primary"])) == 1 assert sliced_MAE_assay.sample_map.shape[0] == 5 -def test_MAE_slice_dict(): - muMAE = mae.make_mae(experiments={"rna": adata, "spatial": adata2, "multi": adata3}) +# def test_MAE_slice_dict(): +# muMAE = mae.make_mae(experiments={"rna": adata, "spatial": adata2, "multi": adata3}) - sliced_MAE_assay = muMAE[ - {"rna": slice(0, 5)}, {"spatial": slice(0, 10)}, ["rna", "spatial"] - ] - assert sliced_MAE_assay is not None - assert isinstance(sliced_MAE_assay, mae.MultiAssayExperiment) +# sliced_MAE_assay = muMAE[ +# {"rna": slice(0, 5)}, {"spatial": slice(0, 10)}, ["rna", "spatial"] +# ] +# assert sliced_MAE_assay is not None +# assert isinstance(sliced_MAE_assay, mae.MultiAssayExperiment) - assert sliced_MAE_assay.experiments is not None - assert sliced_MAE_assay.sample_map is not None - assert sliced_MAE_assay.col_data is not None +# assert sliced_MAE_assay.experiments is not None +# assert sliced_MAE_assay.sample_map is not None +# assert sliced_MAE_assay.column_data is not None - assert len(sliced_MAE_assay.sample_map["assay"].unique()) == 2 - assert len(sliced_MAE_assay.sample_map["primary"].unique()) == 2 - assert sliced_MAE_assay.sample_map.shape[0] == 1010 +# assert len(sliced_MAE_assay.sample_map["assay"].unique()) == 2 +# assert len(sliced_MAE_assay.sample_map["primary"].unique()) == 2 +# assert sliced_MAE_assay.sample_map.shape[0] == 1010 def test_MAE_subset_by_row(): @@ -106,21 +106,21 @@ def test_MAE_subset_by_row(): assert muMAE.experiments is not None assert muMAE.sample_map is not None - assert muMAE.col_data is not None + assert muMAE.column_data is not None - assert len(muMAE.sample_map["assay"].unique()) == 3 - assert len(muMAE.sample_map["primary"].unique()) == 3 + assert len(set(muMAE.sample_map["assay"])) == 3 + assert len(set(muMAE.sample_map["primary"])) == 3 - sliced_MAE = muMAE.subset_by_row(subset=[10, 2, 5]) + sliced_MAE = muMAE.subset_by_row(rows=[10, 2, 5]) assert sliced_MAE is not None assert isinstance(sliced_MAE, mae.MultiAssayExperiment) assert sliced_MAE.experiments is not None assert sliced_MAE.sample_map is not None - assert sliced_MAE.col_data is not None + assert sliced_MAE.column_data is not None - assert len(sliced_MAE.sample_map["assay"].unique()) == 3 - assert len(sliced_MAE.sample_map["primary"].unique()) == 3 + assert len(set(sliced_MAE.sample_map["assay"])) == 3 + assert len(set(sliced_MAE.sample_map["primary"])) == 3 assert sliced_MAE.sample_map.shape == (2030, 3) @@ -132,22 +132,22 @@ def test_MAE_subset_by_column(): assert muMAE.experiments is not None assert muMAE.sample_map is not None - assert muMAE.col_data is not None + assert muMAE.column_data is not None - assert len(muMAE.sample_map["assay"].unique()) == 3 - assert len(muMAE.sample_map["primary"].unique()) == 3 + assert len(set(muMAE.sample_map["assay"])) == 3 + assert len(set(muMAE.sample_map["primary"])) == 3 - sliced_MAE = muMAE.subset_by_column(subset=[10, 2, 5]) + sliced_MAE = muMAE.subset_by_column(columns=[10, 2, 5]) assert sliced_MAE is not None assert isinstance(sliced_MAE, mae.MultiAssayExperiment) assert sliced_MAE.experiments is not None assert sliced_MAE.sample_map is not None - assert sliced_MAE.col_data is not None + assert sliced_MAE.column_data is not None - assert len(sliced_MAE.sample_map["assay"].unique()) == 3 - assert len(sliced_MAE.sample_map["primary"].unique()) == 3 - assert sliced_MAE.sample_map.shape == (9, 3) + assert len(set(sliced_MAE.sample_map["assay"])) == 3 + assert len(set(sliced_MAE.sample_map["primary"])) == 3 + assert sliced_MAE.sample_map.shape == (2030, 3) def test_MAE_subsetByExpt(): @@ -158,20 +158,20 @@ def test_MAE_subsetByExpt(): assert muMAE.experiments is not None assert muMAE.sample_map is not None - assert muMAE.col_data is not None + assert muMAE.column_data is not None - assert len(muMAE.sample_map["assay"].unique()) == 3 - assert len(muMAE.sample_map["primary"].unique()) == 3 + assert len(set(muMAE.sample_map["assay"])) == 3 + assert len(set(muMAE.sample_map["primary"])) == 3 - sliced_MAE = muMAE.subset_by_experiments(subset=["rna", "spatial"]) + sliced_MAE = muMAE.subset_by_experiments(experiments=["rna", "spatial"]) assert sliced_MAE is not None assert isinstance(sliced_MAE, mae.MultiAssayExperiment) assert sliced_MAE.experiments is not None assert len(sliced_MAE.experiments.keys()) == 2 assert sliced_MAE.sample_map is not None - assert sliced_MAE.col_data is not None + assert sliced_MAE.column_data is not None - assert len(sliced_MAE.sample_map["assay"].unique()) == 2 - assert len(sliced_MAE.sample_map["primary"].unique()) == 2 + assert len(set(sliced_MAE.sample_map["assay"])) == 2 + assert len(set(sliced_MAE.sample_map["primary"])) == 2 assert sliced_MAE.sample_map.shape == (2000, 3) diff --git a/tests/test_with_coldata.py b/tests/test_with_coldata.py index 4e70119..8d59ac5 100644 --- a/tests/test_with_coldata.py +++ b/tests/test_with_coldata.py @@ -38,15 +38,15 @@ } ) -gr = genomicranges.from_pandas(df_gr) +gr = genomicranges.GenomicRanges.from_pandas(df_gr) -col_data_sce = pd.DataFrame( +column_data_sce = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, index=[f"sce_{i}" for i in range(6)], ) -col_data_se = pd.DataFrame( +column_data_se = pd.DataFrame( { "treatment": ["ChIP", "Input"] * 3, }, @@ -79,24 +79,24 @@ ) tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=df_gr, column_data=column_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, row_data=df_gr.copy(), - col_data=col_data_se.copy(), + column_data=column_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) -def test_access_expt_with_col_data(): +def test_access_expt_with_column_data(): assert mae is not None se = mae.experiment("se") @@ -105,4 +105,4 @@ def test_access_expt_with_col_data(): sce = mae.experiment("sce", with_sample_data=True) assert sce.shape == tsce.shape - assert len(sce.col_data.columns) >= len(tsce.col_data.columns) + assert len(sce.column_data.columns) >= len(tsce.column_data.columns) From b50ac4575c614f7a20f31ade648abe00c840d81c Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 10:33:51 -0800 Subject: [PATCH 06/21] lint issues --- src/multiassayexperiment/MultiAssayExperiment.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 2706bc0..aff8011 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -296,10 +296,13 @@ def __str__(self) -> str: expt = self._experiments[expt_name] output += f"[{idx}] {expt_name}: {type(expt).__name} with {expt.shape[0]} rows and {expt.shape[1]} columns" - output += f"column_data columns({len(self._column_data.column_names)}): {ut.print_truncated_list(self._column_data.column_names)}\n" - output += f"sample_map columns({len(self._sample_map.column_names)}): {ut.print_truncated_list(self._sample_map.column_names)}\n" + output += f"column_data columns({len(self._column_data.column_names)}): " + output += "{ut.print_truncated_list(self._column_data.column_names)}\n" - output += f"metadata({str(len(self.metadata))}): {ut.print_truncated_list(list(self.metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" + output += f"sample_map columns({len(self._sample_map.column_names)}): " + output += "{ut.print_truncated_list(self._sample_map.column_names)}\n" + + output += f"metadata({str(len(self.metadata))}): {ut.print_truncated_list(list(self.metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" # noqa return output From 4b0ee951c15a8a7264bd9597b4ab96bb7d037289 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 10:34:47 -0800 Subject: [PATCH 07/21] yet another lint --- src/multiassayexperiment/MultiAssayExperiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index aff8011..20889cd 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -888,7 +888,7 @@ def __getitem__(self, args: tuple) -> "MultiAssayExperiment": ) else: raise ValueError( - f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." + f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." # noqa ) raise TypeError("'args' must be a tuple") From c4da4ecadd0596b7c879b16faec1df7ec652875f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 18:34:56 +0000 Subject: [PATCH 08/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multiassayexperiment/MultiAssayExperiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 20889cd..64fefff 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -888,7 +888,7 @@ def __getitem__(self, args: tuple) -> "MultiAssayExperiment": ) else: raise ValueError( - f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." # noqa + f"`{type(self).__name__}` only supports 3-dimensional slicing along rows, columns and/or experiments." # noqa ) raise TypeError("'args' must be a tuple") From 18f8eac28086812bf3177fbbd490e35b4ecbac2d Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 10:55:03 -0800 Subject: [PATCH 09/21] readme and docs changes --- README.md | 65 ++++++------ docs/tutorial.md | 98 ++++++++----------- .../MultiAssayExperiment.py | 6 +- 3 files changed, 78 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index 9e2b44b..a9b054d 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,18 @@ pip install multiassayexperiment First create mock sample data ```python -import pandas as pd +from random import random + import numpy as np +from biocframe import BiocFrame from genomicranges import GenomicRanges +from iranges import IRanges nrows = 200 ncols = 6 counts = np.random.rand(nrows, ncols) gr = GenomicRanges( - { - "seqnames": [ + seqnames=[ "chr1", "chr2", "chr2", @@ -39,39 +41,30 @@ gr = GenomicRanges( "chr3", "chr3", "chr3", - ] - * 20, - "starts": range(100, 300), - "ends": range(110, 310), - "strand": ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] * 20, + ] * 20, + ranges=IRanges(range(100, 300), range(110, 310)), + strand = ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] * 20, + mcols=BiocFrame({ "score": range(0, 200), "GC": [random() for _ in range(10)] * 20, - } + }) ) -col_data_sce = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["sce"] * 6, +col_data_sce = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, + row_names=["sce"] * 6, ) -col_data_se = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["se"] * 6, +col_data_se = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, + row_names=["se"] * 6, ) -sample_map = pd.DataFrame( - { - "assay": ["sce", "se"] * 6, - "primary": ["sample1", "sample2"] * 6, - "colname": ["sce", "se"] * 6, - } -) +sample_map = BiocFrame({ + "assay": ["sce", "se"] * 6, + "primary": ["sample1", "sample2"] * 6, + "colname": ["sce", "se"] * 6 +}) -sample_data = pd.DataFrame({"samples": ["sample1", "sample2"]}) +sample_data = BiocFrame({"samples": ["sample1", "sample2"]}) ``` Now we can create an instance of an MAE - @@ -79,26 +72,34 @@ Now we can create an instance of an MAE - ```python from multiassayexperiment import MultiAssayExperiment from singlecellexperiment import SingleCellExperiment -from summarizedExperiment import SummarizedExperiment +from summarizedexperiment import SummarizedExperiment tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=gr.to_pandas(), column_data=col_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, - row_data=df_gr.copy(), - col_data=col_data_se.copy(), + row_data=gr.to_pandas().copy(), + column_data=col_data_se.copy(), ) mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) ``` + ## output + class: MultiAssayExperiment containing 2 experiments + [0] sce: SingleCellExperiment with 200 rows and 6 columns + [1] se: SummarizedExperiment with 200 rows and 6 columns + column_data columns(1): ['samples'] + sample_map columns(3): ['assay', 'primary', 'colname'] + metadata(1): could be + For more use cases, checkout the [documentation](https://biocpy.github.io/MultiAssayExperiment/). diff --git a/docs/tutorial.md b/docs/tutorial.md index 430df07..b220834 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -22,16 +22,17 @@ An MAE contains three main entities, Lets create these objects ```python -import pandas as pd +from biocframe import BiocFrame +from iranges import IRanges import numpy as np from genomicranges import GenomicRanges +from random import random nrows = 200 ncols = 6 counts = np.random.rand(nrows, ncols) -df_gr = pd.DataFrame( - { - "seqnames": [ +gr = GenomicRanges( + seqnames=[ "chr1", "chr2", "chr2", @@ -42,57 +43,46 @@ df_gr = pd.DataFrame( "chr3", "chr3", "chr3", - ] - * 20, - "starts": range(100, 300), - "ends": range(110, 310), - "strand": ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] * 20, + ] * 20, + ranges=IRanges(range(100, 300), range(110, 310)), + strand = ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] * 20, + mcols=BiocFrame({ "score": range(0, 200), "GC": [random() for _ in range(10)] * 20, - } + }) ) -gr = GenomicRanges.from_pandas(df_gr) - -col_data_sce = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["sce"] * 6, +col_data_sce = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, + row_names=["sce"] * 6, ) -col_data_se = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["se"] * 6, +col_data_se = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, + row_names=["se"] * 6, ) -sample_map = pd.DataFrame( - { - "assay": ["sce", "se"] * 6, - "primary": ["sample1", "sample2"] * 6, - "colname": ["sce", "se"] * 6, - } -) +sample_map = BiocFrame({ + "assay": ["sce", "se"] * 6, + "primary": ["sample1", "sample2"] * 6, + "colname": ["sce", "se"] * 6 +}) -sample_data = pd.DataFrame({"samples": ["sample1", "sample2"]}) +sample_data = BiocFrame({"samples": ["sample1", "sample2"]}, row_names=["sample1", "sample2"]) ``` Then, create various experiment classes, ```python from singlecellexperiment import SingleCellExperiment -from summarizedExperiment import SummarizedExperiment +from summarizedexperiment import SummarizedExperiment tsce = SingleCellExperiment( - assays={"counts": counts}, row_data=df_gr, col_data=col_data_sce + assays={"counts": counts}, row_data=gr.to_pandas(), column_data=col_data_sce ) tse2 = SummarizedExperiment( assays={"counts": counts.copy()}, - row_data=df_gr.copy(), - col_data=col_data_se.copy(), + row_data=gr.to_pandas().copy(), + column_data=col_data_se.copy(), ) ``` @@ -101,9 +91,9 @@ Now that we have all the pieces together, we can now create an MAE, ```python from multiassayexperiment import MultiAssayExperiment -maeObj = MultiAssayExperiment( +mae = MultiAssayExperiment( experiments={"sce": tsce, "se": tse2}, - col_data=sample_data, + column_data=sample_data, sample_map=sample_map, metadata={"could be": "anything"}, ) @@ -114,7 +104,8 @@ To make your life easier, we also provide methods to naively create sample mappi **_This is not a recommended approach, but if you don't have sample mapping, then it doesn't matter._** ```python -maeObj = mae.make_mae(experiments={"sce": tsce, "se": tse2}) +import multiassayexperiment +maeObj = multiassayexperiment.make_mae(experiments={"sce": tsce, "se": tse2}) ``` ## Import `MuData` and `AnnData` as `MultiAssayExperiment` @@ -152,15 +143,18 @@ adata2.var_names = [f"var2_{j+1}" for j in range(d2)] we can now construct a `MuData` object and convert that to an MAE ```python +from mudata import MuData +from multiassayexperiment import MultiAssayExperiment mdata = MuData({"rna": adata, "spatial": adata2}) -maeObj = mae.from_mudata(mudata=mdata) +maeObj = MultiAssayExperiment.from_mudata(input=mdata) ``` Methods are also available to convert an `AnnData` object to `MAE`. ```python -maeObj = mae.read_h5ad("tests/data/adata.h5ad") +import multiassayexperiment +maeObj = multiassayexperiment.read_h5ad("tests/data/adata.h5ad") ``` # Accessors @@ -168,11 +162,11 @@ maeObj = mae.read_h5ad("tests/data/adata.h5ad") Multiple methods are available to access various slots of a `MultiAssayExperiment` object ```python -maeObj.assays -maeObj.col_data -maeObj.sample_map -maeObj.experiments -maeObj.metadata +mae.assays +mae.column_data +mae.sample_map +mae.experiments +mae.metadata ``` ## Access experiments @@ -181,7 +175,7 @@ if you want to access a specific experiment ```python # access a specific experiment -maeObj.experiment(experiment_name) +mae.experiment("se") ``` This does not include the sample data stored in the MAE. If you want to include this information @@ -199,7 +193,7 @@ expt_with_sampleData = maeObj.experiment(experiment_name, with_sample_data=True) The structure for slicing, ``` -maeObj[rows, columns, experiments] +mae[rows, columns, experiments] ``` - rows, columns: accepts either a slice, list of indices or a dictionary to specify slices per experiment. @@ -217,21 +211,13 @@ maeObj[1:5, 0:4] maeObj[1:5, 0:4, ["spatial"]] ``` -## Specify slices per experiment - -You can specify slices by experiment, rest of the experiments are not sliced. - -```python -maeObj[{"rna": slice(0,10)}, {"spatial": slice(0,5)}, ["spatial"]] -``` - Checkout other methods that perform similar operations - `subset_by_rows`, `subset_by_columns` & `subset_by_experiments`. # Helper methods ## completedCases -This method returns a boolean vector that specifies which biospecimens have data across all experiments. +This method returns a boolean vector that specifies which bio specimens have data across all experiments. ```python maeObj.completed_cases() @@ -239,7 +225,7 @@ maeObj.completed_cases() ## replicated -replicated identifies biospecimens that have multiple observations per experiment. +replicated identifies bio specimens that have multiple observations per experiment. ```python maeObj.replicated() diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 64fefff..d22202f 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -294,13 +294,13 @@ def __str__(self) -> str: for idx in range(len(self.experiment_names)): expt_name = self.experiment_names[idx] expt = self._experiments[expt_name] - output += f"[{idx}] {expt_name}: {type(expt).__name} with {expt.shape[0]} rows and {expt.shape[1]} columns" + output += f"[{idx}] {expt_name}: {type(expt).__name__} with {expt.shape[0]} rows and {expt.shape[1]} columns \n" output += f"column_data columns({len(self._column_data.column_names)}): " - output += "{ut.print_truncated_list(self._column_data.column_names)}\n" + output += f"{ut.print_truncated_list(self._column_data.column_names)}\n" output += f"sample_map columns({len(self._sample_map.column_names)}): " - output += "{ut.print_truncated_list(self._sample_map.column_names)}\n" + output += f"{ut.print_truncated_list(self._sample_map.column_names)}\n" output += f"metadata({str(len(self.metadata))}): {ut.print_truncated_list(list(self.metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" # noqa From 5766c1bd72197f8d87527f350c2b8cef96692d27 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 18:56:01 +0000 Subject: [PATCH 10/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a9b054d..922ab41 100644 --- a/README.md +++ b/README.md @@ -94,8 +94,8 @@ mae = MultiAssayExperiment( ## output class: MultiAssayExperiment containing 2 experiments - [0] sce: SingleCellExperiment with 200 rows and 6 columns - [1] se: SummarizedExperiment with 200 rows and 6 columns + [0] sce: SingleCellExperiment with 200 rows and 6 columns + [1] se: SummarizedExperiment with 200 rows and 6 columns column_data columns(1): ['samples'] sample_map columns(3): ['assay', 'primary', 'colname'] metadata(1): could be From d8ecb94306e67364bc528a5054c5691a2bdc5003 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 11:11:15 -0800 Subject: [PATCH 11/21] add tests for experiment --- README.md | 6 +- .../MultiAssayExperiment.py | 8 ++- tests/test_methods.py | 64 +++++++++++++++---- 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index a9b054d..7cf0cca 100644 --- a/README.md +++ b/README.md @@ -51,17 +51,17 @@ gr = GenomicRanges( ) col_data_sce = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, - row_names=["sce"] * 6, + row_names=[f"sce_{i}" for i in range(6)], ) col_data_se = BiocFrame({"treatment": ["ChIP", "Input"] * 3}, - row_names=["se"] * 6, + row_names=[f"se_{i}" for i in range(6)], ) sample_map = BiocFrame({ "assay": ["sce", "se"] * 6, "primary": ["sample1", "sample2"] * 6, - "colname": ["sce", "se"] * 6 + "colname": ["sce_0", "se_0", "sce_1", "se_1", "sce_2", "se_2", "sce_3", "se_3", "sce_4", "se_4", "sce_5", "se_5"] }) sample_data = BiocFrame({"samples": ["sample1", "sample2"]}) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index d22202f..e808341 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -463,8 +463,6 @@ def experiment(self, name: str, with_sample_data: bool = False) -> Any: expt = self.experiments[name] if with_sample_data is True: - expt = deepcopy(expt) - assay_splits = self.sample_map.split("assay", only_indices=True) subset_map = self.sample_map[assay_splits[name],] subset_map = subset_map.set_row_names(subset_map.get_column("colname")) @@ -474,7 +472,11 @@ def experiment(self, name: str, with_sample_data: bool = False) -> Any: [subset_map, expt_column_data], join="outer" ) - expt.column_data = new_column_data + new_column_data = biocframe.merge( + [new_column_data, self._column_data], join="left" + ) + + return expt.set_column_data(new_column_data, in_place=False) return expt diff --git a/tests/test_methods.py b/tests/test_methods.py index eac56f7..5b14d84 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -5,6 +5,7 @@ import pandas as pd import pytest from anndata import AnnData +from biocframe import BiocFrame from singlecellexperiment import SingleCellExperiment from summarizedexperiment import SummarizedExperiment @@ -44,29 +45,39 @@ gr = genomicranges.GenomicRanges.from_pandas(df_gr) -column_data_sce = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["sce"] * 6, +column_data_sce = BiocFrame( + {"treatment": ["ChIP", "Input"] * 3}, + row_names=[f"sce_{i}" for i in range(6)], ) -column_data_se = pd.DataFrame( - { - "treatment": ["ChIP", "Input"] * 3, - }, - index=["se"] * 6, + +column_data_se = BiocFrame( + {"treatment": ["ChIP", "Input"] * 3}, + row_names=[f"se_{i}" for i in range(6)], ) -sample_map = pd.DataFrame( +sample_map = BiocFrame( { "assay": ["sce", "se"] * 6, "primary": ["sample1", "sample2"] * 6, - "colname": ["sce", "se"] * 6, + "colname": [ + "sce_0", + "se_0", + "sce_1", + "se_1", + "sce_2", + "se_2", + "sce_3", + "se_3", + "sce_4", + "se_4", + "sce_5", + "se_5", + ], } ) -sample_data = pd.DataFrame( - {"samples": ["sample1", "sample2"]}, index=["sample1", "sample2"] +sample_data = BiocFrame( + {"samples": ["sample1", "sample2"]}, row_names=["sample1", "sample2"] ) @@ -194,3 +205,28 @@ def test_MAE_replicated(): assert repls is not None assert len(repls) == len(mae.experiments.keys()) + +def test_with_sample_data(): + tsce = SingleCellExperiment( + assays={"counts": counts}, row_data=gr.to_pandas(), column_data=column_data_sce + ) + + tse2 = SummarizedExperiment( + assays={"counts": counts.copy()}, + row_data=gr.to_pandas().copy(), + column_data=column_data_se.copy(), + ) + + mae = MultiAssayExperiment( + experiments={"sce": tsce, "se": tse2}, + column_data=sample_data, + sample_map=sample_map, + metadata={"could be": "anything"}, + ) + + expt_with_sample_data = mae.experiment("se", with_sample_data=True) + + assert expt_with_sample_data is not None + assert expt_with_sample_data.column_data is not None + print(expt_with_sample_data.column_data) + assert expt_with_sample_data.column_data.get_column("samples") is not None \ No newline at end of file From a6983a990eb913091f11bd8cf1eb856e6ed77913 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:11:38 +0000 Subject: [PATCH 12/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multiassayexperiment/MultiAssayExperiment.py | 1 - tests/test_methods.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index e808341..3e1168b 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -1,5 +1,4 @@ from collections import OrderedDict, namedtuple -from copy import deepcopy from typing import Any, Dict, List, Optional, Sequence, Union from warnings import warn diff --git a/tests/test_methods.py b/tests/test_methods.py index 5b14d84..a16fbfd 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -206,6 +206,7 @@ def test_MAE_replicated(): assert repls is not None assert len(repls) == len(mae.experiments.keys()) + def test_with_sample_data(): tsce = SingleCellExperiment( assays={"counts": counts}, row_data=gr.to_pandas(), column_data=column_data_sce @@ -229,4 +230,4 @@ def test_with_sample_data(): assert expt_with_sample_data is not None assert expt_with_sample_data.column_data is not None print(expt_with_sample_data.column_data) - assert expt_with_sample_data.column_data.get_column("samples") is not None \ No newline at end of file + assert expt_with_sample_data.column_data.get_column("samples") is not None From 70b6a77d1b8571303df97bf7eb8e608a18c08251 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 11:15:10 -0800 Subject: [PATCH 13/21] warn user if column names is None --- README.md | 2 +- src/multiassayexperiment/MultiAssayExperiment.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 232ff5c..dd6488a 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ sample_map = BiocFrame({ "colname": ["sce_0", "se_0", "sce_1", "se_1", "sce_2", "se_2", "sce_3", "se_3", "sce_4", "se_4", "sce_5", "se_5"] }) -sample_data = BiocFrame({"samples": ["sample1", "sample2"]}) +sample_data = BiocFrame({"samples": ["sample1", "sample2"]}, row_names= ["sample1", "sample2"]) ``` Now we can create an instance of an MAE - diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 3e1168b..5b10d3c 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -28,6 +28,9 @@ def _validate_experiments(experiments): if not hasattr(v, "shape"): raise ValueError(f"experiment: {k} is not supported.") + if v.column_data is None: + warn(f"Experiment '{k}' does not contain column (cell/sample) names.", UserWarning) + def _validate_column_data(column_data): if column_data is None: From fd57667da50d59408a4777285cfee3bf3605434d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:16:08 +0000 Subject: [PATCH 14/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multiassayexperiment/MultiAssayExperiment.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 5b10d3c..539dad0 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -29,7 +29,10 @@ def _validate_experiments(experiments): raise ValueError(f"experiment: {k} is not supported.") if v.column_data is None: - warn(f"Experiment '{k}' does not contain column (cell/sample) names.", UserWarning) + warn( + f"Experiment '{k}' does not contain column (cell/sample) names.", + UserWarning, + ) def _validate_column_data(column_data): From 5db897a1fddfa327226354c994381f59e7ecb2f1 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 11:17:06 -0800 Subject: [PATCH 15/21] ignore lint --- src/multiassayexperiment/MultiAssayExperiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 539dad0..f39d138 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -299,7 +299,7 @@ def __str__(self) -> str: for idx in range(len(self.experiment_names)): expt_name = self.experiment_names[idx] expt = self._experiments[expt_name] - output += f"[{idx}] {expt_name}: {type(expt).__name__} with {expt.shape[0]} rows and {expt.shape[1]} columns \n" + output += f"[{idx}] {expt_name}: {type(expt).__name__} with {expt.shape[0]} rows and {expt.shape[1]} columns \n" # noqa output += f"column_data columns({len(self._column_data.column_names)}): " output += f"{ut.print_truncated_list(self._column_data.column_names)}\n" From 2dabf955f89fe2fcec43b9e600cafcc28b8b70f7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:18:38 +0000 Subject: [PATCH 16/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multiassayexperiment/MultiAssayExperiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index f39d138..50f3c35 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -299,7 +299,7 @@ def __str__(self) -> str: for idx in range(len(self.experiment_names)): expt_name = self.experiment_names[idx] expt = self._experiments[expt_name] - output += f"[{idx}] {expt_name}: {type(expt).__name__} with {expt.shape[0]} rows and {expt.shape[1]} columns \n" # noqa + output += f"[{idx}] {expt_name}: {type(expt).__name__} with {expt.shape[0]} rows and {expt.shape[1]} columns \n" # noqa output += f"column_data columns({len(self._column_data.column_names)}): " output += f"{ut.print_truncated_list(self._column_data.column_names)}\n" From 451e95916d10a7e2e4bfdcb850a7360c063078ff Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 11:20:24 -0800 Subject: [PATCH 17/21] typo from column_data to column_names --- src/multiassayexperiment/MultiAssayExperiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index f39d138..3794bc0 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -28,7 +28,7 @@ def _validate_experiments(experiments): if not hasattr(v, "shape"): raise ValueError(f"experiment: {k} is not supported.") - if v.column_data is None: + if v.column_names is None: warn( f"Experiment '{k}' does not contain column (cell/sample) names.", UserWarning, From de605f871c8486e71775cd85dea76c56d7c60fe2 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 11:24:14 -0800 Subject: [PATCH 18/21] remove prints --- src/multiassayexperiment/MultiAssayExperiment.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index edfa595..10e97c2 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -756,8 +756,6 @@ def _generic_slice( experiment_names=experiments, rows=rows, columns=columns ) - print("new expt keys", _new_experiments.keys()) - # filter sample_map smap_indices_to_keep = [] for expname, expt in _new_experiments.items(): @@ -883,7 +881,6 @@ def __getitem__(self, args: tuple) -> "MultiAssayExperiment": self.metadata, ) elif len(args) == 3: - print("SHOULD BER HERE???????") sresult = self._generic_slice( rows=args[0], columns=args[1], experiments=args[2] ) From 8b9a13cbd32c8d309e42e686dd32e3a19f6331aa Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 13:04:07 -0800 Subject: [PATCH 19/21] warn when row names of column data contain duplicates --- src/multiassayexperiment/MultiAssayExperiment.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/multiassayexperiment/MultiAssayExperiment.py b/src/multiassayexperiment/MultiAssayExperiment.py index 10e97c2..b52d9f1 100644 --- a/src/multiassayexperiment/MultiAssayExperiment.py +++ b/src/multiassayexperiment/MultiAssayExperiment.py @@ -45,6 +45,9 @@ def _validate_column_data(column_data): if column_data.row_names is None: raise ValueError("`column_data` must have row names or labels.") + if len(set(column_data.row_names)) != len(column_data.row_names): + warn("'column_data' has duplicate row_names.", UserWarning) + def _validate_sample_map_with_column_data(sample_map, column_data): # check if all samples are from primary exist in col data @@ -109,7 +112,7 @@ def _create_smap_from_experiments(experiments): samples = [] for expname, expt in experiments.items(): - colnames = expt.colnames + colnames = expt.column_names asy_sample = f"unknown_sample_{expname}" _all_assays.extend([expname] * len(colnames)) _all_primary.extend([asy_sample] * len(colnames)) From 9fbf319a68a2794aa00f4629d23eb1b3ee54452a Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 3 Jan 2024 13:06:54 -0800 Subject: [PATCH 20/21] create empty MAE --- tests/test_create.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_create.py b/tests/test_create.py index 831cabb..a9f7fcf 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -164,3 +164,9 @@ def test_MAE_save(): assert mdata is not None assert len(mdata.mod.keys()) == 2 + +def test_empty_mae(): + mae = MultiAssayExperiment(experiments={}) + + assert mae is not None + assert isinstance(mae, MultiAssayExperiment) \ No newline at end of file From 07c1df5c54d7a0e73e28e8b39d118035b70a0825 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 21:07:04 +0000 Subject: [PATCH 21/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_create.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_create.py b/tests/test_create.py index a9f7fcf..6b7af71 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -165,8 +165,9 @@ def test_MAE_save(): assert mdata is not None assert len(mdata.mod.keys()) == 2 + def test_empty_mae(): mae = MultiAssayExperiment(experiments={}) assert mae is not None - assert isinstance(mae, MultiAssayExperiment) \ No newline at end of file + assert isinstance(mae, MultiAssayExperiment)