From d625a858caf5d69321bf37617c453f2f054daca7 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Fri, 24 May 2024 14:51:25 -0700 Subject: [PATCH] Optimize/polish dataset for upload (#4) including tests and documentation --- setup.cfg | 2 +- src/scrnaseq/__init__.py | 4 +- src/scrnaseq/polish_dataset.py | 131 ++++++++++++++++++++++++ tests/test_polish_dataset.py | 177 +++++++++++++++++++++++++++++++++ 4 files changed, 312 insertions(+), 2 deletions(-) create mode 100644 src/scrnaseq/polish_dataset.py create mode 100644 tests/test_polish_dataset.py diff --git a/setup.cfg b/setup.cfg index 013c9e6..37f3a0c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = dolomite_matrix dolomite_sce>=0.1.2 gypsum_client>=0.1.1 - delayedarray + delayedarray>=0.5.1 summarizedexperiment singlecellexperiment pandas diff --git a/src/scrnaseq/__init__.py b/src/scrnaseq/__init__.py index 254d666..9441006 100644 --- a/src/scrnaseq/__init__.py +++ b/src/scrnaseq/__init__.py @@ -18,4 +18,6 @@ from .fetch_dataset import fetch_dataset, fetch_metadata from .list_datasets import list_datasets from .list_versions import fetch_latest_version, list_versions -from .save_dataset import save_dataset \ No newline at end of file +from .polish_dataset import polish_dataset +from .save_dataset import save_dataset +from .upload_dataset import upload_dataset diff --git a/src/scrnaseq/polish_dataset.py b/src/scrnaseq/polish_dataset.py new file mode 100644 index 0000000..04f1fa2 --- /dev/null +++ b/src/scrnaseq/polish_dataset.py @@ -0,0 +1,131 @@ +from typing import Type +from warnings import warn + +import numpy as np +from delayedarray import DelayedArray +from scipy import sparse as sp +from singlecellexperiment import SingleCellExperiment +from summarizedexperiment import SummarizedExperiment + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def polish_dataset( + x: Type[SummarizedExperiment], + reformat_assay_by_density: float = 0.3, + attempt_integer_conversion: bool = True, + remove_altexp_coldata: bool = True, + forbid_nested_altexp: bool = True, +) -> Type[SummarizedExperiment]: + """Optimize dataset for saving. + + Prepare a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` or + :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment` + to be saved with :py:func:`scrnaseq.save_dataset.save_dataset`. + + This performs minor changes to improve storage efficiency, especially + with matrices. + + Args: + x: + A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + or one of its derivative. + + reformat_assay_by_density: + Whether to optimize assay formats based on the density of non-zero values. + Assays with densities above this number are converted to ordinary dense + arrays (if they are not already), while those with lower densities are + converted to sparse matrices. + + This can be disabled by setting it to `None`. + + attempt_integer_conversion: + Whether to convert double-precision assays containing integer values + to actually have the integer type. + + This can improve efficiency of downstream applications by avoiding + the need to operate in double precision. + + remove_altexp_coldata: + Whether column data for alternative experiments should be removed. + Defaults to `True` as the alternative experiment column data is + usually redundant compared to the main experiment. + + forbid_nested_altexp: + Whether nested alternative experiments (i.e., alternative experiments of + alternative experiments) should be forbidden. + + Returns: + A modifed object with the same type as ``x``. + """ + return _polish_dataset( + x, + reformat_assay_by_density, + attempt_integer_conversion, + remove_altexp_coldata, + forbid_nested_altexp, + ) + + +def _polish_dataset( + x: Type[SummarizedExperiment], + reformat_assay_by_density: float, + attempt_integer_conversion: bool, + remove_altexp_coldata: bool, + forbid_nested_altexp: bool, + level: int = 0, +): + new_assays = {} + for asyname, asy in x.assays.items(): + if reformat_assay_by_density is not None: + density = min(np.mean(asy != 0), np.mean(asy != np.nan)) + if density < reformat_assay_by_density: + if not sp.issparse(asy): + asy = sp.csr_matrix(asy) + else: + if sp.issparse(asy): + asy = asy.toarray() + + if attempt_integer_conversion: + if np.issubdtype(asy.dtype, np.floating): + _cast = False + if sp.issparse(asy): + if not np.any(asy.data % 1 != 0): + _cast = True + elif not np.any(asy % 1 != 0): + _cast = True + + if _cast is True: + asy = asy.astype(np.int_) + + new_assays[asyname] = asy + + x = x.set_assays(new_assays) + + if isinstance(x, SingleCellExperiment): + if len(x.get_alternative_experiment_names()) > 0: + if forbid_nested_altexp and level > 0: + raise ValueError("Nested alternative experiments are forbidden.") + + new_alts = {} + for altname, altexp in x.alternative_experiments.items(): + if remove_altexp_coldata: + altexp = altexp.set_column_data(None) + + altexp = _polish_dataset( + altexp, + reformat_assay_by_density=reformat_assay_by_density, + attempt_integer_conversion=attempt_integer_conversion, + remove_altexp_coldata=remove_altexp_coldata, + forbid_nested_altexp=forbid_nested_altexp, + level=level + 1, + ) + + new_alts[altname] = altexp + + x = x.set_alternative_experiments(new_alts) + + return x diff --git a/tests/test_polish_dataset.py b/tests/test_polish_dataset.py new file mode 100644 index 0000000..9a310ff --- /dev/null +++ b/tests/test_polish_dataset.py @@ -0,0 +1,177 @@ +import numpy as np +import pytest +from biocframe import BiocFrame +from scipy import sparse +from scrnaseq import fetch_dataset, polish_dataset +from singlecellexperiment import SingleCellExperiment + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def generate_matrix(): + mat = np.random.poisson(1, (100, 10)) + return mat + + +def test_polish_dataset_strips_assay_dimnames(): + mat = generate_matrix() + row_names = [f"GENE_{i}" for i in range(mat.shape[0])] + col_names = list("ABCDEFGHIJ") + sce = SingleCellExperiment( + assays={"counts": mat}, + row_data=BiocFrame(row_names=row_names), + column_data=BiocFrame(row_names=col_names), + ) + + y = polish_dataset(sce) + assert sce.shape == y.shape + assert sce.get_assay_names() == y.get_assay_names() + assert y.get_row_names() is not None + assert y.get_column_names() is not None + + sce.row_data = sce.row_data.set_row_names([name.lower() for name in row_names]) + sce.column_names = sce.column_data.set_row_names( + [name.lower() for name in col_names] + ) + y = polish_dataset(sce) + assert sce.shape == y.shape + assert sce.get_assay_names() == y.get_assay_names() + assert y.get_row_names() == sce.get_row_names() + assert y.get_column_names() == sce.get_column_names() + + +def test_polish_dataset_strips_reduced_dimension_names(): + mat = generate_matrix() + row_names = [f"GENE_{i}" for i in range(mat.shape[0])] + col_names = list("ABCDEFGHIJ") + pca = np.random.randn(10, 5) + sce = SingleCellExperiment( + assays={"counts": mat}, + row_data=BiocFrame(row_names=row_names), + column_data=BiocFrame(row_names=col_names), + reduced_dims={"pca": pca}, + ) + + y = polish_dataset(sce) + assert sce.shape == y.shape + assert sce.get_assay_names() == y.get_assay_names() + assert y.get_row_names() is not None + assert y.get_column_names() is not None + assert sce.get_reduced_dim_names() == y.get_reduced_dim_names() + assert y.reduced_dims["pca"].shape == pca.shape + + +def test_polish_dataset_strips_alternative_experiment_names(): + mat = generate_matrix() + row_names = [f"GENE_{i}" for i in range(mat.shape[0])] + col_names = list("ABCDEFGHIJ") + pca = np.random.randn(10, 5) + altexp = SingleCellExperiment( + assays={"counts": mat}, + row_data=BiocFrame(row_names=row_names), + column_data=BiocFrame(row_names=col_names), + ) + sce = SingleCellExperiment( + assays={"counts": mat}, + row_data=BiocFrame(row_names=row_names), + column_data=BiocFrame(row_names=col_names), + reduced_dims={"pca": pca}, + alternative_experiments={"rando": altexp}, + ) + + y = polish_dataset(sce) + assert y.alternative_experiments["rando"].shape == altexp.shape + assert y.alternative_experiments["rando"].get_row_names() == altexp.get_row_names() + + +def test_polish_dataset_converts_dense_to_sparse(): + mat = np.random.poisson(0.2, (100, 10)) + sce = SingleCellExperiment(assays={"counts": mat}) + + y = polish_dataset(sce) + assert sparse.issparse(y.assays["counts"]) + + y = polish_dataset(sce, reformat_assay_by_density=0) + assert not sparse.issparse(y.assays["counts"]) + assert y.shape == mat.shape + + y = polish_dataset(sce, reformat_assay_by_density=None) + assert not sparse.issparse(y.assays["counts"]) + + +def test_polish_dataset_converts_sparse_to_dense(): + mat = np.random.poisson(3, (100, 10)) + sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) + + y = polish_dataset(sce) + assert not sparse.issparse(y.assays["counts"]) + + y = polish_dataset(sce, reformat_assay_by_density=1) + assert sparse.issparse(y.assays["counts"]) + + y = polish_dataset(sce, reformat_assay_by_density=None) + assert sparse.issparse(y.assays["counts"]) + + +def test_polish_dataset_attempts_integer_conversions(): + mat = np.random.poisson(3, (100, 10)).astype(float) + sce = SingleCellExperiment(assays={"counts": mat}) + + y = polish_dataset(sce) + assert y.assays["counts"].dtype == np.int_ + + mat = np.random.poisson(0.1, (100, 10)).astype(float) + sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) + + y = polish_dataset(sce) + assert sparse.issparse(y.assays["counts"]) + assert y.assays["counts"].dtype == np.int_ + + mat = np.random.poisson(3, (100, 10)) * 1.5 + sce = SingleCellExperiment(assays={"counts": mat}) + + y = polish_dataset(sce) + assert y.assays["counts"].dtype == np.float_ + + +def test_polish_dataset_works_with_na_values(): + mat = np.random.poisson(0.1, (100, 10)) + mat = mat.astype(np.float_) + mat.ravel()[np.random.choice(mat.size, 10, replace=False)] = np.nan + sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) + + y = polish_dataset(sce) + assert sparse.issparse(y.assays["counts"]) + assert y.assays["counts"].dtype == np.float_ + + +def test_polish_dataset_forbids_highly_nested_altexps(): + mat = np.random.poisson(1, (100, 10)) + sce2 = SingleCellExperiment(assays={"counts": mat[:5] + 2}) + + sce1 = SingleCellExperiment( + assays={"counts": mat[:10] + 1}, alternative_experiments={"rando": sce2} + ) + + sce0 = SingleCellExperiment( + assays={"counts": mat}, alternative_experiments={"rando": sce1} + ) + + with pytest.raises(Exception): + polish_dataset(sce0) + + y = polish_dataset(sce0, forbid_nested_altexp=False) + assert ( + y.get_alternative_experiment_names() == sce0.get_alternative_experiment_names() + ) + + +def test_polish_existing_dataset(): + sce = fetch_dataset("zeisel-brain-2015", "2023-12-14") + + y = polish_dataset(sce) + + assert y.shape == sce.shape + assert type(y.assays["counts"]) != type(sce.assays["counts"])