Optimize/polish dataset for upload (#4)

including tests and documentation
BiocPy · May 24, 2024 · d625a85 · d625a85
1 parent ca14950
commit d625a85
Show file tree

Hide file tree

Showing 4 changed files with 312 additions and 2 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -53,7 +53,7 @@ install_requires =
     dolomite_matrix
     dolomite_sce>=0.1.2
     gypsum_client>=0.1.1
-    delayedarray
+    delayedarray>=0.5.1
     summarizedexperiment
     singlecellexperiment
     pandas

diff --git a/src/scrnaseq/__init__.py b/src/scrnaseq/__init__.py
@@ -18,4 +18,6 @@
 from .fetch_dataset import fetch_dataset, fetch_metadata
 from .list_datasets import list_datasets
 from .list_versions import fetch_latest_version, list_versions
-from .save_dataset import save_dataset
+from .polish_dataset import polish_dataset
+from .save_dataset import save_dataset
+from .upload_dataset import upload_dataset
diff --git a/src/scrnaseq/polish_dataset.py b/src/scrnaseq/polish_dataset.py
@@ -0,0 +1,131 @@
+from typing import Type
+from warnings import warn
+
+import numpy as np
+from delayedarray import DelayedArray
+from scipy import sparse as sp
+from singlecellexperiment import SingleCellExperiment
+from summarizedexperiment import SummarizedExperiment
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+def polish_dataset(
+    x: Type[SummarizedExperiment],
+    reformat_assay_by_density: float = 0.3,
+    attempt_integer_conversion: bool = True,
+    remove_altexp_coldata: bool = True,
+    forbid_nested_altexp: bool = True,
+) -> Type[SummarizedExperiment]:
+    """Optimize dataset for saving.
+
+    Prepare a
+    :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` or
+    :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`
+    to be saved with :py:func:`scrnaseq.save_dataset.save_dataset`.
+
+    This performs minor changes to improve storage efficiency, especially
+    with matrices.
+
+    Args:
+        x:
+            A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
+            or one of its derivative.
+
+        reformat_assay_by_density:
+            Whether to optimize assay formats based on the density of non-zero values.
+            Assays with densities above this number are converted to ordinary dense
+            arrays (if they are not already), while those with lower densities are
+            converted to sparse matrices.
+
+            This can be disabled by setting it to `None`.
+
+        attempt_integer_conversion:
+            Whether to convert double-precision assays containing integer values
+            to actually have the integer type.
+
+            This can improve efficiency of downstream applications by avoiding
+            the need to operate in double precision.
+
+        remove_altexp_coldata:
+            Whether column data for alternative experiments should be removed.
+            Defaults to `True` as the alternative experiment column data is
+            usually redundant compared to the main experiment.
+
+        forbid_nested_altexp:
+            Whether nested alternative experiments (i.e., alternative experiments of
+            alternative experiments) should be forbidden.
+
+    Returns:
+        A modifed object with the same type as ``x``.
+    """
+    return _polish_dataset(
+        x,
+        reformat_assay_by_density,
+        attempt_integer_conversion,
+        remove_altexp_coldata,
+        forbid_nested_altexp,
+    )
+
+
+def _polish_dataset(
+    x: Type[SummarizedExperiment],
+    reformat_assay_by_density: float,
+    attempt_integer_conversion: bool,
+    remove_altexp_coldata: bool,
+    forbid_nested_altexp: bool,
+    level: int = 0,
+):
+    new_assays = {}
+    for asyname, asy in x.assays.items():
+        if reformat_assay_by_density is not None:
+            density = min(np.mean(asy != 0), np.mean(asy != np.nan))
+            if density < reformat_assay_by_density:
+                if not sp.issparse(asy):
+                    asy = sp.csr_matrix(asy)
+            else:
+                if sp.issparse(asy):
+                    asy = asy.toarray()
+
+        if attempt_integer_conversion:
+            if np.issubdtype(asy.dtype, np.floating):
+                _cast = False
+                if sp.issparse(asy):
+                    if not np.any(asy.data % 1 != 0):
+                        _cast = True
+                elif not np.any(asy % 1 != 0):
+                    _cast = True
+
+                if _cast is True:
+                    asy = asy.astype(np.int_)
+
+        new_assays[asyname] = asy
+
+    x = x.set_assays(new_assays)
+
+    if isinstance(x, SingleCellExperiment):
+        if len(x.get_alternative_experiment_names()) > 0:
+            if forbid_nested_altexp and level > 0:
+                raise ValueError("Nested alternative experiments are forbidden.")
+
+        new_alts = {}
+        for altname, altexp in x.alternative_experiments.items():
+            if remove_altexp_coldata:
+                altexp = altexp.set_column_data(None)
+
+            altexp = _polish_dataset(
+                altexp,
+                reformat_assay_by_density=reformat_assay_by_density,
+                attempt_integer_conversion=attempt_integer_conversion,
+                remove_altexp_coldata=remove_altexp_coldata,
+                forbid_nested_altexp=forbid_nested_altexp,
+                level=level + 1,
+            )
+
+            new_alts[altname] = altexp
+
+        x = x.set_alternative_experiments(new_alts)
+
+    return x
diff --git a/tests/test_polish_dataset.py b/tests/test_polish_dataset.py
@@ -0,0 +1,177 @@
+import numpy as np
+import pytest
+from biocframe import BiocFrame
+from scipy import sparse
+from scrnaseq import fetch_dataset, polish_dataset
+from singlecellexperiment import SingleCellExperiment
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+def generate_matrix():
+    mat = np.random.poisson(1, (100, 10))
+    return mat
+
+
+def test_polish_dataset_strips_assay_dimnames():
+    mat = generate_matrix()
+    row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
+    col_names = list("ABCDEFGHIJ")
+    sce = SingleCellExperiment(
+        assays={"counts": mat},
+        row_data=BiocFrame(row_names=row_names),
+        column_data=BiocFrame(row_names=col_names),
+    )
+
+    y = polish_dataset(sce)
+    assert sce.shape == y.shape
+    assert sce.get_assay_names() == y.get_assay_names()
+    assert y.get_row_names() is not None
+    assert y.get_column_names() is not None
+
+    sce.row_data = sce.row_data.set_row_names([name.lower() for name in row_names])
+    sce.column_names = sce.column_data.set_row_names(
+        [name.lower() for name in col_names]
+    )
+    y = polish_dataset(sce)
+    assert sce.shape == y.shape
+    assert sce.get_assay_names() == y.get_assay_names()
+    assert y.get_row_names() == sce.get_row_names()
+    assert y.get_column_names() == sce.get_column_names()
+
+
+def test_polish_dataset_strips_reduced_dimension_names():
+    mat = generate_matrix()
+    row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
+    col_names = list("ABCDEFGHIJ")
+    pca = np.random.randn(10, 5)
+    sce = SingleCellExperiment(
+        assays={"counts": mat},
+        row_data=BiocFrame(row_names=row_names),
+        column_data=BiocFrame(row_names=col_names),
+        reduced_dims={"pca": pca},
+    )
+
+    y = polish_dataset(sce)
+    assert sce.shape == y.shape
+    assert sce.get_assay_names() == y.get_assay_names()
+    assert y.get_row_names() is not None
+    assert y.get_column_names() is not None
+    assert sce.get_reduced_dim_names() == y.get_reduced_dim_names()
+    assert y.reduced_dims["pca"].shape == pca.shape
+
+
+def test_polish_dataset_strips_alternative_experiment_names():
+    mat = generate_matrix()
+    row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
+    col_names = list("ABCDEFGHIJ")
+    pca = np.random.randn(10, 5)
+    altexp = SingleCellExperiment(
+        assays={"counts": mat},
+        row_data=BiocFrame(row_names=row_names),
+        column_data=BiocFrame(row_names=col_names),
+    )
+    sce = SingleCellExperiment(
+        assays={"counts": mat},
+        row_data=BiocFrame(row_names=row_names),
+        column_data=BiocFrame(row_names=col_names),
+        reduced_dims={"pca": pca},
+        alternative_experiments={"rando": altexp},
+    )
+
+    y = polish_dataset(sce)
+    assert y.alternative_experiments["rando"].shape == altexp.shape
+    assert y.alternative_experiments["rando"].get_row_names() == altexp.get_row_names()
+
+
+def test_polish_dataset_converts_dense_to_sparse():
+    mat = np.random.poisson(0.2, (100, 10))
+    sce = SingleCellExperiment(assays={"counts": mat})
+
+    y = polish_dataset(sce)
+    assert sparse.issparse(y.assays["counts"])
+
+    y = polish_dataset(sce, reformat_assay_by_density=0)
+    assert not sparse.issparse(y.assays["counts"])
+    assert y.shape == mat.shape
+
+    y = polish_dataset(sce, reformat_assay_by_density=None)
+    assert not sparse.issparse(y.assays["counts"])
+
+
+def test_polish_dataset_converts_sparse_to_dense():
+    mat = np.random.poisson(3, (100, 10))
+    sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})
+
+    y = polish_dataset(sce)
+    assert not sparse.issparse(y.assays["counts"])
+
+    y = polish_dataset(sce, reformat_assay_by_density=1)
+    assert sparse.issparse(y.assays["counts"])
+
+    y = polish_dataset(sce, reformat_assay_by_density=None)
+    assert sparse.issparse(y.assays["counts"])
+
+
+def test_polish_dataset_attempts_integer_conversions():
+    mat = np.random.poisson(3, (100, 10)).astype(float)
+    sce = SingleCellExperiment(assays={"counts": mat})
+
+    y = polish_dataset(sce)
+    assert y.assays["counts"].dtype == np.int_
+
+    mat = np.random.poisson(0.1, (100, 10)).astype(float)
+    sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})
+
+    y = polish_dataset(sce)
+    assert sparse.issparse(y.assays["counts"])
+    assert y.assays["counts"].dtype == np.int_
+
+    mat = np.random.poisson(3, (100, 10)) * 1.5
+    sce = SingleCellExperiment(assays={"counts": mat})
+
+    y = polish_dataset(sce)
+    assert y.assays["counts"].dtype == np.float_
+
+
+def test_polish_dataset_works_with_na_values():
+    mat = np.random.poisson(0.1, (100, 10))
+    mat = mat.astype(np.float_)
+    mat.ravel()[np.random.choice(mat.size, 10, replace=False)] = np.nan
+    sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})
+
+    y = polish_dataset(sce)
+    assert sparse.issparse(y.assays["counts"])
+    assert y.assays["counts"].dtype == np.float_
+
+
+def test_polish_dataset_forbids_highly_nested_altexps():
+    mat = np.random.poisson(1, (100, 10))
+    sce2 = SingleCellExperiment(assays={"counts": mat[:5] + 2})
+
+    sce1 = SingleCellExperiment(
+        assays={"counts": mat[:10] + 1}, alternative_experiments={"rando": sce2}
+    )
+
+    sce0 = SingleCellExperiment(
+        assays={"counts": mat}, alternative_experiments={"rando": sce1}
+    )
+
+    with pytest.raises(Exception):
+        polish_dataset(sce0)
+
+    y = polish_dataset(sce0, forbid_nested_altexp=False)
+    assert (
+        y.get_alternative_experiment_names() == sce0.get_alternative_experiment_names()
+    )
+
+
+def test_polish_existing_dataset():
+    sce = fetch_dataset("zeisel-brain-2015", "2023-12-14")
+
+    y = polish_dataset(sce)
+
+    assert y.shape == sce.shape
+    assert type(y.assays["counts"]) != type(sce.assays["counts"])