Skip to content

Commit

Permalink
Optimize/polish dataset for upload (#4)
Browse files Browse the repository at this point in the history
including tests and documentation
  • Loading branch information
jkanche authored May 24, 2024
1 parent ca14950 commit d625a85
Show file tree
Hide file tree
Showing 4 changed files with 312 additions and 2 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ install_requires =
dolomite_matrix
dolomite_sce>=0.1.2
gypsum_client>=0.1.1
delayedarray
delayedarray>=0.5.1
summarizedexperiment
singlecellexperiment
pandas
Expand Down
4 changes: 3 additions & 1 deletion src/scrnaseq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
from .fetch_dataset import fetch_dataset, fetch_metadata
from .list_datasets import list_datasets
from .list_versions import fetch_latest_version, list_versions
from .save_dataset import save_dataset
from .polish_dataset import polish_dataset
from .save_dataset import save_dataset
from .upload_dataset import upload_dataset
131 changes: 131 additions & 0 deletions src/scrnaseq/polish_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from typing import Type
from warnings import warn

import numpy as np
from delayedarray import DelayedArray
from scipy import sparse as sp
from singlecellexperiment import SingleCellExperiment
from summarizedexperiment import SummarizedExperiment

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def polish_dataset(
x: Type[SummarizedExperiment],
reformat_assay_by_density: float = 0.3,
attempt_integer_conversion: bool = True,
remove_altexp_coldata: bool = True,
forbid_nested_altexp: bool = True,
) -> Type[SummarizedExperiment]:
"""Optimize dataset for saving.
Prepare a
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` or
:py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment`
to be saved with :py:func:`scrnaseq.save_dataset.save_dataset`.
This performs minor changes to improve storage efficiency, especially
with matrices.
Args:
x:
A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
or one of its derivative.
reformat_assay_by_density:
Whether to optimize assay formats based on the density of non-zero values.
Assays with densities above this number are converted to ordinary dense
arrays (if they are not already), while those with lower densities are
converted to sparse matrices.
This can be disabled by setting it to `None`.
attempt_integer_conversion:
Whether to convert double-precision assays containing integer values
to actually have the integer type.
This can improve efficiency of downstream applications by avoiding
the need to operate in double precision.
remove_altexp_coldata:
Whether column data for alternative experiments should be removed.
Defaults to `True` as the alternative experiment column data is
usually redundant compared to the main experiment.
forbid_nested_altexp:
Whether nested alternative experiments (i.e., alternative experiments of
alternative experiments) should be forbidden.
Returns:
A modifed object with the same type as ``x``.
"""
return _polish_dataset(
x,
reformat_assay_by_density,
attempt_integer_conversion,
remove_altexp_coldata,
forbid_nested_altexp,
)


def _polish_dataset(
x: Type[SummarizedExperiment],
reformat_assay_by_density: float,
attempt_integer_conversion: bool,
remove_altexp_coldata: bool,
forbid_nested_altexp: bool,
level: int = 0,
):
new_assays = {}
for asyname, asy in x.assays.items():
if reformat_assay_by_density is not None:
density = min(np.mean(asy != 0), np.mean(asy != np.nan))
if density < reformat_assay_by_density:
if not sp.issparse(asy):
asy = sp.csr_matrix(asy)
else:
if sp.issparse(asy):
asy = asy.toarray()

if attempt_integer_conversion:
if np.issubdtype(asy.dtype, np.floating):
_cast = False
if sp.issparse(asy):
if not np.any(asy.data % 1 != 0):
_cast = True
elif not np.any(asy % 1 != 0):
_cast = True

if _cast is True:
asy = asy.astype(np.int_)

new_assays[asyname] = asy

x = x.set_assays(new_assays)

if isinstance(x, SingleCellExperiment):
if len(x.get_alternative_experiment_names()) > 0:
if forbid_nested_altexp and level > 0:
raise ValueError("Nested alternative experiments are forbidden.")

new_alts = {}
for altname, altexp in x.alternative_experiments.items():
if remove_altexp_coldata:
altexp = altexp.set_column_data(None)

altexp = _polish_dataset(
altexp,
reformat_assay_by_density=reformat_assay_by_density,
attempt_integer_conversion=attempt_integer_conversion,
remove_altexp_coldata=remove_altexp_coldata,
forbid_nested_altexp=forbid_nested_altexp,
level=level + 1,
)

new_alts[altname] = altexp

x = x.set_alternative_experiments(new_alts)

return x
177 changes: 177 additions & 0 deletions tests/test_polish_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import numpy as np
import pytest
from biocframe import BiocFrame
from scipy import sparse
from scrnaseq import fetch_dataset, polish_dataset
from singlecellexperiment import SingleCellExperiment

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def generate_matrix():
mat = np.random.poisson(1, (100, 10))
return mat


def test_polish_dataset_strips_assay_dimnames():
mat = generate_matrix()
row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
col_names = list("ABCDEFGHIJ")
sce = SingleCellExperiment(
assays={"counts": mat},
row_data=BiocFrame(row_names=row_names),
column_data=BiocFrame(row_names=col_names),
)

y = polish_dataset(sce)
assert sce.shape == y.shape
assert sce.get_assay_names() == y.get_assay_names()
assert y.get_row_names() is not None
assert y.get_column_names() is not None

sce.row_data = sce.row_data.set_row_names([name.lower() for name in row_names])
sce.column_names = sce.column_data.set_row_names(
[name.lower() for name in col_names]
)
y = polish_dataset(sce)
assert sce.shape == y.shape
assert sce.get_assay_names() == y.get_assay_names()
assert y.get_row_names() == sce.get_row_names()
assert y.get_column_names() == sce.get_column_names()


def test_polish_dataset_strips_reduced_dimension_names():
mat = generate_matrix()
row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
col_names = list("ABCDEFGHIJ")
pca = np.random.randn(10, 5)
sce = SingleCellExperiment(
assays={"counts": mat},
row_data=BiocFrame(row_names=row_names),
column_data=BiocFrame(row_names=col_names),
reduced_dims={"pca": pca},
)

y = polish_dataset(sce)
assert sce.shape == y.shape
assert sce.get_assay_names() == y.get_assay_names()
assert y.get_row_names() is not None
assert y.get_column_names() is not None
assert sce.get_reduced_dim_names() == y.get_reduced_dim_names()
assert y.reduced_dims["pca"].shape == pca.shape


def test_polish_dataset_strips_alternative_experiment_names():
mat = generate_matrix()
row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
col_names = list("ABCDEFGHIJ")
pca = np.random.randn(10, 5)
altexp = SingleCellExperiment(
assays={"counts": mat},
row_data=BiocFrame(row_names=row_names),
column_data=BiocFrame(row_names=col_names),
)
sce = SingleCellExperiment(
assays={"counts": mat},
row_data=BiocFrame(row_names=row_names),
column_data=BiocFrame(row_names=col_names),
reduced_dims={"pca": pca},
alternative_experiments={"rando": altexp},
)

y = polish_dataset(sce)
assert y.alternative_experiments["rando"].shape == altexp.shape
assert y.alternative_experiments["rando"].get_row_names() == altexp.get_row_names()


def test_polish_dataset_converts_dense_to_sparse():
mat = np.random.poisson(0.2, (100, 10))
sce = SingleCellExperiment(assays={"counts": mat})

y = polish_dataset(sce)
assert sparse.issparse(y.assays["counts"])

y = polish_dataset(sce, reformat_assay_by_density=0)
assert not sparse.issparse(y.assays["counts"])
assert y.shape == mat.shape

y = polish_dataset(sce, reformat_assay_by_density=None)
assert not sparse.issparse(y.assays["counts"])


def test_polish_dataset_converts_sparse_to_dense():
mat = np.random.poisson(3, (100, 10))
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})

y = polish_dataset(sce)
assert not sparse.issparse(y.assays["counts"])

y = polish_dataset(sce, reformat_assay_by_density=1)
assert sparse.issparse(y.assays["counts"])

y = polish_dataset(sce, reformat_assay_by_density=None)
assert sparse.issparse(y.assays["counts"])


def test_polish_dataset_attempts_integer_conversions():
mat = np.random.poisson(3, (100, 10)).astype(float)
sce = SingleCellExperiment(assays={"counts": mat})

y = polish_dataset(sce)
assert y.assays["counts"].dtype == np.int_

mat = np.random.poisson(0.1, (100, 10)).astype(float)
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})

y = polish_dataset(sce)
assert sparse.issparse(y.assays["counts"])
assert y.assays["counts"].dtype == np.int_

mat = np.random.poisson(3, (100, 10)) * 1.5
sce = SingleCellExperiment(assays={"counts": mat})

y = polish_dataset(sce)
assert y.assays["counts"].dtype == np.float_


def test_polish_dataset_works_with_na_values():
mat = np.random.poisson(0.1, (100, 10))
mat = mat.astype(np.float_)
mat.ravel()[np.random.choice(mat.size, 10, replace=False)] = np.nan
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)})

y = polish_dataset(sce)
assert sparse.issparse(y.assays["counts"])
assert y.assays["counts"].dtype == np.float_


def test_polish_dataset_forbids_highly_nested_altexps():
mat = np.random.poisson(1, (100, 10))
sce2 = SingleCellExperiment(assays={"counts": mat[:5] + 2})

sce1 = SingleCellExperiment(
assays={"counts": mat[:10] + 1}, alternative_experiments={"rando": sce2}
)

sce0 = SingleCellExperiment(
assays={"counts": mat}, alternative_experiments={"rando": sce1}
)

with pytest.raises(Exception):
polish_dataset(sce0)

y = polish_dataset(sce0, forbid_nested_altexp=False)
assert (
y.get_alternative_experiment_names() == sce0.get_alternative_experiment_names()
)


def test_polish_existing_dataset():
sce = fetch_dataset("zeisel-brain-2015", "2023-12-14")

y = polish_dataset(sce)

assert y.shape == sce.shape
assert type(y.assays["counts"]) != type(sce.assays["counts"])

0 comments on commit d625a85

Please sign in to comment.