-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize/polish dataset for upload (#4)
including tests and documentation
- Loading branch information
Showing
4 changed files
with
312 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
from typing import Type | ||
from warnings import warn | ||
|
||
import numpy as np | ||
from delayedarray import DelayedArray | ||
from scipy import sparse as sp | ||
from singlecellexperiment import SingleCellExperiment | ||
from summarizedexperiment import SummarizedExperiment | ||
|
||
__author__ = "Jayaram Kancherla" | ||
__copyright__ = "Jayaram Kancherla" | ||
__license__ = "MIT" | ||
|
||
|
||
def polish_dataset( | ||
x: Type[SummarizedExperiment], | ||
reformat_assay_by_density: float = 0.3, | ||
attempt_integer_conversion: bool = True, | ||
remove_altexp_coldata: bool = True, | ||
forbid_nested_altexp: bool = True, | ||
) -> Type[SummarizedExperiment]: | ||
"""Optimize dataset for saving. | ||
Prepare a | ||
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` or | ||
:py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment` | ||
to be saved with :py:func:`scrnaseq.save_dataset.save_dataset`. | ||
This performs minor changes to improve storage efficiency, especially | ||
with matrices. | ||
Args: | ||
x: | ||
A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` | ||
or one of its derivative. | ||
reformat_assay_by_density: | ||
Whether to optimize assay formats based on the density of non-zero values. | ||
Assays with densities above this number are converted to ordinary dense | ||
arrays (if they are not already), while those with lower densities are | ||
converted to sparse matrices. | ||
This can be disabled by setting it to `None`. | ||
attempt_integer_conversion: | ||
Whether to convert double-precision assays containing integer values | ||
to actually have the integer type. | ||
This can improve efficiency of downstream applications by avoiding | ||
the need to operate in double precision. | ||
remove_altexp_coldata: | ||
Whether column data for alternative experiments should be removed. | ||
Defaults to `True` as the alternative experiment column data is | ||
usually redundant compared to the main experiment. | ||
forbid_nested_altexp: | ||
Whether nested alternative experiments (i.e., alternative experiments of | ||
alternative experiments) should be forbidden. | ||
Returns: | ||
A modifed object with the same type as ``x``. | ||
""" | ||
return _polish_dataset( | ||
x, | ||
reformat_assay_by_density, | ||
attempt_integer_conversion, | ||
remove_altexp_coldata, | ||
forbid_nested_altexp, | ||
) | ||
|
||
|
||
def _polish_dataset( | ||
x: Type[SummarizedExperiment], | ||
reformat_assay_by_density: float, | ||
attempt_integer_conversion: bool, | ||
remove_altexp_coldata: bool, | ||
forbid_nested_altexp: bool, | ||
level: int = 0, | ||
): | ||
new_assays = {} | ||
for asyname, asy in x.assays.items(): | ||
if reformat_assay_by_density is not None: | ||
density = min(np.mean(asy != 0), np.mean(asy != np.nan)) | ||
if density < reformat_assay_by_density: | ||
if not sp.issparse(asy): | ||
asy = sp.csr_matrix(asy) | ||
else: | ||
if sp.issparse(asy): | ||
asy = asy.toarray() | ||
|
||
if attempt_integer_conversion: | ||
if np.issubdtype(asy.dtype, np.floating): | ||
_cast = False | ||
if sp.issparse(asy): | ||
if not np.any(asy.data % 1 != 0): | ||
_cast = True | ||
elif not np.any(asy % 1 != 0): | ||
_cast = True | ||
|
||
if _cast is True: | ||
asy = asy.astype(np.int_) | ||
|
||
new_assays[asyname] = asy | ||
|
||
x = x.set_assays(new_assays) | ||
|
||
if isinstance(x, SingleCellExperiment): | ||
if len(x.get_alternative_experiment_names()) > 0: | ||
if forbid_nested_altexp and level > 0: | ||
raise ValueError("Nested alternative experiments are forbidden.") | ||
|
||
new_alts = {} | ||
for altname, altexp in x.alternative_experiments.items(): | ||
if remove_altexp_coldata: | ||
altexp = altexp.set_column_data(None) | ||
|
||
altexp = _polish_dataset( | ||
altexp, | ||
reformat_assay_by_density=reformat_assay_by_density, | ||
attempt_integer_conversion=attempt_integer_conversion, | ||
remove_altexp_coldata=remove_altexp_coldata, | ||
forbid_nested_altexp=forbid_nested_altexp, | ||
level=level + 1, | ||
) | ||
|
||
new_alts[altname] = altexp | ||
|
||
x = x.set_alternative_experiments(new_alts) | ||
|
||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
import numpy as np | ||
import pytest | ||
from biocframe import BiocFrame | ||
from scipy import sparse | ||
from scrnaseq import fetch_dataset, polish_dataset | ||
from singlecellexperiment import SingleCellExperiment | ||
|
||
__author__ = "Jayaram Kancherla" | ||
__copyright__ = "Jayaram Kancherla" | ||
__license__ = "MIT" | ||
|
||
|
||
def generate_matrix(): | ||
mat = np.random.poisson(1, (100, 10)) | ||
return mat | ||
|
||
|
||
def test_polish_dataset_strips_assay_dimnames(): | ||
mat = generate_matrix() | ||
row_names = [f"GENE_{i}" for i in range(mat.shape[0])] | ||
col_names = list("ABCDEFGHIJ") | ||
sce = SingleCellExperiment( | ||
assays={"counts": mat}, | ||
row_data=BiocFrame(row_names=row_names), | ||
column_data=BiocFrame(row_names=col_names), | ||
) | ||
|
||
y = polish_dataset(sce) | ||
assert sce.shape == y.shape | ||
assert sce.get_assay_names() == y.get_assay_names() | ||
assert y.get_row_names() is not None | ||
assert y.get_column_names() is not None | ||
|
||
sce.row_data = sce.row_data.set_row_names([name.lower() for name in row_names]) | ||
sce.column_names = sce.column_data.set_row_names( | ||
[name.lower() for name in col_names] | ||
) | ||
y = polish_dataset(sce) | ||
assert sce.shape == y.shape | ||
assert sce.get_assay_names() == y.get_assay_names() | ||
assert y.get_row_names() == sce.get_row_names() | ||
assert y.get_column_names() == sce.get_column_names() | ||
|
||
|
||
def test_polish_dataset_strips_reduced_dimension_names(): | ||
mat = generate_matrix() | ||
row_names = [f"GENE_{i}" for i in range(mat.shape[0])] | ||
col_names = list("ABCDEFGHIJ") | ||
pca = np.random.randn(10, 5) | ||
sce = SingleCellExperiment( | ||
assays={"counts": mat}, | ||
row_data=BiocFrame(row_names=row_names), | ||
column_data=BiocFrame(row_names=col_names), | ||
reduced_dims={"pca": pca}, | ||
) | ||
|
||
y = polish_dataset(sce) | ||
assert sce.shape == y.shape | ||
assert sce.get_assay_names() == y.get_assay_names() | ||
assert y.get_row_names() is not None | ||
assert y.get_column_names() is not None | ||
assert sce.get_reduced_dim_names() == y.get_reduced_dim_names() | ||
assert y.reduced_dims["pca"].shape == pca.shape | ||
|
||
|
||
def test_polish_dataset_strips_alternative_experiment_names(): | ||
mat = generate_matrix() | ||
row_names = [f"GENE_{i}" for i in range(mat.shape[0])] | ||
col_names = list("ABCDEFGHIJ") | ||
pca = np.random.randn(10, 5) | ||
altexp = SingleCellExperiment( | ||
assays={"counts": mat}, | ||
row_data=BiocFrame(row_names=row_names), | ||
column_data=BiocFrame(row_names=col_names), | ||
) | ||
sce = SingleCellExperiment( | ||
assays={"counts": mat}, | ||
row_data=BiocFrame(row_names=row_names), | ||
column_data=BiocFrame(row_names=col_names), | ||
reduced_dims={"pca": pca}, | ||
alternative_experiments={"rando": altexp}, | ||
) | ||
|
||
y = polish_dataset(sce) | ||
assert y.alternative_experiments["rando"].shape == altexp.shape | ||
assert y.alternative_experiments["rando"].get_row_names() == altexp.get_row_names() | ||
|
||
|
||
def test_polish_dataset_converts_dense_to_sparse(): | ||
mat = np.random.poisson(0.2, (100, 10)) | ||
sce = SingleCellExperiment(assays={"counts": mat}) | ||
|
||
y = polish_dataset(sce) | ||
assert sparse.issparse(y.assays["counts"]) | ||
|
||
y = polish_dataset(sce, reformat_assay_by_density=0) | ||
assert not sparse.issparse(y.assays["counts"]) | ||
assert y.shape == mat.shape | ||
|
||
y = polish_dataset(sce, reformat_assay_by_density=None) | ||
assert not sparse.issparse(y.assays["counts"]) | ||
|
||
|
||
def test_polish_dataset_converts_sparse_to_dense(): | ||
mat = np.random.poisson(3, (100, 10)) | ||
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) | ||
|
||
y = polish_dataset(sce) | ||
assert not sparse.issparse(y.assays["counts"]) | ||
|
||
y = polish_dataset(sce, reformat_assay_by_density=1) | ||
assert sparse.issparse(y.assays["counts"]) | ||
|
||
y = polish_dataset(sce, reformat_assay_by_density=None) | ||
assert sparse.issparse(y.assays["counts"]) | ||
|
||
|
||
def test_polish_dataset_attempts_integer_conversions(): | ||
mat = np.random.poisson(3, (100, 10)).astype(float) | ||
sce = SingleCellExperiment(assays={"counts": mat}) | ||
|
||
y = polish_dataset(sce) | ||
assert y.assays["counts"].dtype == np.int_ | ||
|
||
mat = np.random.poisson(0.1, (100, 10)).astype(float) | ||
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) | ||
|
||
y = polish_dataset(sce) | ||
assert sparse.issparse(y.assays["counts"]) | ||
assert y.assays["counts"].dtype == np.int_ | ||
|
||
mat = np.random.poisson(3, (100, 10)) * 1.5 | ||
sce = SingleCellExperiment(assays={"counts": mat}) | ||
|
||
y = polish_dataset(sce) | ||
assert y.assays["counts"].dtype == np.float_ | ||
|
||
|
||
def test_polish_dataset_works_with_na_values(): | ||
mat = np.random.poisson(0.1, (100, 10)) | ||
mat = mat.astype(np.float_) | ||
mat.ravel()[np.random.choice(mat.size, 10, replace=False)] = np.nan | ||
sce = SingleCellExperiment(assays={"counts": sparse.csr_matrix(mat)}) | ||
|
||
y = polish_dataset(sce) | ||
assert sparse.issparse(y.assays["counts"]) | ||
assert y.assays["counts"].dtype == np.float_ | ||
|
||
|
||
def test_polish_dataset_forbids_highly_nested_altexps(): | ||
mat = np.random.poisson(1, (100, 10)) | ||
sce2 = SingleCellExperiment(assays={"counts": mat[:5] + 2}) | ||
|
||
sce1 = SingleCellExperiment( | ||
assays={"counts": mat[:10] + 1}, alternative_experiments={"rando": sce2} | ||
) | ||
|
||
sce0 = SingleCellExperiment( | ||
assays={"counts": mat}, alternative_experiments={"rando": sce1} | ||
) | ||
|
||
with pytest.raises(Exception): | ||
polish_dataset(sce0) | ||
|
||
y = polish_dataset(sce0, forbid_nested_altexp=False) | ||
assert ( | ||
y.get_alternative_experiment_names() == sce0.get_alternative_experiment_names() | ||
) | ||
|
||
|
||
def test_polish_existing_dataset(): | ||
sce = fetch_dataset("zeisel-brain-2015", "2023-12-14") | ||
|
||
y = polish_dataset(sce) | ||
|
||
assert y.shape == sce.shape | ||
assert type(y.assays["counts"]) != type(sce.assays["counts"]) |