Skip to content

Commit

Permalink
update tests for integrated and also the README
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Jun 7, 2024
1 parent 680f92e commit 040d983
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 73 deletions.
68 changes: 29 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,28 @@ import singlecellexperiment as sce
data = sce.read_tenx_h5("pbmc4k-tenx.h5")
mat = data.assay("counts")
features = [str(x) for x in data.row_data["name"]]

import delayedarray as da
mat_csr = da.to_scipy_sparse_matrix(mat, "csr")
```

Now, we fetch the Blueprint/ENCODE reference:

```python
import celldex

ref_data = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
```

Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
We can annotate each cell in `mat` with the reference:

```python
import singler
results = singler.annotate_single(
mat,
features,
ref_data = "BlueprintEncode",
ref_features = "symbol",
ref_labels = "main",
cache_dir = "_cache"
test_data = mat_csr,
test_features = features,
ref_data = ref_data,
ref_labels = "label.main",
)
```

Expand Down Expand Up @@ -74,34 +83,12 @@ The `annotate_single()` function is a convenient wrapper around a number of lowe
Advanced users may prefer to build the reference and run the classification separately.
This allows us to re-use the same reference for multiple datasets without repeating the build step.

We start by fetching the reference of interest from [GitHub](https://github.com/kanaverse/singlepp-references).
Note the use of `cache_dir` to avoid repeated downloads from GitHub.

```python
ref = singler.fetch_github_reference("BlueprintEncode", cache_dir="_cache")
```

We'll be using the gene symbols here with the markers for the main labels.
We need to set `restrict_to` to the features in our test data, so as to avoid picking marker genes in the reference that won't be present in the test.

```python
ref_features = ref.row_data.column("symbol")

markers = singler.realize_github_markers(
ref.metadata["main"],
ref_features,
restrict_to=set(features),
)
```

Now we build the reference from the ranked expression values and the associated labels in the reference:

```python
built = singler.build_single_reference(
ref_data=ref.assay("ranks"),
ref_labels=ref.col_data.column("main"),
ref_features=ref_features,
markers=markers,
ref_data=ref_data.assay("logcounts"),
ref_labels=ref_data.col_data.column("label.main"),
ref_features=ref_data.get_row_names(),
restrict_to=features,
)
```

Expand All @@ -110,7 +97,7 @@ This can be repeated with different datasets that have the same features or a su

```python
output = singler.classify_single_reference(
mat,
mat_csr,
test_features=features,
ref_prebuilt=built,
)
Expand All @@ -134,14 +121,17 @@ We can use annotations from multiple references through the `annotate_integrated

```python
import singler
import celldex

blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)

immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)

single_results, integrated = singler.annotate_integrated(
mat,
features,
ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
ref_features_list= "symbol",
ref_labels_list = "main",
build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
cache_dir = "_cache",
ref_data_list = (blueprint_ref, immune_cell_ref),
ref_labels_list = "label.main",
num_threads = 6
)
```
Expand Down
35 changes: 24 additions & 11 deletions src/singler/annotate_integrated.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@

def annotate_integrated(
test_data: Any,
test_features: Sequence,
ref_data_list: Sequence[Union[Any, str]],
ref_labels_list: Union[str, Sequence[Union[Sequence, str]]],
ref_features_list: Union[str, Sequence[Union[Sequence, str]]],
test_features: Optional[Union[Sequence, str]] = None,
ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
test_assay_type: Union[str, int] = 0,
test_check_missing: bool = True,
ref_assay_type: Union[str, int] = "logcounts",
ref_check_missing: bool = True,
cache_dir: Optional[str] = None,
build_single_args: dict = {},
classify_single_args: dict = {},
build_integrated_args: dict = {},
Expand All @@ -45,6 +44,11 @@ def annotate_integrated(
Sequence of length equal to the number of rows in
``test_data``, containing the feature identifier for each row.
Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
may be a string speciying the column name in `row_data` that contains the
features. It can also be set to `None`, to use the `row_names` of the
experiment as features.
ref_data_list:
Sequence consisting of one or more of the following:
Expand All @@ -69,6 +73,10 @@ def annotate_integrated(
- If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string
specifying the label type to use, e.g., "main", "fine", "ont".
If a single string is supplied, it is recycled for all ``ref_data``.
- If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]``
may be a string speciying the column name in `column_data` that contains the
features. It can also be set to `None`, to use the `column_names`of the
experiment as features.
ref_features_list:
Sequence of the same length as ``ref_data_list``, where the contents
Expand All @@ -80,6 +88,10 @@ def annotate_integrated(
- If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string
specifying the feature type to use, e.g., "ensembl", "symbol".
If a single string is supplied, it is recycled for all ``ref_data``.
- If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]``
may be a string speciying the column name in `row_data` that contains the
features. It can also be set to `None`, to use the `row_names` of the
experiment as features.
test_assay_type:
Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a
Expand All @@ -95,11 +107,6 @@ def annotate_integrated(
ref_check_missing:
Whether to check for and remove missing (i.e., NaN) values from the reference datasets.
cache_dir:
Path to a cache directory for downloading reference files, see
:py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
Only used if ``ref_data`` is a string.
build_single_args:
Further arguments to pass to
:py:meth:`~singler.build_single_reference.build_single_reference`.
Expand Down Expand Up @@ -131,12 +138,18 @@ def annotate_integrated(

if isinstance(ref_labels_list, str):
ref_labels_list = [ref_labels_list] * nrefs
elif nrefs != len(ref_labels_list):
elif ref_labels_list is None:
ref_labels_list = [None] * nrefs

if nrefs != len(ref_labels_list):
raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")

if isinstance(ref_features_list, str):
ref_features_list = [ref_features_list] * nrefs
elif nrefs != len(ref_features_list):
elif ref_features_list is None:
ref_features_list = [None] * nrefs

if nrefs != len(ref_features_list):
raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")

test_ptr, test_features = _clean_matrix(
Expand Down
39 changes: 20 additions & 19 deletions src/singler/annotate_single.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Any, Optional, Sequence, Union

from biocframe import BiocFrame
Expand All @@ -8,9 +9,7 @@


def _resolve_reference(ref_data, ref_labels, ref_features, build_args):
if isinstance(ref_data, SummarizedExperiment) or issubclass(
type(ref_data), SummarizedExperiment
):
if isinstance(ref_data, SummarizedExperiment) or issubclass(type(ref_data), SummarizedExperiment):
if ref_features is None:
ref_features = ref_data.get_row_names()
elif isinstance(ref_features, str):
Expand All @@ -32,9 +31,7 @@ def _resolve_reference(ref_data, ref_labels, ref_features, build_args):

ref_data = ref_data.assay(_default_asy)
except Exception as _:
raise ValueError(
f"Reference dataset must contain log-normalized count ('{_default_asy}') assay."
)
raise ValueError(f"Reference dataset must contain log-normalized count ('{_default_asy}') assay.")

if ref_labels is None:
raise ValueError("'ref_labels' cannot be `None`.")
Expand Down Expand Up @@ -73,10 +70,10 @@ def annotate_single(
Sequence of length equal to the number of rows in
``test_data``, containing the feature identifier for each row.
If ``test_data`` is a ``SummarizedExperiment``, ``test_features``
may be a string speciying the column name in `row_data`that contains the
features. Alternatively can be set to `None`, to use the `row_names` of
the experiment as used as features.
Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
may be a string speciying the column name in `row_data` that contains the
features. It can also be set to `None`, to use the `row_names` of
the experiment as features.
ref_data:
A matrix-like object representing the reference dataset, where rows
Expand All @@ -94,20 +91,20 @@ def annotate_single(
a sequence of length equal to the number of columns of ``ref_data``,
containing the label associated with each column.
If ``ref_data`` is a ``SummarizedExperiment``, ``ref_labels``
may be a string specifying the label type to use,
e.g., "main", "fine", "ont". Alternatively can be set to
`None`, to use the `row_names` of the experiment as used as features.
Alternatively, if ``ref_data`` is a ``SummarizedExperiment``,
``ref_labels`` may be a string specifying the label type to use,
e.g., "main", "fine", "ont". It can also be set to `None`, to use
the `row_names` of the experiment as features.
ref_features:
If ``ref_data`` is a matrix-like object, ``ref_features`` should be
a sequence of length equal to the number of rows of ``ref_data``,
containing the feature identifier associated with each row.
If ``ref_data`` is a ``SummarizedExperiment``, ``ref_features``
may be a string speciying the column name in `column_data`
that contains the features. Alternatively can be set to
`None`, to use the `row_names` of the experiment as used as features.
Alternatively, if ``ref_data`` is a ``SummarizedExperiment``,
``ref_features`` may be a string speciying the column name in `column_data`
that contains the features. It can also be set to `None`, to use the
`row_names` of the experiment as features.
build_args:
Further arguments to pass to
Expand Down Expand Up @@ -138,6 +135,10 @@ def annotate_single(
raise ValueError("'test_features' cannot be `None`.")

test_features_set = set(test_features)
if len(test_features_set) != len(test_features):
warnings.warn("'test_features' is not unique, subsetting test matrix...", UserWarning)
_idxs = [test_features.index(x) for x in test_features_set]
test_data = test_data[_idxs,]

ref_data, ref_labels, ref_features = _resolve_reference(
ref_data=ref_data,
Expand All @@ -157,7 +158,7 @@ def annotate_single(

output = classify_single_reference(
test_data,
test_features=test_features,
test_features=test_features_set,
ref_prebuilt=built,
**classify_args,
num_threads=num_threads,
Expand Down
75 changes: 75 additions & 0 deletions tests/test_integrated_with_celldex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import singler
import numpy
import celldex
import scrnaseq
import pandas as pd
import scipy
import pytest
from biocframe import BiocFrame


def test_with_minimal_args():
sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)

blueprint_ref = celldex.fetch_reference(
"blueprint_encode", "2024-02-26", realize_assays=True
)
immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)

with pytest.raises(Exception):
singler.annotate_integrated(
test_data=sce.assays["counts"],
ref_data_list=(blueprint_ref, immune_cell_ref),
ref_labels_list="label.main",
num_threads=6,
)

single, integrated = singler.annotate_integrated(
test_data=sce,
ref_data_list=(blueprint_ref, immune_cell_ref),
ref_labels_list="label.main",
num_threads=6,
)
assert len(single) == 2
assert isinstance(integrated, BiocFrame)


def test_with_all_supplied():
sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)

blueprint_ref = celldex.fetch_reference(
"blueprint_encode", "2024-02-26", realize_assays=True
)
immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)

single, integrated = singler.annotate_integrated(
test_data=sce,
test_features=sce.get_row_names(),
ref_data_list=(blueprint_ref, immune_cell_ref),
ref_labels_list=[
x.get_column_data().column("label.main")
for x in (blueprint_ref, immune_cell_ref)
],
ref_features_list=[x.get_row_names() for x in (blueprint_ref, immune_cell_ref)],
)

assert len(single) == 2
assert isinstance(integrated, BiocFrame)


def test_with_colname():
sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)

blueprint_ref = celldex.fetch_reference(
"blueprint_encode", "2024-02-26", realize_assays=True
)
immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)

single, integrated = singler.annotate_integrated(
test_data=sce,
ref_data_list=(blueprint_ref, immune_cell_ref),
ref_labels_list="label.main",
)

assert len(single) == 2
assert isinstance(integrated, BiocFrame)
Loading

0 comments on commit 040d983

Please sign in to comment.