Skip to content

Commit

Permalink
Migrate to using celldex to fetch reference datasets (#25)
Browse files Browse the repository at this point in the history
* Removes code that fetches references from github built for kana. 
* Support test_data and ref_data to be SummarizedExperiment or its derivatives
* Include tests to support newer usecases
* Update README and docstrings
* Update package dependencies
  • Loading branch information
jkanche authored Jun 7, 2024
1 parent 04350c9 commit 9dd0d2e
Show file tree
Hide file tree
Showing 13 changed files with 285 additions and 492 deletions.
65 changes: 26 additions & 39 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,28 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics:

```python
import singlecellexperiment as sce
data = sce.read_tenx_h5("pbmc4k-tenx.h5")
data = sce.read_tenx_h5("pbmc4k-tenx.h5", realize_assays=True)
mat = data.assay("counts")
features = [str(x) for x in data.row_data["name"]]
```

Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
Now, we fetch the Blueprint/ENCODE reference:

```python
import celldex

ref_data = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
```

We can annotate each cell in `mat` with the reference:

```python
import singler
results = singler.annotate_single(
mat,
features,
ref_data = "BlueprintEncode",
ref_features = "symbol",
ref_labels = "main",
cache_dir = "_cache"
test_data = mat,
test_features = features,
ref_data = ref_data,
ref_labels = "label.main",
)
```

Expand Down Expand Up @@ -74,34 +80,12 @@ The `annotate_single()` function is a convenient wrapper around a number of lowe
Advanced users may prefer to build the reference and run the classification separately.
This allows us to re-use the same reference for multiple datasets without repeating the build step.

We start by fetching the reference of interest from [GitHub](https://github.com/kanaverse/singlepp-references).
Note the use of `cache_dir` to avoid repeated downloads from GitHub.

```python
ref = singler.fetch_github_reference("BlueprintEncode", cache_dir="_cache")
```

We'll be using the gene symbols here with the markers for the main labels.
We need to set `restrict_to` to the features in our test data, so as to avoid picking marker genes in the reference that won't be present in the test.

```python
ref_features = ref.row_data.column("symbol")

markers = singler.realize_github_markers(
ref.metadata["main"],
ref_features,
restrict_to=set(features),
)
```

Now we build the reference from the ranked expression values and the associated labels in the reference:

```python
built = singler.build_single_reference(
ref_data=ref.assay("ranks"),
ref_labels=ref.col_data.column("main"),
ref_features=ref_features,
markers=markers,
ref_data=ref_data.assay("logcounts"),
ref_labels=ref_data.col_data.column("label.main"),
ref_features=ref_data.get_row_names(),
restrict_to=features,
)
```

Expand Down Expand Up @@ -134,14 +118,17 @@ We can use annotations from multiple references through the `annotate_integrated

```python
import singler
import celldex

blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)

immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)

single_results, integrated = singler.annotate_integrated(
mat,
features,
ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
ref_features_list= "symbol",
ref_labels_list = "main",
build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
cache_dir = "_cache",
ref_data_list = (blueprint_ref, immune_cell_ref),
ref_labels_list = "label.main",
num_threads = 6
)
```
Expand Down
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ install_requires =
delayedarray
biocframe>=0.5.0
summarizedexperiment>=0.4.0
singlecellexperiment>=0.4.6
biocutils

[options.packages.find]
Expand All @@ -71,6 +72,9 @@ testing =
setuptools
pytest
pytest-cov
celldex
scrnaseq
scipy

[options.entry_points]
# Add here console scripts like:
Expand Down
11 changes: 5 additions & 6 deletions src/singler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
del version, PackageNotFoundError


from .get_classic_markers import get_classic_markers, number_of_classic_markers
from .annotate_integrated import annotate_integrated
from .annotate_single import annotate_single
from .build_integrated_references import IntegratedReferences, build_integrated_references
from .build_single_reference import build_single_reference
from .build_integrated_references import build_integrated_references, IntegratedReferences
from .classify_single_reference import classify_single_reference
from .classify_integrated_references import classify_integrated_references
from .fetch_reference import fetch_github_reference, realize_github_markers
from .annotate_single import annotate_single
from .annotate_integrated import annotate_integrated
from .classify_single_reference import classify_single_reference
from .get_classic_markers import get_classic_markers, number_of_classic_markers
7 changes: 7 additions & 0 deletions src/singler/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,18 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads):
return x, features

if isinstance(x, SummarizedExperiment):
if features is None:
features = x.get_row_names()
elif isinstance(features, str):
features = x.get_row_data().column(features)
features = list(features)

x = x.assay(assay_type)

curshape = x.shape
if len(curshape) != 2:
raise ValueError("each entry of 'ref' should be a 2-dimensional array")

if curshape[0] != len(features):
raise ValueError(
"number of rows of 'x' should be equal to the length of 'features'"
Expand Down
54 changes: 31 additions & 23 deletions src/singler/annotate_integrated.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from biocframe import BiocFrame

from ._utils import _clean_matrix
from .annotate_single import _attach_markers, _resolve_reference
from .annotate_single import _resolve_reference
from .build_integrated_references import build_integrated_references
from .build_single_reference import build_single_reference
from .classify_integrated_references import classify_integrated_references
Expand All @@ -12,15 +12,14 @@

def annotate_integrated(
test_data: Any,
test_features: Sequence,
ref_data_list: Sequence[Union[Any, str]],
ref_labels_list: Union[str, Sequence[Union[Sequence, str]]],
ref_features_list: Union[str, Sequence[Union[Sequence, str]]],
test_features: Optional[Union[Sequence, str]] = None,
ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
test_assay_type: Union[str, int] = 0,
test_check_missing: bool = True,
ref_assay_type: Union[str, int] = "logcounts",
ref_check_missing: bool = True,
cache_dir: Optional[str] = None,
build_single_args: dict = {},
classify_single_args: dict = {},
build_integrated_args: dict = {},
Expand All @@ -45,6 +44,11 @@ def annotate_integrated(
Sequence of length equal to the number of rows in
``test_data``, containing the feature identifier for each row.
Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
may be a string speciying the column name in `row_data` that contains the
features. It can also be set to `None`, to use the `row_names` of the
experiment as features.
ref_data_list:
Sequence consisting of one or more of the following:
Expand All @@ -69,6 +73,10 @@ def annotate_integrated(
- If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string
specifying the label type to use, e.g., "main", "fine", "ont".
If a single string is supplied, it is recycled for all ``ref_data``.
- If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]``
may be a string speciying the column name in `column_data` that contains the
features. It can also be set to `None`, to use the `column_names`of the
experiment as features.
ref_features_list:
Sequence of the same length as ``ref_data_list``, where the contents
Expand All @@ -80,6 +88,10 @@ def annotate_integrated(
- If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string
specifying the feature type to use, e.g., "ensembl", "symbol".
If a single string is supplied, it is recycled for all ``ref_data``.
- If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]``
may be a string speciying the column name in `row_data` that contains the
features. It can also be set to `None`, to use the `row_names` of the
experiment as features.
test_assay_type:
Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a
Expand All @@ -95,11 +107,6 @@ def annotate_integrated(
ref_check_missing:
Whether to check for and remove missing (i.e., NaN) values from the reference datasets.
cache_dir:
Path to a cache directory for downloading reference files, see
:py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
Only used if ``ref_data`` is a string.
build_single_args:
Further arguments to pass to
:py:meth:`~singler.build_single_reference.build_single_reference`.
Expand Down Expand Up @@ -128,18 +135,22 @@ def annotate_integrated(
:py:meth:`~singler.classify_integrated_references.classify_integrated_references`).
"""
nrefs = len(ref_data_list)

if isinstance(ref_labels_list, str):
ref_labels_list = [ref_labels_list] * nrefs
elif nrefs != len(ref_labels_list):
raise ValueError(
"'ref_data_list' and 'ref_labels_list' must be the same length"
)
elif ref_labels_list is None:
ref_labels_list = [None] * nrefs

if nrefs != len(ref_labels_list):
raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")

if isinstance(ref_features_list, str):
ref_features_list = [ref_features_list] * nrefs
elif nrefs != len(ref_features_list):
raise ValueError(
"'ref_data_list' and 'ref_features_list' must be the same length"
)
elif ref_features_list is None:
ref_features_list = [None] * nrefs

if nrefs != len(ref_features_list):
raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")

test_ptr, test_features = _clean_matrix(
test_data,
Expand All @@ -157,13 +168,11 @@ def annotate_integrated(
test_features_set = set(test_features)

for r in range(nrefs):
curref_mat, curref_labels, curref_features, curref_markers = _resolve_reference(
curref_mat, curref_labels, curref_features = _resolve_reference(
ref_data=ref_data_list[r],
ref_labels=ref_labels_list[r],
ref_features=ref_features_list[r],
cache_dir=cache_dir,
build_args=build_single_args,
test_features_set=test_features_set,
)

curref_ptr, curref_features = _clean_matrix(
Expand All @@ -174,13 +183,12 @@ def annotate_integrated(
num_threads=num_threads,
)

bargs = _attach_markers(curref_markers, build_single_args)
curbuilt = build_single_reference(
ref_data=curref_ptr,
ref_labels=curref_labels,
ref_features=curref_features,
restrict_to=test_features_set,
**bargs,
**build_single_args,
num_threads=num_threads,
)

Expand Down
Loading

0 comments on commit 9dd0d2e

Please sign in to comment.