update tests for integrated and also the README

SingleR-inc · Jun 7, 2024 · 040d983 · 040d983
1 parent 680f92e
commit 040d983
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -31,19 +31,28 @@ import singlecellexperiment as sce
 data = sce.read_tenx_h5("pbmc4k-tenx.h5")
 mat = data.assay("counts")
 features = [str(x) for x in data.row_data["name"]]
+
+import delayedarray as da
+mat_csr = da.to_scipy_sparse_matrix(mat, "csr")
+```
+
+Now, we fetch the Blueprint/ENCODE reference:
+
+```python
+import celldex
+
+ref_data = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
 ```
 
-Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
+We can annotate each cell in `mat` with the reference:
 
 ```python
 import singler
 results = singler.annotate_single(
-    mat,
-    features,
-    ref_data = "BlueprintEncode",
-    ref_features = "symbol",
-    ref_labels = "main",
-    cache_dir = "_cache"
+    test_data = mat_csr,
+    test_features = features,
+    ref_data = ref_data,
+    ref_labels = "label.main",
 )
 ```
 
@@ -74,34 +83,12 @@ The `annotate_single()` function is a convenient wrapper around a number of lowe
 Advanced users may prefer to build the reference and run the classification separately.
 This allows us to re-use the same reference for multiple datasets without repeating the build step.
 
-We start by fetching the reference of interest from [GitHub](https://github.com/kanaverse/singlepp-references).
-Note the use of `cache_dir` to avoid repeated downloads from GitHub.
-
-```python
-ref = singler.fetch_github_reference("BlueprintEncode", cache_dir="_cache")
-```
-
-We'll be using the gene symbols here with the markers for the main labels.
-We need to set `restrict_to` to the features in our test data, so as to avoid picking marker genes in the reference that won't be present in the test.
-
-```python
-ref_features = ref.row_data.column("symbol")
-
-markers = singler.realize_github_markers(
-    ref.metadata["main"],
-    ref_features,
-    restrict_to=set(features),
-)
-```
-
-Now we build the reference from the ranked expression values and the associated labels in the reference:
-
 ```python
 built = singler.build_single_reference(
-    ref_data=ref.assay("ranks"),
-    ref_labels=ref.col_data.column("main"),
-    ref_features=ref_features,
-    markers=markers,
+    ref_data=ref_data.assay("logcounts"),
+    ref_labels=ref_data.col_data.column("label.main"),
+    ref_features=ref_data.get_row_names(),
+    restrict_to=features,
 )
 ```
 
@@ -110,7 +97,7 @@ This can be repeated with different datasets that have the same features or a su
 
 ```python
 output = singler.classify_single_reference(
-    mat,
+    mat_csr,
     test_features=features,
     ref_prebuilt=built,
 )
@@ -134,14 +121,17 @@ We can use annotations from multiple references through the `annotate_integrated
 
 ```python
 import singler
+import celldex
+
+blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
+
+immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)
+
 single_results, integrated = singler.annotate_integrated(
     mat,
     features,
-    ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
-    ref_features_list= "symbol",
-    ref_labels_list = "main",
-    build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
-    cache_dir = "_cache",
+    ref_data_list = (blueprint_ref, immune_cell_ref),
+    ref_labels_list = "label.main",
     num_threads = 6
 )
 ```

diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py
@@ -12,15 +12,14 @@
 
 def annotate_integrated(
     test_data: Any,
-    test_features: Sequence,
     ref_data_list: Sequence[Union[Any, str]],
-    ref_labels_list: Union[str, Sequence[Union[Sequence, str]]],
-    ref_features_list: Union[str, Sequence[Union[Sequence, str]]],
+    test_features: Optional[Union[Sequence, str]] = None,
+    ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
+    ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
     test_assay_type: Union[str, int] = 0,
     test_check_missing: bool = True,
     ref_assay_type: Union[str, int] = "logcounts",
     ref_check_missing: bool = True,
-    cache_dir: Optional[str] = None,
     build_single_args: dict = {},
     classify_single_args: dict = {},
     build_integrated_args: dict = {},
@@ -45,6 +44,11 @@ def annotate_integrated(
             Sequence of length equal to the number of rows in
             ``test_data``, containing the feature identifier for each row.
 
+            Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
+            may be a string speciying the column name in `row_data` that contains the
+            features. It can also be set to `None`, to use the `row_names` of the
+            experiment as features.
+
         ref_data_list:
             Sequence consisting of one or more of the following:
 
@@ -69,6 +73,10 @@ def annotate_integrated(
             - If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string
               specifying the label type to use, e.g., "main", "fine", "ont".
               If a single string is supplied, it is recycled for all ``ref_data``.
+            - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]``
+              may be a string speciying the column name in `column_data` that contains the
+              features. It can also be set to `None`, to use the `column_names`of the
+              experiment as features.
 
         ref_features_list:
             Sequence of the same length as ``ref_data_list``, where the contents
@@ -80,6 +88,10 @@ def annotate_integrated(
             - If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string
               specifying the feature type to use, e.g., "ensembl", "symbol".
               If a single string is supplied, it is recycled for all ``ref_data``.
+            - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]``
+              may be a string speciying the column name in `row_data` that contains the
+              features. It can also be set to `None`, to use the `row_names` of the
+              experiment as features.
 
         test_assay_type:
             Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a
@@ -95,11 +107,6 @@ def annotate_integrated(
         ref_check_missing:
             Whether to check for and remove missing (i.e., NaN) values from the reference datasets.
 
-        cache_dir:
-            Path to a cache directory for downloading reference files, see
-            :py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
-            Only used if ``ref_data`` is a string.
-
         build_single_args:
             Further arguments to pass to
             :py:meth:`~singler.build_single_reference.build_single_reference`.
@@ -131,12 +138,18 @@ def annotate_integrated(
 
     if isinstance(ref_labels_list, str):
         ref_labels_list = [ref_labels_list] * nrefs
-    elif nrefs != len(ref_labels_list):
+    elif ref_labels_list is None:
+        ref_labels_list = [None] * nrefs
+
+    if nrefs != len(ref_labels_list):
         raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")
 
     if isinstance(ref_features_list, str):
         ref_features_list = [ref_features_list] * nrefs
-    elif nrefs != len(ref_features_list):
+    elif ref_features_list is None:
+        ref_features_list = [None] * nrefs
+
+    if nrefs != len(ref_features_list):
         raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")
 
     test_ptr, test_features = _clean_matrix(

diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Optional, Sequence, Union
 
 from biocframe import BiocFrame
@@ -8,9 +9,7 @@
 
 
 def _resolve_reference(ref_data, ref_labels, ref_features, build_args):
-    if isinstance(ref_data, SummarizedExperiment) or issubclass(
-        type(ref_data), SummarizedExperiment
-    ):
+    if isinstance(ref_data, SummarizedExperiment) or issubclass(type(ref_data), SummarizedExperiment):
         if ref_features is None:
             ref_features = ref_data.get_row_names()
         elif isinstance(ref_features, str):
@@ -32,9 +31,7 @@ def _resolve_reference(ref_data, ref_labels, ref_features, build_args):
 
             ref_data = ref_data.assay(_default_asy)
         except Exception as _:
-            raise ValueError(
-                f"Reference dataset must contain log-normalized count ('{_default_asy}') assay."
-            )
+            raise ValueError(f"Reference dataset must contain log-normalized count ('{_default_asy}') assay.")
 
     if ref_labels is None:
         raise ValueError("'ref_labels' cannot be `None`.")
@@ -73,10 +70,10 @@ def annotate_single(
             Sequence of length equal to the number of rows in
             ``test_data``, containing the feature identifier for each row.
 
-            If ``test_data`` is a ``SummarizedExperiment``, ``test_features``
-            may be a string speciying the column name in `row_data`that contains the
-            features. Alternatively can be set to `None`, to use the `row_names` of
-            the experiment as used as features.
+            Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
+            may be a string speciying the column name in `row_data` that contains the
+            features. It can also be set to `None`, to use the `row_names` of
+            the experiment as features.
 
         ref_data:
             A matrix-like object representing the reference dataset, where rows
@@ -94,20 +91,20 @@ def annotate_single(
             a sequence of length equal to the number of columns of ``ref_data``,
             containing the label associated with each column.
 
-            If ``ref_data`` is a ``SummarizedExperiment``, ``ref_labels``
-            may be a string specifying the label type to use,
-            e.g., "main", "fine", "ont". Alternatively can be set to
-            `None`, to use the `row_names` of the experiment as used as features.
+            Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, 
+            ``ref_labels`` may be a string specifying the label type to use,
+            e.g., "main", "fine", "ont". It can also be set to `None`, to use 
+            the `row_names` of the experiment as features.
 
         ref_features:
             If ``ref_data`` is a matrix-like object, ``ref_features`` should be
             a sequence of length equal to the number of rows of ``ref_data``,
             containing the feature identifier associated with each row.
 
-            If ``ref_data`` is a ``SummarizedExperiment``, ``ref_features``
-            may be a string speciying the column name in `column_data`
-            that contains the features. Alternatively can be set to
-            `None`, to use the `row_names` of the experiment as used as features.
+            Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, 
+            ``ref_features`` may be a string speciying the column name in `column_data`
+            that contains the features. It can also be set to `None`, to use the 
+            `row_names` of the experiment as features.
 
         build_args:
             Further arguments to pass to
@@ -138,6 +135,10 @@ def annotate_single(
         raise ValueError("'test_features' cannot be `None`.")
 
     test_features_set = set(test_features)
+    if len(test_features_set) != len(test_features):
+        warnings.warn("'test_features' is not unique, subsetting test matrix...", UserWarning)
+        _idxs = [test_features.index(x) for x in test_features_set]
+        test_data = test_data[_idxs,]
 
     ref_data, ref_labels, ref_features = _resolve_reference(
         ref_data=ref_data,
@@ -157,7 +158,7 @@ def annotate_single(
 
     output = classify_single_reference(
         test_data,
-        test_features=test_features,
+        test_features=test_features_set,
         ref_prebuilt=built,
         **classify_args,
         num_threads=num_threads,

diff --git a/tests/test_integrated_with_celldex.py b/tests/test_integrated_with_celldex.py
@@ -0,0 +1,75 @@
+import singler
+import numpy
+import celldex
+import scrnaseq
+import pandas as pd
+import scipy
+import pytest
+from biocframe import BiocFrame
+
+
+def test_with_minimal_args():
+    sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
+
+    blueprint_ref = celldex.fetch_reference(
+        "blueprint_encode", "2024-02-26", realize_assays=True
+    )
+    immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)
+
+    with pytest.raises(Exception):
+        singler.annotate_integrated(
+            test_data=sce.assays["counts"],
+            ref_data_list=(blueprint_ref, immune_cell_ref),
+            ref_labels_list="label.main",
+            num_threads=6,
+        )
+
+    single, integrated = singler.annotate_integrated(
+        test_data=sce,
+        ref_data_list=(blueprint_ref, immune_cell_ref),
+        ref_labels_list="label.main",
+        num_threads=6,
+    )
+    assert len(single) == 2
+    assert isinstance(integrated, BiocFrame)
+
+
+def test_with_all_supplied():
+    sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
+
+    blueprint_ref = celldex.fetch_reference(
+        "blueprint_encode", "2024-02-26", realize_assays=True
+    )
+    immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)
+
+    single, integrated = singler.annotate_integrated(
+        test_data=sce,
+        test_features=sce.get_row_names(),
+        ref_data_list=(blueprint_ref, immune_cell_ref),
+        ref_labels_list=[
+            x.get_column_data().column("label.main")
+            for x in (blueprint_ref, immune_cell_ref)
+        ],
+        ref_features_list=[x.get_row_names() for x in (blueprint_ref, immune_cell_ref)],
+    )
+
+    assert len(single) == 2
+    assert isinstance(integrated, BiocFrame)
+
+
+def test_with_colname():
+    sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True)
+
+    blueprint_ref = celldex.fetch_reference(
+        "blueprint_encode", "2024-02-26", realize_assays=True
+    )
+    immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)
+
+    single, integrated = singler.annotate_integrated(
+        test_data=sce,
+        ref_data_list=(blueprint_ref, immune_cell_ref),
+        ref_labels_list="label.main",
+    )
+
+    assert len(single) == 2
+    assert isinstance(integrated, BiocFrame)