Migrate to using celldex to fetch reference datasets (#25)

* Removes code that fetches references from github built for kana. * Support test_data and ref_data to be SummarizedExperiment or its derivatives * Include tests to support newer usecases * Update README and docstrings * Update package dependencies
SingleR-inc · Jun 7, 2024 · 9dd0d2e · 9dd0d2e
1 parent 04350c9
commit 9dd0d2e
Show file tree

Hide file tree

Showing 13 changed files with 285 additions and 492 deletions.
diff --git a/README.md b/README.md
@@ -28,22 +28,28 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics:
 
 ```python
 import singlecellexperiment as sce
-data = sce.read_tenx_h5("pbmc4k-tenx.h5")
+data = sce.read_tenx_h5("pbmc4k-tenx.h5", realize_assays=True)
 mat = data.assay("counts")
 features = [str(x) for x in data.row_data["name"]]
 ```
 
-Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
+Now, we fetch the Blueprint/ENCODE reference:
+
+```python
+import celldex
+
+ref_data = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
+```
+
+We can annotate each cell in `mat` with the reference:
 
 ```python
 import singler
 results = singler.annotate_single(
-    mat,
-    features,
-    ref_data = "BlueprintEncode",
-    ref_features = "symbol",
-    ref_labels = "main",
-    cache_dir = "_cache"
+    test_data = mat,
+    test_features = features,
+    ref_data = ref_data,
+    ref_labels = "label.main",
 )
 ```
 
@@ -74,34 +80,12 @@ The `annotate_single()` function is a convenient wrapper around a number of lowe
 Advanced users may prefer to build the reference and run the classification separately.
 This allows us to re-use the same reference for multiple datasets without repeating the build step.
 
-We start by fetching the reference of interest from [GitHub](https://github.com/kanaverse/singlepp-references).
-Note the use of `cache_dir` to avoid repeated downloads from GitHub.
-
-```python
-ref = singler.fetch_github_reference("BlueprintEncode", cache_dir="_cache")
-```
-
-We'll be using the gene symbols here with the markers for the main labels.
-We need to set `restrict_to` to the features in our test data, so as to avoid picking marker genes in the reference that won't be present in the test.
-
-```python
-ref_features = ref.row_data.column("symbol")
-
-markers = singler.realize_github_markers(
-    ref.metadata["main"],
-    ref_features,
-    restrict_to=set(features),
-)
-```
-
-Now we build the reference from the ranked expression values and the associated labels in the reference:
-
 ```python
 built = singler.build_single_reference(
-    ref_data=ref.assay("ranks"),
-    ref_labels=ref.col_data.column("main"),
-    ref_features=ref_features,
-    markers=markers,
+    ref_data=ref_data.assay("logcounts"),
+    ref_labels=ref_data.col_data.column("label.main"),
+    ref_features=ref_data.get_row_names(),
+    restrict_to=features,
 )
 ```
 
@@ -134,14 +118,17 @@ We can use annotations from multiple references through the `annotate_integrated
 
 ```python
 import singler
+import celldex
+
+blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True)
+
+immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True)
+
 single_results, integrated = singler.annotate_integrated(
     mat,
     features,
-    ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
-    ref_features_list= "symbol",
-    ref_labels_list = "main",
-    build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
-    cache_dir = "_cache",
+    ref_data_list = (blueprint_ref, immune_cell_ref),
+    ref_labels_list = "label.main",
     num_threads = 6
 )
 ```

diff --git a/setup.cfg b/setup.cfg
@@ -54,6 +54,7 @@ install_requires =
     delayedarray
     biocframe>=0.5.0
     summarizedexperiment>=0.4.0
+    singlecellexperiment>=0.4.6
     biocutils
 
 [options.packages.find]
@@ -71,6 +72,9 @@ testing =
     setuptools
     pytest
     pytest-cov
+    celldex
+    scrnaseq
+    scipy
 
 [options.entry_points]
 # Add here console scripts like:

diff --git a/src/singler/__init__.py b/src/singler/__init__.py
@@ -16,11 +16,10 @@
     del version, PackageNotFoundError
 
 
-from .get_classic_markers import get_classic_markers, number_of_classic_markers
+from .annotate_integrated import annotate_integrated
+from .annotate_single import annotate_single
+from .build_integrated_references import IntegratedReferences, build_integrated_references
 from .build_single_reference import build_single_reference
-from .build_integrated_references import build_integrated_references, IntegratedReferences
-from .classify_single_reference import classify_single_reference
 from .classify_integrated_references import classify_integrated_references
-from .fetch_reference import fetch_github_reference, realize_github_markers
-from .annotate_single import annotate_single
-from .annotate_integrated import annotate_integrated
+from .classify_single_reference import classify_single_reference
+from .get_classic_markers import get_classic_markers, number_of_classic_markers
diff --git a/src/singler/_utils.py b/src/singler/_utils.py
@@ -74,11 +74,18 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads):
         return x, features
 
     if isinstance(x, SummarizedExperiment):
+        if features is None:
+            features = x.get_row_names()
+        elif isinstance(features, str):
+            features = x.get_row_data().column(features)
+        features = list(features)
+
         x = x.assay(assay_type)
 
     curshape = x.shape
     if len(curshape) != 2:
         raise ValueError("each entry of 'ref' should be a 2-dimensional array")
+
     if curshape[0] != len(features):
         raise ValueError(
             "number of rows of 'x' should be equal to the length of 'features'"

diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py
@@ -3,7 +3,7 @@
 from biocframe import BiocFrame
 
 from ._utils import _clean_matrix
-from .annotate_single import _attach_markers, _resolve_reference
+from .annotate_single import _resolve_reference
 from .build_integrated_references import build_integrated_references
 from .build_single_reference import build_single_reference
 from .classify_integrated_references import classify_integrated_references
@@ -12,15 +12,14 @@
 
 def annotate_integrated(
     test_data: Any,
-    test_features: Sequence,
     ref_data_list: Sequence[Union[Any, str]],
-    ref_labels_list: Union[str, Sequence[Union[Sequence, str]]],
-    ref_features_list: Union[str, Sequence[Union[Sequence, str]]],
+    test_features: Optional[Union[Sequence, str]] = None,
+    ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
+    ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None,
     test_assay_type: Union[str, int] = 0,
     test_check_missing: bool = True,
     ref_assay_type: Union[str, int] = "logcounts",
     ref_check_missing: bool = True,
-    cache_dir: Optional[str] = None,
     build_single_args: dict = {},
     classify_single_args: dict = {},
     build_integrated_args: dict = {},
@@ -45,6 +44,11 @@ def annotate_integrated(
             Sequence of length equal to the number of rows in
             ``test_data``, containing the feature identifier for each row.
 
+            Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features``
+            may be a string speciying the column name in `row_data` that contains the
+            features. It can also be set to `None`, to use the `row_names` of the
+            experiment as features.
+
         ref_data_list:
             Sequence consisting of one or more of the following:
 
@@ -69,6 +73,10 @@ def annotate_integrated(
             - If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string
               specifying the label type to use, e.g., "main", "fine", "ont".
               If a single string is supplied, it is recycled for all ``ref_data``.
+            - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]``
+              may be a string speciying the column name in `column_data` that contains the
+              features. It can also be set to `None`, to use the `column_names`of the
+              experiment as features.
 
         ref_features_list:
             Sequence of the same length as ``ref_data_list``, where the contents
@@ -80,6 +88,10 @@ def annotate_integrated(
             - If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string
               specifying the feature type to use, e.g., "ensembl", "symbol".
               If a single string is supplied, it is recycled for all ``ref_data``.
+            - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]``
+              may be a string speciying the column name in `row_data` that contains the
+              features. It can also be set to `None`, to use the `row_names` of the
+              experiment as features.
 
         test_assay_type:
             Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a
@@ -95,11 +107,6 @@ def annotate_integrated(
         ref_check_missing:
             Whether to check for and remove missing (i.e., NaN) values from the reference datasets.
 
-        cache_dir:
-            Path to a cache directory for downloading reference files, see
-            :py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
-            Only used if ``ref_data`` is a string.
-
         build_single_args:
             Further arguments to pass to
             :py:meth:`~singler.build_single_reference.build_single_reference`.
@@ -128,18 +135,22 @@ def annotate_integrated(
         :py:meth:`~singler.classify_integrated_references.classify_integrated_references`).
     """
     nrefs = len(ref_data_list)
+
     if isinstance(ref_labels_list, str):
         ref_labels_list = [ref_labels_list] * nrefs
-    elif nrefs != len(ref_labels_list):
-        raise ValueError(
-            "'ref_data_list' and 'ref_labels_list' must be the same length"
-        )
+    elif ref_labels_list is None:
+        ref_labels_list = [None] * nrefs
+
+    if nrefs != len(ref_labels_list):
+        raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")
+
     if isinstance(ref_features_list, str):
         ref_features_list = [ref_features_list] * nrefs
-    elif nrefs != len(ref_features_list):
-        raise ValueError(
-            "'ref_data_list' and 'ref_features_list' must be the same length"
-        )
+    elif ref_features_list is None:
+        ref_features_list = [None] * nrefs
+
+    if nrefs != len(ref_features_list):
+        raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")
 
     test_ptr, test_features = _clean_matrix(
         test_data,
@@ -157,13 +168,11 @@ def annotate_integrated(
     test_features_set = set(test_features)
 
     for r in range(nrefs):
-        curref_mat, curref_labels, curref_features, curref_markers = _resolve_reference(
+        curref_mat, curref_labels, curref_features = _resolve_reference(
             ref_data=ref_data_list[r],
             ref_labels=ref_labels_list[r],
             ref_features=ref_features_list[r],
-            cache_dir=cache_dir,
             build_args=build_single_args,
-            test_features_set=test_features_set,
         )
 
         curref_ptr, curref_features = _clean_matrix(
@@ -174,13 +183,12 @@ def annotate_integrated(
             num_threads=num_threads,
         )
 
-        bargs = _attach_markers(curref_markers, build_single_args)
         curbuilt = build_single_reference(
             ref_data=curref_ptr,
             ref_labels=curref_labels,
             ref_features=curref_features,
             restrict_to=test_features_set,
-            **bargs,
+            **build_single_args,
             num_threads=num_threads,
         )