Added an overlord function for the integrated annotation. (#20)

- Renamed annotate to annotate_single to distinguish it. - Factored out the reference string handler from annotate_single. - Minor fixes to classify_integrated to deal with BiocFrame inputs. - Updated the README.
SingleR-inc · Sep 19, 2023 · 7bbf662 · 7bbf662
1 parent 15dfdd5
commit 7bbf662
Show file tree

Hide file tree

Showing 7 changed files with 322 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -30,14 +30,14 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics:
 import singlecellexperiment as sce
 data = sce.read_tenx_h5("pbmc4k-tenx.h5")
 mat = data.assay("counts")
-features = [x.encode("ASCII") for x in data.row_data["name"]]
+features = [str(x) for x in data.row_data["name"]]
 ```
 
 Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
 
 ```python
 import singler
-results = singler.annotate(
+results = singler.annotate_single(
     mat,
     features,
     ref_data = "BlueprintEncode",
@@ -63,14 +63,14 @@ results.column("best")
 ##  ...
 ## ]
 
-results.column("scores").column("Macrophage")
+results.column("scores").column("Macrophages")
 ## array([0.35935275, 0.40833545, 0.37430726, ..., 0.32135929, 0.29728435,
 ##        0.40208581])
 ```
 
 ## Calling low-level functions
 
-The `annotate()` function is a convenient wrapper around a number of lower-level functions in **singler**.
+The `annotate_single()` function is a convenient wrapper around a number of lower-level functions in **singler**.
 Advanced users may prefer to build the reference and run the classification separately.
 This allows us to re-use the same reference for multiple datasets without repeating the build step.
 
@@ -116,6 +116,53 @@ output = singler.classify_single_reference(
 )
 ```
 
+## Integrating labels across references
+
+We can use annotations from multiple references through the `annotate_integrated()` function:
+
+```python
+import singler
+single_results, integrated = singler.annotate_integrated(
+    mat,
+    features,
+    ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
+    ref_features = "symbol",
+    ref_labels = "main",
+    build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
+    cache_dir = "_cache",
+    num_threads = 6
+)
+```
+
+This annotates the test dataset against each reference individually to obtain the best per-reference label,
+and then it compares across references to find the best label from all references.
+Both the single and integrated annotations are reported for diagnostics.
+
+```python
+integrated.column("best_label")
+## ['Monocytes', 
+##  'Monocytes',
+##  'Monocytes',
+##  'CD8+ T-cells',
+##  'CD4+ T-cells',
+##  'CD8+ T-cells',
+##  'Monocytes',
+##  'Monocytes',
+##  ...
+## ]
+
+integrated.column("best_reference")
+## ['Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## ...
+##]
+```
+
 ## Developer notes
 
 Build the shared object file:

diff --git a/src/singler/__init__.py b/src/singler/__init__.py
@@ -22,4 +22,5 @@
 from .classify_single_reference import classify_single_reference
 from .classify_integrated_references import classify_integrated_references
 from .fetch_reference import fetch_github_reference, realize_github_markers
-from .annotate import annotate
+from .annotate_single import annotate_single
+from .annotate_integrated import annotate_integrated
diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py
@@ -0,0 +1,170 @@
+from typing import Union, Sequence, Optional, Any, Tuple
+from biocframe import BiocFrame
+
+from .fetch_reference import fetch_github_reference, realize_github_markers
+from .build_single_reference import build_single_reference
+from .classify_single_reference import classify_single_reference
+from .build_integrated_references import build_integrated_references
+from .classify_integrated_references import classify_integrated_references
+from .annotate_single import _build_reference
+
+
+def annotate_integrated(
+    test_data: Any,
+    test_features: Sequence,
+    ref_data: Sequence[Union[Any, str]],
+    ref_labels: Union[str, Sequence[Union[Sequence, str]]],
+    ref_features: Union[str, Sequence[Union[Sequence, str]]],
+    cache_dir: Optional[str] = None,
+    build_single_args={},
+    classify_single_args={},
+    build_integrated_args={},
+    classify_integrated_args={},
+    num_threads=1,
+) -> Tuple[list[BiocFrame], BiocFrame]:
+    """Annotate a single-cell expression dataset based on the correlation 
+    of each cell to profiles in multiple labelled references, where the
+    annotation from each reference is then integrated across references.
+
+    Args:
+        test_data: A matrix-like object representing the test dataset, where rows are
+            features and columns are samples (usually cells). Entries should be expression
+            values; only the ranking within each column will be used.
+
+        test_features (Sequence): Sequence of length equal to the number of rows in
+            ``test_data``, containing the feature identifier for each row.
+
+        ref_data: 
+            Sequence consisting of one or more of the following:
+
+            - A matrix-like object representing the reference dataset, where rows
+              are features and columns are samples. Entries should be expression values,
+              usually log-transformed (see comments for the ``ref`` argument in
+              :py:meth:`~singler.build_single_reference.build_single_reference`).
+            - A string that can be passed as ``name`` to
+              :py:meth:`~singler.fetch_reference.fetch_github_reference`.
+              This will use the specified dataset as the reference.
+
+        ref_labels (Union[str, Sequence[Union[Sequence, str]]]):
+            Sequence of the same length as ``ref_data``, where the contents
+            depend on the type of value in the corresponding entry of ``ref_data``:
+
+            - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be
+              a sequence of length equal to the number of columns of ``ref_data[i]``,
+              containing the label associated with each column.
+            - If ``ref_data[i]`` is a string, ``ref_labels[i]`` should be a string
+              specifying the label type to use, e.g., "main", "fine", "ont".
+
+             If a single string is supplied, it is recycled for all ``ref_data``.
+
+        ref_features (Union[str, Sequence[Union[Sequence, str]]]):
+            Sequence of the same length as ``ref_data``, where the contents
+            depend on the type of value in the corresponding entry of ``ref_data``:
+
+            - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be
+              a sequence of length equal to the number of rows of ``ref_data``,
+              containing the feature identifier associated with each row.
+            - If ``ref_data[i]`` is a string, ``ref_features[i]`` should be a string
+              specifying the feature type to use, e.g., "ensembl", "symbol".
+
+             If a single string is supplied, it is recycled for all ``ref_data``.
+
+        cache_dir (str):
+            Path to a cache directory for downloading reference files, see
+            :py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
+            Only used if ``ref_data`` is a string.
+
+        build_single_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.build_single_reference.build_single_reference`.
+
+        classify_single_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.classify_single_reference.classify_single_reference`.
+
+        build_integrated_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.build_integrated_reference.build_integrated_reference`.
+
+        classify_integrated_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.classify_integrated_reference.classify_integrated_reference`.
+
+        num_threads (int):
+            Number of threads to use for the various steps.
+
+    Returns:
+        Tuple[list[BiocFrame], BiocFrame]: Tuple where the first element
+        contains per-reference results (i.e. a list of BiocFrame outputs
+        equivalent to running
+        :py:meth:`~singler.annotate_single.annotate_single` on each reference)
+        and the second element contains integrated results across references
+        (i.e., a BiocFrame from
+        :py:meth:`~singler.classify_integrated_references.classify_integrated_references`).
+    """
+    nrefs = len(ref_data)
+    if isinstance(ref_labels, str):
+        ref_labels = [ref_labels] * nrefs
+    elif nrefs != len(ref_labels):
+        raise ValueError("'ref_data' and 'ref_labels' must be the same length")
+    if isinstance(ref_features, str):
+        ref_features = [ref_features] * nrefs
+    elif nrefs != len(ref_features):
+        raise ValueError("'ref_data' and 'ref_features' must be the same length")
+
+    all_ref_data = []
+    all_ref_labels = []
+    all_ref_features = []
+    all_built = []
+    all_results = []
+    test_features_set = set(test_features)
+
+    for r in range(nrefs):
+        curref_data, curref_labels, curref_features, curbuilt = _build_reference(
+            ref_data=ref_data[r],
+            ref_labels=ref_labels[r],
+            ref_features=ref_features[r],
+            test_features_set=test_features_set,
+            cache_dir=cache_dir,
+            build_args=build_single_args,
+            num_threads=num_threads,
+        )
+
+        res = classify_single_reference(
+            test_data,
+            test_features=test_features,
+            ref_prebuilt=curbuilt,
+            **classify_single_args,
+            num_threads=num_threads,
+        )
+
+        res.metadata = {
+            "markers": curbuilt.markers,
+            "unique_markers": curbuilt.marker_subset(),
+        }
+
+        all_ref_data.append(curref_data)
+        all_ref_labels.append(curref_labels)
+        all_ref_features.append(curref_features)
+        all_built.append(curbuilt)
+        all_results.append(res)
+
+    ibuilt = build_integrated_references(
+        test_features=test_features,
+        ref_data_list=all_ref_data,
+        ref_labels_list=all_ref_labels,
+        ref_features_list=all_ref_features,
+        ref_prebuilt_list=all_built,
+        num_threads=num_threads,
+        **build_integrated_args,
+    )
+
+    ires = classify_integrated_references(
+        test_data=test_data,
+        results=all_results,
+        integrated_prebuilt=ibuilt,
+        **classify_integrated_args,
+        num_threads=num_threads,
+    )
+
+    return all_results, ires
diff --git a/src/singler/annotate.py → src/singler/annotate_single.py b/src/singler/annotate.py → src/singler/annotate_single.py
@@ -6,7 +6,49 @@
 from .classify_single_reference import classify_single_reference
 
 
-def annotate(
+def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cache_dir, build_args, num_threads):
+    if isinstance(ref_data, str):
+        ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
+        ref_features = ref.row_data.column(ref_features)
+
+        num_de = None
+        if "marker_args" in build_args:
+            marker_args = build_args["marker_args"]
+            if "num_de" in marker_args:
+                num_de = marker_args["num_de"]
+
+        markers = realize_github_markers(
+            ref.metadata[ref_labels],
+            ref_features,
+            num_markers=num_de,
+            restrict_to=test_features_set,
+        )
+
+        ref_data = ref.assay("ranks")
+        ref_labels=ref.col_data.column(ref_labels)
+        built = build_single_reference(
+            ref_data=ref_data,
+            ref_labels=ref_labels,
+            ref_features=ref_features,
+            markers=markers,
+            num_threads=num_threads,
+            **build_args,
+        )
+
+    else:
+        built = build_single_reference(
+            ref_data=ref_data,
+            ref_labels=ref_labels,
+            ref_features=ref_features,
+            restrict_to=test_features_set,
+            num_threads=num_threads,
+            **build_args,
+        )
+
+    return ref_data, ref_labels, ref_features, built
+
+
+def annotate_single(
     test_data,
     test_features: Sequence,
     ref_data,
@@ -17,7 +59,8 @@ def annotate(
     classify_args={},
     num_threads=1,
 ) -> BiocFrame:
-    """Annotate an expression dataset based on the correlation to a labelled reference.
+    """Annotate a single-cell expression dataset based on the correlation 
+    of each cell to profiles in a labelled reference.
 
     Args:
         test_data: A matrix-like object representing the test dataset, where rows are
@@ -50,7 +93,7 @@ def annotate(
             containing the feature identifier associated with each row.
 
             If ``ref_data`` is a string, ``ref_features`` should be a string
-            specifying the label type to use, e.g., "main", "fine", "ont".
+            specifying the label type to use, e.g., "ensembl", "symbol".
 
         cache_dir (str):
             Path to a cache directory for downloading reference files, see
@@ -75,42 +118,15 @@ def annotate(
         specifying the markers that were used for each pairwise comparison
         between labels; and a list of ``unique_markers`` across all labels.
     """
-
-    if isinstance(ref_data, str):
-        ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
-        ref_features = ref.row_data.column(ref_features)
-
-        num_de = None
-        if "marker_args" in build_args:
-            marker_args = build_args["marker_args"]
-            if "num_de" in marker_args:
-                num_de = marker_args["num_de"]
-
-        markers = realize_github_markers(
-            ref.metadata[ref_labels],
-            ref_features,
-            num_markers=num_de,
-            restrict_to=set(test_features),
-        )
-
-        built = build_single_reference(
-            ref_data=ref.assay("ranks"),
-            ref_labels=ref.col_data.column(ref_labels),
-            ref_features=ref_features,
-            markers=markers,
-            num_threads=num_threads,
-            **build_args,
-        )
-
-    else:
-        built = build_single_reference(
-            ref_data=ref_data,
-            ref_labels=ref_labels,
-            ref_features=ref_features,
-            restrict_to=set(test_features),
-            num_threads=num_threads,
-            **build_args,
-        )
+    ref_data, ref_labels, ref_features, built = _build_reference(
+        ref_data=ref_data,
+        ref_labels=ref_labels,
+        ref_features=ref_features,
+        test_features_set=set(test_features),
+        cache_dir=cache_dir,
+        build_args=build_args,
+        num_threads=num_threads,
+    )
 
     output = classify_single_reference(
         test_data,