From 7bbf662aa064df64e25d58a666359dbeb439bb18 Mon Sep 17 00:00:00 2001
From: Aaron Lun <infinite.monkeys.with.keyboards@gmail.com>
Date: Tue, 19 Sep 2023 10:37:02 -0700
Subject: [PATCH] Added an overlord function for the integrated annotation.
 (#20)

- Renamed annotate to annotate_single to distinguish it.
- Factored out the reference string handler from annotate_single.
- Minor fixes to classify_integrated to deal with BiocFrame inputs.
- Updated the README.
---
 README.md                                     |  55 +++++-
 src/singler/__init__.py                       |   3 +-
 src/singler/annotate_integrated.py            | 170 ++++++++++++++++++
 .../{annotate.py => annotate_single.py}       |  94 ++++++----
 src/singler/classify_integrated_references.py |   8 +-
 tests/test_annotate_integrated.py             |  30 ++++
 ...st_annotate.py => test_annotate_single.py} |  14 +-
 7 files changed, 322 insertions(+), 52 deletions(-)
 create mode 100644 src/singler/annotate_integrated.py
 rename src/singler/{annotate.py => annotate_single.py} (82%)
 create mode 100644 tests/test_annotate_integrated.py
 rename tests/{test_annotate.py => test_annotate_single.py} (91%)

diff --git a/README.md b/README.md
index 5db9f90..21a6a93 100644
--- a/README.md
+++ b/README.md
@@ -30,14 +30,14 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics:
 import singlecellexperiment as sce
 data = sce.read_tenx_h5("pbmc4k-tenx.h5")
 mat = data.assay("counts")
-features = [x.encode("ASCII") for x in data.row_data["name"]]
+features = [str(x) for x in data.row_data["name"]]
 ```
 
 Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`:
 
 ```python
 import singler
-results = singler.annotate(
+results = singler.annotate_single(
     mat,
     features,
     ref_data = "BlueprintEncode",
@@ -63,14 +63,14 @@ results.column("best")
 ##  ...
 ## ]
 
-results.column("scores").column("Macrophage")
+results.column("scores").column("Macrophages")
 ## array([0.35935275, 0.40833545, 0.37430726, ..., 0.32135929, 0.29728435,
 ##        0.40208581])
 ```
 
 ## Calling low-level functions
 
-The `annotate()` function is a convenient wrapper around a number of lower-level functions in **singler**.
+The `annotate_single()` function is a convenient wrapper around a number of lower-level functions in **singler**.
 Advanced users may prefer to build the reference and run the classification separately.
 This allows us to re-use the same reference for multiple datasets without repeating the build step.
 
@@ -116,6 +116,53 @@ output = singler.classify_single_reference(
 )
 ```
 
+## Integrating labels across references
+
+We can use annotations from multiple references through the `annotate_integrated()` function:
+
+```python
+import singler
+single_results, integrated = singler.annotate_integrated(
+    mat,
+    features,
+    ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
+    ref_features = "symbol",
+    ref_labels = "main",
+    build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
+    cache_dir = "_cache",
+    num_threads = 6
+)
+```
+
+This annotates the test dataset against each reference individually to obtain the best per-reference label,
+and then it compares across references to find the best label from all references.
+Both the single and integrated annotations are reported for diagnostics.
+
+```python
+integrated.column("best_label")
+## ['Monocytes', 
+##  'Monocytes',
+##  'Monocytes',
+##  'CD8+ T-cells',
+##  'CD4+ T-cells',
+##  'CD8+ T-cells',
+##  'Monocytes',
+##  'Monocytes',
+##  ...
+## ]
+
+integrated.column("best_reference")
+## ['Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## 'Blueprint',
+## ...
+##]
+```
+
 ## Developer notes
 
 Build the shared object file:
diff --git a/src/singler/__init__.py b/src/singler/__init__.py
index 351f836..dc17568 100644
--- a/src/singler/__init__.py
+++ b/src/singler/__init__.py
@@ -22,4 +22,5 @@
 from .classify_single_reference import classify_single_reference
 from .classify_integrated_references import classify_integrated_references
 from .fetch_reference import fetch_github_reference, realize_github_markers
-from .annotate import annotate
+from .annotate_single import annotate_single
+from .annotate_integrated import annotate_integrated
diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py
new file mode 100644
index 0000000..df79156
--- /dev/null
+++ b/src/singler/annotate_integrated.py
@@ -0,0 +1,170 @@
+from typing import Union, Sequence, Optional, Any, Tuple
+from biocframe import BiocFrame
+
+from .fetch_reference import fetch_github_reference, realize_github_markers
+from .build_single_reference import build_single_reference
+from .classify_single_reference import classify_single_reference
+from .build_integrated_references import build_integrated_references
+from .classify_integrated_references import classify_integrated_references
+from .annotate_single import _build_reference
+
+
+def annotate_integrated(
+    test_data: Any,
+    test_features: Sequence,
+    ref_data: Sequence[Union[Any, str]],
+    ref_labels: Union[str, Sequence[Union[Sequence, str]]],
+    ref_features: Union[str, Sequence[Union[Sequence, str]]],
+    cache_dir: Optional[str] = None,
+    build_single_args={},
+    classify_single_args={},
+    build_integrated_args={},
+    classify_integrated_args={},
+    num_threads=1,
+) -> Tuple[list[BiocFrame], BiocFrame]:
+    """Annotate a single-cell expression dataset based on the correlation 
+    of each cell to profiles in multiple labelled references, where the
+    annotation from each reference is then integrated across references.
+
+    Args:
+        test_data: A matrix-like object representing the test dataset, where rows are
+            features and columns are samples (usually cells). Entries should be expression
+            values; only the ranking within each column will be used.
+
+        test_features (Sequence): Sequence of length equal to the number of rows in
+            ``test_data``, containing the feature identifier for each row.
+
+        ref_data: 
+            Sequence consisting of one or more of the following:
+
+            - A matrix-like object representing the reference dataset, where rows
+              are features and columns are samples. Entries should be expression values,
+              usually log-transformed (see comments for the ``ref`` argument in
+              :py:meth:`~singler.build_single_reference.build_single_reference`).
+            - A string that can be passed as ``name`` to
+              :py:meth:`~singler.fetch_reference.fetch_github_reference`.
+              This will use the specified dataset as the reference.
+
+        ref_labels (Union[str, Sequence[Union[Sequence, str]]]):
+            Sequence of the same length as ``ref_data``, where the contents
+            depend on the type of value in the corresponding entry of ``ref_data``:
+
+            - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be
+              a sequence of length equal to the number of columns of ``ref_data[i]``,
+              containing the label associated with each column.
+            - If ``ref_data[i]`` is a string, ``ref_labels[i]`` should be a string
+              specifying the label type to use, e.g., "main", "fine", "ont".
+
+             If a single string is supplied, it is recycled for all ``ref_data``.
+
+        ref_features (Union[str, Sequence[Union[Sequence, str]]]):
+            Sequence of the same length as ``ref_data``, where the contents
+            depend on the type of value in the corresponding entry of ``ref_data``:
+
+            - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be
+              a sequence of length equal to the number of rows of ``ref_data``,
+              containing the feature identifier associated with each row.
+            - If ``ref_data[i]`` is a string, ``ref_features[i]`` should be a string
+              specifying the feature type to use, e.g., "ensembl", "symbol".
+
+             If a single string is supplied, it is recycled for all ``ref_data``.
+
+        cache_dir (str):
+            Path to a cache directory for downloading reference files, see
+            :py:meth:`~singler.fetch_reference.fetch_github_reference` for details.
+            Only used if ``ref_data`` is a string.
+
+        build_single_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.build_single_reference.build_single_reference`.
+
+        classify_single_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.classify_single_reference.classify_single_reference`.
+
+        build_integrated_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.build_integrated_reference.build_integrated_reference`.
+
+        classify_integrated_args (dict):
+            Further arguments to pass to
+            :py:meth:`~singler.classify_integrated_reference.classify_integrated_reference`.
+
+        num_threads (int):
+            Number of threads to use for the various steps.
+
+    Returns:
+        Tuple[list[BiocFrame], BiocFrame]: Tuple where the first element
+        contains per-reference results (i.e. a list of BiocFrame outputs
+        equivalent to running
+        :py:meth:`~singler.annotate_single.annotate_single` on each reference)
+        and the second element contains integrated results across references
+        (i.e., a BiocFrame from
+        :py:meth:`~singler.classify_integrated_references.classify_integrated_references`).
+    """
+    nrefs = len(ref_data)
+    if isinstance(ref_labels, str):
+        ref_labels = [ref_labels] * nrefs
+    elif nrefs != len(ref_labels):
+        raise ValueError("'ref_data' and 'ref_labels' must be the same length")
+    if isinstance(ref_features, str):
+        ref_features = [ref_features] * nrefs
+    elif nrefs != len(ref_features):
+        raise ValueError("'ref_data' and 'ref_features' must be the same length")
+
+    all_ref_data = []
+    all_ref_labels = []
+    all_ref_features = []
+    all_built = []
+    all_results = []
+    test_features_set = set(test_features)
+
+    for r in range(nrefs):
+        curref_data, curref_labels, curref_features, curbuilt = _build_reference(
+            ref_data=ref_data[r],
+            ref_labels=ref_labels[r],
+            ref_features=ref_features[r],
+            test_features_set=test_features_set,
+            cache_dir=cache_dir,
+            build_args=build_single_args,
+            num_threads=num_threads,
+        )
+
+        res = classify_single_reference(
+            test_data,
+            test_features=test_features,
+            ref_prebuilt=curbuilt,
+            **classify_single_args,
+            num_threads=num_threads,
+        )
+
+        res.metadata = {
+            "markers": curbuilt.markers,
+            "unique_markers": curbuilt.marker_subset(),
+        }
+
+        all_ref_data.append(curref_data)
+        all_ref_labels.append(curref_labels)
+        all_ref_features.append(curref_features)
+        all_built.append(curbuilt)
+        all_results.append(res)
+
+    ibuilt = build_integrated_references(
+        test_features=test_features,
+        ref_data_list=all_ref_data,
+        ref_labels_list=all_ref_labels,
+        ref_features_list=all_ref_features,
+        ref_prebuilt_list=all_built,
+        num_threads=num_threads,
+        **build_integrated_args,
+    )
+
+    ires = classify_integrated_references(
+        test_data=test_data,
+        results=all_results,
+        integrated_prebuilt=ibuilt,
+        **classify_integrated_args,
+        num_threads=num_threads,
+    )
+
+    return all_results, ires
diff --git a/src/singler/annotate.py b/src/singler/annotate_single.py
similarity index 82%
rename from src/singler/annotate.py
rename to src/singler/annotate_single.py
index 5c07042..40c6fa9 100644
--- a/src/singler/annotate.py
+++ b/src/singler/annotate_single.py
@@ -6,7 +6,49 @@
 from .classify_single_reference import classify_single_reference
 
 
-def annotate(
+def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cache_dir, build_args, num_threads):
+    if isinstance(ref_data, str):
+        ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
+        ref_features = ref.row_data.column(ref_features)
+
+        num_de = None
+        if "marker_args" in build_args:
+            marker_args = build_args["marker_args"]
+            if "num_de" in marker_args:
+                num_de = marker_args["num_de"]
+
+        markers = realize_github_markers(
+            ref.metadata[ref_labels],
+            ref_features,
+            num_markers=num_de,
+            restrict_to=test_features_set,
+        )
+
+        ref_data = ref.assay("ranks")
+        ref_labels=ref.col_data.column(ref_labels)
+        built = build_single_reference(
+            ref_data=ref_data,
+            ref_labels=ref_labels,
+            ref_features=ref_features,
+            markers=markers,
+            num_threads=num_threads,
+            **build_args,
+        )
+
+    else:
+        built = build_single_reference(
+            ref_data=ref_data,
+            ref_labels=ref_labels,
+            ref_features=ref_features,
+            restrict_to=test_features_set,
+            num_threads=num_threads,
+            **build_args,
+        )
+
+    return ref_data, ref_labels, ref_features, built
+
+
+def annotate_single(
     test_data,
     test_features: Sequence,
     ref_data,
@@ -17,7 +59,8 @@ def annotate(
     classify_args={},
     num_threads=1,
 ) -> BiocFrame:
-    """Annotate an expression dataset based on the correlation to a labelled reference.
+    """Annotate a single-cell expression dataset based on the correlation 
+    of each cell to profiles in a labelled reference.
 
     Args:
         test_data: A matrix-like object representing the test dataset, where rows are
@@ -50,7 +93,7 @@ def annotate(
             containing the feature identifier associated with each row.
 
             If ``ref_data`` is a string, ``ref_features`` should be a string
-            specifying the label type to use, e.g., "main", "fine", "ont".
+            specifying the label type to use, e.g., "ensembl", "symbol".
 
         cache_dir (str):
             Path to a cache directory for downloading reference files, see
@@ -75,42 +118,15 @@ def annotate(
         specifying the markers that were used for each pairwise comparison
         between labels; and a list of ``unique_markers`` across all labels.
     """
-
-    if isinstance(ref_data, str):
-        ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
-        ref_features = ref.row_data.column(ref_features)
-
-        num_de = None
-        if "marker_args" in build_args:
-            marker_args = build_args["marker_args"]
-            if "num_de" in marker_args:
-                num_de = marker_args["num_de"]
-
-        markers = realize_github_markers(
-            ref.metadata[ref_labels],
-            ref_features,
-            num_markers=num_de,
-            restrict_to=set(test_features),
-        )
-
-        built = build_single_reference(
-            ref_data=ref.assay("ranks"),
-            ref_labels=ref.col_data.column(ref_labels),
-            ref_features=ref_features,
-            markers=markers,
-            num_threads=num_threads,
-            **build_args,
-        )
-
-    else:
-        built = build_single_reference(
-            ref_data=ref_data,
-            ref_labels=ref_labels,
-            ref_features=ref_features,
-            restrict_to=set(test_features),
-            num_threads=num_threads,
-            **build_args,
-        )
+    ref_data, ref_labels, ref_features, built = _build_reference(
+        ref_data=ref_data,
+        ref_labels=ref_labels,
+        ref_features=ref_features,
+        test_features_set=set(test_features),
+        cache_dir=cache_dir,
+        build_args=build_args,
+        num_threads=num_threads,
+    )
 
     output = classify_single_reference(
         test_data,
diff --git a/src/singler/classify_integrated_references.py b/src/singler/classify_integrated_references.py
index 7eec576..1507126 100644
--- a/src/singler/classify_integrated_references.py
+++ b/src/singler/classify_integrated_references.py
@@ -106,7 +106,13 @@ def classify_integrated_references(
         num_threads,
     )
 
-    best_label = [results[b][i] for i, b in enumerate(best)]
+    best_label = []
+    for i, b in enumerate(best):
+        if isinstance(results[b], BiocFrame):
+            best_label.append(results[b].column("best")[i])
+        else:
+            best_label.append(results[b][i])
+
     if has_names:
         best = [all_refs[b] for b in best]
 
diff --git a/tests/test_annotate_integrated.py b/tests/test_annotate_integrated.py
new file mode 100644
index 0000000..987d648
--- /dev/null
+++ b/tests/test_annotate_integrated.py
@@ -0,0 +1,30 @@
+import singler
+import numpy
+
+
+def test_annotate_integrated():
+    all_features = [str(i) for i in range(10000)]
+
+    ref1 = numpy.random.rand(8000, 10)
+    labels1 = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"]
+    features1 = [all_features[i] for i in range(8000)]
+
+    ref2 = numpy.random.rand(8000, 6)
+    labels2 = ["z", "y", "x", "z", "y", "z"]
+    features2 = [all_features[i] for i in range(2000, 10000)]
+
+    test_features = [all_features[i] for i in range(0, 10000, 2)]
+    test = numpy.random.rand(len(test_features), 50)
+
+    single_results, integrated_results = singler.annotate_integrated(
+        test,
+        test_features=test_features,
+        ref_data=[ref1, ref2],
+        ref_labels=[labels1, labels2],
+        ref_features=[features1, features2],
+    )
+
+    assert len(single_results) == 2
+    assert set(single_results[0].column("best")) == set(labels1)
+    assert set(single_results[1].column("best")) == set(labels2)
+    assert set(integrated_results.column("best_reference")) == set([0, 1])
diff --git a/tests/test_annotate.py b/tests/test_annotate_single.py
similarity index 91%
rename from tests/test_annotate.py
rename to tests/test_annotate_single.py
index 9f5724e..be5cc54 100644
--- a/tests/test_annotate.py
+++ b/tests/test_annotate_single.py
@@ -2,7 +2,7 @@
 import numpy
 
 
-def test_annotate_sanity():
+def test_annotate_single_sanity():
     ref = numpy.random.rand(10000, 10) + 1
     ref[:2000, :2] = 0
     ref[2000:4000, 2:4] = 0
@@ -19,7 +19,7 @@ def test_annotate_sanity():
     test[4000:6000, 4] = 0  # C
 
     all_features = [str(i) for i in range(10000)]
-    output = singler.annotate(
+    output = singler.annotate_single(
         test,
         test_features=all_features,
         ref_data=ref,
@@ -31,14 +31,14 @@ def test_annotate_sanity():
     assert output.column("best") == ["B", "D", "A", "E", "C"]
 
 
-def test_annotate_intersect():
+def test_annotate_single_intersect():
     ref = numpy.random.rand(10000, 10)
     ref_features = [str(i) for i in range(10000)]
     ref_labels = ["A", "A", "B", "B", "C", "C", "D", "D", "E", "E"]
     test = numpy.random.rand(10000, 50)
     test_features = [str(i + 2000) for i in range(10000)]
 
-    output = singler.annotate(
+    output = singler.annotate_single(
         test,
         test_features=test_features,
         ref_data=ref,
@@ -60,7 +60,7 @@ def test_annotate_intersect():
     ).all()
 
 
-def test_annotate_github():
+def test_annotate_single_github():
     se = singler.fetch_github_reference("ImmGen", cache_dir="_cache")
 
     keep = range(5, se.shape[0], 2)
@@ -68,7 +68,7 @@ def test_annotate_github():
     ref_features = se.row_data.column("symbol")
     test_features = [ref_features[i] for i in keep]
 
-    output = singler.annotate(
+    output = singler.annotate_single(
         test,
         test_features=test_features,
         ref_data="ImmGen",
@@ -86,7 +86,7 @@ def test_annotate_github():
     assert output.metadata["markers"] == expected_markers
 
     # Checking that we handle the number of markers correctly.
-    more_output = singler.annotate(
+    more_output = singler.annotate_single(
         test,
         test_features=test_features,
         ref_data="ImmGen",