From 7bbf662aa064df64e25d58a666359dbeb439bb18 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Tue, 19 Sep 2023 10:37:02 -0700 Subject: [PATCH] Added an overlord function for the integrated annotation. (#20) - Renamed annotate to annotate_single to distinguish it. - Factored out the reference string handler from annotate_single. - Minor fixes to classify_integrated to deal with BiocFrame inputs. - Updated the README. --- README.md | 55 +++++- src/singler/__init__.py | 3 +- src/singler/annotate_integrated.py | 170 ++++++++++++++++++ .../{annotate.py => annotate_single.py} | 94 ++++++---- src/singler/classify_integrated_references.py | 8 +- tests/test_annotate_integrated.py | 30 ++++ ...st_annotate.py => test_annotate_single.py} | 14 +- 7 files changed, 322 insertions(+), 52 deletions(-) create mode 100644 src/singler/annotate_integrated.py rename src/singler/{annotate.py => annotate_single.py} (82%) create mode 100644 tests/test_annotate_integrated.py rename tests/{test_annotate.py => test_annotate_single.py} (91%) diff --git a/README.md b/README.md index 5db9f90..21a6a93 100644 --- a/README.md +++ b/README.md @@ -30,14 +30,14 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics: import singlecellexperiment as sce data = sce.read_tenx_h5("pbmc4k-tenx.h5") mat = data.assay("counts") -features = [x.encode("ASCII") for x in data.row_data["name"]] +features = [str(x) for x in data.row_data["name"]] ``` Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`: ```python import singler -results = singler.annotate( +results = singler.annotate_single( mat, features, ref_data = "BlueprintEncode", @@ -63,14 +63,14 @@ results.column("best") ## ... ## ] -results.column("scores").column("Macrophage") +results.column("scores").column("Macrophages") ## array([0.35935275, 0.40833545, 0.37430726, ..., 0.32135929, 0.29728435, ## 0.40208581]) ``` ## Calling low-level functions -The `annotate()` function is a convenient wrapper around a number of lower-level functions in **singler**. +The `annotate_single()` function is a convenient wrapper around a number of lower-level functions in **singler**. Advanced users may prefer to build the reference and run the classification separately. This allows us to re-use the same reference for multiple datasets without repeating the build step. @@ -116,6 +116,53 @@ output = singler.classify_single_reference( ) ``` +## Integrating labels across references + +We can use annotations from multiple references through the `annotate_integrated()` function: + +```python +import singler +single_results, integrated = singler.annotate_integrated( + mat, + features, + ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"), + ref_features = "symbol", + ref_labels = "main", + build_integrated_args = { "ref_names": ("Blueprint", "DICE") }, + cache_dir = "_cache", + num_threads = 6 +) +``` + +This annotates the test dataset against each reference individually to obtain the best per-reference label, +and then it compares across references to find the best label from all references. +Both the single and integrated annotations are reported for diagnostics. + +```python +integrated.column("best_label") +## ['Monocytes', +## 'Monocytes', +## 'Monocytes', +## 'CD8+ T-cells', +## 'CD4+ T-cells', +## 'CD8+ T-cells', +## 'Monocytes', +## 'Monocytes', +## ... +## ] + +integrated.column("best_reference") +## ['Blueprint', +## 'Blueprint', +## 'Blueprint', +## 'Blueprint', +## 'Blueprint', +## 'Blueprint', +## 'Blueprint', +## ... +##] +``` + ## Developer notes Build the shared object file: diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 351f836..dc17568 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -22,4 +22,5 @@ from .classify_single_reference import classify_single_reference from .classify_integrated_references import classify_integrated_references from .fetch_reference import fetch_github_reference, realize_github_markers -from .annotate import annotate +from .annotate_single import annotate_single +from .annotate_integrated import annotate_integrated diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py new file mode 100644 index 0000000..df79156 --- /dev/null +++ b/src/singler/annotate_integrated.py @@ -0,0 +1,170 @@ +from typing import Union, Sequence, Optional, Any, Tuple +from biocframe import BiocFrame + +from .fetch_reference import fetch_github_reference, realize_github_markers +from .build_single_reference import build_single_reference +from .classify_single_reference import classify_single_reference +from .build_integrated_references import build_integrated_references +from .classify_integrated_references import classify_integrated_references +from .annotate_single import _build_reference + + +def annotate_integrated( + test_data: Any, + test_features: Sequence, + ref_data: Sequence[Union[Any, str]], + ref_labels: Union[str, Sequence[Union[Sequence, str]]], + ref_features: Union[str, Sequence[Union[Sequence, str]]], + cache_dir: Optional[str] = None, + build_single_args={}, + classify_single_args={}, + build_integrated_args={}, + classify_integrated_args={}, + num_threads=1, +) -> Tuple[list[BiocFrame], BiocFrame]: + """Annotate a single-cell expression dataset based on the correlation + of each cell to profiles in multiple labelled references, where the + annotation from each reference is then integrated across references. + + Args: + test_data: A matrix-like object representing the test dataset, where rows are + features and columns are samples (usually cells). Entries should be expression + values; only the ranking within each column will be used. + + test_features (Sequence): Sequence of length equal to the number of rows in + ``test_data``, containing the feature identifier for each row. + + ref_data: + Sequence consisting of one or more of the following: + + - A matrix-like object representing the reference dataset, where rows + are features and columns are samples. Entries should be expression values, + usually log-transformed (see comments for the ``ref`` argument in + :py:meth:`~singler.build_single_reference.build_single_reference`). + - A string that can be passed as ``name`` to + :py:meth:`~singler.fetch_reference.fetch_github_reference`. + This will use the specified dataset as the reference. + + ref_labels (Union[str, Sequence[Union[Sequence, str]]]): + Sequence of the same length as ``ref_data``, where the contents + depend on the type of value in the corresponding entry of ``ref_data``: + + - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be + a sequence of length equal to the number of columns of ``ref_data[i]``, + containing the label associated with each column. + - If ``ref_data[i]`` is a string, ``ref_labels[i]`` should be a string + specifying the label type to use, e.g., "main", "fine", "ont". + + If a single string is supplied, it is recycled for all ``ref_data``. + + ref_features (Union[str, Sequence[Union[Sequence, str]]]): + Sequence of the same length as ``ref_data``, where the contents + depend on the type of value in the corresponding entry of ``ref_data``: + + - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be + a sequence of length equal to the number of rows of ``ref_data``, + containing the feature identifier associated with each row. + - If ``ref_data[i]`` is a string, ``ref_features[i]`` should be a string + specifying the feature type to use, e.g., "ensembl", "symbol". + + If a single string is supplied, it is recycled for all ``ref_data``. + + cache_dir (str): + Path to a cache directory for downloading reference files, see + :py:meth:`~singler.fetch_reference.fetch_github_reference` for details. + Only used if ``ref_data`` is a string. + + build_single_args (dict): + Further arguments to pass to + :py:meth:`~singler.build_single_reference.build_single_reference`. + + classify_single_args (dict): + Further arguments to pass to + :py:meth:`~singler.classify_single_reference.classify_single_reference`. + + build_integrated_args (dict): + Further arguments to pass to + :py:meth:`~singler.build_integrated_reference.build_integrated_reference`. + + classify_integrated_args (dict): + Further arguments to pass to + :py:meth:`~singler.classify_integrated_reference.classify_integrated_reference`. + + num_threads (int): + Number of threads to use for the various steps. + + Returns: + Tuple[list[BiocFrame], BiocFrame]: Tuple where the first element + contains per-reference results (i.e. a list of BiocFrame outputs + equivalent to running + :py:meth:`~singler.annotate_single.annotate_single` on each reference) + and the second element contains integrated results across references + (i.e., a BiocFrame from + :py:meth:`~singler.classify_integrated_references.classify_integrated_references`). + """ + nrefs = len(ref_data) + if isinstance(ref_labels, str): + ref_labels = [ref_labels] * nrefs + elif nrefs != len(ref_labels): + raise ValueError("'ref_data' and 'ref_labels' must be the same length") + if isinstance(ref_features, str): + ref_features = [ref_features] * nrefs + elif nrefs != len(ref_features): + raise ValueError("'ref_data' and 'ref_features' must be the same length") + + all_ref_data = [] + all_ref_labels = [] + all_ref_features = [] + all_built = [] + all_results = [] + test_features_set = set(test_features) + + for r in range(nrefs): + curref_data, curref_labels, curref_features, curbuilt = _build_reference( + ref_data=ref_data[r], + ref_labels=ref_labels[r], + ref_features=ref_features[r], + test_features_set=test_features_set, + cache_dir=cache_dir, + build_args=build_single_args, + num_threads=num_threads, + ) + + res = classify_single_reference( + test_data, + test_features=test_features, + ref_prebuilt=curbuilt, + **classify_single_args, + num_threads=num_threads, + ) + + res.metadata = { + "markers": curbuilt.markers, + "unique_markers": curbuilt.marker_subset(), + } + + all_ref_data.append(curref_data) + all_ref_labels.append(curref_labels) + all_ref_features.append(curref_features) + all_built.append(curbuilt) + all_results.append(res) + + ibuilt = build_integrated_references( + test_features=test_features, + ref_data_list=all_ref_data, + ref_labels_list=all_ref_labels, + ref_features_list=all_ref_features, + ref_prebuilt_list=all_built, + num_threads=num_threads, + **build_integrated_args, + ) + + ires = classify_integrated_references( + test_data=test_data, + results=all_results, + integrated_prebuilt=ibuilt, + **classify_integrated_args, + num_threads=num_threads, + ) + + return all_results, ires diff --git a/src/singler/annotate.py b/src/singler/annotate_single.py similarity index 82% rename from src/singler/annotate.py rename to src/singler/annotate_single.py index 5c07042..40c6fa9 100644 --- a/src/singler/annotate.py +++ b/src/singler/annotate_single.py @@ -6,7 +6,49 @@ from .classify_single_reference import classify_single_reference -def annotate( +def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cache_dir, build_args, num_threads): + if isinstance(ref_data, str): + ref = fetch_github_reference(ref_data, cache_dir=cache_dir) + ref_features = ref.row_data.column(ref_features) + + num_de = None + if "marker_args" in build_args: + marker_args = build_args["marker_args"] + if "num_de" in marker_args: + num_de = marker_args["num_de"] + + markers = realize_github_markers( + ref.metadata[ref_labels], + ref_features, + num_markers=num_de, + restrict_to=test_features_set, + ) + + ref_data = ref.assay("ranks") + ref_labels=ref.col_data.column(ref_labels) + built = build_single_reference( + ref_data=ref_data, + ref_labels=ref_labels, + ref_features=ref_features, + markers=markers, + num_threads=num_threads, + **build_args, + ) + + else: + built = build_single_reference( + ref_data=ref_data, + ref_labels=ref_labels, + ref_features=ref_features, + restrict_to=test_features_set, + num_threads=num_threads, + **build_args, + ) + + return ref_data, ref_labels, ref_features, built + + +def annotate_single( test_data, test_features: Sequence, ref_data, @@ -17,7 +59,8 @@ def annotate( classify_args={}, num_threads=1, ) -> BiocFrame: - """Annotate an expression dataset based on the correlation to a labelled reference. + """Annotate a single-cell expression dataset based on the correlation + of each cell to profiles in a labelled reference. Args: test_data: A matrix-like object representing the test dataset, where rows are @@ -50,7 +93,7 @@ def annotate( containing the feature identifier associated with each row. If ``ref_data`` is a string, ``ref_features`` should be a string - specifying the label type to use, e.g., "main", "fine", "ont". + specifying the label type to use, e.g., "ensembl", "symbol". cache_dir (str): Path to a cache directory for downloading reference files, see @@ -75,42 +118,15 @@ def annotate( specifying the markers that were used for each pairwise comparison between labels; and a list of ``unique_markers`` across all labels. """ - - if isinstance(ref_data, str): - ref = fetch_github_reference(ref_data, cache_dir=cache_dir) - ref_features = ref.row_data.column(ref_features) - - num_de = None - if "marker_args" in build_args: - marker_args = build_args["marker_args"] - if "num_de" in marker_args: - num_de = marker_args["num_de"] - - markers = realize_github_markers( - ref.metadata[ref_labels], - ref_features, - num_markers=num_de, - restrict_to=set(test_features), - ) - - built = build_single_reference( - ref_data=ref.assay("ranks"), - ref_labels=ref.col_data.column(ref_labels), - ref_features=ref_features, - markers=markers, - num_threads=num_threads, - **build_args, - ) - - else: - built = build_single_reference( - ref_data=ref_data, - ref_labels=ref_labels, - ref_features=ref_features, - restrict_to=set(test_features), - num_threads=num_threads, - **build_args, - ) + ref_data, ref_labels, ref_features, built = _build_reference( + ref_data=ref_data, + ref_labels=ref_labels, + ref_features=ref_features, + test_features_set=set(test_features), + cache_dir=cache_dir, + build_args=build_args, + num_threads=num_threads, + ) output = classify_single_reference( test_data, diff --git a/src/singler/classify_integrated_references.py b/src/singler/classify_integrated_references.py index 7eec576..1507126 100644 --- a/src/singler/classify_integrated_references.py +++ b/src/singler/classify_integrated_references.py @@ -106,7 +106,13 @@ def classify_integrated_references( num_threads, ) - best_label = [results[b][i] for i, b in enumerate(best)] + best_label = [] + for i, b in enumerate(best): + if isinstance(results[b], BiocFrame): + best_label.append(results[b].column("best")[i]) + else: + best_label.append(results[b][i]) + if has_names: best = [all_refs[b] for b in best] diff --git a/tests/test_annotate_integrated.py b/tests/test_annotate_integrated.py new file mode 100644 index 0000000..987d648 --- /dev/null +++ b/tests/test_annotate_integrated.py @@ -0,0 +1,30 @@ +import singler +import numpy + + +def test_annotate_integrated(): + all_features = [str(i) for i in range(10000)] + + ref1 = numpy.random.rand(8000, 10) + labels1 = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] + features1 = [all_features[i] for i in range(8000)] + + ref2 = numpy.random.rand(8000, 6) + labels2 = ["z", "y", "x", "z", "y", "z"] + features2 = [all_features[i] for i in range(2000, 10000)] + + test_features = [all_features[i] for i in range(0, 10000, 2)] + test = numpy.random.rand(len(test_features), 50) + + single_results, integrated_results = singler.annotate_integrated( + test, + test_features=test_features, + ref_data=[ref1, ref2], + ref_labels=[labels1, labels2], + ref_features=[features1, features2], + ) + + assert len(single_results) == 2 + assert set(single_results[0].column("best")) == set(labels1) + assert set(single_results[1].column("best")) == set(labels2) + assert set(integrated_results.column("best_reference")) == set([0, 1]) diff --git a/tests/test_annotate.py b/tests/test_annotate_single.py similarity index 91% rename from tests/test_annotate.py rename to tests/test_annotate_single.py index 9f5724e..be5cc54 100644 --- a/tests/test_annotate.py +++ b/tests/test_annotate_single.py @@ -2,7 +2,7 @@ import numpy -def test_annotate_sanity(): +def test_annotate_single_sanity(): ref = numpy.random.rand(10000, 10) + 1 ref[:2000, :2] = 0 ref[2000:4000, 2:4] = 0 @@ -19,7 +19,7 @@ def test_annotate_sanity(): test[4000:6000, 4] = 0 # C all_features = [str(i) for i in range(10000)] - output = singler.annotate( + output = singler.annotate_single( test, test_features=all_features, ref_data=ref, @@ -31,14 +31,14 @@ def test_annotate_sanity(): assert output.column("best") == ["B", "D", "A", "E", "C"] -def test_annotate_intersect(): +def test_annotate_single_intersect(): ref = numpy.random.rand(10000, 10) ref_features = [str(i) for i in range(10000)] ref_labels = ["A", "A", "B", "B", "C", "C", "D", "D", "E", "E"] test = numpy.random.rand(10000, 50) test_features = [str(i + 2000) for i in range(10000)] - output = singler.annotate( + output = singler.annotate_single( test, test_features=test_features, ref_data=ref, @@ -60,7 +60,7 @@ def test_annotate_intersect(): ).all() -def test_annotate_github(): +def test_annotate_single_github(): se = singler.fetch_github_reference("ImmGen", cache_dir="_cache") keep = range(5, se.shape[0], 2) @@ -68,7 +68,7 @@ def test_annotate_github(): ref_features = se.row_data.column("symbol") test_features = [ref_features[i] for i in keep] - output = singler.annotate( + output = singler.annotate_single( test, test_features=test_features, ref_data="ImmGen", @@ -86,7 +86,7 @@ def test_annotate_github(): assert output.metadata["markers"] == expected_markers # Checking that we handle the number of markers correctly. - more_output = singler.annotate( + more_output = singler.annotate_single( test, test_features=test_features, ref_data="ImmGen",