From f23d8095d9964ecdb10c659f50cf8e1a693e8e7c Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 20 Sep 2023 11:04:42 -0700 Subject: [PATCH] Ensure that annotate_integrated uses the same matrix consistently. In other words, we need to make sure that it uses the same assay_type and check_missing across both single and integrated classifications. This is most easily done by running _clean_matrix externally on test/ ref datasets and then passing the tatami pointer into each of the lower-level functions, which will then skip all NaN and assay checks. --- src/singler/annotate_integrated.py | 110 ++++++++++++++++++----------- src/singler/annotate_single.py | 50 +++++++------ tests/test_annotate_integrated.py | 6 +- 3 files changed, 101 insertions(+), 65 deletions(-) diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index 4ac5f95..7f03a4b 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -6,17 +6,20 @@ from .classify_single_reference import classify_single_reference from .build_integrated_references import build_integrated_references from .classify_integrated_references import classify_integrated_references -from .annotate_single import _build_reference +from .annotate_single import _resolve_reference, _attach_markers +from ._utils import _clean_matrix def annotate_integrated( test_data: Any, test_features: Sequence, - ref_data: Sequence[Union[Any, str]], - ref_labels: Union[str, Sequence[Union[Sequence, str]]], - ref_features: Union[str, Sequence[Union[Sequence, str]]], + ref_data_list: Sequence[Union[Any, str]], + ref_labels_list: Union[str, Sequence[Union[Sequence, str]]], + ref_features_list: Union[str, Sequence[Union[Sequence, str]]], test_assay_type: Union[str, int] = 0, + test_check_missing: bool = True, ref_assay_type: Union[str, int] = "logcounts", + ref_check_missing: bool = True, cache_dir: Optional[str] = None, build_single_args: dict = {}, classify_single_args: dict = {}, @@ -40,7 +43,7 @@ def annotate_integrated( test_features: Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row. - ref_data: + ref_data_list: Sequence consisting of one or more of the following: - A matrix-like object representing the reference dataset, where rows @@ -54,25 +57,25 @@ def annotate_integrated( :py:meth:`~singler.fetch_reference.fetch_github_reference`. This will use the specified dataset as the reference. - ref_labels: + ref_labels_list: Sequence of the same length as ``ref_data``, where the contents depend on the type of value in the corresponding entry of ``ref_data``: - - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be - a sequence of length equal to the number of columns of ``ref_data[i]``, + - If ``ref_data_list[i]`` is a matrix-like object, ``ref_labels_list[i]`` should be + a sequence of length equal to the number of columns of ``ref_data_list[i]``, containing the label associated with each column. - - If ``ref_data[i]`` is a string, ``ref_labels[i]`` should be a string + - If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string specifying the label type to use, e.g., "main", "fine", "ont". If a single string is supplied, it is recycled for all ``ref_data``. - ref_features: - Sequence of the same length as ``ref_data``, where the contents + ref_features_list: + Sequence of the same length as ``ref_data_list``, where the contents depend on the type of value in the corresponding entry of ``ref_data``: - - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be - a sequence of length equal to the number of rows of ``ref_data``, + - If ``ref_data_list[i]`` is a matrix-like object, ``ref_features_list[i]`` should be + a sequence of length equal to the number of rows of ``ref_data_list[i]``, containing the feature identifier associated with each row. - - If ``ref_data[i]`` is a string, ``ref_features[i]`` should be a string + - If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string specifying the feature type to use, e.g., "ensembl", "symbol". If a single string is supplied, it is recycled for all ``ref_data``. @@ -80,10 +83,16 @@ def annotate_integrated( Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + test_check_missing: + Whether to check for and remove missing (i.e., NaN) values from the test dataset. + ref_assay_type: Assay containing the expression matrix for any entry of ``ref_data_list`` that is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + ref_check_missing: + Whether to check for and remove missing (i.e., NaN) values from the reference datasets. + cache_dir: Path to a cache directory for downloading reference files, see :py:meth:`~singler.fetch_reference.fetch_github_reference` for details. @@ -116,15 +125,23 @@ def annotate_integrated( (i.e., a BiocFrame from :py:meth:`~singler.classify_integrated_references.classify_integrated_references`). """ - nrefs = len(ref_data) - if isinstance(ref_labels, str): - ref_labels = [ref_labels] * nrefs - elif nrefs != len(ref_labels): - raise ValueError("'ref_data' and 'ref_labels' must be the same length") - if isinstance(ref_features, str): - ref_features = [ref_features] * nrefs - elif nrefs != len(ref_features): - raise ValueError("'ref_data' and 'ref_features' must be the same length") + nrefs = len(ref_data_list) + if isinstance(ref_labels_list, str): + ref_labels_list = [ref_labels_list] * nrefs + elif nrefs != len(ref_labels_list): + raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length") + if isinstance(ref_features_list, str): + ref_features_list = [ref_features_list] * nrefs + elif nrefs != len(ref_features_list): + raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length") + + test_ptr, test_features = _clean_matrix( + test_data, + test_features, + assay_type = test_assay_type, + check_missing = test_check_missing, + num_threads = num_threads, + ) all_ref_data = [] all_ref_labels = [] @@ -134,14 +151,30 @@ def annotate_integrated( test_features_set = set(test_features) for r in range(nrefs): - curref_data, curref_labels, curref_features, curbuilt = _build_reference( - ref_data=ref_data[r], - ref_labels=ref_labels[r], - ref_features=ref_features[r], - test_features_set=test_features_set, + curref_mat, curref_labels, curref_features, curref_markers = _resolve_reference( + ref_data=ref_data_list[r], + ref_labels=ref_labels_list[r], + ref_features=ref_features_list[r], cache_dir=cache_dir, - build_args=build_single_args, + build_single_args=build_single_args, + test_features_set=test_features_set, + ) + + curref_ptr, curref_features = _clean_matrix( + curref_mat, + curref_features, assay_type = ref_assay_type, + check_missing = ref_check_missing, + num_threads = num_threads, + ) + + bargs = _attach_markers(curref_markers, build_single_args) + curbuilt = build_single_reference( + ref_data=curref_ptr, + ref_labels=curref_labels, + ref_features=curref_features, + restrict_to=test_features_set, + **bargs, num_threads=num_threads, ) @@ -150,38 +183,35 @@ def annotate_integrated( test_features=test_features, ref_prebuilt=curbuilt, **classify_single_args, - assay_type = test_assay_type, num_threads=num_threads, ) - res.metadata = { - "markers": curbuilt.markers, - "unique_markers": curbuilt.marker_subset(), - } - - all_ref_data.append(curref_data) + all_ref_data.append(curref_ptr) all_ref_labels.append(curref_labels) all_ref_features.append(curref_features) all_built.append(curbuilt) all_results.append(res) + res.metadata = { + "markers": curbuilt.markers, + "unique_markers": curbuilt.marker_subset(), + } + ibuilt = build_integrated_references( test_features=test_features, ref_data_list=all_ref_data, ref_labels_list=all_ref_labels, ref_features_list=all_ref_features, ref_prebuilt_list=all_built, - assay_type = ref_assay_type, - num_threads=num_threads, **build_integrated_args, + num_threads=num_threads, ) ires = classify_integrated_references( - test_data=test_data, + test_data=test_ptr, results=all_results, integrated_prebuilt=ibuilt, **classify_integrated_args, - assay_type = test_assay_type, num_threads=num_threads, ) diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index abfe668..01b9cc1 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -1,12 +1,14 @@ from typing import Union, Sequence, Optional, Any from biocframe import BiocFrame +from copy import copy from .fetch_reference import fetch_github_reference, realize_github_markers from .build_single_reference import build_single_reference from .classify_single_reference import classify_single_reference +from ._utils import _clean_matrix -def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cache_dir, build_args, num_threads): +def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set): if isinstance(ref_data, str): ref = fetch_github_reference(ref_data, cache_dir=cache_dir) ref_features = ref.row_data.column(ref_features) @@ -17,7 +19,7 @@ def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cach if "num_de" in marker_args: num_de = marker_args["num_de"] - markers = realize_github_markers( + ref_markers = realize_github_markers( ref.metadata[ref_labels], ref_features, num_markers=num_de, @@ -26,26 +28,19 @@ def _build_reference(ref_data, ref_labels, ref_features, test_features_set, cach ref_data = ref.assay("ranks") ref_labels=ref.col_data.column(ref_labels) - built = build_single_reference( - ref_data=ref_data, - ref_labels=ref_labels, - ref_features=ref_features, - markers=markers, - num_threads=num_threads, - **build_args, - ) - else: - built = build_single_reference( - ref_data=ref_data, - ref_labels=ref_labels, - ref_features=ref_features, - restrict_to=test_features_set, - num_threads=num_threads, - **build_args, - ) + ref_markers = None + + return ref_data, ref_labels, ref_features, ref_markers + - return ref_data, ref_labels, ref_features, built +def _attach_markers(markers, build_args): + if markers is not None and "markers" not in build_args: + tmp = copy(build_args) + tmp["markers"] = markers + print(tmp) + return tmp + return build_args def annotate_single( @@ -123,13 +118,24 @@ def annotate_single( specifying the markers that were used for each pairwise comparison between labels; and a list of ``unique_markers`` across all labels. """ - ref_data, ref_labels, ref_features, built = _build_reference( + test_features_set = set(test_features) + + ref_data, ref_labels, ref_features, markers = _resolve_reference( ref_data=ref_data, ref_labels=ref_labels, ref_features=ref_features, - test_features_set=set(test_features), cache_dir=cache_dir, build_args=build_args, + test_features_set=test_features_set, + ) + + bargs = _attach_markers(markers, build_args) + built = build_single_reference( + ref_data=ref_data, + ref_labels=ref_labels, + ref_features=ref_features, + restrict_to=test_features_set, + **bargs, num_threads=num_threads, ) diff --git a/tests/test_annotate_integrated.py b/tests/test_annotate_integrated.py index 987d648..fc6aa7c 100644 --- a/tests/test_annotate_integrated.py +++ b/tests/test_annotate_integrated.py @@ -19,9 +19,9 @@ def test_annotate_integrated(): single_results, integrated_results = singler.annotate_integrated( test, test_features=test_features, - ref_data=[ref1, ref2], - ref_labels=[labels1, labels2], - ref_features=[features1, features2], + ref_data_list=[ref1, ref2], + ref_labels_list=[labels1, labels2], + ref_features_list=[features1, features2], ) assert len(single_results) == 2