Refactor and migrate code resulting from other packages (#21)

SingleR-inc · Jan 4, 2024 · 04350c9 · 04350c9
1 parent c1736e8
commit 04350c9
Show file tree

Hide file tree

Showing 16 changed files with 169 additions and 161 deletions.
diff --git a/README.md b/README.md
@@ -116,6 +116,18 @@ output = singler.classify_single_reference(
 )
 ```
 
+    ## output
+    BiocFrame with 4340 rows and 3 columns
+                best                                   scores                delta
+            <list>                              <BiocFrame>   <ndarray[float64]>
+    [0] Monocytes 0.33265560369962943:0.407117403330602...  0.40706830113982534
+    [1] Monocytes 0.4078771641637374:0.4783396310685646...  0.07000418564184802
+    [2] Monocytes 0.3517036021728629:0.4076971245524348...  0.30997293412307647
+                ...                                      ...                  ...
+    [4337]  NK cells 0.3472631136865701:0.3937898240670208...  0.09640242155786138
+    [4338]   B-cells 0.26974632191999887:0.334862058137758... 0.061215905058676856
+    [4339] Monocytes 0.39390119034537324:0.468867490667427...  0.06678168346812047
+
 ## Integrating labels across references
 
 We can use annotations from multiple references through the `annotate_integrated()` function:
@@ -125,9 +137,9 @@ import singler
 single_results, integrated = singler.annotate_integrated(
     mat,
     features,
-    ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
-    ref_features = "symbol",
-    ref_labels = "main",
+    ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
+    ref_features_list= "symbol",
+    ref_labels_list = "main",
     build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
     cache_dir = "_cache",
     num_threads = 6

diff --git a/setup.cfg b/setup.cfg
@@ -5,17 +5,17 @@
 
 [metadata]
 name = singler
-description = Add a short description here!
+description = Python bindings to the singleR algorithm to annotate cell types from known references.
 author = Aaron Lun
 author_email = [email protected]
 license = MIT
 license_files = LICENSE.txt
 long_description = file: README.md
 long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
-url = https://github.com/pyscaffold/pyscaffold/
+url = https://github.com/BiocPy/singler
 # Add here related links, for example:
 project_urls =
-    Documentation = https://pyscaffold.org/
+    Documentation = https://github.com/BiocPy/singler
 #    Source = https://github.com/pyscaffold/pyscaffold/
 #    Changelog = https://pyscaffold.org/en/latest/changelog.html
 #    Tracker = https://github.com/pyscaffold/pyscaffold/issues
@@ -41,7 +41,7 @@ package_dir =
     =src
 
 # Require a min/specific Python version (comma-separated conditions)
-# python_requires = >=3.8
+python_requires = >=3.8
 
 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
 # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
@@ -50,10 +50,11 @@ package_dir =
 install_requires =
     importlib-metadata; python_version<"3.8"
     mattress>=0.1.4
-    assorthead
+    assorthead>=0.0.11
     delayedarray
-    biocframe
-    summarizedexperiment
+    biocframe>=0.5.0
+    summarizedexperiment>=0.4.0
+    biocutils
 
 [options.packages.find]
 where = src

diff --git a/src/singler/_Markers.py b/src/singler/_Markers.py
@@ -1,6 +1,8 @@
+from typing import Any, Sequence
+
+from numpy import array, int32, ndarray
+
 from . import _cpphelpers as lib
-from numpy import ndarray, int32, array
-from typing import Sequence, Any
 
 
 class _Markers:

diff --git a/src/singler/_utils.py b/src/singler/_utils.py
@@ -1,37 +1,15 @@
-from numpy import ndarray
 from typing import Sequence, Tuple
-from summarizedexperiment import SummarizedExperiment
-from mattress import tatamize, TatamiNumericPointer
-from delayedarray import DelayedArray
-
-
-def _factorize(x: Sequence) -> Tuple[Sequence, ndarray]:
-    levels = []
-    mapping = {}
-    indices = []
-
-    for i, lev in enumerate(x):
-        if lev is None:
-            indices.append(None)
-        else:
-            if lev not in mapping:
-                mapping[lev] = len(levels)
-                levels.append(lev)
-            indices.append(mapping[lev])
-
-    return levels, indices
 
+import biocutils as ut
+import numpy as np
+from delayedarray import DelayedArray
+from mattress import TatamiNumericPointer, tatamize
+from summarizedexperiment import SummarizedExperiment
 
-def _match(x: Sequence, levels: Sequence) -> ndarray:
-    mapping = _create_map(levels)
-    indices = []
-    for i, y in enumerate(x):
-        if y is None or y not in mapping:
-            indices.append(None)
-        else:
-            indices.append(mapping[y])
 
-    return indices
+def _factorize(x: Sequence) -> Tuple[list, np.ndarray]:
+    _factor = ut.Factor.from_sequence(x, sort_levels=False)
+    return _factor.levels, np.array(_factor.codes, np.int32)
 
 
 def _create_map(x: Sequence) -> dict:
@@ -92,7 +70,7 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads):
     if isinstance(x, TatamiNumericPointer):
         # Assume the pointer was previously generated from _clean_matrix,
         # so it's 2-dimensional, matches up with features and it's already
-        # clean of NaNs... so we no-op and just return it directly. 
+        # clean of NaNs... so we no-op and just return it directly.
         return x, features
 
     if isinstance(x, SummarizedExperiment):

diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py
@@ -1,13 +1,13 @@
-from typing import Union, Sequence, Optional, Any, Tuple
+from typing import Any, Optional, Sequence, Tuple, Union
+
 from biocframe import BiocFrame
 
-from .fetch_reference import fetch_github_reference, realize_github_markers
-from .build_single_reference import build_single_reference
-from .classify_single_reference import classify_single_reference
+from ._utils import _clean_matrix
+from .annotate_single import _attach_markers, _resolve_reference
 from .build_integrated_references import build_integrated_references
+from .build_single_reference import build_single_reference
 from .classify_integrated_references import classify_integrated_references
-from .annotate_single import _resolve_reference, _attach_markers
-from ._utils import _clean_matrix
+from .classify_single_reference import classify_single_reference
 
 
 def annotate_integrated(
@@ -27,20 +27,22 @@ def annotate_integrated(
     classify_integrated_args: dict = {},
     num_threads: int = 1,
 ) -> Tuple[list[BiocFrame], BiocFrame]:
-    """Annotate a single-cell expression dataset based on the correlation 
+    """Annotate a single-cell expression dataset based on the correlation
     of each cell to profiles in multiple labelled references, where the
     annotation from each reference is then integrated across references.
 
     Args:
-        test_data: A matrix-like object representing the test dataset, where rows are
+        test_data:
+            A matrix-like object representing the test dataset, where rows are
             features and columns are samples (usually cells). Entries should be expression
             values; only the ranking within each column will be used.
 
             Alternatively, a
             :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
-            containing such a matrix in one of its assays. 
+            containing such a matrix in one of its assays.
 
-        test_features: Sequence of length equal to the number of rows in
+        test_features:
+            Sequence of length equal to the number of rows in
             ``test_data``, containing the feature identifier for each row.
 
         ref_data_list:
@@ -50,7 +52,7 @@ def annotate_integrated(
               are features and columns are samples. Entries should be expression values,
               usually log-transformed (see comments for the ``ref`` argument in
               :py:meth:`~singler.build_single_reference.build_single_reference`).
-            - A 
+            - A
               :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
               object containing such a matrix in its assays.
             - A string that can be passed as ``name`` to
@@ -129,18 +131,22 @@ def annotate_integrated(
     if isinstance(ref_labels_list, str):
         ref_labels_list = [ref_labels_list] * nrefs
     elif nrefs != len(ref_labels_list):
-        raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")
+        raise ValueError(
+            "'ref_data_list' and 'ref_labels_list' must be the same length"
+        )
     if isinstance(ref_features_list, str):
         ref_features_list = [ref_features_list] * nrefs
     elif nrefs != len(ref_features_list):
-        raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")
+        raise ValueError(
+            "'ref_data_list' and 'ref_features_list' must be the same length"
+        )
 
     test_ptr, test_features = _clean_matrix(
         test_data,
         test_features,
-        assay_type = test_assay_type,
-        check_missing = test_check_missing,
-        num_threads = num_threads,
+        assay_type=test_assay_type,
+        check_missing=test_check_missing,
+        num_threads=num_threads,
     )
 
     all_ref_data = []
@@ -163,9 +169,9 @@ def annotate_integrated(
         curref_ptr, curref_features = _clean_matrix(
             curref_mat,
             curref_features,
-            assay_type = ref_assay_type,
-            check_missing = ref_check_missing,
-            num_threads = num_threads,
+            assay_type=ref_assay_type,
+            check_missing=ref_check_missing,
+            num_threads=num_threads,
         )
 
         bargs = _attach_markers(curref_markers, build_single_args)

diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py
@@ -1,14 +1,16 @@
-from typing import Union, Sequence, Optional, Any
-from biocframe import BiocFrame
 from copy import copy
+from typing import Any, Optional, Sequence, Union
+
+from biocframe import BiocFrame
 
-from .fetch_reference import fetch_github_reference, realize_github_markers
 from .build_single_reference import build_single_reference
 from .classify_single_reference import classify_single_reference
-from ._utils import _clean_matrix
+from .fetch_reference import fetch_github_reference, realize_github_markers
 
 
-def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set):
+def _resolve_reference(
+    ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set
+):
     if isinstance(ref_data, str):
         ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
         ref_features = ref.row_data.column(ref_features)
@@ -27,7 +29,7 @@ def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args
         )
 
         ref_data = ref.assay("ranks")
-        ref_labels=ref.col_data.column(ref_labels)
+        ref_labels = ref.col_data.column(ref_labels)
     else:
         ref_markers = None
 
@@ -54,11 +56,12 @@ def annotate_single(
     classify_args: dict = {},
     num_threads: int = 1,
 ) -> BiocFrame:
-    """Annotate a single-cell expression dataset based on the correlation 
+    """Annotate a single-cell expression dataset based on the correlation
     of each cell to profiles in a labelled reference.
 
     Args:
-        test_data: A matrix-like object representing the test dataset, where rows are
+        test_data:
+            A matrix-like object representing the test dataset, where rows are
             features and columns are samples (usually cells). Entries should be expression
             values; only the ranking within each column will be used.
 
@@ -67,10 +70,12 @@ def annotate_single(
             containing such a matrix in one of its assays. Non-default assay
             types can be specified in ``classify_args``.
 
-        test_features: Sequence of length equal to the number of rows in
+        test_features:
+            Sequence of length equal to the number of rows in
             ``test_data``, containing the feature identifier for each row.
 
-        ref_data: A matrix-like object representing the reference dataset, where rows
+        ref_data:
+            A matrix-like object representing the reference dataset, where rows
             are features and columns are samples. Entries should be expression values,
             usually log-transformed (see comments for the ``ref`` argument in
             :py:meth:`~singler.build_single_reference.build_single_reference`).