Skip to content

Commit

Permalink
Refactor and migrate code resulting from other packages (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche authored Jan 4, 2024
1 parent c1736e8 commit 04350c9
Show file tree
Hide file tree
Showing 16 changed files with 169 additions and 161 deletions.
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,18 @@ output = singler.classify_single_reference(
)
```

## output
BiocFrame with 4340 rows and 3 columns
best scores delta
<list> <BiocFrame> <ndarray[float64]>
[0] Monocytes 0.33265560369962943:0.407117403330602... 0.40706830113982534
[1] Monocytes 0.4078771641637374:0.4783396310685646... 0.07000418564184802
[2] Monocytes 0.3517036021728629:0.4076971245524348... 0.30997293412307647
... ... ...
[4337] NK cells 0.3472631136865701:0.3937898240670208... 0.09640242155786138
[4338] B-cells 0.26974632191999887:0.334862058137758... 0.061215905058676856
[4339] Monocytes 0.39390119034537324:0.468867490667427... 0.06678168346812047

## Integrating labels across references

We can use annotations from multiple references through the `annotate_integrated()` function:
Expand All @@ -125,9 +137,9 @@ import singler
single_results, integrated = singler.annotate_integrated(
mat,
features,
ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
ref_features = "symbol",
ref_labels = "main",
ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"),
ref_features_list= "symbol",
ref_labels_list = "main",
build_integrated_args = { "ref_names": ("Blueprint", "DICE") },
cache_dir = "_cache",
num_threads = 6
Expand Down
15 changes: 8 additions & 7 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@

[metadata]
name = singler
description = Add a short description here!
description = Python bindings to the singleR algorithm to annotate cell types from known references.
author = Aaron Lun
author_email = [email protected]
license = MIT
license_files = LICENSE.txt
long_description = file: README.md
long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
url = https://github.com/pyscaffold/pyscaffold/
url = https://github.com/BiocPy/singler
# Add here related links, for example:
project_urls =
Documentation = https://pyscaffold.org/
Documentation = https://github.com/BiocPy/singler
# Source = https://github.com/pyscaffold/pyscaffold/
# Changelog = https://pyscaffold.org/en/latest/changelog.html
# Tracker = https://github.com/pyscaffold/pyscaffold/issues
Expand All @@ -41,7 +41,7 @@ package_dir =
=src

# Require a min/specific Python version (comma-separated conditions)
# python_requires = >=3.8
python_requires = >=3.8

# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
Expand All @@ -50,10 +50,11 @@ package_dir =
install_requires =
importlib-metadata; python_version<"3.8"
mattress>=0.1.4
assorthead
assorthead>=0.0.11
delayedarray
biocframe
summarizedexperiment
biocframe>=0.5.0
summarizedexperiment>=0.4.0
biocutils

[options.packages.find]
where = src
Expand Down
6 changes: 4 additions & 2 deletions src/singler/_Markers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Any, Sequence

from numpy import array, int32, ndarray

from . import _cpphelpers as lib
from numpy import ndarray, int32, array
from typing import Sequence, Any


class _Markers:
Expand Down
40 changes: 9 additions & 31 deletions src/singler/_utils.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,15 @@
from numpy import ndarray
from typing import Sequence, Tuple
from summarizedexperiment import SummarizedExperiment
from mattress import tatamize, TatamiNumericPointer
from delayedarray import DelayedArray


def _factorize(x: Sequence) -> Tuple[Sequence, ndarray]:
levels = []
mapping = {}
indices = []

for i, lev in enumerate(x):
if lev is None:
indices.append(None)
else:
if lev not in mapping:
mapping[lev] = len(levels)
levels.append(lev)
indices.append(mapping[lev])

return levels, indices

import biocutils as ut
import numpy as np
from delayedarray import DelayedArray
from mattress import TatamiNumericPointer, tatamize
from summarizedexperiment import SummarizedExperiment

def _match(x: Sequence, levels: Sequence) -> ndarray:
mapping = _create_map(levels)
indices = []
for i, y in enumerate(x):
if y is None or y not in mapping:
indices.append(None)
else:
indices.append(mapping[y])

return indices
def _factorize(x: Sequence) -> Tuple[list, np.ndarray]:
_factor = ut.Factor.from_sequence(x, sort_levels=False)
return _factor.levels, np.array(_factor.codes, np.int32)


def _create_map(x: Sequence) -> dict:
Expand Down Expand Up @@ -92,7 +70,7 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads):
if isinstance(x, TatamiNumericPointer):
# Assume the pointer was previously generated from _clean_matrix,
# so it's 2-dimensional, matches up with features and it's already
# clean of NaNs... so we no-op and just return it directly.
# clean of NaNs... so we no-op and just return it directly.
return x, features

if isinstance(x, SummarizedExperiment):
Expand Down
44 changes: 25 additions & 19 deletions src/singler/annotate_integrated.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import Union, Sequence, Optional, Any, Tuple
from typing import Any, Optional, Sequence, Tuple, Union

from biocframe import BiocFrame

from .fetch_reference import fetch_github_reference, realize_github_markers
from .build_single_reference import build_single_reference
from .classify_single_reference import classify_single_reference
from ._utils import _clean_matrix
from .annotate_single import _attach_markers, _resolve_reference
from .build_integrated_references import build_integrated_references
from .build_single_reference import build_single_reference
from .classify_integrated_references import classify_integrated_references
from .annotate_single import _resolve_reference, _attach_markers
from ._utils import _clean_matrix
from .classify_single_reference import classify_single_reference


def annotate_integrated(
Expand All @@ -27,20 +27,22 @@ def annotate_integrated(
classify_integrated_args: dict = {},
num_threads: int = 1,
) -> Tuple[list[BiocFrame], BiocFrame]:
"""Annotate a single-cell expression dataset based on the correlation
"""Annotate a single-cell expression dataset based on the correlation
of each cell to profiles in multiple labelled references, where the
annotation from each reference is then integrated across references.
Args:
test_data: A matrix-like object representing the test dataset, where rows are
test_data:
A matrix-like object representing the test dataset, where rows are
features and columns are samples (usually cells). Entries should be expression
values; only the ranking within each column will be used.
Alternatively, a
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
containing such a matrix in one of its assays.
containing such a matrix in one of its assays.
test_features: Sequence of length equal to the number of rows in
test_features:
Sequence of length equal to the number of rows in
``test_data``, containing the feature identifier for each row.
ref_data_list:
Expand All @@ -50,7 +52,7 @@ def annotate_integrated(
are features and columns are samples. Entries should be expression values,
usually log-transformed (see comments for the ``ref`` argument in
:py:meth:`~singler.build_single_reference.build_single_reference`).
- A
- A
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
object containing such a matrix in its assays.
- A string that can be passed as ``name`` to
Expand Down Expand Up @@ -129,18 +131,22 @@ def annotate_integrated(
if isinstance(ref_labels_list, str):
ref_labels_list = [ref_labels_list] * nrefs
elif nrefs != len(ref_labels_list):
raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length")
raise ValueError(
"'ref_data_list' and 'ref_labels_list' must be the same length"
)
if isinstance(ref_features_list, str):
ref_features_list = [ref_features_list] * nrefs
elif nrefs != len(ref_features_list):
raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length")
raise ValueError(
"'ref_data_list' and 'ref_features_list' must be the same length"
)

test_ptr, test_features = _clean_matrix(
test_data,
test_features,
assay_type = test_assay_type,
check_missing = test_check_missing,
num_threads = num_threads,
assay_type=test_assay_type,
check_missing=test_check_missing,
num_threads=num_threads,
)

all_ref_data = []
Expand All @@ -163,9 +169,9 @@ def annotate_integrated(
curref_ptr, curref_features = _clean_matrix(
curref_mat,
curref_features,
assay_type = ref_assay_type,
check_missing = ref_check_missing,
num_threads = num_threads,
assay_type=ref_assay_type,
check_missing=ref_check_missing,
num_threads=num_threads,
)

bargs = _attach_markers(curref_markers, build_single_args)
Expand Down
25 changes: 15 additions & 10 deletions src/singler/annotate_single.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from typing import Union, Sequence, Optional, Any
from biocframe import BiocFrame
from copy import copy
from typing import Any, Optional, Sequence, Union

from biocframe import BiocFrame

from .fetch_reference import fetch_github_reference, realize_github_markers
from .build_single_reference import build_single_reference
from .classify_single_reference import classify_single_reference
from ._utils import _clean_matrix
from .fetch_reference import fetch_github_reference, realize_github_markers


def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set):
def _resolve_reference(
ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set
):
if isinstance(ref_data, str):
ref = fetch_github_reference(ref_data, cache_dir=cache_dir)
ref_features = ref.row_data.column(ref_features)
Expand All @@ -27,7 +29,7 @@ def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args
)

ref_data = ref.assay("ranks")
ref_labels=ref.col_data.column(ref_labels)
ref_labels = ref.col_data.column(ref_labels)
else:
ref_markers = None

Expand All @@ -54,11 +56,12 @@ def annotate_single(
classify_args: dict = {},
num_threads: int = 1,
) -> BiocFrame:
"""Annotate a single-cell expression dataset based on the correlation
"""Annotate a single-cell expression dataset based on the correlation
of each cell to profiles in a labelled reference.
Args:
test_data: A matrix-like object representing the test dataset, where rows are
test_data:
A matrix-like object representing the test dataset, where rows are
features and columns are samples (usually cells). Entries should be expression
values; only the ranking within each column will be used.
Expand All @@ -67,10 +70,12 @@ def annotate_single(
containing such a matrix in one of its assays. Non-default assay
types can be specified in ``classify_args``.
test_features: Sequence of length equal to the number of rows in
test_features:
Sequence of length equal to the number of rows in
``test_data``, containing the feature identifier for each row.
ref_data: A matrix-like object representing the reference dataset, where rows
ref_data:
A matrix-like object representing the reference dataset, where rows
are features and columns are samples. Entries should be expression values,
usually log-transformed (see comments for the ``ref`` argument in
:py:meth:`~singler.build_single_reference.build_single_reference`).
Expand Down
Loading

0 comments on commit 04350c9

Please sign in to comment.