From 37d441f6d32fe7eaf95598d68ad74ceacd645a38 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 09:53:49 -0800 Subject: [PATCH 01/29] Begin hauling ass to update to the latest C++ libraries. --- lib/CMakeLists.txt | 28 ++ .../build_integrated_references.cpp | 0 .../lib => lib}/build_single_reference.cpp | 0 .../classify_integrated_references.cpp | 0 .../lib => lib}/classify_single_reference.cpp | 0 lib/src/classify_single.cpp | 47 +++ lib/src/def.h | 16 + lib/src/find_classic_markers.cpp | 59 ++++ lib/src/init.cpp | 11 + lib/src/train_single.cpp | 83 ++++++ lib/src/utils.h | 37 +++ pyproject.toml | 2 +- setup.cfg | 1 - setup.py | 79 +++-- src/singler/lib/Markers.cpp | 42 --- src/singler/lib/bindings.cpp | 282 ------------------ src/singler/lib/find_classic_markers.cpp | 28 -- src/singler/lib/utils.h | 11 - 18 files changed, 339 insertions(+), 387 deletions(-) create mode 100644 lib/CMakeLists.txt rename {src/singler/lib => lib}/build_integrated_references.cpp (100%) rename {src/singler/lib => lib}/build_single_reference.cpp (100%) rename {src/singler/lib => lib}/classify_integrated_references.cpp (100%) rename {src/singler/lib => lib}/classify_single_reference.cpp (100%) create mode 100644 lib/src/classify_single.cpp create mode 100644 lib/src/def.h create mode 100644 lib/src/find_classic_markers.cpp create mode 100644 lib/src/init.cpp create mode 100644 lib/src/train_single.cpp create mode 100644 lib/src/utils.h delete mode 100644 src/singler/lib/Markers.cpp delete mode 100644 src/singler/lib/bindings.cpp delete mode 100644 src/singler/lib/find_classic_markers.cpp delete mode 100644 src/singler/lib/utils.h diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt new file mode 100644 index 0000000..3817cfc --- /dev/null +++ b/lib/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.24) + +project(singler + VERSION 1.0.0 + DESCRIPTION "Building the singler shared library" + LANGUAGES CXX) + +# Defining the targets. +find_package(pybind11 CONFIG) + +# pybind11 method: +pybind11_add_module(singler + src/find_classic_markers.cpp + src/train_single.cpp + src/classify_single.cpp + src/init.cpp +) + +target_include_directories(singler PRIVATE "${ASSORTHEAD_INCLUDE_DIR}") + +set_property(TARGET singler PROPERTY CXX_STANDARD 17) + +target_link_libraries(singler PRIVATE pybind11::pybind11) + +set_target_properties(singler PROPERTIES + OUTPUT_NAME lib_singler + PREFIX "" +) diff --git a/src/singler/lib/build_integrated_references.cpp b/lib/build_integrated_references.cpp similarity index 100% rename from src/singler/lib/build_integrated_references.cpp rename to lib/build_integrated_references.cpp diff --git a/src/singler/lib/build_single_reference.cpp b/lib/build_single_reference.cpp similarity index 100% rename from src/singler/lib/build_single_reference.cpp rename to lib/build_single_reference.cpp diff --git a/src/singler/lib/classify_integrated_references.cpp b/lib/classify_integrated_references.cpp similarity index 100% rename from src/singler/lib/classify_integrated_references.cpp rename to lib/classify_integrated_references.cpp diff --git a/src/singler/lib/classify_single_reference.cpp b/lib/classify_single_reference.cpp similarity index 100% rename from src/singler/lib/classify_single_reference.cpp rename to lib/classify_single_reference.cpp diff --git a/lib/src/classify_single.cpp b/lib/src/classify_single.cpp new file mode 100644 index 0000000..090eed7 --- /dev/null +++ b/lib/src/classify_single.cpp @@ -0,0 +1,47 @@ +#include "def.h" +#include "utils.h" + +#include "singlepp/singlepp.hpp" +#include "tatami/tatami.hpp" +#include "pybind11/pybind11.h" + +#include +#include +#include + +pybind11::tuple classify_single(const MatrixPointer& test, const TrainedSingleIntersectPointer& built, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { + // Setting up outputs. + size_t ncells = test->ncol(); + pybind11::array_t best(ncells); + pybind11::array_t delta(ncells); + + singlepp::ClassifySingleBuffers buffers; + buffers.best = static_cast(best.request().ptr); + buffers.delta = static_cast(delta.request().ptr); + + size_t nlabels = built->num_labels(); + pybind11::array_t scores({ ncells, nlabels }); + buffers.scores.resize(nlabels); + auto sptr = static_cast(scores.request().ptr); + for (size_t l = 0; l < nlabels; ++l) { + buffers.scores[l] = sptr + ncells * l; // already size_t, no need to cast. + } + + // Running the analysis. + singlepp::ClassifySingleOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + opts.fine_tune = use_fine_tune; + opts.fine_tune_threshold = fine_tune_threshold; + singlepp::classify_single_intersect(*test, *built, buffers, opts); + + pybind11::tuple output(3); + output[0] = best; + output[1] = scores; + output[2] = delta; + return output; +} + +void init_classify_single(pybind11::module& m) { + m.def("classify_single", &classify_single); +} diff --git a/lib/src/def.h b/lib/src/def.h new file mode 100644 index 0000000..773dfdd --- /dev/null +++ b/lib/src/def.h @@ -0,0 +1,16 @@ +#ifndef DEF_H +#define DEF_H + +#include +#include +#include "tatami/tatami.hpp" +#include "singlepp/singlepp.hpp" + +typedef uint32_t MatrixIndex; +typedef double MatrixValue; +typedef std::shared_ptr > MatrixPointer; + +typedef std::shared_ptr > TrainedSingleIntersectPointer; +typedef std::shared_ptr > TrainedIntegratedPointer; + +#endif diff --git a/lib/src/find_classic_markers.cpp b/lib/src/find_classic_markers.cpp new file mode 100644 index 0000000..5ef6e86 --- /dev/null +++ b/lib/src/find_classic_markers.cpp @@ -0,0 +1,59 @@ +#include "def.h" +#include "utils.h" + +#include "singlepp/singlepp.hpp" +#include "tatami/tatami.hpp" +#include "pybind11/pybind11.h" + +#include +#include +#include + +pybind11::list find_classic_markers(uint32_t num_labels, uint32_t num_genes, const pybind11::list& reference, const pybind11::list& labels, int de_n, int nthreads) { + size_t num_ref = reference.size(); + if (num_ref != static_cast(labels.size())) { + throw std::runtime_error("'ref' and 'labels' should have the same length"); + } + + std::vector*> ref_ptrs; + ref_ptrs.reserve(num_ref); + std::vector lab_ptrs; + lab_ptrs.reserve(num_ref); + + for (size_t r = 0; r < num_ref; ++r) { + auto ptr = reference[r].cast().get(); + ref_ptrs.emplace_back(ptr); + if (ptr->nrow() != num_genes) { + throw std::runtime_error("each entry of 'ref' should have number of rows equal to 'ngenes'"); + } + + // No copy, so it's fine to create a pointer and discard the casted array. + auto lab = labels[r].cast(); + if (lab.size() != static_cast(ptr->ncol())) { + throw std::runtime_error("number of columns in each 'ref' should equal the length of the corresponding entry of 'labels'"); + } + + lab_ptrs.push_back(check_numpy_array(lab)); + } + + singlepp::ChooseClassicMarkersOptions opts; + opts.number = de_n; + opts.num_threads = nthreads; + auto store = singlepp::choose_classic_markers(ref_ptrs, lab_ptrs, opts); + + pybind11::list output(num_labels); + for (uint32_t l = 0; l < num_labels; ++l) { + const auto& src = store[l]; + pybind11::list dest(num_labels); + for (uint32_t l2 = 0; l2 < num_labels; ++l2) { + dest[l2] = pybind11::array_t(src[l2].size(), src[l2].data()); + } + output[l] = dest; + } + + return output; +} + +void init_find_classic_markers(pybind11::module& m) { + m.def("find_classic_markers", &find_classic_markers); +} diff --git a/lib/src/init.cpp b/lib/src/init.cpp new file mode 100644 index 0000000..2aefc65 --- /dev/null +++ b/lib/src/init.cpp @@ -0,0 +1,11 @@ +#include "pybind11/pybind11.h" + +void init_find_classic_markers(pybind11::module&); +void init_train_single(pybind11::module&); +void init_classify_single(pybind11::module&); + +PYBIND11_MODULE(lib_singler, m) { + init_find_classic_markers(m); + init_train_single(m); + init_classify_single(m); +} diff --git a/lib/src/train_single.cpp b/lib/src/train_single.cpp new file mode 100644 index 0000000..ea752df --- /dev/null +++ b/lib/src/train_single.cpp @@ -0,0 +1,83 @@ +#include "def.h" +#include "utils.h" + +#include "singlepp/singlepp.hpp" +#include "tatami/tatami.hpp" +#include "knncolle/knncolle.hpp" +#include "pybind11/pybind11.h" + +#include +#include + +TrainedSingleIntersectPointer train_single( + const pybind11::array& test_features, + const MatrixPointer& ref, + const pybind11::array& ref_features, + const pybind11::array& labels, + const pybind11::list& markers, + const std::shared_ptr, double> >& builder, + int nthreads) +{ + singlepp::TrainSingleOptions opts; + opts.num_threads = nthreads; + opts.top = -1; // Use all available markers; assume subsetting was applied on the R side. + + opts.trainer = builder; // std::shared_ptr(std::shared_ptr{}, bptr.get()); // make a no-op shared pointer. + + auto NR = ref->nrow(); + auto NC = ref->ncol(); + if (static_cast(labels.size()) != NC) { + throw std::runtime_error("length of 'labels' is equal to the number of columns of 'ref'"); + } + + // Setting up the markers. We assume that these are already 0-indexed on the R side. + size_t ngroups = markers.size(); + singlepp::Markers markers2(ngroups); + for (size_t m = 0; m < ngroups; ++m) { + auto curmarkers = markers[m].cast(); + auto& curmarkers2 = markers2[m]; + size_t inner_ngroups = curmarkers.size(); + curmarkers2.resize(inner_ngroups); + + for (size_t n = 0; n < inner_ngroups; ++n) { + auto seq = curmarkers[n].cast(); + auto sptr = check_numpy_array(seq); + auto& seq2 = curmarkers2[n]; + seq2.insert(seq2.end(), sptr, sptr + seq.size()); + } + } + + // Preparing the features. + size_t ninter = test_features.size(); + if (ninter != static_cast(ref_features.size())) { + throw std::runtime_error("length of 'test_features' and 'ref_features' should be the same"); + } + auto tfptr = check_numpy_array(test_features); + auto rfptr = check_numpy_array(ref_features); + singlepp::Intersection inter; + inter.reserve(ninter); + for (size_t i = 0; i < ninter; ++i) { + inter.emplace_back(tfptr[i], rfptr[i]); + } + + // Building the indices. + auto built = singlepp::train_single_intersect( + inter, + *ref, + check_numpy_array(labels), + std::move(markers2), + opts + ); + + return TrainedSingleIntersectPointer(new decltype(built)(std::move(built))); +} + +pybind11::array_t get_ref_subset(const TrainedSingleIntersectPointer& ptr) { + const auto& rsub = ptr->get_ref_subset(); + return pybind11::array_t(rsub.size(), rsub.data()); +} + +void init_train_single(pybind11::module& m) { + m.def("train_single", &train_single); + m.def("get_ref_subset", &get_ref_subset); +} diff --git a/lib/src/utils.h b/lib/src/utils.h new file mode 100644 index 0000000..621e430 --- /dev/null +++ b/lib/src/utils.h @@ -0,0 +1,37 @@ +#ifndef UTILS_H +#define UTILS_H + +#include "pybind11/pybind11.h" +#include "pybind11/numpy.h" + +#include + +// As a general rule, we avoid using pybind11::array_t as function arguments, +// because pybind11 might auto-cast and create an allocation that we then +// create a view on; on function exit, our view would be a dangling reference +// once the allocation gets destructed. So, we accept instead a pybind11::array +// and make sure it has our desired type and contiguous storage. + +template +const Expected_* get_numpy_array_data(const pybind11::array& x) { + return static_cast(x.request().ptr); +} + +template +const Expected_* check_contiguous_numpy_array(const pybind11::array& x) { + auto flag = x.flags(); + if (!(flag & pybind11::array::c_style) || !(flag & pybind11::array::f_style)) { + throw std::runtime_error("NumPy array contents should be contiguous"); + } + return get_numpy_array_data(x); +} + +template +const Expected_* check_numpy_array(const pybind11::array& x) { + if (!x.dtype().is(pybind11::dtype::of())) { + throw std::runtime_error("unexpected dtype for NumPy array"); + } + return check_contiguous_numpy_array(x); +} + +#endif diff --git a/pyproject.toml b/pyproject.toml index badefeb..b584fea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "mattress>=0.1.4", "assorthead"] +requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "mattress>=0.2.0", "pybind11", "assorthead>=0.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] diff --git a/setup.cfg b/setup.cfg index 796ce89..9b7b350 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,6 @@ python_requires = >=3.8 install_requires = importlib-metadata; python_version<"3.8" mattress>=0.1.4 - assorthead>=0.0.11 delayedarray biocframe>=0.5.0 summarizedexperiment>=0.4.0 diff --git a/setup.py b/setup.py index 93465ab..14f0176 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,71 @@ -"""Setup file for singler. Use setup.cfg to configure your project. +"""Setup file for mattress. Use setup.cfg to configure your project. This file was generated with PyScaffold 4.5. PyScaffold helps you to put up the scaffold of your new Python project. Learn more under: https://pyscaffold.org/ """ from setuptools import setup, Extension -import assorthead -import mattress +from setuptools.command.build_ext import build_ext as build_ext_orig +from glob import glob +import pathlib +import os +import shutil +import sys +import pybind11 + +## Adapted from https://stackoverflow.com/questions/42585210/extending-setuptools-extension-to-use-cmake-in-setup-py. +class CMakeExtension(Extension): + def __init__(self, name): + super().__init__(name, sources=[]) + +class build_ext(build_ext_orig): + def run(self): + for ext in self.extensions: + self.build_cmake(ext) + + def build_cmake(self, ext): + build_temp = pathlib.Path(self.build_temp) + build_lib = pathlib.Path(self.build_lib) + outpath = os.path.join(build_lib.absolute(), ext.name) + + if not os.path.exists(build_temp): + import assorthead + cmd = [ + "cmake", + "-S", "lib", + "-B", build_temp, + "-Dpybind11_DIR=" + os.path.join(os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11"), + "-DPYTHON_EXECUTABLE=" + sys.executable, + "-DASSORTHEAD_INCLUDE_DIR=" + assorthead.includes() + ] + if os.name != "nt": + cmd.append("-DCMAKE_BUILD_TYPE=Release") + cmd.append("-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + outpath) + + if "MORE_CMAKE_OPTIONS" in os.environ: + cmd += os.environ["MORE_CMAKE_OPTIONS"].split() + self.spawn(cmd) + + if not self.dry_run: + cmd = ['cmake', '--build', build_temp] + if os.name == "nt": + cmd += ["--config", "Release"] + self.spawn(cmd) + if os.name == "nt": + # Gave up trying to get MSVC to respect the output directory. + # Delvewheel also needs it to have a 'pyd' suffix... whatever. + shutil.copyfile(os.path.join(build_temp, "Release", "_core.dll"), os.path.join(outpath, "_core.pyd")) + if __name__ == "__main__": + import os try: setup( use_scm_version={"version_scheme": "no-guess-dev"}, - ext_modules=[ - Extension( - "singler._core", - [ - "src/singler/lib/Markers.cpp", - "src/singler/lib/bindings.cpp", - "src/singler/lib/find_classic_markers.cpp", - "src/singler/lib/build_single_reference.cpp", - "src/singler/lib/build_integrated_references.cpp", - "src/singler/lib/classify_single_reference.cpp", - "src/singler/lib/classify_integrated_references.cpp", - ], - include_dirs=[assorthead.includes()] + mattress.includes(), - language="c++", - extra_compile_args=[ - "-std=c++17", - ], - ) - ], + ext_modules=[CMakeExtension("mattress")], + cmdclass={ + 'build_ext': build_ext + } ) except: # noqa print( diff --git a/src/singler/lib/Markers.cpp b/src/singler/lib/Markers.cpp deleted file mode 100644 index 96cba3d..0000000 --- a/src/singler/lib/Markers.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include "utils.h" - -#include - -//[[export]] -void * create_markers(int32_t nlabels) { - auto ptr = new singlepp::Markers(nlabels); - auto& mrk = *ptr; - for (int32_t l = 0; l < nlabels; ++l) { - mrk[l].resize(nlabels); - } - return ptr; -} - -//[[export]] -void free_markers(void * ptr) { - delete reinterpret_cast(ptr); -} - -//[[export]] -int32_t get_nlabels_from_markers(void* ptr) { - return reinterpret_cast(ptr)->size(); -} - -//[[export]] -int32_t get_nmarkers_for_pair(void * ptr, int32_t label1, int32_t label2) { - const auto& current = (*reinterpret_cast(ptr))[label1][label2]; - return current.size(); -} - -//[[export]] -void get_markers_for_pair(void * ptr, int32_t label1, int32_t label2, int32_t* buffer /** numpy */) { - const auto& current = (*reinterpret_cast(ptr))[label1][label2]; - std::copy(current.begin(), current.end(), buffer); -} - -//[[export]] -void set_markers_for_pair(void* ptr, int32_t label1, int32_t label2, int32_t n, const int32_t* values /** numpy */) { - auto& current = (*reinterpret_cast(ptr))[label1][label2]; - current.clear(); - current.insert(current.end(), values, values + n); -} diff --git a/src/singler/lib/bindings.cpp b/src/singler/lib/bindings.cpp deleted file mode 100644 index 8674984..0000000 --- a/src/singler/lib/bindings.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* DO NOT MODIFY: this is automatically generated by the cpptypes */ - -#include -#include -#include - -#ifdef _WIN32 -#define PYAPI __declspec(dllexport) -#else -#define PYAPI -#endif - -static char* copy_error_message(const char* original) { - auto n = std::strlen(original); - auto copy = new char[n + 1]; - std::strcpy(copy, original); - return copy; -} - -void* build_integrated_references(int32_t, const int32_t*, int32_t, const uintptr_t*, const uintptr_t*, const uintptr_t*, const uintptr_t*, int32_t); - -void* build_single_reference(void*, const int32_t*, void*, uint8_t, int32_t); - -void classify_integrated_references(void*, const uintptr_t*, void*, double, uintptr_t*, int32_t*, double*, int32_t); - -void classify_single_reference(void*, const int32_t*, void*, double, uint8_t, double, int32_t, const uintptr_t*, int32_t*, double*); - -void* create_markers(int32_t); - -void* find_classic_markers(int32_t, const uintptr_t*, const uintptr_t*, int32_t, int32_t); - -void free_integrated_references(void*); - -void free_markers(void*); - -void free_single_reference(void*); - -void get_markers_for_pair(void*, int32_t, int32_t, int32_t*); - -int32_t get_nlabels_from_markers(void*); - -int32_t get_nlabels_from_single_reference(void*); - -int32_t get_nmarkers_for_pair(void*, int32_t, int32_t); - -int32_t get_nsubset_from_single_reference(void*); - -void get_subset_from_single_reference(void*, int32_t*); - -int32_t number_of_classic_markers(int32_t); - -void set_markers_for_pair(void*, int32_t, int32_t, int32_t, const int32_t*); - -extern "C" { - -PYAPI void free_error_message(char** msg) { - delete [] *msg; -} - -PYAPI void* py_build_integrated_references(int32_t test_nrow, const int32_t* test_features, int32_t nrefs, const uintptr_t* references, const uintptr_t* labels, const uintptr_t* ref_ids, const uintptr_t* prebuilt, int32_t nthreads, int32_t* errcode, char** errmsg) { - void* output = NULL; - try { - output = build_integrated_references(test_nrow, test_features, nrefs, references, labels, ref_ids, prebuilt, nthreads); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void* py_build_single_reference(void* ref, const int32_t* labels, void* markers, uint8_t approximate, int32_t nthreads, int32_t* errcode, char** errmsg) { - void* output = NULL; - try { - output = build_single_reference(ref, labels, markers, approximate, nthreads); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void py_classify_integrated_references(void* mat, const uintptr_t* assigned, void* prebuilt, double quantile, uintptr_t* scores, int32_t* best, double* delta, int32_t nthreads, int32_t* errcode, char** errmsg) { - try { - classify_integrated_references(mat, assigned, prebuilt, quantile, scores, best, delta, nthreads); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI void py_classify_single_reference(void* mat, const int32_t* subset, void* prebuilt, double quantile, uint8_t use_fine_tune, double fine_tune_threshold, int32_t nthreads, const uintptr_t* scores, int32_t* best, double* delta, int32_t* errcode, char** errmsg) { - try { - classify_single_reference(mat, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads, scores, best, delta); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI void* py_create_markers(int32_t nlabels, int32_t* errcode, char** errmsg) { - void* output = NULL; - try { - output = create_markers(nlabels); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void* py_find_classic_markers(int32_t nref, const uintptr_t* labels, const uintptr_t* ref, int32_t de_n, int32_t nthreads, int32_t* errcode, char** errmsg) { - void* output = NULL; - try { - output = find_classic_markers(nref, labels, ref, de_n, nthreads); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void py_free_integrated_references(void* ptr, int32_t* errcode, char** errmsg) { - try { - free_integrated_references(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI void py_free_markers(void* ptr, int32_t* errcode, char** errmsg) { - try { - free_markers(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI void py_free_single_reference(void* ptr, int32_t* errcode, char** errmsg) { - try { - free_single_reference(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI void py_get_markers_for_pair(void* ptr, int32_t label1, int32_t label2, int32_t* buffer, int32_t* errcode, char** errmsg) { - try { - get_markers_for_pair(ptr, label1, label2, buffer); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI int32_t py_get_nlabels_from_markers(void* ptr, int32_t* errcode, char** errmsg) { - int32_t output = 0; - try { - output = get_nlabels_from_markers(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI int32_t py_get_nlabels_from_single_reference(void* ptr, int32_t* errcode, char** errmsg) { - int32_t output = 0; - try { - output = get_nlabels_from_single_reference(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI int32_t py_get_nmarkers_for_pair(void* ptr, int32_t label1, int32_t label2, int32_t* errcode, char** errmsg) { - int32_t output = 0; - try { - output = get_nmarkers_for_pair(ptr, label1, label2); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI int32_t py_get_nsubset_from_single_reference(void* ptr, int32_t* errcode, char** errmsg) { - int32_t output = 0; - try { - output = get_nsubset_from_single_reference(ptr); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void py_get_subset_from_single_reference(void* ptr, int32_t* buffer, int32_t* errcode, char** errmsg) { - try { - get_subset_from_single_reference(ptr, buffer); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -PYAPI int32_t py_number_of_classic_markers(int32_t num_labels, int32_t* errcode, char** errmsg) { - int32_t output = 0; - try { - output = number_of_classic_markers(num_labels); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } - return output; -} - -PYAPI void py_set_markers_for_pair(void* ptr, int32_t label1, int32_t label2, int32_t n, const int32_t* values, int32_t* errcode, char** errmsg) { - try { - set_markers_for_pair(ptr, label1, label2, n, values); - } catch(std::exception& e) { - *errcode = 1; - *errmsg = copy_error_message(e.what()); - } catch(...) { - *errcode = 1; - *errmsg = copy_error_message("unknown C++ exception"); - } -} - -} diff --git a/src/singler/lib/find_classic_markers.cpp b/src/singler/lib/find_classic_markers.cpp deleted file mode 100644 index 6e5bec7..0000000 --- a/src/singler/lib/find_classic_markers.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "utils.h" // must be before all other includes. - -#include -#include - -//[[export]] -void* find_classic_markers(int32_t nref, const uintptr_t* labels /** void_p */, const uintptr_t* ref /** void_p */, int32_t de_n, int32_t nthreads) { - std::vector*> ref_ptrs; - ref_ptrs.reserve(nref); - std::vector lab_ptrs; - lab_ptrs.reserve(nref); - - for (int32_t r = 0; r < nref; ++r) { - const auto& ptr = reinterpret_cast(ref[r])->ptr; - ref_ptrs.push_back(ptr.get()); - lab_ptrs.push_back(reinterpret_cast(labels[r])); - } - - singlepp::ChooseClassicMarkers mrk; - mrk.set_number(de_n).set_num_threads(nthreads); - auto store = mrk.run(ref_ptrs, lab_ptrs); - return new singlepp::Markers(std::move(store)); -} - -//[[export]] -int32_t number_of_classic_markers(int32_t num_labels) { - return singlepp::ChooseClassicMarkers::number_of_markers(num_labels); -} diff --git a/src/singler/lib/utils.h b/src/singler/lib/utils.h deleted file mode 100644 index 603d6cd..0000000 --- a/src/singler/lib/utils.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef UTILS_H -#define UTILS_H - -#include "Mattress.h" - -// must be before singlepp includes. -#define SINGLEPP_CUSTOM_PARALLEL tatami::parallelize - -#include "singlepp/singlepp.hpp" - -#endif From 8b24ff56257de93468626ac7f3ac2223647950fa Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 10:26:42 -0800 Subject: [PATCH 02/29] Continue the grind of getting the damn thing to install. --- lib/src/find_classic_markers.cpp | 5 + setup.py | 2 +- src/singler/_Markers.py | 78 -------- src/singler/__init__.py | 6 - src/singler/_cpphelpers.py | 252 -------------------------- src/singler/_utils.py | 26 +-- src/singler/build_single_reference.py | 1 - src/singler/get_classic_markers.py | 61 ++++--- 8 files changed, 50 insertions(+), 381 deletions(-) delete mode 100644 src/singler/_Markers.py delete mode 100644 src/singler/_cpphelpers.py diff --git a/lib/src/find_classic_markers.cpp b/lib/src/find_classic_markers.cpp index 5ef6e86..2f2888d 100644 --- a/lib/src/find_classic_markers.cpp +++ b/lib/src/find_classic_markers.cpp @@ -54,6 +54,11 @@ pybind11::list find_classic_markers(uint32_t num_labels, uint32_t num_genes, con return output; } +uint32_t number_of_classic_markers(uint32_t num_labels) { + return singlepp::number_of_classic_markers(num_labels); +} + void init_find_classic_markers(pybind11::module& m) { m.def("find_classic_markers", &find_classic_markers); + m.def("number_of_classic_markers", &number_of_classic_markers); } diff --git a/setup.py b/setup.py index 14f0176..0d70471 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ def build_cmake(self, ext): try: setup( use_scm_version={"version_scheme": "no-guess-dev"}, - ext_modules=[CMakeExtension("mattress")], + ext_modules=[CMakeExtension("singler")], cmdclass={ 'build_ext': build_ext } diff --git a/src/singler/_Markers.py b/src/singler/_Markers.py deleted file mode 100644 index 9259f7d..0000000 --- a/src/singler/_Markers.py +++ /dev/null @@ -1,78 +0,0 @@ -from typing import Any, Sequence - -from numpy import array, int32, ndarray - -from . import _cpphelpers as lib - - -class _Markers: - def __init__(self, ptr): - self._ptr = ptr - self._num_labels = lib.get_nlabels_from_markers(self._ptr) - - def __del__(self): - lib.free_markers(self._ptr) - - def num_labels(self) -> int: - return self._num_labels - - def _check(self, i: int): - if i < 0 or i >= self._num_labels: - raise IndexError("label " + str(i) + " out of range for marker list") - - def get(self, first: int, second: int): - self._check(first) - self._check(second) - n = lib.get_nmarkers_for_pair(self._ptr, first, second) - output = ndarray(n, dtype=int32) - lib.get_markers_for_pair(self._ptr, first, second, output) - return output - - def set(self, first: int, second: int, markers: Sequence): - self._check(first) - self._check(second) - out = array(markers, dtype=int32, copy=False) - lib.set_markers_for_pair(self._ptr, first, second, len(out), out) - - def to_dict( - self, labels: Sequence, features: Sequence - ) -> dict[Any, dict[Any, Sequence]]: - if len(labels) != self._num_labels: - raise ValueError( - "length of 'labels' should be equal to the number of labels" - ) - - markers = {} - for i, x in enumerate(labels): - current = {} - for j, y in enumerate(labels): - current[y] = [features[k] for k in self.get(i, j)] - markers[x] = current - - return markers - - @classmethod - def from_dict( - cls, - markers: dict[Any, dict[Any, Sequence]], - labels: Sequence, - features: Sequence, - ): - fmapping = {} - for i, x in enumerate(features): - fmapping[x] = i - - instance = cls(lib.create_markers(len(labels))) - - for outer_i, outer_k in enumerate(labels): - for inner_i, inner_k in enumerate(labels): - current = markers[outer_k][inner_k] - - mapped = [] - for x in current: - if x in fmapping: # just skipping features that aren't present. - mapped.append(fmapping[x]) - - instance.set(outer_i, inner_i, mapped) - - return instance diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 8c512e7..80f0fca 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -16,10 +16,4 @@ del version, PackageNotFoundError -from .annotate_integrated import annotate_integrated -from .annotate_single import annotate_single -from .build_integrated_references import IntegratedReferences, build_integrated_references -from .build_single_reference import build_single_reference -from .classify_integrated_references import classify_integrated_references -from .classify_single_reference import classify_single_reference from .get_classic_markers import get_classic_markers, number_of_classic_markers diff --git a/src/singler/_cpphelpers.py b/src/singler/_cpphelpers.py deleted file mode 100644 index 15f5b2b..0000000 --- a/src/singler/_cpphelpers.py +++ /dev/null @@ -1,252 +0,0 @@ -# DO NOT MODIFY: this is automatically generated by the cpptypes - -import os -import ctypes as ct -import numpy as np - -def _catch_errors(f): - def wrapper(*args): - errcode = ct.c_int32(0) - errmsg = ct.c_char_p(0) - output = f(*args, ct.byref(errcode), ct.byref(errmsg)) - if errcode.value != 0: - msg = errmsg.value.decode('ascii') - lib.free_error_message(errmsg) - raise RuntimeError(msg) - return output - return wrapper - -# TODO: surely there's a better way than whatever this is. -dirname = os.path.dirname(os.path.abspath(__file__)) -contents = os.listdir(dirname) -lib = None -for x in contents: - if x.startswith('_core') and not x.endswith("py"): - lib = ct.CDLL(os.path.join(dirname, x)) - break - -if lib is None: - raise ImportError("failed to find the _core.* module") - -lib.free_error_message.argtypes = [ ct.POINTER(ct.c_char_p) ] - -def _np2ct(x, expected, contiguous=True): - if not isinstance(x, np.ndarray): - raise ValueError('expected a NumPy array') - if x.dtype != expected: - raise ValueError('expected a NumPy array of type ' + str(expected) + ', got ' + str(x.dtype)) - if contiguous: - if not x.flags.c_contiguous and not x.flags.f_contiguous: - raise ValueError('only contiguous NumPy arrays are supported') - return x.ctypes.data - -lib.py_build_integrated_references.restype = ct.c_void_p -lib.py_build_integrated_references.argtypes = [ - ct.c_int32, - ct.c_void_p, - ct.c_int32, - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_build_single_reference.restype = ct.c_void_p -lib.py_build_single_reference.argtypes = [ - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_uint8, - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_classify_integrated_references.restype = None -lib.py_classify_integrated_references.argtypes = [ - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_double, - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_classify_single_reference.restype = None -lib.py_classify_single_reference.argtypes = [ - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.c_double, - ct.c_uint8, - ct.c_double, - ct.c_int32, - ct.c_void_p, - ct.c_void_p, - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_create_markers.restype = ct.c_void_p -lib.py_create_markers.argtypes = [ - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_find_classic_markers.restype = ct.c_void_p -lib.py_find_classic_markers.argtypes = [ - ct.c_int32, - ct.c_void_p, - ct.c_void_p, - ct.c_int32, - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_free_integrated_references.restype = None -lib.py_free_integrated_references.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_free_markers.restype = None -lib.py_free_markers.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_free_single_reference.restype = None -lib.py_free_single_reference.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_markers_for_pair.restype = None -lib.py_get_markers_for_pair.argtypes = [ - ct.c_void_p, - ct.c_int32, - ct.c_int32, - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_nlabels_from_markers.restype = ct.c_int32 -lib.py_get_nlabels_from_markers.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_nlabels_from_single_reference.restype = ct.c_int32 -lib.py_get_nlabels_from_single_reference.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_nmarkers_for_pair.restype = ct.c_int32 -lib.py_get_nmarkers_for_pair.argtypes = [ - ct.c_void_p, - ct.c_int32, - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_nsubset_from_single_reference.restype = ct.c_int32 -lib.py_get_nsubset_from_single_reference.argtypes = [ - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_get_subset_from_single_reference.restype = None -lib.py_get_subset_from_single_reference.argtypes = [ - ct.c_void_p, - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_number_of_classic_markers.restype = ct.c_int32 -lib.py_number_of_classic_markers.argtypes = [ - ct.c_int32, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -lib.py_set_markers_for_pair.restype = None -lib.py_set_markers_for_pair.argtypes = [ - ct.c_void_p, - ct.c_int32, - ct.c_int32, - ct.c_int32, - ct.c_void_p, - ct.POINTER(ct.c_int32), - ct.POINTER(ct.c_char_p) -] - -def build_integrated_references(test_nrow, test_features, nrefs, references, labels, ref_ids, prebuilt, nthreads): - return _catch_errors(lib.py_build_integrated_references)(test_nrow, _np2ct(test_features, np.int32), nrefs, references, labels, ref_ids, prebuilt, nthreads) - -def build_single_reference(ref, labels, markers, approximate, nthreads): - return _catch_errors(lib.py_build_single_reference)(ref, _np2ct(labels, np.int32), markers, approximate, nthreads) - -def classify_integrated_references(mat, assigned, prebuilt, quantile, scores, best, delta, nthreads): - return _catch_errors(lib.py_classify_integrated_references)(mat, assigned, prebuilt, quantile, scores, _np2ct(best, np.int32), _np2ct(delta, np.float64), nthreads) - -def classify_single_reference(mat, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads, scores, best, delta): - return _catch_errors(lib.py_classify_single_reference)(mat, _np2ct(subset, np.int32), prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads, scores, _np2ct(best, np.int32), _np2ct(delta, np.float64)) - -def create_markers(nlabels): - return _catch_errors(lib.py_create_markers)(nlabels) - -def find_classic_markers(nref, labels, ref, de_n, nthreads): - return _catch_errors(lib.py_find_classic_markers)(nref, labels, ref, de_n, nthreads) - -def free_integrated_references(ptr): - return _catch_errors(lib.py_free_integrated_references)(ptr) - -def free_markers(ptr): - return _catch_errors(lib.py_free_markers)(ptr) - -def free_single_reference(ptr): - return _catch_errors(lib.py_free_single_reference)(ptr) - -def get_markers_for_pair(ptr, label1, label2, buffer): - return _catch_errors(lib.py_get_markers_for_pair)(ptr, label1, label2, _np2ct(buffer, np.int32)) - -def get_nlabels_from_markers(ptr): - return _catch_errors(lib.py_get_nlabels_from_markers)(ptr) - -def get_nlabels_from_single_reference(ptr): - return _catch_errors(lib.py_get_nlabels_from_single_reference)(ptr) - -def get_nmarkers_for_pair(ptr, label1, label2): - return _catch_errors(lib.py_get_nmarkers_for_pair)(ptr, label1, label2) - -def get_nsubset_from_single_reference(ptr): - return _catch_errors(lib.py_get_nsubset_from_single_reference)(ptr) - -def get_subset_from_single_reference(ptr, buffer): - return _catch_errors(lib.py_get_subset_from_single_reference)(ptr, _np2ct(buffer, np.int32)) - -def number_of_classic_markers(num_labels): - return _catch_errors(lib.py_number_of_classic_markers)(num_labels) - -def set_markers_for_pair(ptr, label1, label2, n, values): - return _catch_errors(lib.py_set_markers_for_pair)(ptr, label1, label2, n, _np2ct(values, np.int32)) diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 4e31b51..4c1e0ae 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -1,15 +1,15 @@ from typing import Sequence, Tuple -import biocutils as ut -import numpy as np -from delayedarray import DelayedArray -from mattress import TatamiNumericPointer, tatamize -from summarizedexperiment import SummarizedExperiment +import biocutils +import numpy +import delayedarray +import mattress +import summarizedexperiment -def _factorize(x: Sequence) -> Tuple[list, np.ndarray]: - _factor = ut.Factor.from_sequence(x, sort_levels=False) - return _factor.levels, np.array(_factor.codes, np.int32) +def _factorize(x: Sequence) -> Tuple[list, numpy.ndarray]: + _factor = biocutils.Factor.from_sequence(x, sort_levels=False) + return _factor.levels, numpy.array(_factor.codes, numpy.int32) def _create_map(x: Sequence) -> dict: @@ -67,13 +67,13 @@ def _stable_union(*args) -> list: def _clean_matrix(x, features, assay_type, check_missing, num_threads): - if isinstance(x, TatamiNumericPointer): + if isinstance(x, mattress.InitializedMatrix): # Assume the pointer was previously generated from _clean_matrix, # so it's 2-dimensional, matches up with features and it's already # clean of NaNs... so we no-op and just return it directly. return x, features - if isinstance(x, SummarizedExperiment): + if isinstance(x, summarizedexperiment.SummarizedExperiment): if features is None: features = x.get_row_names() elif isinstance(features, str): @@ -104,8 +104,8 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): if k: new_features.append(features[i]) - sub = DelayedArray(ptr)[retain, :] # avoid re-tatamizing 'x'. - return tatamize(sub), new_features + sub = delayedarray.DelayedArray(ptr)[retain, :] # avoid re-tatamizing 'x'. + return mattress.initialize(sub), new_features def _restrict_features(ptr, features, restrict_to): @@ -117,5 +117,5 @@ def _restrict_features(ptr, features, restrict_to): keep.append(i) new_features.append(x) features = new_features - ptr = tatamize(DelayedArray(ptr)[keep, :]) + ptr = mattress.initialize(delayedarray.DelayedArray(ptr)[keep, :]) return ptr, features diff --git a/src/singler/build_single_reference.py b/src/singler/build_single_reference.py index 5761a65..5d5b6c8 100644 --- a/src/singler/build_single_reference.py +++ b/src/singler/build_single_reference.py @@ -3,7 +3,6 @@ import biocutils as ut from numpy import array, int32, ndarray -from . import _cpphelpers as lib from ._Markers import _Markers from ._utils import _clean_matrix, _factorize, _restrict_features from .get_classic_markers import _get_classic_markers_raw diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index 14fb675..2b55acf 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -1,11 +1,10 @@ from typing import Any, Optional, Sequence, Union import delayedarray -from mattress import tatamize -from numpy import int32, ndarray, uintp +import mattress +import numpy -from . import _cpphelpers as lib -from ._Markers import _Markers +from . import lib_singler as lib from ._utils import ( _clean_matrix, _create_map, @@ -15,9 +14,7 @@ ) -def _get_classic_markers_raw( - ref_ptrs, ref_labels, ref_features, num_de=None, num_threads=1 -): +def _get_classic_markers_raw(ref_ptrs: list, ref_labels: list, ref_features: list, num_de=None, num_threads=1): nrefs = len(ref_ptrs) # We assume that ref_ptrs and ref_features contains the outputs of @@ -32,15 +29,15 @@ def _get_classic_markers_raw( # Defining the intersection of features. common_features = _stable_intersect(*ref_features) if len(common_features) == 0: - for feat in ref_features: - if len(feat): + for r in ref_ptrs: + if r.nrow(): raise ValueError("no common feature names across 'features'") common_features_map = _create_map(common_features) - # Creating medians. + # Computing medians for each group within each median. ref2 = [] - ref2_ptrs = ndarray((nrefs,), dtype=uintp) + ref2_ptrs = [] tmp_labels = [] for i, x in enumerate(ref_ptrs): @@ -52,28 +49,26 @@ def _get_classic_markers_raw( remap[common_features_map[f]] = len(survivors) - 1 da = delayedarray.DelayedArray(x)[survivors, :] - ptr = tatamize(da) + ptr = initialize(da) med, lev = ptr.row_medians_by_group(ref_labels[i], num_threads=num_threads) tmp_labels.append(lev) - finalptr = tatamize(med[remap, :]) + finalptr = initialize(med[remap, :]) ref2.append(finalptr) ref2_ptrs[i] = finalptr.ptr ref_labels = tmp_labels - # Defining the union of labels. + # Defining the union of labels across all references. common_labels = _stable_union(*ref_labels) common_labels_map = _create_map(common_labels) labels2 = [] - labels2_ptrs = ndarray((nrefs,), dtype=uintp) for i, lab in enumerate(ref_labels): - converted = ndarray(len(lab), dtype=int32) + converted = numpy.ndarray(len(lab), dtype=numpy.uint32) for j, x in enumerate(lab): converted[j] = common_labels_map[x] labels2.append(converted) - labels2_ptrs[i] = converted.ctypes.data # Finally getting around to calling markers. if num_de is None: @@ -81,14 +76,13 @@ def _get_classic_markers_raw( elif num_de <= 0: raise ValueError("'num_de' should be positive") - raw_markers = _Markers( - lib.find_classic_markers( - nref=nrefs, - labels=labels2_ptrs.ctypes.data, - ref=ref2_ptrs.ctypes.data, - de_n=num_de, - nthreads=num_threads, - ) + raw_markers = lib.find_classic_markers( + len(common_labels), + len(common_features), + ref2_ptrs, + labels2, + num_de, + num_threads ) return raw_markers, common_labels, common_features @@ -142,8 +136,8 @@ def get_classic_markers( restrict_to: Subset of available features to restrict to. Only features in - ``restrict_to`` will be used in the reference building. If None, - no restriction is performed. + ``restrict_to`` will be used for marker detection. If None, no + restriction is performed. num_de: Number of differentially expressed genes to use as markers for each pairwise comparison between labels. @@ -195,12 +189,19 @@ def get_classic_markers( num_threads=num_threads, ) - return raw_markers.to_dict(common_labels, common_features) + markers = {} + for i, x in enumerate(common_labels): + current = {} + for j, y in enumerate(common_labels): + current[y] = [common_features[k] for k in raw_markers[i][j]] + markers[x] = current + + return markers def number_of_classic_markers(num_labels: int) -> int: - """Compute the number of markers to detect for a given number of labels, using the classic SingleR marker detection - algorithm. + """Compute the number of markers to detect for a given number of labels, + using the classic SingleR marker detection algorithm. Args: num_labels: Number of labels. From 3dbd44d1b531a17f80aa6778c9909e14e2ca5b24 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 11:22:32 -0800 Subject: [PATCH 03/29] Continue grinding. --- lib/src/def.h | 8 +- lib/src/init.cpp | 4 + lib/src/train_single.cpp | 23 ++-- src/singler/__init__.py | 1 + src/singler/_utils.py | 2 +- src/singler/get_classic_markers.py | 20 ++- ...ld_single_reference.py => train_single.py} | 114 +++++++++--------- tests/test_get_classic_markers.py | 13 -- ...ngle_reference.py => test_train_single.py} | 18 +-- 9 files changed, 103 insertions(+), 100 deletions(-) rename src/singler/{build_single_reference.py => train_single.py} (69%) rename tests/{test_build_single_reference.py => test_train_single.py} (78%) diff --git a/lib/src/def.h b/lib/src/def.h index 773dfdd..df0491b 100644 --- a/lib/src/def.h +++ b/lib/src/def.h @@ -10,7 +10,11 @@ typedef uint32_t MatrixIndex; typedef double MatrixValue; typedef std::shared_ptr > MatrixPointer; -typedef std::shared_ptr > TrainedSingleIntersectPointer; -typedef std::shared_ptr > TrainedIntegratedPointer; +typedef std::shared_ptr, double> > BuilderPointer; + +typedef singlepp::TrainedSingleIntersect TrainedSingleIntersect; +typedef std::shared_ptr TrainedSingleIntersectPointer; +typedef singlepp::TrainedIntegrated TrainedIntegrated; +typedef std::shared_ptr TrainedIntegratedPointer; #endif diff --git a/lib/src/init.cpp b/lib/src/init.cpp index 2aefc65..67dd2e9 100644 --- a/lib/src/init.cpp +++ b/lib/src/init.cpp @@ -1,3 +1,4 @@ +#include "def.h" #include "pybind11/pybind11.h" void init_find_classic_markers(pybind11::module&); @@ -8,4 +9,7 @@ PYBIND11_MODULE(lib_singler, m) { init_find_classic_markers(m); init_train_single(m); init_classify_single(m); + + pybind11::class_(m, "TrainSingleIntersect"); + pybind11::class_(m, "TrainIntegrated"); } diff --git a/lib/src/train_single.cpp b/lib/src/train_single.cpp index ea752df..f138699 100644 --- a/lib/src/train_single.cpp +++ b/lib/src/train_single.cpp @@ -15,14 +15,13 @@ TrainedSingleIntersectPointer train_single( const pybind11::array& ref_features, const pybind11::array& labels, const pybind11::list& markers, - const std::shared_ptr, double> >& builder, + const BuilderPointer& builder, int nthreads) { singlepp::TrainSingleOptions opts; opts.num_threads = nthreads; - opts.top = -1; // Use all available markers; assume subsetting was applied on the R side. - - opts.trainer = builder; // std::shared_ptr(std::shared_ptr{}, bptr.get()); // make a no-op shared pointer. + opts.top = -1; // Use all available markers; assume subsetting was applied on the Python side. + opts.trainer = BuilderPointer(BuilderPointer{}, builder.get()); // make a no-op shared pointer. auto NR = ref->nrow(); auto NC = ref->ncol(); @@ -30,7 +29,7 @@ TrainedSingleIntersectPointer train_single( throw std::runtime_error("length of 'labels' is equal to the number of columns of 'ref'"); } - // Setting up the markers. We assume that these are already 0-indexed on the R side. + // Setting up the markers. size_t ngroups = markers.size(); singlepp::Markers markers2(ngroups); for (size_t m = 0; m < ngroups; ++m) { @@ -72,12 +71,22 @@ TrainedSingleIntersectPointer train_single( return TrainedSingleIntersectPointer(new decltype(built)(std::move(built))); } -pybind11::array_t get_ref_subset(const TrainedSingleIntersectPointer& ptr) { +pybind11::array_t get_markers_from_single_reference(const TrainedSingleIntersectPointer& ptr) { const auto& rsub = ptr->get_ref_subset(); return pybind11::array_t(rsub.size(), rsub.data()); } +uint32_t get_num_markers_from_single_reference(const TrainedSingleIntersectPointer& ptr) { + return ptr->get_ref_subset().size(); +} + +uint32_t get_num_labels_from_single_reference(const TrainedSingleIntersectPointer& ptr) { + return ptr->num_labels(); +} + void init_train_single(pybind11::module& m) { m.def("train_single", &train_single); - m.def("get_ref_subset", &get_ref_subset); + m.def("get_markers_from_single_reference", &get_markers_from_single_reference); + m.def("get_num_markers_from_single_reference", &get_num_markers_from_single_reference); + m.def("get_num_labels_from_single_reference", &get_num_labels_from_single_reference); } diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 80f0fca..fcd6031 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -17,3 +17,4 @@ from .get_classic_markers import get_classic_markers, number_of_classic_markers +from .train_single import train_single, TrainedSingleReference diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 4c1e0ae..66f810d 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -91,7 +91,7 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): "number of rows of 'x' should be equal to the length of 'features'" ) - ptr = tatamize(x) + ptr = mattress.initialize(x) if not check_missing: return ptr, features diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index 2b55acf..39ab6a8 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -49,13 +49,13 @@ def _get_classic_markers_raw(ref_ptrs: list, ref_labels: list, ref_features: lis remap[common_features_map[f]] = len(survivors) - 1 da = delayedarray.DelayedArray(x)[survivors, :] - ptr = initialize(da) + ptr = mattress.initialize(da) med, lev = ptr.row_medians_by_group(ref_labels[i], num_threads=num_threads) tmp_labels.append(lev) - finalptr = initialize(med[remap, :]) + finalptr = mattress.initialize(med[remap, :]) ref2.append(finalptr) - ref2_ptrs[i] = finalptr.ptr + ref2_ptrs.append(finalptr.ptr) ref_labels = tmp_labels @@ -94,7 +94,6 @@ def get_classic_markers( ref_features: Union[Sequence, list[Sequence]], assay_type: Union[str, int] = "logcounts", check_missing: bool = True, - restrict_to: Optional[Union[set, dict]] = None, num_de: Optional[int] = None, num_threads: int = 1, ) -> dict[Any, dict[Any, list]]: @@ -134,11 +133,6 @@ def get_classic_markers( Whether to check for and remove rows with missing (NaN) values in the reference matrices. This can be set to False if it is known that no NaN values exist. - restrict_to: - Subset of available features to restrict to. Only features in - ``restrict_to`` will be used for marker detection. If None, no - restriction is performed. - num_de: Number of differentially expressed genes to use as markers for each pairwise comparison between labels. If None, an appropriate number of genes is automatically determined. @@ -173,9 +167,6 @@ def get_classic_markers( check_missing=check_missing, num_threads=num_threads, ) - - r, f = _restrict_features(r, f, restrict_to) - ref_ptrs.append(r) tmp_features.append(f) @@ -189,13 +180,16 @@ def get_classic_markers( num_threads=num_threads, ) + return _markers_to_dict(raw_markers, common_labels, common_features) + + +def _markers_to_dict(raw_markers, common_labels, common_features): markers = {} for i, x in enumerate(common_labels): current = {} for j, y in enumerate(common_labels): current[y] = [common_features[k] for k in raw_markers[i][j]] markers[x] = current - return markers diff --git a/src/singler/build_single_reference.py b/src/singler/train_single.py similarity index 69% rename from src/singler/build_single_reference.py rename to src/singler/train_single.py index 5d5b6c8..e06cd56 100644 --- a/src/singler/build_single_reference.py +++ b/src/singler/train_single.py @@ -1,14 +1,14 @@ from typing import Any, Literal, Optional, Sequence, Union -import biocutils as ut -from numpy import array, int32, ndarray +import biocutils +import numpy -from ._Markers import _Markers -from ._utils import _clean_matrix, _factorize, _restrict_features -from .get_classic_markers import _get_classic_markers_raw +from . import lib_singler as lib +from ._utils import _clean_matrix, _factorize +from .get_classic_markers import _get_classic_markers_raw, _markers_to_dict -class SinglePrebuiltReference: +class TrainedSingleReference: """A prebuilt reference object, typically created by :py:meth:`~singler.build_single_reference.build_single_reference`. This is intended for advanced users only and should not be serialized. @@ -26,49 +26,37 @@ def __init__( self._labels = labels self._markers = markers - def __del__(self): - lib.free_single_reference(self._ptr) - def num_markers(self) -> int: """ Returns: Number of markers to be used for classification. This is the same as the size of the array from :py:meth:`~marker_subset`. """ - return lib.get_nsubset_from_single_reference(self._ptr) + return lib.get_num_markers_from_single_reference(self._ptr) def num_labels(self) -> int: """ Returns: Number of unique labels in this reference. """ - return lib.get_nlabels_from_single_reference(self._ptr) + return lib.get_num_labels_from_single_reference(self._ptr) @property - def features(self) -> Sequence: - """ - Returns: - The universe of features known to this reference, usually as strings. - """ + def features(self) -> list: + """The universe of features known to this reference.""" return self._features @property def labels(self) -> Sequence: - """ - Returns: - Unique labels in this reference. - """ + """Unique labels in this reference.""" return self._labels @property - def markers(self) -> dict[Any, dict[Any, Sequence]]: - """ - Returns: - Markers for every pairwise comparison between labels. - """ + def markers(self) -> dict[Any, dict[Any, list]]: + """Markers for every pairwise comparison between labels.""" return self._markers - def marker_subset(self, indices_only: bool = False) -> Union[ndarray, list]: + def marker_subset(self, indices_only: bool = False) -> Union[numpy.ndarray, list]: """ Args: indices_only: @@ -81,28 +69,48 @@ def marker_subset(self, indices_only: bool = False) -> Union[ndarray, list]: If ``indices_only = True``, a NumPy array containing the integer indices of features in ``features`` that were chosen as markers. """ - nmarkers = self.num_markers() - buffer = ndarray(nmarkers, dtype=int32) - lib.get_subset_from_single_reference(self._ptr, buffer) + buffer = lib.get_markers_from_single_reference(self._ptr) if indices_only: return buffer else: return [self._features[i] for i in buffer] -def build_single_reference( +def _markers_from_dict(markers: dict[Any, dict[Any, Sequence]], labels: Sequence, features: Sequence): + fmapping = {} + for i, x in enumerate(features): + fmapping[x] = i + + outer_instance = [None] * len(labels) + for outer_i, outer_k in enumerate(labels): + inner_instance = [None] * len(labels) + + for inner_i, inner_k in enumerate(labels): + current = markers[outer_k][inner_k] + mapped = [] + for x in current: + if x in fmapping: # just skipping features that aren't present. + mapped.append(fmapping[x]) + inner_instance[inner_i] = numpy.array(mapped, dtype=numpy.uint32) + + outer_instance[outer_i] = inner_instance + + return outer_instance + + +def train_single( ref_data: Any, ref_labels: Sequence, ref_features: Sequence, + test_features: Optional[Sequence] = None, assay_type: Union[str, int] = "logcounts", check_missing: bool = True, - restrict_to: Optional[Union[set, dict]] = None, markers: Optional[dict[Any, dict[Any, Sequence]]] = None, marker_method: Literal["classic"] = "classic", marker_args: dict = {}, approximate: bool = True, num_threads: int = 1, -) -> SinglePrebuiltReference: + ) -> TrainedSingleReference: """Build a single reference dataset in preparation for classification. Args: @@ -118,13 +126,14 @@ def build_single_reference( :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays. - labels: - Sequence of labels for each reference profile, - i.e., column in ``ref``. + ref_labels: + Sequence of labels for each reference profile, i.e., column in ``ref_data``. - features: - Sequence of identifiers for each feature, - i.e., row in ``ref``. + ref_features: + Sequence of identifiers for each feature, i.e., row in ``ref_data``. + + test_features: + Sequence of identifiers for each feature in the test dataset. assay_type: Assay containing the expression matrix, @@ -135,11 +144,6 @@ def build_single_reference( Whether to check for and remove rows with missing (NaN) values from ``ref_data``. - restrict_to: - Subset of available features to restrict to. Only features in - ``restrict_to`` will be used in the reference building. If None, - no restriction is performed. - markers: Upregulated markers for each pairwise comparison between labels. Specifically, ``markers[a][b]`` should be a sequence of features @@ -166,7 +170,7 @@ def build_single_reference( Returns: The pre-built reference, ready for use in downstream methods like - :py:meth:`~singler.classify_single_reference.classify_single_reference`. + :py:meth:`~singler.classify_single_reference.classify_single`. """ ref_ptr, ref_features = _clean_matrix( @@ -188,22 +192,22 @@ def build_single_reference( num_threads=num_threads, **marker_args, ) - markers = mrk.to_dict(lablev, ref_features) - labind = array(ut.match(ref_labels, lablev), dtype=int32) + markers = _markers_to_dict(mrk, lablev, ref_features) + labind = numpy.array(biocutils.match(ref_labels, lablev), dtype=numpy.uint32) else: - raise NotImplementedError("other marker methods are not implemented, sorry") + raise NotImplementedError("other marker methods are not yet implemented, sorry") else: lablev, labind = _factorize(ref_labels) - labind = array(labind, dtype=int32) - mrk = _Markers.from_dict(markers, lablev, ref_features) + labind = numpy.array(labind, dtype=numpy.uint32) + mrk = _markers_from_dict(markers, lablev, ref_features) - return SinglePrebuiltReference( - lib.build_single_reference( + return TrainedSingleReference( + lib.train_single( ref_ptr.ptr, - labels=labind, - markers=mrk._ptr, - approximate=approximate, - nthreads=num_threads, + labind, + mrk, + approximate, + num_threads, ), labels=lablev, features=ref_features, diff --git a/tests/test_get_classic_markers.py b/tests/test_get_classic_markers.py index e4c9c27..797368a 100644 --- a/tests/test_get_classic_markers.py +++ b/tests/test_get_classic_markers.py @@ -95,16 +95,3 @@ def test_get_classic_markers_intersected_features(): assert sorted(alpha.keys()) == sorted(bravo.keys()) for k2 in alpha.keys(): assert alpha[k2] == bravo[k2] - - -def test_get_classic_markers_restricted(): - ref = numpy.random.rand(10000, 10) - labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] - features = [str(i) for i in range(ref.shape[0])] - - keep = range(2000, 8000, 5) - restricted = [str(i) for i in keep] - markers = singler.get_classic_markers(ref, labels, features, restrict_to=restricted) - - expected = singler.get_classic_markers(ref[keep, :], labels, restricted) - assert markers == expected diff --git a/tests/test_build_single_reference.py b/tests/test_train_single.py similarity index 78% rename from tests/test_build_single_reference.py rename to tests/test_train_single.py index 2f81106..a7e0e3c 100644 --- a/tests/test_build_single_reference.py +++ b/tests/test_train_single.py @@ -2,13 +2,13 @@ import numpy -def test_build_single_reference(): +def test_train_single(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features = [str(i) for i in range(ref.shape[0])] markers = singler.get_classic_markers(ref, labels, features) - built = singler.build_single_reference(ref, labels, features, markers) + built = singler.train_single(ref, labels, features, markers) assert built.num_labels() == 5 assert built.num_markers() < len(features) assert built.features == features @@ -21,35 +21,35 @@ def test_build_single_reference(): assert m in feat_set # Same results when run in parallel. - pbuilt = singler.build_single_reference( + pbuilt = singler.train_single( ref, labels, features, markers, num_threads=2 ) assert all_markers == pbuilt.marker_subset() -def test_build_single_reference_markers(): +def test_train_single_markers(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features = [str(i) for i in range(ref.shape[0])] - built = singler.build_single_reference(ref, labels, features) + built = singler.train_single(ref, labels, features) markers = singler.get_classic_markers(ref, labels, features) - mbuilt = singler.build_single_reference(ref, labels, features, markers) + mbuilt = singler.train_single(ref, labels, features, markers) assert built.markers == mbuilt.markers -def test_build_single_reference_restricted(): +def test_train_single_restricted(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features = [str(i) for i in range(ref.shape[0])] keep = range(1, ref.shape[0], 3) restricted = [str(i) for i in keep] - built = singler.build_single_reference( + built = singler.train_single( ref, labels, features, restrict_to=set(restricted) ) - expected = singler.build_single_reference(ref[keep, :], labels, restricted) + expected = singler.train_single(ref[keep, :], labels, restricted) assert built.markers == expected.markers assert built.marker_subset() == expected.marker_subset() From ae2d0c8541fa8a4d70325afa6770e96c52d5fd2a Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 14:00:06 -0800 Subject: [PATCH 04/29] Get even closer to finishing the grind. --- setup.cfg | 3 +- src/singler/_utils.py | 32 +++---- src/singler/get_classic_markers.py | 22 ++--- src/singler/train_single.py | 138 ++++++++++++++++++++--------- tests/test_train_single.py | 14 +-- 5 files changed, 125 insertions(+), 84 deletions(-) diff --git a/setup.cfg b/setup.cfg index 9b7b350..b4171a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,8 @@ python_requires = >=3.8 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - mattress>=0.1.4 + mattress>=0.3.0 + knncolle delayedarray biocframe>=0.5.0 summarizedexperiment>=0.4.0 diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 66f810d..96b1567 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -3,13 +3,13 @@ import biocutils import numpy import delayedarray -import mattress import summarizedexperiment +import mattress def _factorize(x: Sequence) -> Tuple[list, numpy.ndarray]: - _factor = biocutils.Factor.from_sequence(x, sort_levels=False) - return _factor.levels, numpy.array(_factor.codes, numpy.int32) + f = biocutils.Factor.from_sequence(x, sort_levels=False) + return f.levels, numpy.array(f.codes, dtype=numpy.uint32) def _create_map(x: Sequence) -> dict: @@ -67,19 +67,12 @@ def _stable_union(*args) -> list: def _clean_matrix(x, features, assay_type, check_missing, num_threads): - if isinstance(x, mattress.InitializedMatrix): - # Assume the pointer was previously generated from _clean_matrix, - # so it's 2-dimensional, matches up with features and it's already - # clean of NaNs... so we no-op and just return it directly. - return x, features - if isinstance(x, summarizedexperiment.SummarizedExperiment): if features is None: features = x.get_row_names() elif isinstance(features, str): features = x.get_row_data().column(features) features = list(features) - x = x.assay(assay_type) curshape = x.shape @@ -91,31 +84,32 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): "number of rows of 'x' should be equal to the length of 'features'" ) - ptr = mattress.initialize(x) if not check_missing: - return ptr, features + return x, features + ptr = mattress.initialize(x) retain = ptr.row_nan_counts(num_threads=num_threads) == 0 if retain.all(): - return ptr, features + return x, features new_features = [] for i, k in enumerate(retain): if k: new_features.append(features[i]) - sub = delayedarray.DelayedArray(ptr)[retain, :] # avoid re-tatamizing 'x'. - return mattress.initialize(sub), new_features + sub = delayedarray.DelayedArray(x)[retain, :] + return sub, new_features -def _restrict_features(ptr, features, restrict_to): +def _restrict_features(data, features, restrict_to): if restrict_to is not None: + if not isinstance(restrict_to, set): + restrict_to = set(restrict_to) keep = [] new_features = [] for i, x in enumerate(features): if x in restrict_to: keep.append(i) new_features.append(x) - features = new_features - ptr = mattress.initialize(delayedarray.DelayedArray(ptr)[keep, :]) - return ptr, features + return delayedarray.DelayedArray(data)[keep, :], new_features + return data, features diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index 39ab6a8..3ca7f2a 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -8,7 +8,6 @@ from ._utils import ( _clean_matrix, _create_map, - _restrict_features, _stable_intersect, _stable_union, ) @@ -141,10 +140,9 @@ def get_classic_markers( Number of threads to use for the calculations. Returns: - A dictionary of dictionary of lists - containing the markers for each pairwise comparison between labels, - i.e., ``markers[a][b]`` contains the upregulated markers for label - ``a`` over label ``b``. + A dictionary of dictionary of lists containing the markers for each + pairwise comparison between labels, i.e., ``markers[a][b]`` contains + the upregulated markers for label ``a`` over label ``b``. """ if not isinstance(ref_data, list): ref_data = [ref_data] @@ -158,7 +156,7 @@ def get_classic_markers( raise ValueError("length of 'ref' and 'features' should be the same") ref_ptrs = [] - tmp_features = [] + cleaned_features = [] for i in range(nrefs): r, f = _clean_matrix( ref_data[i], @@ -167,23 +165,17 @@ def get_classic_markers( check_missing=check_missing, num_threads=num_threads, ) - ref_ptrs.append(r) - tmp_features.append(f) - - ref_features = tmp_features + ref_ptrs.append(mattress.initialize(r)) + cleaned_features.append(f) raw_markers, common_labels, common_features = _get_classic_markers_raw( ref_ptrs=ref_ptrs, ref_labels=ref_labels, - ref_features=ref_features, + ref_features=cleaned_features, num_de=num_de, num_threads=num_threads, ) - return _markers_to_dict(raw_markers, common_labels, common_features) - - -def _markers_to_dict(raw_markers, common_labels, common_features): markers = {} for i, x in enumerate(common_labels): current = {} diff --git a/src/singler/train_single.py b/src/singler/train_single.py index e06cd56..b82f26a 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -2,10 +2,13 @@ import biocutils import numpy +import knncolle +import mattress +import delayedarray from . import lib_singler as lib -from ._utils import _clean_matrix, _factorize -from .get_classic_markers import _get_classic_markers_raw, _markers_to_dict +from ._utils import _clean_matrix, _factorize, _restrict_features, _stable_intersect +from .get_classic_markers import get_classic_markers class TrainedSingleReference: @@ -76,25 +79,10 @@ def marker_subset(self, indices_only: bool = False) -> Union[numpy.ndarray, list return [self._features[i] for i in buffer] -def _markers_from_dict(markers: dict[Any, dict[Any, Sequence]], labels: Sequence, features: Sequence): +def _markers_from_dict(markers: dict[Any, dict[Any, Sequence]], labels: Sequence, available_features: Sequence): fmapping = {} - for i, x in enumerate(features): + for i, x in enumerate(available_features): fmapping[x] = i - - outer_instance = [None] * len(labels) - for outer_i, outer_k in enumerate(labels): - inner_instance = [None] * len(labels) - - for inner_i, inner_k in enumerate(labels): - current = markers[outer_k][inner_k] - mapped = [] - for x in current: - if x in fmapping: # just skipping features that aren't present. - mapped.append(fmapping[x]) - inner_instance[inner_i] = numpy.array(mapped, dtype=numpy.uint32) - - outer_instance[outer_i] = inner_instance - return outer_instance @@ -104,11 +92,12 @@ def train_single( ref_features: Sequence, test_features: Optional[Sequence] = None, assay_type: Union[str, int] = "logcounts", + restrict_to: Optional[Union[set, dict]] = None, check_missing: bool = True, markers: Optional[dict[Any, dict[Any, Sequence]]] = None, marker_method: Literal["classic"] = "classic", marker_args: dict = {}, - approximate: bool = True, + nn_parameters: Optional[knncolle.Parameters] = knncolle.AnnoyParameters(), num_threads: int = 1, ) -> TrainedSingleReference: """Build a single reference dataset in preparation for classification. @@ -144,6 +133,11 @@ def train_single( Whether to check for and remove rows with missing (NaN) values from ``ref_data``. + restrict_to: + Subset of available features to restrict to. Only features in + ``restrict_to`` will be used in the reference building. If None, + no restriction is performed. + markers: Upregulated markers for each pairwise comparison between labels. Specifically, ``markers[a][b]`` should be a sequence of features @@ -173,7 +167,7 @@ def train_single( :py:meth:`~singler.classify_single_reference.classify_single`. """ - ref_ptr, ref_features = _clean_matrix( + ref_data, ref_features = _clean_matrix( ref_data, ref_features, assay_type=assay_type, @@ -181,35 +175,95 @@ def train_single( num_threads=num_threads, ) - ref_ptr, ref_features = _restrict_features(ref_ptr, ref_features, restrict_to) + unique_labels, label_idx = _factorize(ref_labels) + markers = identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) + markers_idx = [None] * len(unique_labels) + for outer_i, outer_k in enumerate(unique_labels): + inner_instance = [None] * len(unique_labels) + for inner_i, inner_k in enumerate(unique_labels): + current = markers[outer_k][inner_k] + inner_instance[inner_i] = numpy.array(biocutils.match(current, ref_features), dtype=numpy.uint32) + markers_idx[outer_i] = inner_instance + + if test_features is None: + test_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) + ref_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) + else: + common_features = _stable_intersect(test_features, ref_features) + test_features_idx = numpy.array(biocutils.match(test_features, common_features), dtype=numpy.uint32) + ref_features_idx = numpy.array(biocutils.match(ref_features, common_features), dtype=numpy.uint32) + ref_ptr = mattress.initialize(ref_data) + builder, _ = knncolle.define_builder(nn_parameters) + return TrainedSingleReference( + lib.train_single( + test_features_idx, + ref_ptr.ptr, + ref_features_idx, + label_idx, + markers_idx, + builder, + num_threads, + ), + labels = unique_labels, + features = ref_features, + markers = markers, + ) + + +def identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads): + ref_data, ref_features = _restrict_features(ref_data, ref_features, test_features) + ref_data, ref_features = _restrict_features(ref_data, ref_features, restrict_to) + + # Deriving the markers from expression data. if markers is None: if marker_method == "classic": - mrk, lablev, ref_features = _get_classic_markers_raw( - ref_ptrs=[ref_ptr], + markers = get_classic_markers( + ref_data=[ref_data], ref_labels=[ref_labels], ref_features=[ref_features], num_threads=num_threads, **marker_args, ) - markers = _markers_to_dict(mrk, lablev, ref_features) - labind = numpy.array(biocutils.match(ref_labels, lablev), dtype=numpy.uint32) else: raise NotImplementedError("other marker methods are not yet implemented, sorry") - else: - lablev, labind = _factorize(ref_labels) - labind = numpy.array(labind, dtype=numpy.uint32) - mrk = _markers_from_dict(markers, lablev, ref_features) + print(markers) + return markers + + # Validating a user-supplied list of markers. + if not isinstance(markers, dict): + raise ValueError("'markers' should be a list where the labels are keys") + if len(unique_labels) != len(markers): + raise ValueError("'markers' should have length equal to the number of unique labels") + + available_features = set(ref_features) + new_markers = {} + for x, y in markers.items(): + if x not in unique_labels: + raise ValueError("unknown label '" + x + "' in 'markers'") + + if isinstance(y, list): + collected = [] + for g in y: + if g in available_features: + collected.append(g) + output = {} + for l in unique_labels: + output[l] = [] + output[x] = collected + elif isinstance(y, dict): + if len(unique_labels) != len(y): + raise ValueError("each value of 'markers' should have length equal to the number of unique labels") + output = {} + for x_inner, y_inner in y.items(): + collected = [] + for g in y_inner: + if g in available_features: + collected.append(g) + output[x_inner] = collected + else: + raise ValueError("values of 'markers' should be dictionaries") - return TrainedSingleReference( - lib.train_single( - ref_ptr.ptr, - labind, - mrk, - approximate, - num_threads, - ), - labels=lablev, - features=ref_features, - markers=markers, - ) + new_markers[x] = output + + return new_markers diff --git a/tests/test_train_single.py b/tests/test_train_single.py index a7e0e3c..f8b2f1b 100644 --- a/tests/test_train_single.py +++ b/tests/test_train_single.py @@ -2,17 +2,17 @@ import numpy -def test_train_single(): +def test_train_single_basic(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features = [str(i) for i in range(ref.shape[0])] markers = singler.get_classic_markers(ref, labels, features) - built = singler.train_single(ref, labels, features, markers) + built = singler.train_single(ref, labels, features, markers=markers) assert built.num_labels() == 5 assert built.num_markers() < len(features) assert built.features == features - assert built.labels == ["A", "B", "C", "D", "E"] + assert built.labels.as_list() == ["A", "B", "C", "D", "E"] all_markers = built.marker_subset() assert len(all_markers) == built.num_markers() @@ -22,7 +22,7 @@ def test_train_single(): # Same results when run in parallel. pbuilt = singler.train_single( - ref, labels, features, markers, num_threads=2 + ref, labels, features, markers=markers, num_threads=2 ) assert all_markers == pbuilt.marker_subset() @@ -34,7 +34,7 @@ def test_train_single_markers(): built = singler.train_single(ref, labels, features) markers = singler.get_classic_markers(ref, labels, features) - mbuilt = singler.train_single(ref, labels, features, markers) + mbuilt = singler.train_single(ref, labels, features, markers=markers) assert built.markers == mbuilt.markers @@ -48,12 +48,12 @@ def test_train_single_restricted(): built = singler.train_single( ref, labels, features, restrict_to=set(restricted) ) + assert built.features == features expected = singler.train_single(ref[keep, :], labels, restricted) - assert built.markers == expected.markers assert built.marker_subset() == expected.marker_subset() - assert built.features == expected.features + return # Check that the actual C++ content is the same. test = numpy.random.rand(10000, 50) From 6cb69b2ec67dfa993f18b4dbb681b850b6118944 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 15:10:57 -0800 Subject: [PATCH 05/29] Bring the classifier back online. --- lib/build_single_reference.cpp | 41 ------ lib/classify_single_reference.cpp | 52 ------- lib/src/classify_single.cpp | 6 +- lib/src/train_single.cpp | 2 +- src/singler/__init__.py | 1 + src/singler/_utils.py | 7 +- src/singler/classify_single.py | 94 +++++++++++++ src/singler/classify_single_reference.py | 128 ------------------ src/singler/train_single.py | 5 +- ...e_reference.py => test_classify_single.py} | 38 +++--- tests/test_train_single.py | 7 +- 11 files changed, 129 insertions(+), 252 deletions(-) delete mode 100644 lib/build_single_reference.cpp delete mode 100644 lib/classify_single_reference.cpp create mode 100644 src/singler/classify_single.py delete mode 100644 src/singler/classify_single_reference.py rename tests/{test_classify_single_reference.py => test_classify_single.py} (54%) diff --git a/lib/build_single_reference.cpp b/lib/build_single_reference.cpp deleted file mode 100644 index ead7f94..0000000 --- a/lib/build_single_reference.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include "utils.h" // must be before all other includes. - -#include -#include -#include - -//[[export]] -void* build_single_reference(void* ref, const int32_t* labels /** numpy */, void* markers, uint8_t approximate, int32_t nthreads) { - singlepp::BasicBuilder builder; - builder.set_num_threads(nthreads); - builder.set_top(-1); // Use all available markers; assume subsetting was applied on the Python side. - builder.set_approximate(approximate); - - auto marker_ptr = reinterpret_cast(markers); - const auto& ptr = reinterpret_cast(ref)->ptr; - std::vector labels2(labels, labels + ptr->ncol()); // need to copy as int may not be int32 and singlepp isn't templated on the labels (for now). - auto built = builder.run(ptr.get(), labels2.data(), *marker_ptr); - - return new singlepp::BasicBuilder::Prebuilt(std::move(built)); -} - -//[[export]] -int32_t get_nsubset_from_single_reference(void* ptr) { - return reinterpret_cast(ptr)->subset.size(); -} - -//[[export]] -int32_t get_nlabels_from_single_reference(void* ptr) { - return reinterpret_cast(ptr)->num_labels(); -} - -//[[export]] -void get_subset_from_single_reference(void* ptr, int32_t* buffer /** numpy */) { - const auto& sub = reinterpret_cast(ptr)->subset; - std::copy(sub.begin(), sub.end(), buffer); -} - -//[[export]] -void free_single_reference(void* ptr) { - delete reinterpret_cast(ptr); -} diff --git a/lib/classify_single_reference.cpp b/lib/classify_single_reference.cpp deleted file mode 100644 index f9d7713..0000000 --- a/lib/classify_single_reference.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "utils.h" // must be before raticate, singlepp includes. - -#include -#include -#include - -//[[export]] -void classify_single_reference( - void* mat, - const int32_t* subset /** numpy */, - void* prebuilt, - double quantile, - uint8_t use_fine_tune, - double fine_tune_threshold, - int32_t nthreads, - const uintptr_t* scores /** void_p */, - int32_t* best /** numpy */, - double* delta /** numpy */) -{ - auto mptr = reinterpret_cast(mat); - auto bptr = reinterpret_cast(prebuilt); - - singlepp::BasicScorer runner; - runner.set_num_threads(nthreads); - runner.set_quantile(quantile); - runner.set_fine_tune(use_fine_tune); - runner.set_fine_tune_threshold(fine_tune_threshold); - - // Only necessary as BasicScorer::run() isn't templated yet. - std::vector subset_copy(subset, subset + bptr->subset.size()); - size_t NC = mptr->ptr->ncol(); - std::vector best_copy(NC); - - size_t nlabels = bptr->num_labels(); - std::vector score_ptrs; - score_ptrs.reserve(nlabels); - for (size_t l = 0; l < nlabels; ++l) { - score_ptrs.push_back(reinterpret_cast(scores[l])); - } - - runner.run( - mptr->ptr.get(), - *bptr, - subset_copy.data(), - best_copy.data(), - score_ptrs, - delta - ); - - std::copy(best_copy.begin(), best_copy.end(), best); - return; -} diff --git a/lib/src/classify_single.cpp b/lib/src/classify_single.cpp index 090eed7..b6784c5 100644 --- a/lib/src/classify_single.cpp +++ b/lib/src/classify_single.cpp @@ -20,11 +20,11 @@ pybind11::tuple classify_single(const MatrixPointer& test, const TrainedSingleIn buffers.delta = static_cast(delta.request().ptr); size_t nlabels = built->num_labels(); - pybind11::array_t scores({ ncells, nlabels }); + pybind11::list scores(nlabels); buffers.scores.resize(nlabels); - auto sptr = static_cast(scores.request().ptr); for (size_t l = 0; l < nlabels; ++l) { - buffers.scores[l] = sptr + ncells * l; // already size_t, no need to cast. + scores[l] = pybind11::array_t(ncells); + buffers.scores[l] = static_cast(scores[l].cast().request().ptr); } // Running the analysis. diff --git a/lib/src/train_single.cpp b/lib/src/train_single.cpp index f138699..912dbd4 100644 --- a/lib/src/train_single.cpp +++ b/lib/src/train_single.cpp @@ -18,7 +18,7 @@ TrainedSingleIntersectPointer train_single( const BuilderPointer& builder, int nthreads) { - singlepp::TrainSingleOptions opts; + singlepp::TrainSingleOptions opts; opts.num_threads = nthreads; opts.top = -1; // Use all available markers; assume subsetting was applied on the Python side. opts.trainer = BuilderPointer(BuilderPointer{}, builder.get()); // make a no-op shared pointer. diff --git a/src/singler/__init__.py b/src/singler/__init__.py index fcd6031..3e8a4a9 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -18,3 +18,4 @@ from .get_classic_markers import get_classic_markers, number_of_classic_markers from .train_single import train_single, TrainedSingleReference +from .classify_single import classify_single diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 96b1567..68518e8 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -66,7 +66,7 @@ def _stable_union(*args) -> list: return output -def _clean_matrix(x, features, assay_type, check_missing, num_threads): +def _extract_assay(x, features, assay_type): if isinstance(x, summarizedexperiment.SummarizedExperiment): if features is None: features = x.get_row_names() @@ -74,6 +74,11 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): features = x.get_row_data().column(features) features = list(features) x = x.assay(assay_type) + return x, features + + +def _clean_matrix(x, features, assay_type, check_missing, num_threads): + x, features = _extract_assay(x, features, assay_type) curshape = x.shape if len(curshape) != 2: diff --git a/src/singler/classify_single.py b/src/singler/classify_single.py new file mode 100644 index 0000000..34de9ff --- /dev/null +++ b/src/singler/classify_single.py @@ -0,0 +1,94 @@ +from typing import Any, Sequence, Union + +import biocframe +import mattress + +from . import lib_singler as lib +from ._utils import _extract_assay, _create_map +from .train_single import TrainedSingleReference + + +def classify_single( + test_data: Any, + ref_prebuilt: TrainedSingleReference, + assay_type: Union[str, int] = 0, + quantile: float = 0.8, + use_fine_tune: bool = True, + fine_tune_threshold: float = 0.05, + num_threads: int = 1, +) -> biocframe.BiocFrame: + """Classify a test dataset against a reference by assigning labels from the latter to each column of the former + using the SingleR algorithm. + + Args: + test_data: + A matrix-like object where each row is a feature and each column + is a test sample (usually a single cell), containing expression values. + Normalized and transformed expression values are also acceptable as only + the ranking is used within this function. + + Alternatively, a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + containing such a matrix in one of its assays. + + ref_prebuilt: + A pre-built reference created with + :py:meth:`~singler.build_single_reference.build_single_reference`. + + assay_type: + Assay containing the expression matrix, if `test_data` is a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + + quantile: + Quantile of the correlation distribution for computing the score for each label. + Larger values increase sensitivity of matches at the expense of + similarity to the average behavior of each label. + + use_fine_tune: + Whether fine-tuning should be performed. This improves accuracy for distinguishing + between similar labels but requires more computational work. + + fine_tune_threshold: + Maximum difference from the maximum correlation to use in fine-tuning. + All labels above this threshold are used for another round of fine-tuning. + + num_threads: + Number of threads to use during classification. + + Returns: + A :py:class:`~BiocFrame.BiocFrame.BiocFrame` containing the ``best`` + label, the ``scores`` for each label (as a nested BiocFrame), and the + ``delta`` from the best to the second-best label. Each row corresponds + to a column of ``test``. The metadata contains ``markers``, a list of + the markers from each pairwise comparison between labels; and ``used``, + a list containing the union of markers from all comparisons. + """ + test_data, _ = _extract_assay(test_data, None, assay_type=assay_type) + test_ptr = mattress.initialize(test_data) + + best, raw_scores, delta = lib.classify_single( + test_ptr.ptr, + ref_prebuilt._ptr, + quantile, + use_fine_tune, + fine_tune_threshold, + num_threads + ) + + all_labels = ref_prebuilt.labels + scores = {} + for i, l in enumerate(all_labels): + scores[l] = raw_scores[i] + scores_df = biocframe.BiocFrame(scores, number_of_rows=test_data.shape[1], column_names=all_labels) + + output = biocframe.BiocFrame({ + "best": [all_labels[b] for b in best], + "scores": scores_df, + "delta": delta + }) + output = output.set_metadata({ + "used": ref_prebuilt.marker_subset(), + "markers": ref_prebuilt.markers, + }) + + return output diff --git a/src/singler/classify_single_reference.py b/src/singler/classify_single_reference.py deleted file mode 100644 index fa233f0..0000000 --- a/src/singler/classify_single_reference.py +++ /dev/null @@ -1,128 +0,0 @@ -from typing import Any, Sequence, Union - -from biocframe import BiocFrame -from numpy import float64, int32, ndarray, uintp - -from . import _cpphelpers as lib -from ._utils import _clean_matrix, _create_map -from .build_single_reference import SinglePrebuiltReference - - -def classify_single_reference( - test_data: Any, - test_features: Sequence, - ref_prebuilt: SinglePrebuiltReference, - assay_type: Union[str, int] = 0, - check_missing: bool = True, - quantile: float = 0.8, - use_fine_tune: bool = True, - fine_tune_threshold: float = 0.05, - num_threads: int = 1, -) -> BiocFrame: - """Classify a test dataset against a reference by assigning labels from the latter to each column of the former - using the SingleR algorithm. - - Args: - test_data: - A matrix-like object where each row is a feature and each column - is a test sample (usually a single cell), containing expression values. - Normalized and transformed expression values are also acceptable as only - the ranking is used within this function. - - Alternatively, a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - containing such a matrix in one of its assays. - - test_features: - Sequence of identifiers for each feature in the test - dataset, i.e., row in ``test_data``. - - If ``test_data`` is a ``SummarizedExperiment``, ``test_features`` - may be a string speciying the column name in `row_data`that contains the - features. Alternatively can be set to `None`, to use the `row_names` of - the experiment as used as features. - - ref_prebuilt: - A pre-built reference created with - :py:meth:`~singler.build_single_reference.build_single_reference`. - - assay_type: - Assay containing the expression matrix, - if `test_data` is a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. - - check_missing: - Whether to check for and remove rows with missing (NaN) values - from ``test_data``. - - quantile: - Quantile of the correlation distribution for computing the score for each label. - Larger values increase sensitivity of matches at the expense of - similarity to the average behavior of each label. - - use_fine_tune: - Whether fine-tuning should be performed. This improves accuracy for distinguishing - between similar labels but requires more computational work. - - fine_tune_threshold: - Maximum difference from the maximum correlation to use in fine-tuning. - All labels above this threshold are used for another round of fine-tuning. - - num_threads: - Number of threads to use during classification. - - Returns: - A data frame containing the ``best`` label, the ``scores`` - for each label (as a nested BiocFrame), and the ``delta`` from the best - to the second-best label. Each row corresponds to a column of ``test``. - """ - mat_ptr, test_features = _clean_matrix( - test_data, - test_features, - assay_type=assay_type, - check_missing=check_missing, - num_threads=num_threads, - ) - - nl = ref_prebuilt.num_labels() - nc = mat_ptr.ncol() - - best = ndarray((nc,), dtype=int32) - delta = ndarray((nc,), dtype=float64) - - scores = {} - all_labels = ref_prebuilt.labels - score_ptrs = ndarray((nl,), dtype=uintp) - for i in range(nl): - current = ndarray((nc,), dtype=float64) - scores[all_labels[i]] = current - score_ptrs[i] = current.ctypes.data - - mapping = _create_map(test_features) - - ref_subset = ref_prebuilt.marker_subset(indices_only=True) - ref_features = ref_prebuilt.features - subset = ndarray((len(ref_subset),), dtype=int32) - for i, y in enumerate(ref_subset): - x = ref_features[y] - if x not in mapping: - raise KeyError("failed to find gene '" + str(x) + "' in the test dataset") - subset[i] = mapping[x] - - lib.classify_single_reference( - mat_ptr.ptr, - subset, - ref_prebuilt._ptr, - quantile=quantile, - use_fine_tune=use_fine_tune, - fine_tune_threshold=fine_tune_threshold, - nthreads=num_threads, - scores=score_ptrs.ctypes.data, - best=best, - delta=delta, - ) - - scores_df = BiocFrame(scores, number_of_rows=nc) - return BiocFrame( - {"best": [all_labels[b] for b in best], "scores": scores_df, "delta": delta} - ) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index b82f26a..58e3532 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -190,8 +190,8 @@ def train_single( ref_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) else: common_features = _stable_intersect(test_features, ref_features) - test_features_idx = numpy.array(biocutils.match(test_features, common_features), dtype=numpy.uint32) - ref_features_idx = numpy.array(biocutils.match(ref_features, common_features), dtype=numpy.uint32) + test_features_idx = numpy.array(biocutils.match(common_features, test_features), dtype=numpy.uint32) + ref_features_idx = numpy.array(biocutils.match(common_features, ref_features), dtype=numpy.uint32) ref_ptr = mattress.initialize(ref_data) builder, _ = knncolle.define_builder(nn_parameters) @@ -227,7 +227,6 @@ def identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, m ) else: raise NotImplementedError("other marker methods are not yet implemented, sorry") - print(markers) return markers # Validating a user-supplied list of markers. diff --git a/tests/test_classify_single_reference.py b/tests/test_classify_single.py similarity index 54% rename from tests/test_classify_single_reference.py rename to tests/test_classify_single.py index 1e45b0e..1428dff 100644 --- a/tests/test_classify_single_reference.py +++ b/tests/test_classify_single.py @@ -1,16 +1,16 @@ import singler import numpy +import knncolle -def test_classify_single_reference_simple(): +def test_classify_single_simple(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features = [str(i) for i in range(ref.shape[0])] - markers = singler.get_classic_markers(ref, labels, features) - built = singler.build_single_reference(ref, labels, features, markers) + built = singler.train_single(ref, labels, features) test = numpy.random.rand(10000, 50) - output = singler.classify_single_reference(test, features, built) + output = singler.classify_single(test, built) assert output.shape[0] == 50 assert sorted(output.column("scores").column_names) == ["A", "B", "C", "D", "E"] @@ -19,7 +19,7 @@ def test_classify_single_reference_simple(): assert x in all_names -def test_classify_single_reference_sanity(): +def test_classify_single_sanity(): ref = numpy.random.rand(10000, 10) + 1 ref[:2000, :2] = 0 ref[2000:4000, 2:4] = 0 @@ -29,8 +29,7 @@ def test_classify_single_reference_sanity(): labels = ["A", "A", "B", "B", "C", "C", "D", "D", "E", "E"] features = [str(i) for i in range(ref.shape[0])] - markers = singler.get_classic_markers(ref, labels, features) - built = singler.build_single_reference(ref, labels, features, markers) + built = singler.train_single(ref, labels, features) test = numpy.random.rand(10000, 5) + 1 test[2000:4000, 0] = 0 # B @@ -39,26 +38,27 @@ def test_classify_single_reference_sanity(): test[8000:, 3] = 0 # E test[4000:6000, 4] = 0 # C - output = singler.classify_single_reference(test, features, built) + output = singler.classify_single(test, built) assert output.shape[0] == 5 assert output.column("best") == ["B", "D", "A", "E", "C"] -def test_classify_single_reference_features(): +def test_classify_single_features(): ref = numpy.random.rand(10000, 10) labels = ["A", "A", "B", "B", "C", "C", "D", "D", "E", "E"] features = [str(i) for i in range(ref.shape[0])] - markers = singler.get_classic_markers(ref, labels, features) - built = singler.build_single_reference(ref, labels, features, markers) + # Using an exact NN algorithm so that the ordering doesn't change the results. + built = singler.train_single(ref, labels, features, nn_parameters=knncolle.VptreeParameters()) test = numpy.random.rand(10000, 50) - revfeatures = features[::-1] - output = singler.classify_single_reference(test, revfeatures, built) + output = singler.classify_single(test, built) + # Checking that a different ordering of features in the test is respected. + revfeatures = features[::-1] + revbuilt = singler.train_single(ref, labels, features, test_features=revfeatures, nn_parameters=knncolle.VptreeParameters()) revtest = numpy.array(test[::-1, :]) - unscrambled = singler.classify_single_reference(revtest, features, built) - assert output.column("best") == unscrambled.column("best") - assert (output.column("delta") == unscrambled.column("delta")).all() - assert ( - output.column("scores").column("A") == unscrambled.column("scores").column("A") - ).all() + revoutput = singler.classify_single(revtest, revbuilt) + + assert output.column("best") == revoutput.column("best") + assert numpy.isclose(output.column("delta"), revoutput.column("delta")).all() + assert numpy.isclose(output.column("scores").column("A"), revoutput.column("scores").column("A")).all() diff --git a/tests/test_train_single.py b/tests/test_train_single.py index f8b2f1b..bbb12b0 100644 --- a/tests/test_train_single.py +++ b/tests/test_train_single.py @@ -50,14 +50,13 @@ def test_train_single_restricted(): ) assert built.features == features - expected = singler.train_single(ref[keep, :], labels, restricted) + expected = singler.train_single(ref[keep,:], labels, restricted) assert built.markers == expected.markers assert built.marker_subset() == expected.marker_subset() - return # Check that the actual C++ content is the same. test = numpy.random.rand(10000, 50) - output = singler.classify_single_reference(test, features, built) - expected_output = singler.classify_single_reference(test, features, expected) + output = singler.classify_single(test, built) + expected_output = singler.classify_single(test[keep,:], expected) assert (output.column("delta") == expected_output.column("delta")).all() assert output.column("best") == expected_output.column("best") From be176facc71cd00a1cf2438b6677f2c31144ab56 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 21:11:31 -0800 Subject: [PATCH 06/29] Got all the C++ code compiling again. --- lib/CMakeLists.txt | 2 + lib/build_integrated_references.cpp | 38 ---------------- lib/classify_integrated_references.cpp | 56 ----------------------- lib/src/classify_integrated.cpp | 63 ++++++++++++++++++++++++++ lib/src/init.cpp | 4 ++ lib/src/train_integrated.cpp | 62 +++++++++++++++++++++++++ setup.cfg | 2 +- 7 files changed, 132 insertions(+), 95 deletions(-) delete mode 100644 lib/build_integrated_references.cpp delete mode 100644 lib/classify_integrated_references.cpp create mode 100644 lib/src/classify_integrated.cpp create mode 100644 lib/src/train_integrated.cpp diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 3817cfc..a8eacf3 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -13,6 +13,8 @@ pybind11_add_module(singler src/find_classic_markers.cpp src/train_single.cpp src/classify_single.cpp + src/train_integrated.cpp + src/classify_integrated.cpp src/init.cpp ) diff --git a/lib/build_integrated_references.cpp b/lib/build_integrated_references.cpp deleted file mode 100644 index 0a20691..0000000 --- a/lib/build_integrated_references.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include "utils.h" // must be before all other includes. - -#include - -//[[export]] -void* build_integrated_references( - int32_t test_nrow, - const int32_t* /** numpy */ test_features, - int32_t nrefs, - const uintptr_t* /** void_p */ references, - const uintptr_t* /** void_p */ labels, - const uintptr_t* /** void_p */ ref_ids, - const uintptr_t* /** void_p */ prebuilt, - int32_t nthreads) -{ - singlepp::IntegratedBuilder runner; - runner.set_num_threads(nthreads); - - for (int32_t r = 0; r < nrefs; ++r) { - const auto& ptr = reinterpret_cast(references[r])->ptr; - runner.add( - test_nrow, - test_features, - ptr.get(), - reinterpret_cast(ref_ids[r]), - reinterpret_cast(labels[r]), - *reinterpret_cast(prebuilt[r]) - ); - } - - auto o = runner.finish(); - return new decltype(o)(std::move(o)); -} - -//[[export]] -void free_integrated_references(void* ptr) { - delete reinterpret_cast(ptr); -} diff --git a/lib/classify_integrated_references.cpp b/lib/classify_integrated_references.cpp deleted file mode 100644 index 3df92ea..0000000 --- a/lib/classify_integrated_references.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include "utils.h" // must be before raticate, singlepp includes. - -#include -#include -#include - -//[[export]] -void classify_integrated_references( - void* mat, - const uintptr_t* assigned /** void_p */, - void* prebuilt, - double quantile, - uintptr_t* scores /** void_p */, - int32_t* best /** numpy */, - double* delta /** numpy */, - int32_t nthreads) -{ - auto mptr = reinterpret_cast(mat); - auto NC = mptr->ptr->ncol(); - auto bptr = reinterpret_cast(prebuilt); - - // Only necessary as IntegratedScorer::run() isn't templated yet. - size_t nrefs = bptr->num_references(); - std::vector > single_results; - std::vector single_results_ptr; - single_results.reserve(nrefs); - single_results_ptr.reserve(nrefs); - - for (size_t r = 0; r < nrefs; ++r) { - auto current = reinterpret_cast(assigned[r]); - single_results.emplace_back(current, current + NC); - single_results_ptr.emplace_back(single_results.back().data()); - } - - singlepp::IntegratedScorer runner; - runner.set_num_threads(nthreads); - runner.set_quantile(quantile); - - std::vector best_copy(NC); - std::vector score_ptrs(nrefs); - for (size_t r = 0; r < nrefs; ++r) { - score_ptrs[r] = reinterpret_cast(scores[r]); - } - - runner.run( - mptr->ptr.get(), - single_results_ptr, - *bptr, - best_copy.data(), - score_ptrs, - delta - ); - - std::copy(best_copy.begin(), best_copy.end(), best); - return; -} diff --git a/lib/src/classify_integrated.cpp b/lib/src/classify_integrated.cpp new file mode 100644 index 0000000..afa1df2 --- /dev/null +++ b/lib/src/classify_integrated.cpp @@ -0,0 +1,63 @@ +#include "def.h" +#include "utils.h" + +#include "singlepp/singlepp.hpp" +#include "tatami/tatami.hpp" +#include "pybind11/pybind11.h" + +#include +#include +#include + +pybind11::tuple classify_integrated( + const MatrixPointer& test, + const pybind11::list& results, + const TrainedIntegratedPointer& integrated_build, + double quantile, + bool use_fine_tune, + double fine_tune_threshold, + int nthreads) +{ + // Setting up the previous results. + size_t num_refs = results.size(); + std::vector previous_results; + previous_results.reserve(num_refs); + for (size_t r = 0; r < num_refs; ++r) { + const auto& curres = results[r].cast(); + previous_results.push_back(check_numpy_array(curres)); + } + + // Setting up outputs. + size_t ncells = test->ncol(); + pybind11::array_t best(ncells); + pybind11::array_t delta(ncells); + + singlepp::ClassifyIntegratedBuffers buffers; + buffers.best = static_cast(best.request().ptr); + buffers.delta = static_cast(delta.request().ptr); + + pybind11::list scores(num_refs); + buffers.scores.resize(num_refs); + for (size_t l = 0; l < num_refs; ++l) { + scores[l] = pybind11::array_t(ncells); + buffers.scores[l] = static_cast(scores[l].cast().request().ptr); + } + + // Running the integrated scoring. + singlepp::ClassifyIntegratedOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + opts.fine_tune = use_fine_tune; + opts.fine_tune_threshold = fine_tune_threshold; + singlepp::classify_integrated(*test, previous_results, *integrated_build, buffers, opts); + + pybind11::tuple output(3); + output[0] = best; + output[1] = scores; + output[2] = delta; + return output; +} + +void init_classify_integrated(pybind11::module& m) { + m.def("classify_integrated", &classify_integrated); +} diff --git a/lib/src/init.cpp b/lib/src/init.cpp index 67dd2e9..7a7a700 100644 --- a/lib/src/init.cpp +++ b/lib/src/init.cpp @@ -4,11 +4,15 @@ void init_find_classic_markers(pybind11::module&); void init_train_single(pybind11::module&); void init_classify_single(pybind11::module&); +void init_train_integrated(pybind11::module&); +void init_classify_integrated(pybind11::module&); PYBIND11_MODULE(lib_singler, m) { init_find_classic_markers(m); init_train_single(m); init_classify_single(m); + init_train_integrated(m); + init_classify_integrated(m); pybind11::class_(m, "TrainSingleIntersect"); pybind11::class_(m, "TrainIntegrated"); diff --git a/lib/src/train_integrated.cpp b/lib/src/train_integrated.cpp new file mode 100644 index 0000000..12a154f --- /dev/null +++ b/lib/src/train_integrated.cpp @@ -0,0 +1,62 @@ +#include "def.h" +#include "utils.h" + +#include "singlepp/singlepp.hpp" +#include "tatami/tatami.hpp" +#include "pybind11/pybind11.h" + +#include +#include +#include + +TrainedIntegratedPointer train_integrated( + const pybind11::list& test_features, + const pybind11::list& references, + const pybind11::list& ref_features, + const pybind11::list& labels, + const pybind11::list& prebuilt, + int nthreads) +{ + size_t nrefs = references.size(); + std::vector > inputs; + inputs.reserve(nrefs); + std::vector > intersections(nrefs); + + for (size_t r = 0; r < nrefs; ++r) { + const auto& curref = references[r].cast(); + const auto& curlabels = labels[r].cast(); + const auto& curbuilt = prebuilt[r].cast(); + + const auto& test_ids = test_features[r].cast(); + const auto& ref_ids = ref_features[r].cast(); + size_t ninter = test_ids.size(); + if (ninter != static_cast(ref_ids.size())) { + throw std::runtime_error("length of each entry of 'test_features' and 'ref_features' should be the same"); + } + const auto* test_ids_ptr = check_numpy_array(test_ids); + const auto* ref_ids_ptr = check_numpy_array(ref_ids); + + auto& curinter = intersections[r]; + curinter.reserve(ninter); + for (size_t i = 0; i < ninter; ++i) { + curinter.emplace_back(test_ids_ptr[i], ref_ids_ptr[i]); + } + + inputs.push_back(singlepp::prepare_integrated_input_intersect( + curinter, + *curref, + check_numpy_array(curlabels), + *curbuilt + )); + } + + singlepp::TrainIntegratedOptions opts; + opts.num_threads = nthreads; + auto finished = singlepp::train_integrated(std::move(inputs), opts); + + return TrainedIntegratedPointer(new TrainedIntegrated(std::move(finished))); +} + +void init_train_integrated(pybind11::module& m) { + m.def("train_integrated", &train_integrated); +} diff --git a/setup.cfg b/setup.cfg index b4171a7..24fa793 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,7 @@ python_requires = >=3.8 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - mattress>=0.3.0 + mattress knncolle delayedarray biocframe>=0.5.0 From 24de82a1b8cacea945db62dbd46c7b7a071fb822 Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 11 Dec 2024 22:22:37 -0800 Subject: [PATCH 07/29] Fixed up the Python code in preparation for the finishing touch tomorrow. --- src/singler/__init__.py | 2 + src/singler/build_integrated_references.py | 171 ------------------ src/singler/classify_integrated.py | 130 +++++++++++++ src/singler/classify_integrated_references.py | 143 --------------- src/singler/train_integrated.py | 98 ++++++++++ src/singler/train_single.py | 8 +- 6 files changed, 237 insertions(+), 315 deletions(-) delete mode 100644 src/singler/build_integrated_references.py create mode 100644 src/singler/classify_integrated.py delete mode 100644 src/singler/classify_integrated_references.py create mode 100644 src/singler/train_integrated.py diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 3e8a4a9..6938bbc 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -19,3 +19,5 @@ from .get_classic_markers import get_classic_markers, number_of_classic_markers from .train_single import train_single, TrainedSingleReference from .classify_single import classify_single +from .train_integrated import train_integrated, TrainedIntegratedReferences +from .classify_integrated import classify_integrated diff --git a/src/singler/build_integrated_references.py b/src/singler/build_integrated_references.py deleted file mode 100644 index 19cb2d7..0000000 --- a/src/singler/build_integrated_references.py +++ /dev/null @@ -1,171 +0,0 @@ -from typing import Sequence, Optional, Union -from numpy import array, ndarray, int32, uintp - -import biocutils as ut - -from .build_single_reference import SinglePrebuiltReference -from . import _cpphelpers as lib -from ._utils import _stable_union, _factorize, _clean_matrix - - -class IntegratedReferences: - """Object containing integrated references, typically constructed by - :py:meth:`~singler.build_integrated_references.build_integrated_references`.""" - - def __init__(self, ptr, ref_names, ref_labels, test_features): - self._ptr = ptr - self._names = ref_names - self._labels = ref_labels - self._features = test_features - - def __del__(self): - lib.free_integrated_references(self._ptr) - - @property - def reference_names(self) -> Union[Sequence[str], None]: - """Sequence containing the names of the references. Alternatively - None, if no names were supplied.""" - return self._names - - @property - def reference_labels(self) -> list: - """List of lists containing the names of the labels for each reference. - - Each entry corresponds to a reference in :py:attr:`~reference_names`, - if ``reference_names`` is not None. - """ - return self._labels - - @property - def test_features(self) -> list[str]: - """Sequence containing the names of the test features.""" - return self._features - - -def build_integrated_references( - test_features: Sequence, - ref_data_list: dict, - ref_labels_list: list[Sequence], - ref_features_list: list[Sequence], - ref_prebuilt_list: list[SinglePrebuiltReference], - ref_names: Optional[Sequence[str]] = None, - assay_type: Union[str, int] = "logcounts", - check_missing: bool = True, - num_threads: int = 1, -) -> IntegratedReferences: - """Build a set of integrated references for classification of a test dataset. - - Arguments: - test_features: - Sequence of features for the test dataset. - - ref_data_list: - List of reference datasets, where each entry is equivalent to ``ref_data`` in - :py:meth:`~singler.build_single_reference.build_single_reference`. - - ref_labels_list: - List of reference labels, where each entry is equivalent to ``ref_labels`` in - :py:meth:`~singler.build_single_reference.build_single_reference`. - - ref_features_list: - List of reference features, where each entry is equivalent to ``ref_features`` in - :py:meth:`~singler.build_single_reference.build_single_reference`. - - ref_prebuilt_list: - List of prebuilt references, typically created by - calling :py:meth:`~singler.build_single_reference.build_single_reference` on the corresponding - elements of ``ref_data_list``, ``ref_labels_list`` and ``ref_features_list``. - - ref_names: - Sequence of names for the references. - If None, these are automatically generated. - - assasy_type: - Assay containing the expression matrix for any entry of ``ref_data_list`` that is a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. - - check_missing: - Whether to check for and remove rows with missing (NaN) values - from each entry of ``ref_data_list``. - - num_threads: - Number of threads. - - Returns: - Integrated references for classification with - :py:meth:`~singler.classify_integrated_references.classify_integrated_references`. - """ - nrefs = len(ref_data_list) - if nrefs != len(ref_features_list): - raise ValueError( - "'ref_features_list' and 'ref_data_list' should have the same length" - ) - - universe = _stable_union(test_features, *ref_features_list) - original_test_features = test_features - test_features = array(ut.match(test_features, universe), dtype=int32) - - converted_ref_data = [] - ref_data_ptrs = ndarray(nrefs, dtype=uintp) - converted_feature_data = [] - ref_features_ptrs = ndarray(nrefs, dtype=uintp) - - for i, x in enumerate(ref_data_list): - curptr, curfeatures = _clean_matrix( - x, - ref_features_list[i], - assay_type=assay_type, - check_missing=check_missing, - num_threads=num_threads, - ) - converted_ref_data.append(curptr) - ref_data_ptrs[i] = curptr.ptr - - ind = array(ut.match(curfeatures, universe), dtype=int32) - converted_feature_data.append(ind) - ref_features_ptrs[i] = ind.ctypes.data - - if nrefs != len(ref_labels_list): - raise ValueError( - "'ref_labels_list' and 'ref_data_list' should have the same length" - ) - converted_label_levels = [] - converted_label_indices = [] - ref_labels_ptrs = ndarray(nrefs, dtype=uintp) - for i, x in enumerate(ref_labels_list): - lev, ind = _factorize(x) - converted_label_levels.append(lev) - ind = array(ind, dtype=int32) - converted_label_indices.append(ind) - ref_labels_ptrs[i] = ind.ctypes.data - - if nrefs != len(ref_prebuilt_list): - raise ValueError( - "'ref_prebuilt_list' and 'ref_data_list' should have the same length" - ) - ref_prebuilt_ptrs = ndarray(nrefs, dtype=uintp) - for i, x in enumerate(ref_prebuilt_list): - ref_prebuilt_ptrs[i] = x._ptr - - if ref_names is not None: - if nrefs != len(ref_names): - raise ValueError( - "'ref_names' and 'ref_data_list' should have the same length" - ) - elif nrefs != len(set(ref_names)): - raise ValueError("'ref_names' should contain unique names") - - output = lib.build_integrated_references( - len(test_features), - test_features, - nrefs, - ref_data_ptrs.ctypes.data, - ref_labels_ptrs.ctypes.data, - ref_features_ptrs.ctypes.data, - ref_prebuilt_ptrs.ctypes.data, - num_threads, - ) - - return IntegratedReferences( - output, ref_names, converted_label_levels, original_test_features - ) diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py new file mode 100644 index 0000000..13aa002 --- /dev/null +++ b/src/singler/classify_integrated.py @@ -0,0 +1,130 @@ +from typing import Any, Sequence, Union + +import biocutils +import biocframe +import mattress +import numpy + +from . import lib_singler as lib +from .train_integrated import TrainedIntegratedReferences +from ._utils import _extract_assay + + +def classify_integrated( + test_data: Any, + results: list[biocframe.BiocFrame], + integrated_prebuilt: TrainedIntegratedReferences, + assay_type: Union[str, int] = 0, + quantile: float = 0.8, + use_fine_tune: bool = True, + fine_tune_threshold: float = 0.05, + num_threads: int = 1, +) -> biocframe.BiocFrame: + """Integrate classification results across multiple references for a single test dataset. + + Args: + test_data: + A matrix-like object where each row is a feature and each column + is a test sample (usually a single cell), containing expression values. + Normalized and/or transformed expression values are also acceptable as only + the ranking is used within this function. + + Alternatively, a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + containing such a matrix in one of its assays. + + results: + List of classification results generated by running + :py:meth:`~singler.classify_single.classify_single` on + ``test_data`` with each reference. This may be either the full data + frame or just the ``"best"`` column. References should be ordered + as in ``integrated_prebuilt.reference_names``. + + integrated_prebuilt: + Integrated reference object, constructed with + :py:meth:`~singler.train_integrated.train_integrated`. + + assay_type: Assay containing the expression matrix, + if `test_data` is a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + + quantile: + Quantile of the correlation distribution for computing the score for each label. + Larger values increase sensitivity of matches at the expense of + similarity to the average behavior of each label. + + use_fine_tune: + Whether fine-tuning should be performed. This improves accuracy for + distinguishing between references with similar best labels but + requires more computational work. + + fine_tune_threshold: + Maximum difference from the maximum correlation to use in + fine-tuning. All references above this threshold are used for + another round of fine-tuning. + + num_threads: + Number of threads to use during classification. + + Returns: + A :py:class:`~biocframe.BiocFrame.BiocFrame` containing the + ``best_label`` across all references, defined as the assigned label in + the best reference; the identity of the ``best_reference``, either as a + name string or an integer index; the ``scores`` for the best label in + each reference, as a nested BiocFrame of BiocFrames; and the ``delta`` + from the best to the second-best reference. Each row corresponds to a + column of ``test``. + """ + test_data, _ = _extract_assay(test_data, None, assay_type=assay_type) + ref_labs = integrated_prebuilt.reference_labels() + + # Applying the sanity checks. + if len(results) != len(ref_labs): + raise ValueError("length of 'results' should equal the number of references") + for i, curres in enumerate(results): + if test.shape[1] != curres.number_of_rows(): + raise ValueError("numbers of cells in 'results' are not identical") + available = set(integrated_prebuilt.reference_labels()[i]) + for l in curres.column("labels"): + if l not in available: + raise ValueError("not all labels in 'results' are present in the corresponding reference") + + collated = [] + for i, curres in enumerate(results): + available = set(ref_labs[i]) + collated.append(numpy.array(biocutils.match(curres.column("labels"), available), dtype=numpy.uint32)) + + best_ref, raw_scores, delta = classify_integrated( + mattress.initialize(test_data), + collated, + integrated_prebuilt._ptr, + quantile, + use_fine_tune, + fine_tune_threshold, + num_threads + ) + + best_label = [None] * test_data.shape[1] + for ref in set(best_ref): + curbest = results[ref].column("best") + for i, b in best: + if b == ref: + best_label[i] = curbest[i] + + all_refs = integrated_prebuilt.reference_names() + if all_refs is None: + all_refs = [str(i) for i in range(len(ref_labs))] + scores = {} + for i, l in enumerate(all_ref): + scores[l] = biocframe.BiocFrame({ "label": results[i].column("labels"), "score": raw_scores[i] }) + scores_df = biocframe.BiocFrame(scores, number_of_rows=test_data.shape[1], column_names=all_refs) + + if has_names: + best_ref = [all_refs[b] for b in best_ref] + + return bioc.BiocFrame({ + "best_label": best_label, + "best_reference": best, + "scores": scores_df, + "delta": delta, + }) diff --git a/src/singler/classify_integrated_references.py b/src/singler/classify_integrated_references.py deleted file mode 100644 index 79f1845..0000000 --- a/src/singler/classify_integrated_references.py +++ /dev/null @@ -1,143 +0,0 @@ -from typing import Any, Sequence, Union - -import biocutils as ut -from biocframe import BiocFrame -from mattress import TatamiNumericPointer, tatamize -from numpy import array, float64, int32, ndarray, uintp -from summarizedexperiment import SummarizedExperiment - -from . import _cpphelpers as lib -from .build_integrated_references import IntegratedReferences - - -def classify_integrated_references( - test_data: Any, - results: list[Union[BiocFrame, Sequence]], - integrated_prebuilt: IntegratedReferences, - assay_type: Union[str, int] = 0, - quantile: float = 0.8, - num_threads: int = 1, -) -> BiocFrame: - """Integrate classification results across multiple references for a single test dataset. - - Args: - test_data: - A matrix-like object where each row is a feature and each column - is a test sample (usually a single cell), containing expression values. - Normalized and/or transformed expression values are also acceptable as only - the ranking is used within this function. - - Alternatively, a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - containing such a matrix in one of its assays. - - results: - List of classification results generated by running - :py:meth:`~singler.classify_single_reference.classify_single_reference` - on ``test_data`` with each reference. This may be either the full - data frame or just the ``"best"`` column. References should be ordered - as in ``integrated_prebuilt.reference_names``. - - integrated_prebuilt: - Integrated reference object, constructed with - :py:meth:`~singler.build_integrated_references.build_integrated_references`. - - assay_type: Assay containing the expression matrix, - if `test_data` is a - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. - - quantile: - Quantile of the correlation distribution for computing the score for each label. - Larger values increase sensitivity of matches at the expense of - similarity to the average behavior of each label. - - num_threads: - Number of threads to use during classification. - - Returns: - A data frame containing the ``best_label`` across all - references, defined as the assigned label in the best reference; the - identity of the ``best_reference``, either as a name string or an - integer index; the ``scores`` for each reference, as a nested - BiocFrame; and the ``delta`` from the best to the second-best - reference. Each row corresponds to a column of ``test``. - """ - # Don't use _clean_matrix; the features are fixed so no filtering is possible at this point. - if not isinstance(test_data, TatamiNumericPointer): - if isinstance(test_data, SummarizedExperiment): - test_data = test_data.assay(assay_type) - - test_ptr = tatamize(test_data) - if test_ptr.nrow() != len(integrated_prebuilt.test_features): - raise ValueError( - "number of rows in 'test_data' should equal number of features in 'integrated_prebuilt'" - ) - nc = test_ptr.ncol() - - all_labels = integrated_prebuilt.reference_labels - nrefs = len(all_labels) - coerced_labels = [] - - all_refs = integrated_prebuilt.reference_names - has_names = all_refs is not None - if not has_names: - all_refs = [str(i) for i in range(nrefs)] - - scores = {} - score_ptrs = ndarray((nrefs,), dtype=uintp) - assign_ptrs = ndarray((nrefs,), dtype=uintp) - - if len(all_refs) != len(results): - raise ValueError( - "length of 'results' should equal number of references in 'integrated_prebuilt'" - ) - - for i, r in enumerate(all_refs): - current = ndarray((nc,), dtype=float64) - scores[r] = current - score_ptrs[i] = current.ctypes.data - - curlabs = results[i] - if isinstance(curlabs, BiocFrame): - curlabs = curlabs.column("best") - if len(curlabs) != nc: - raise ValueError( - "each entry of 'results' should have results for all cells in 'test_data'" - ) - - ind = array(ut.match(curlabs, all_labels[i]), dtype=int32) - coerced_labels.append(ind) - assign_ptrs[i] = ind.ctypes.data - - best = ndarray((nc,), dtype=int32) - delta = ndarray((nc,), dtype=float64) - lib.classify_integrated_references( - test_ptr.ptr, - assign_ptrs.ctypes.data, - integrated_prebuilt._ptr, - quantile, - score_ptrs.ctypes.data, - best, - delta, - num_threads, - ) - - best_label = [] - for i, b in enumerate(best): - if isinstance(results[b], BiocFrame): - best_label.append(results[b].column("best")[i]) - else: - best_label.append(results[b][i]) - - if has_names: - best = [all_refs[b] for b in best] - - scores_df = BiocFrame(scores, number_of_rows=nc) - return BiocFrame( - { - "best_label": best_label, - "best_reference": best, - "scores": scores_df, - "delta": delta, - } - ) diff --git a/src/singler/train_integrated.py b/src/singler/train_integrated.py new file mode 100644 index 0000000..15b4828 --- /dev/null +++ b/src/singler/train_integrated.py @@ -0,0 +1,98 @@ +from typing import Sequence, Optional, Union +import numpy +import biocutils +import warnings + +from .train_single import TrainedSingleReference +from . import lib_singler as lib +from ._utils import _stable_union, _stable_intersect + + +class TrainedIntegratedReferences: + """Object containing integrated references, typically constructed by + :py:meth:`~singler.train_integrated.train_integrated`.""" + + def __init__(self, ptr, ref_names, ref_labels): + self._ptr = ptr + self._names = ref_names + self._labels = ref_labels + + @property + def reference_names(self) -> Union[Sequence[str], None]: + """Sequence containing the names of the references. Alternatively + None, if no names were supplied.""" + return self._names + + @property + def reference_labels(self) -> list: + """List of lists containing the names of the labels for each reference. + + Each entry corresponds to a reference in :py:attr:`~reference_names`, + if ``reference_names`` is not None. + """ + return self._labels + + +def train_integrated( + test_features: Sequence, + ref_prebuilt: list[TrainedSingleReference], + ref_names: Optional[Sequence[str]] = None, + warn_lost: bool = True, + num_threads: int = 1, +) -> TrainedIntegratedReferences: + """Build a set of integrated references for classification of a test dataset. + + Arguments: + test_features: + Sequence of features for the test dataset. + + ref_prebuilt: + List of prebuilt references, typically created by calling + :py:meth:`~singler.build_single_reference.train_single`. + + ref_names: + Sequence of names for the references. If None, these are + automatically generated. + + warn_lost: + Whether to emit a warning if the markers for each reference are not + all present in all references. + + num_threads: + Number of threads. + + Returns: + Integrated references for classification with + :py:meth:`~singler.classify_integrated_references.classify_integrated`. + """ + # Checking the genes. + if warn_lost: + all_refnames = [x.features for x in ref_prebuilt] + intersected = set(_stable_intersect(*all_refnames)) + for trained in ref_prebuilt: + for g in trained.marker_subset(): + if g not in intersected: + warnings.warn("not all markers in 'ref_prebuilt' are available in each reference") + + all_inter_test = [] + all_inter_ref = [] + for i, trained in enumerate(ref_prebuilt): + common = _stable_intersect(test_features, trained.features) + all_inter_test.append(numpy.array(biocutils.match(common, test_features), numpy.uint32)) + all_inter_ref.append(numpy.array(biocutils.match(common, ref_features), numpy.uint32)) + + # Applying the integration. + ibuilt = train_integrated( + all_inter_test, + [mattress.initialize(x._all_data) for x in ref_prebuilt], + all_inter_ref, + [x._all_label_codes for x in ref_prebuilt], + [x._ptr for x in ref_prebuilt], + num_threads + ) + + return TrainedIntegratedReferences( + ptr=ibuilt, + ref_names=ref_names, + ref_labels=[x.labels for x in ref_prebuilt] + ) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 58e3532..6495976 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -20,11 +20,15 @@ class TrainedSingleReference: def __init__( self, ptr, + full_data, + full_label_codes: numpy.ndarray, labels: Sequence, features: Sequence, - markers: dict[Any, dict[Any, Sequence]], + markers: dict[Any, dict[Any, Sequence]] ): self._ptr = ptr + self._full_data = full_data + self._full_label_codes = full_label_codes self._features = features self._labels = labels self._markers = markers @@ -205,6 +209,8 @@ def train_single( builder, num_threads, ), + full_data = ref_data, + full_label_codes = label_idx, labels = unique_labels, features = ref_features, markers = markers, From 957ca88409a34cc951589514ecdaa7d772745500 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 11:14:54 -0800 Subject: [PATCH 08/29] Bring the integrated functions back online. --- src/singler/classify_integrated.py | 9 ++--- src/singler/train_integrated.py | 12 ++++-- ...erences.py => test_classify_integrated.py} | 38 ++++++++----------- ...references.py => test_train_integrated.py} | 22 ++++------- 4 files changed, 35 insertions(+), 46 deletions(-) rename tests/{test_classify_integrated_references.py => test_classify_integrated.py} (56%) rename tests/{test_build_integrated_references.py => test_train_integrated.py} (55%) diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py index 13aa002..332f6e9 100644 --- a/src/singler/classify_integrated.py +++ b/src/singler/classify_integrated.py @@ -36,16 +36,15 @@ def classify_integrated( results: List of classification results generated by running :py:meth:`~singler.classify_single.classify_single` on - ``test_data`` with each reference. This may be either the full data - frame or just the ``"best"`` column. References should be ordered - as in ``integrated_prebuilt.reference_names``. + ``test_data`` with each reference. References should be ordered as + in ``integrated_prebuilt.reference_names``. integrated_prebuilt: Integrated reference object, constructed with :py:meth:`~singler.train_integrated.train_integrated`. - assay_type: Assay containing the expression matrix, - if `test_data` is a + assay_type: + Assay containing the expression matrix, if `test_data` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. quantile: diff --git a/src/singler/train_integrated.py b/src/singler/train_integrated.py index 15b4828..7573353 100644 --- a/src/singler/train_integrated.py +++ b/src/singler/train_integrated.py @@ -1,7 +1,9 @@ from typing import Sequence, Optional, Union + import numpy import biocutils import warnings +import mattress from .train_single import TrainedSingleReference from . import lib_singler as lib @@ -79,14 +81,16 @@ def train_integrated( for i, trained in enumerate(ref_prebuilt): common = _stable_intersect(test_features, trained.features) all_inter_test.append(numpy.array(biocutils.match(common, test_features), numpy.uint32)) - all_inter_ref.append(numpy.array(biocutils.match(common, ref_features), numpy.uint32)) + all_inter_ref.append(numpy.array(biocutils.match(common, trained.features), numpy.uint32)) + + all_data = [mattress.initialize(x._full_data) for x in ref_prebuilt] # Applying the integration. - ibuilt = train_integrated( + ibuilt = lib.train_integrated( all_inter_test, - [mattress.initialize(x._all_data) for x in ref_prebuilt], + [x.ptr for x in all_data], all_inter_ref, - [x._all_label_codes for x in ref_prebuilt], + [x._full_label_codes for x in ref_prebuilt], [x._ptr for x in ref_prebuilt], num_threads ) diff --git a/tests/test_classify_integrated_references.py b/tests/test_classify_integrated.py similarity index 56% rename from tests/test_classify_integrated_references.py rename to tests/test_classify_integrated.py index 0663a8b..9b03456 100644 --- a/tests/test_classify_integrated_references.py +++ b/tests/test_classify_integrated.py @@ -2,7 +2,7 @@ import numpy -def test_classify_integrated_references(): +def test_classify_integrated(): all_features = [str(i) for i in range(10000)] test_features = [all_features[i] for i in range(0, 10000, 2)] test_set = set(test_features) @@ -10,34 +10,31 @@ def test_classify_integrated_references(): ref1 = numpy.random.rand(8000, 10) labels1 = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features1 = [all_features[i] for i in range(8000)] - built1 = singler.build_single_reference( - ref1, labels1, features1, restrict_to=test_set + built1 = singler.train_single( + ref1, labels1, features1, test_features=test_set ) ref2 = numpy.random.rand(8000, 6) labels2 = ["z", "y", "x", "z", "y", "z"] features2 = [all_features[i] for i in range(2000, 10000)] - built2 = singler.build_single_reference( - ref2, labels2, features2, restrict_to=test_set + built2 = singler.train_single( + ref2, labels2, features2, test_features=test_set ) - integrated = singler.build_integrated_references( + integrated = singler.train_integrated( test_features, - ref_data_list=[ref1, ref2], - ref_labels_list=[labels1, labels2], - ref_features_list=[features1, features2], - ref_prebuilt_list=[built1, built2], + ref_prebuilt=[built1, built2], ref_names=["first", "second"], ) # Running the full analysis. test = numpy.random.rand(len(test_features), 50) - results1 = singler.classify_single_reference(test, test_features, built1) - results2 = singler.classify_single_reference(test, test_features, built2) + results1 = singler.classify_single(test, built1) + results2 = singler.classify_single(test, built2) - results = singler.classify_integrated_references( + results = singler.classify_integrated( test, - results=[results1, results2.column("best")], + results=[results1, results2], integrated_prebuilt=integrated, ) @@ -54,18 +51,15 @@ def test_classify_integrated_references(): assert results2.column("best")[i] in labels2_set # Repeating without names. - integrated = singler.build_integrated_references( + integrated_un = singler.build_integrated_references( test_features, - ref_data_list=[ref1, ref2], - ref_labels_list=[labels1, labels2], - ref_features_list=[features1, features2], - ref_prebuilt_list=[built1, built2], + ref_prebuilt=[built1, built2], ) - results = singler.classify_integrated_references( + results_un = singler.classify_integrated( test, - results=[results1, results2.column("best")], - integrated_prebuilt=integrated, + results=[results1, results2], + integrated_prebuilt=integrated_un, ) assert results.shape[0] == 50 diff --git a/tests/test_build_integrated_references.py b/tests/test_train_integrated.py similarity index 55% rename from tests/test_build_integrated_references.py rename to tests/test_train_integrated.py index 0ea2f93..064d8b4 100644 --- a/tests/test_build_integrated_references.py +++ b/tests/test_train_integrated.py @@ -2,44 +2,36 @@ import numpy -def test_build_integrated_references(): +def test_train_integrated(): all_features = [str(i) for i in range(10000)] ref1 = numpy.random.rand(8000, 10) labels1 = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] features1 = [all_features[i] for i in range(8000)] - built1 = singler.build_single_reference(ref1, labels1, features1) + built1 = singler.train_single(ref1, labels1, features1) ref2 = numpy.random.rand(8000, 6) labels2 = ["z", "y", "x", "z", "y", "z"] features2 = [all_features[i] for i in range(2000, 10000)] - built2 = singler.build_single_reference(ref2, labels2, features2) + built2 = singler.train_single(ref2, labels2, features2) test_features = [all_features[i] for i in range(0, 10000, 2)] - integrated = singler.build_integrated_references( + integrated = singler.train_integrated( test_features, - ref_data_list=[ref1, ref2], - ref_labels_list=[labels1, labels2], - ref_features_list=[features1, features2], - ref_prebuilt_list=[built1, built2], + ref_prebuilt=[built1, built2] ) assert integrated.reference_names == None assert list(integrated.reference_labels[0]) == ["A", "B", "C", "D", "E"] assert list(integrated.reference_labels[1]) == ["z", "y", "x"] - assert integrated.test_features == test_features # Works in parallel. - pintegrated = singler.build_integrated_references( + pintegrated = singler.train_integrated( test_features, - ref_data_list=[ref1, ref2], - ref_labels_list=[labels1, labels2], - ref_features_list=[features1, features2], - ref_prebuilt_list=[built1, built2], + ref_prebuilt=[built1, built2], ref_names=["FOO", "BAR"], num_threads=3, ) assert pintegrated.reference_names == ["FOO", "BAR"] assert pintegrated.reference_labels == integrated.reference_labels - assert pintegrated.test_features == test_features From d23284992e92b3dc458ca3f4a6bc2edd3e29258a Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 11:17:40 -0800 Subject: [PATCH 09/29] Use the latest match() that allows a variable dtype. --- setup.cfg | 2 +- src/singler/classify_integrated.py | 2 +- src/singler/train_integrated.py | 4 ++-- src/singler/train_single.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/setup.cfg b/setup.cfg index 24fa793..e37e2b7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = biocframe>=0.5.0 summarizedexperiment>=0.4.0 singlecellexperiment>=0.4.6 - biocutils + biocutils>=0.1.7 [options.packages.find] where = src diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py index 332f6e9..91d89a1 100644 --- a/src/singler/classify_integrated.py +++ b/src/singler/classify_integrated.py @@ -91,7 +91,7 @@ def classify_integrated( collated = [] for i, curres in enumerate(results): available = set(ref_labs[i]) - collated.append(numpy.array(biocutils.match(curres.column("labels"), available), dtype=numpy.uint32)) + collated.append(biocutils.match(curres.column("labels"), available, dtype=numpy.uint32)) best_ref, raw_scores, delta = classify_integrated( mattress.initialize(test_data), diff --git a/src/singler/train_integrated.py b/src/singler/train_integrated.py index 7573353..e4bcc46 100644 --- a/src/singler/train_integrated.py +++ b/src/singler/train_integrated.py @@ -80,8 +80,8 @@ def train_integrated( all_inter_ref = [] for i, trained in enumerate(ref_prebuilt): common = _stable_intersect(test_features, trained.features) - all_inter_test.append(numpy.array(biocutils.match(common, test_features), numpy.uint32)) - all_inter_ref.append(numpy.array(biocutils.match(common, trained.features), numpy.uint32)) + all_inter_test.append(biocutils.match(common, test_features, dtype=numpy.uint32)) + all_inter_ref.append(biocutils.match(common, trained.features, dtype=numpy.uint32)) all_data = [mattress.initialize(x._full_data) for x in ref_prebuilt] diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 6495976..5bb2834 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -186,7 +186,7 @@ def train_single( inner_instance = [None] * len(unique_labels) for inner_i, inner_k in enumerate(unique_labels): current = markers[outer_k][inner_k] - inner_instance[inner_i] = numpy.array(biocutils.match(current, ref_features), dtype=numpy.uint32) + inner_instance[inner_i] = biocutils.match(current, ref_features, dtype=numpy.uint32) markers_idx[outer_i] = inner_instance if test_features is None: @@ -194,8 +194,8 @@ def train_single( ref_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) else: common_features = _stable_intersect(test_features, ref_features) - test_features_idx = numpy.array(biocutils.match(common_features, test_features), dtype=numpy.uint32) - ref_features_idx = numpy.array(biocutils.match(common_features, ref_features), dtype=numpy.uint32) + test_features_idx = biocutils.match(common_features, test_features, dtype=numpy.uint32) + ref_features_idx = biocutils.match(common_features, ref_features, dtype=numpy.uint32) ref_ptr = mattress.initialize(ref_data) builder, _ = knncolle.define_builder(nn_parameters) From 32d336e4309f920d70226e0d1703c009e7103d9a Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 11:38:18 -0800 Subject: [PATCH 10/29] Got the annotate_single wrapper working again. --- src/singler/__init__.py | 1 + src/singler/annotate_single.py | 144 +++++++++++++-------------------- tests/test_annotate_single.py | 11 +-- 3 files changed, 61 insertions(+), 95 deletions(-) diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 6938bbc..0effd1e 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -19,5 +19,6 @@ from .get_classic_markers import get_classic_markers, number_of_classic_markers from .train_single import train_single, TrainedSingleReference from .classify_single import classify_single +from .annotate_single import annotate_single from .train_integrated import train_integrated, TrainedIntegratedReferences from .classify_integrated import classify_integrated diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 6b1ad36..9922b5a 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -1,57 +1,26 @@ import warnings from typing import Any, Optional, Sequence, Union -from biocframe import BiocFrame -from summarizedexperiment import SummarizedExperiment +import biocframe -from .build_single_reference import build_single_reference -from .classify_single_reference import classify_single_reference - - -def _resolve_reference(ref_data, ref_labels, ref_features, build_args): - if isinstance(ref_data, SummarizedExperiment) or issubclass(type(ref_data), SummarizedExperiment): - if ref_features is None: - ref_features = ref_data.get_row_names() - elif isinstance(ref_features, str): - ref_features = ref_data.get_row_data().column(ref_features) - - ref_features = list(ref_features) - - if ref_labels is None: - ref_labels = ref_data.get_column_names() - elif isinstance(ref_labels, str): - ref_labels = ref_data.get_column_data().column(ref_labels) - - ref_labels = list(ref_labels) - - try: - _default_asy = "logcounts" - if "assay_type" in build_args: - _default_asy = build_args["assay_type"] - - ref_data = ref_data.assay(_default_asy) - except Exception as _: - raise ValueError(f"Reference dataset must contain log-normalized count ('{_default_asy}') assay.") - - if ref_labels is None: - raise ValueError("'ref_labels' cannot be `None`.") - - if ref_features is None: - raise ValueError("'ref_features' cannot be `None`.") - - return ref_data, ref_labels, ref_features +from .train_single import train_single +from .classify_single import classify_single +from ._utils import _clean_matrix def annotate_single( test_data: Any, ref_data: Any, - ref_labels: Optional[Union[Sequence, str]], + ref_labels: Union[Sequence, str], test_features: Optional[Union[Sequence, str]] = None, ref_features: Optional[Union[Sequence, str]] = None, - build_args: dict = {}, + test_assay_type: Union[str, int] = 0, + ref_assay_type: Union[str, int] = 0, + check_missing: bool = True, + train_args: dict = {}, classify_args: dict = {}, num_threads: int = 1, -) -> BiocFrame: +) -> biocframe.BiocFrame: """Annotate a single-cell expression dataset based on the correlation of each cell to profiles in a labelled reference. @@ -66,15 +35,6 @@ def annotate_single( containing such a matrix in one of its assays. Non-default assay types can be specified in ``classify_args``. - test_features: - Sequence of length equal to the number of rows in - ``test_data``, containing the feature identifier for each row. - - Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` - may be a string speciying the column name in `row_data` that contains the - features. It can also be set to `None`, to use the `row_names` of - the experiment as features. - ref_data: A matrix-like object representing the reference dataset, where rows are features and columns are samples. Entries should be expression values, @@ -96,6 +56,15 @@ def annotate_single( e.g., "main", "fine", "ont". It can also be set to `None`, to use the `row_names` of the experiment as features. + test_features: + Sequence of length equal to the number of rows in + ``test_data``, containing the feature identifier for each row. + + Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` + may be a string speciying the column name in `row_data` that contains the + features. It can also be set to `None`, to use the `row_names` of + the experiment as features. + ref_features: If ``ref_data`` is a matrix-like object, ``ref_features`` should be a sequence of length equal to the number of rows of ``ref_data``, @@ -106,67 +75,62 @@ def annotate_single( that contains the features. It can also be set to `None`, to use the `row_names` of the experiment as features. - build_args: + test_assay_type: + Assay containing the expression matrix, if `test_data` is a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + + ref_assay_type: + Assay containing the expression matrix, if `ref_data` is a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + + train_args: Further arguments to pass to - :py:meth:`~singler.build_single_reference.build_single_reference`. + :py:meth:`~singler.train_single.train_single`. classify_args: Further arguments to pass to - :py:meth:`~singler.classify_single_reference.classify_single_reference`. + :py:meth:`~singler.classify_single.classify_single`. num_threads: Number of threads to use for the various steps. Returns: - A data frame containing the labelling results, see - :py:meth:`~singler.classify_single_reference.classify_single_reference` - for details. The metadata also contains a ``markers`` dictionary, - specifying the markers that were used for each pairwise comparison - between labels; and a list of ``unique_markers`` across all labels. + A :py:class:`~biocframe.BiocFrame.BiocFrame` of labelling results, see + :py:meth:`~singler.classify_single.classify_single` for details. """ - - if isinstance(test_data, SummarizedExperiment): - if test_features is None: - test_features = test_data.get_row_names() - elif isinstance(test_features, str): - test_features = test_data.get_row_data().column(test_features) - + test_data, test_features = _clean_matrix( + test_data, + test_features, + assay_type=test_assay_type, + check_missing=check_missing, + num_threads=num_threads + ) if test_features is None: - raise ValueError("'test_features' cannot be `None`.") - - test_features_set = set(test_features) - if len(test_features_set) != len(test_features): - warnings.warn("'test_features' is not unique, subsetting test matrix...", UserWarning) - _idxs = [test_features.index(x) for x in test_features_set] - print("modifying test data") - test_data = test_data[_idxs,] - - ref_data, ref_labels, ref_features = _resolve_reference( - ref_data=ref_data, - ref_labels=ref_labels, - ref_features=ref_features, - build_args=build_args, + raise ValueError("could not determine 'test_features'") + + ref_data, ref_features = _clean_matrix( + ref_data, + ref_features, + assay_type=ref_assay_type, + check_missing=check_missing, + num_threads=num_threads ) + if ref_features is None: + raise ValueError("could not determine 'ref_features'") - built = build_single_reference( + built = train_single( ref_data=ref_data, ref_labels=ref_labels, ref_features=ref_features, - restrict_to=test_features_set, - **build_args, + test_features=test_features, + check_missing=False, num_threads=num_threads, + **train_args, ) - output = classify_single_reference( + return classify_single( test_data, - test_features=test_features, ref_prebuilt=built, **classify_args, num_threads=num_threads, ) - - output.metadata = { - "markers": built.markers, - "unique_markers": built.marker_subset(), - } - return output diff --git a/tests/test_annotate_single.py b/tests/test_annotate_single.py index ff123a3..8f3dc80 100644 --- a/tests/test_annotate_single.py +++ b/tests/test_annotate_single.py @@ -46,12 +46,13 @@ def test_annotate_single_intersect(): ref_labels=ref_labels, ) - built = singler.build_single_reference( - ref[2000:, :], ref_labels=ref_labels, ref_features=ref_features[2000:] - ) - expected = singler.classify_single_reference( - test[:8000, :], test_features[:8000], built + built = singler.train_single( + ref[2000:, :], + ref_labels=ref_labels, + ref_features=ref_features[2000:], + test_features=test_features[:8000], ) + expected = singler.classify_single(test[:8000, :], built) assert output.column("best") == expected.column("best") assert (output.column("delta") == expected.column("delta")).all() From 8b5372826a170b9f2cdbb91f1f11641979b3cac7 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 14:36:14 -0800 Subject: [PATCH 11/29] Bring the annotated_integrated checks back online. Also prohibit use of strings to refer to row/column data. Users should just fetch the values explicitly. --- src/singler/__init__.py | 1 + src/singler/_utils.py | 12 +- src/singler/annotate_integrated.py | 225 +++++++++++++---------------- src/singler/annotate_single.py | 61 ++++---- src/singler/classify_integrated.py | 35 +++-- src/singler/classify_single.py | 5 +- tests/test_annotate_integrated.py | 6 +- tests/test_classify_integrated.py | 8 +- 8 files changed, 163 insertions(+), 190 deletions(-) diff --git a/src/singler/__init__.py b/src/singler/__init__.py index 0effd1e..b3e96e0 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -22,3 +22,4 @@ from .annotate_single import annotate_single from .train_integrated import train_integrated, TrainedIntegratedReferences from .classify_integrated import classify_integrated +from .annotate_integrated import annotate_integrated diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 68518e8..dff601a 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -66,19 +66,13 @@ def _stable_union(*args) -> list: return output -def _extract_assay(x, features, assay_type): +def _clean_matrix(x, features, assay_type, check_missing, num_threads): if isinstance(x, summarizedexperiment.SummarizedExperiment): if features is None: features = x.get_row_names() - elif isinstance(features, str): - features = x.get_row_data().column(features) - features = list(features) + elif not isinstance(features, list): + features = list(features) x = x.assay(assay_type) - return x, features - - -def _clean_matrix(x, features, assay_type, check_missing, num_threads): - x, features = _extract_assay(x, features, assay_type) curshape = x.shape if len(curshape) != 2: diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index afb38f4..4de6ce9 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -1,31 +1,30 @@ from typing import Any, Optional, Sequence, Tuple, Union -from biocframe import BiocFrame +import biocframe -from ._utils import _clean_matrix -from .annotate_single import _resolve_reference -from .build_integrated_references import build_integrated_references -from .build_single_reference import build_single_reference -from .classify_integrated_references import classify_integrated_references -from .classify_single_reference import classify_single_reference +from ._utils import _clean_matrix, _restrict_features +from .train_single import train_single +from .train_integrated import train_integrated +from .classify_single import classify_single +from .classify_integrated import classify_integrated def annotate_integrated( test_data: Any, - ref_data_list: Sequence[Union[Any, str]], - test_features: Optional[Union[Sequence, str]] = None, - ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None, - ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None, + ref_data: Sequence, + ref_labels: list[Sequence], + test_features: Optional[Sequence] = None, + ref_features: Optional[list[Optional[Sequence]]] = None, test_assay_type: Union[str, int] = 0, - test_check_missing: bool = True, ref_assay_type: Union[str, int] = "logcounts", + test_check_missing: bool = False, ref_check_missing: bool = True, - build_single_args: dict = {}, + train_single_args: dict = {}, classify_single_args: dict = {}, - build_integrated_args: dict = {}, + train_integrated_args: dict = {}, classify_integrated_args: dict = {}, num_threads: int = 1, -) -> Tuple[list[BiocFrame], BiocFrame]: +) -> Tuple[list[biocframe.BiocFrame], biocframe.BiocFrame]: """Annotate a single-cell expression dataset based on the correlation of each cell to profiles in multiple labelled references, where the annotation from each reference is then integrated across references. @@ -33,65 +32,58 @@ def annotate_integrated( Args: test_data: A matrix-like object representing the test dataset, where rows are - features and columns are samples (usually cells). Entries should be expression - values; only the ranking within each column will be used. + features and columns are samples (usually cells). Entries should be + expression values; only the ranking within each column is used. Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays. - test_features: - Sequence of length equal to the number of rows in - ``test_data``, containing the feature identifier for each row. - - Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` - may be a string speciying the column name in `row_data` that contains the - features. It can also be set to `None`, to use the `row_names` of the - experiment as features. - - ref_data_list: + ref_data: Sequence consisting of one or more of the following: - A matrix-like object representing the reference dataset, where rows are features and columns are samples. Entries should be expression values, - usually log-transformed (see comments for the ``ref`` argument in - :py:meth:`~singler.build_single_reference.build_single_reference`). - - A - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - object containing such a matrix in its assays. - - A string that can be passed as ``name`` to - :py:meth:`~singler.fetch_reference.fetch_github_reference`. - This will use the specified dataset as the reference. - - ref_labels_list: + usually log-transformed (see comments for the ``ref_data`` argument in + :py:func:`~singler.train_single.train_single`). + - A ``SummarizedExperiment`` object containing such a matrix in its assays. + + ref_labels: Sequence of the same length as ``ref_data``, where the contents depend on the type of value in the corresponding entry of ``ref_data``: - - If ``ref_data_list[i]`` is a matrix-like object, ``ref_labels_list[i]`` should be - a sequence of length equal to the number of columns of ``ref_data_list[i]``, + - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be + a sequence of length equal to the number of columns of ``ref_data[i]``, containing the label associated with each column. - - If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string - specifying the label type to use, e.g., "main", "fine", "ont". - If a single string is supplied, it is recycled for all ``ref_data``. - - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]`` + - If ``ref_data[i]`` is a ``SummarizedExperiment``, ``ref_labels[i]`` may be a string speciying the column name in `column_data` that contains the - features. It can also be set to `None`, to use the `column_names`of the + features. It can also be set to ``None`` to use the row names of the experiment as features. - ref_features_list: - Sequence of the same length as ``ref_data_list``, where the contents + test_features: + Sequence of length equal to the number of rows in + ``test_data``, containing the feature identifier for each row. + + Alternatively, if ``test_data`` is a ``SummarizedExperiment``, + ``test_features`` may be a string speciying the column name in + `row_data` that contains the features. It can also be set to + ``None`` to use the row names of the experiment as features. + + ref_features: + Sequence of the same length as ``ref_data``, where the contents depend on the type of value in the corresponding entry of ``ref_data``: - - If ``ref_data_list[i]`` is a matrix-like object, ``ref_features_list[i]`` should be - a sequence of length equal to the number of rows of ``ref_data_list[i]``, + - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be + a sequence of length equal to the number of rows of ``ref_data[i]``, containing the feature identifier associated with each row. - - If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string - specifying the feature type to use, e.g., "ensembl", "symbol". - If a single string is supplied, it is recycled for all ``ref_data``. - - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]`` - may be a string speciying the column name in `row_data` that contains the - features. It can also be set to `None`, to use the `row_names` of the - experiment as features. + - If ``ref_data[i]`` is a ``SummarizedExperiment``, + ``ref_features[i]`` may be a string speciying the column name in + `row_data` that contains the features. It can also be set to + ``None`` to use the row names of the experiment as features. + + This can also be ``None`` to indicate that the row names should be + used for all references, assuming ``ref_data`` only contains + ``SummarizedExperiment`` objects. test_assay_type: Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a @@ -101,58 +93,50 @@ def annotate_integrated( Whether to check for and remove missing (i.e., NaN) values from the test dataset. ref_assay_type: - Assay containing the expression matrix for any entry of ``ref_data_list`` that is a + Assay containing the expression matrix for any entry of ``ref_data`` that is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. ref_check_missing: Whether to check for and remove missing (i.e., NaN) values from the reference datasets. - build_single_args: + train_single_args: Further arguments to pass to - :py:meth:`~singler.build_single_reference.build_single_reference`. + :py:func:`~singler.train_single.train_single`. classify_single_args: Further arguments to pass to - :py:meth:`~singler.classify_single_reference.classify_single_reference`. + :py:func:`~singler.classify_single.classify_single`. - build_integrated_args: + train_integrated_args: Further arguments to pass to - :py:meth:`~singler.build_integrated_references.build_integrated_references`. + :py:func:`~singler.train_integrated.train_integrated`. classify_integrated_args: Further arguments to pass to - :py:meth:`~singler.classify_integrated_references.classify_integrated_references`. + :py:func:`~singler.classify_integrated.classify_integrated`. num_threads: Number of threads to use for the various steps. Returns: Tuple where the first element contains per-reference results (i.e. a - list of BiocFrame outputs equivalent to running - :py:meth:`~singler.annotate_single.annotate_single` on each reference) + list of BiocFrame outputs, roughly equivalent to running + :py:func:`~singler.annotate_single.annotate_single` on each reference) and the second element contains integrated results across references (i.e., a BiocFrame from - :py:meth:`~singler.classify_integrated_references.classify_integrated_references`). + :py:func:`~singler.classify_integrated.classify_integrated`). """ - nrefs = len(ref_data_list) - - if isinstance(ref_labels_list, str): - ref_labels_list = [ref_labels_list] * nrefs - elif ref_labels_list is None: - ref_labels_list = [None] * nrefs - - if nrefs != len(ref_labels_list): - raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length") - - if isinstance(ref_features_list, str): - ref_features_list = [ref_features_list] * nrefs - elif ref_features_list is None: - ref_features_list = [None] * nrefs - - if nrefs != len(ref_features_list): - raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length") - - test_ptr, test_features = _clean_matrix( + nrefs = len(ref_data) + if nrefs != len(ref_labels): + raise ValueError("'ref_data' and 'ref_labels' must be the same length") + if ref_features is None: + ref_features = [None] * nrefs + if nrefs != len(ref_features): + raise ValueError("'ref_data' and 'ref_features' must be the same length") + if nrefs != len(ref_labels): + raise ValueError("'ref_data' and 'ref_labels' must be the same length") + + test_data, test_features = _clean_matrix( test_data, test_features, assay_type=test_assay_type, @@ -163,66 +147,63 @@ def annotate_integrated( all_ref_data = [] all_ref_labels = [] all_ref_features = [] - all_built = [] - all_results = [] - test_features_set = set(test_features) - for r in range(nrefs): - curref_mat, curref_labels, curref_features = _resolve_reference( - ref_data=ref_data_list[r], - ref_labels=ref_labels_list[r], - ref_features=ref_features_list[r], - build_args=build_single_args, - ) - - curref_ptr, curref_features = _clean_matrix( - curref_mat, - curref_features, + curref_data, curref_features = _clean_matrix( + ref_data[r], + ref_features[r], assay_type=ref_assay_type, check_missing=ref_check_missing, num_threads=num_threads, ) - curbuilt = build_single_reference( - ref_data=curref_ptr, - ref_labels=curref_labels, - ref_features=curref_features, - restrict_to=test_features_set, - **build_single_args, + all_ref_data.append(curref_data) + all_ref_features.append(curref_features) + + # Pre-slicing the test dataset so that we force the downstream analysis + # to only consider genes that are present in all datasets (test and + # reference). This avoids the warning in classify_integrated(). + test_data, test_features = _restrict_features( + test_data, + test_features, + curref_features + ) + + if test_data.shape[0] == 0: + raise ValueError("no genes in common across test and reference datasets") + + all_built = [] + all_results = [] + + for r in range(nrefs): + curbuilt = train_single( + ref_data=all_ref_data[r], + ref_labels=ref_labels[r], + ref_features=all_ref_features[r], + test_features=test_features, + **train_single_args, + check_missing=False, num_threads=num_threads, ) - res = classify_single_reference( + res = classify_single( test_data, - test_features=test_features, ref_prebuilt=curbuilt, **classify_single_args, num_threads=num_threads, ) - all_ref_data.append(curref_ptr) - all_ref_labels.append(curref_labels) - all_ref_features.append(curref_features) all_built.append(curbuilt) all_results.append(res) - res.metadata = { - "markers": curbuilt.markers, - "unique_markers": curbuilt.marker_subset(), - } - - ibuilt = build_integrated_references( + ibuilt = train_integrated( test_features=test_features, - ref_data_list=all_ref_data, - ref_labels_list=all_ref_labels, - ref_features_list=all_ref_features, - ref_prebuilt_list=all_built, - **build_integrated_args, + ref_prebuilt=all_built, + **train_integrated_args, num_threads=num_threads, ) - ires = classify_integrated_references( - test_data=test_ptr, + ires = classify_integrated( + test_data=test_data, results=all_results, integrated_prebuilt=ibuilt, **classify_integrated_args, diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 9922b5a..8f1da09 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -2,6 +2,7 @@ from typing import Any, Optional, Sequence, Union import biocframe +import summarizedexperiment from .train_single import train_single from .classify_single import classify_single @@ -11,12 +12,13 @@ def annotate_single( test_data: Any, ref_data: Any, - ref_labels: Union[Sequence, str], - test_features: Optional[Union[Sequence, str]] = None, - ref_features: Optional[Union[Sequence, str]] = None, + ref_labels: Sequence, + test_features: Optional[Sequence] = None, + ref_features: Optional[Sequence] = None, test_assay_type: Union[str, int] = 0, ref_assay_type: Union[str, int] = 0, - check_missing: bool = True, + test_check_missing: bool = False, + ref_check_missing: bool = True, train_args: dict = {}, classify_args: dict = {}, num_threads: int = 1, @@ -39,7 +41,7 @@ def annotate_single( A matrix-like object representing the reference dataset, where rows are features and columns are samples. Entries should be expression values, usually log-transformed (see comments for the ``ref`` argument in - :py:meth:`~singler.build_single_reference.build_single_reference`). + :py:func:`~singler.train_single.train_single`). Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` @@ -47,62 +49,53 @@ def annotate_single( types can be specified in ``classify_args``. ref_labels: - If ``ref_data`` is a matrix-like object, ``ref_labels`` should be - a sequence of length equal to the number of columns of ``ref_data``, + Sequence of length equal to the number of columns of ``ref_data``, containing the label associated with each column. - Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, - ``ref_labels`` may be a string specifying the label type to use, - e.g., "main", "fine", "ont". It can also be set to `None`, to use - the `row_names` of the experiment as features. - test_features: - Sequence of length equal to the number of rows in - ``test_data``, containing the feature identifier for each row. - - Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` - may be a string speciying the column name in `row_data` that contains the - features. It can also be set to `None`, to use the `row_names` of - the experiment as features. + Sequence of length equal to the number of rows in ``test_data``, + containing the feature identifier for each row. Alternatively + ``None``, to use the row names of the experiment as features. ref_features: - If ``ref_data`` is a matrix-like object, ``ref_features`` should be - a sequence of length equal to the number of rows of ``ref_data``, - containing the feature identifier associated with each row. - - Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, - ``ref_features`` may be a string speciying the column name in `column_data` - that contains the features. It can also be set to `None`, to use the - `row_names` of the experiment as features. + Sequence of length equal to the number of rows of ``ref_data``, + containing the feature identifier for each row. Alternatively + ``None``, to use the `row_names` of the experiment as features. test_assay_type: - Assay containing the expression matrix, if `test_data` is a + Assay containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. ref_assay_type: - Assay containing the expression matrix, if `ref_data` is a + Assay containing the expression matrix, if ``ref_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. + test_assay_type: + Whether to remove rows with missing values from the test dataset. + + ref_assay_type: + Whether to remove rows with missing values from the reference dataset. + train_args: Further arguments to pass to - :py:meth:`~singler.train_single.train_single`. + :py:func:`~singler.train_single.train_single`. classify_args: Further arguments to pass to - :py:meth:`~singler.classify_single.classify_single`. + :py:func:`~singler.classify_single.classify_single`. num_threads: Number of threads to use for the various steps. Returns: A :py:class:`~biocframe.BiocFrame.BiocFrame` of labelling results, see - :py:meth:`~singler.classify_single.classify_single` for details. + :py:func:`~singler.classify_single.classify_single` for details. """ test_data, test_features = _clean_matrix( test_data, test_features, assay_type=test_assay_type, - check_missing=check_missing, + check_missing=test_check_missing, num_threads=num_threads ) if test_features is None: @@ -112,7 +105,7 @@ def annotate_single( ref_data, ref_features, assay_type=ref_assay_type, - check_missing=check_missing, + check_missing=ref_check_missing, num_threads=num_threads ) if ref_features is None: diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py index 91d89a1..c73a1a9 100644 --- a/src/singler/classify_integrated.py +++ b/src/singler/classify_integrated.py @@ -3,11 +3,11 @@ import biocutils import biocframe import mattress +import summarizedexperiment import numpy from . import lib_singler as lib from .train_integrated import TrainedIntegratedReferences -from ._utils import _extract_assay def classify_integrated( @@ -74,27 +74,29 @@ def classify_integrated( from the best to the second-best reference. Each row corresponds to a column of ``test``. """ - test_data, _ = _extract_assay(test_data, None, assay_type=assay_type) - ref_labs = integrated_prebuilt.reference_labels() + if isinstance(test_data, summarizedexperiment.SummarizedExperiment): + test_data = test_data.assay(assay_type) + ref_labs = integrated_prebuilt.reference_labels # Applying the sanity checks. if len(results) != len(ref_labs): raise ValueError("length of 'results' should equal the number of references") for i, curres in enumerate(results): - if test.shape[1] != curres.number_of_rows(): + if test_data.shape[1] != curres.shape[0]: raise ValueError("numbers of cells in 'results' are not identical") - available = set(integrated_prebuilt.reference_labels()[i]) - for l in curres.column("labels"): + available = set(ref_labs[i]) + for l in curres.column("best"): if l not in available: raise ValueError("not all labels in 'results' are present in the corresponding reference") collated = [] for i, curres in enumerate(results): available = set(ref_labs[i]) - collated.append(biocutils.match(curres.column("labels"), available, dtype=numpy.uint32)) + collated.append(biocutils.match(curres.column("best"), available, dtype=numpy.uint32)) - best_ref, raw_scores, delta = classify_integrated( - mattress.initialize(test_data), + test_ptr = mattress.initialize(test_data) + best_ref, raw_scores, delta = lib.classify_integrated( + test_ptr.ptr, collated, integrated_prebuilt._ptr, quantile, @@ -106,24 +108,25 @@ def classify_integrated( best_label = [None] * test_data.shape[1] for ref in set(best_ref): curbest = results[ref].column("best") - for i, b in best: + for i, b in enumerate(curbest): if b == ref: best_label[i] = curbest[i] - all_refs = integrated_prebuilt.reference_names() - if all_refs is None: + all_refs = integrated_prebuilt.reference_names + has_names = not (all_refs is None) + if not has_names: all_refs = [str(i) for i in range(len(ref_labs))] scores = {} - for i, l in enumerate(all_ref): - scores[l] = biocframe.BiocFrame({ "label": results[i].column("labels"), "score": raw_scores[i] }) + for i, l in enumerate(all_refs): + scores[l] = biocframe.BiocFrame({ "label": results[i].column("best"), "score": raw_scores[i] }) scores_df = biocframe.BiocFrame(scores, number_of_rows=test_data.shape[1], column_names=all_refs) if has_names: best_ref = [all_refs[b] for b in best_ref] - return bioc.BiocFrame({ + return biocframe.BiocFrame({ "best_label": best_label, - "best_reference": best, + "best_reference": best_ref, "scores": scores_df, "delta": delta, }) diff --git a/src/singler/classify_single.py b/src/singler/classify_single.py index 34de9ff..fa79ffa 100644 --- a/src/singler/classify_single.py +++ b/src/singler/classify_single.py @@ -2,9 +2,9 @@ import biocframe import mattress +import summarizedexperiment from . import lib_singler as lib -from ._utils import _extract_assay, _create_map from .train_single import TrainedSingleReference @@ -63,7 +63,8 @@ def classify_single( the markers from each pairwise comparison between labels; and ``used``, a list containing the union of markers from all comparisons. """ - test_data, _ = _extract_assay(test_data, None, assay_type=assay_type) + if isinstance(test_data, summarizedexperiment.SummarizedExperiment): + test_data = test_data.assay(assay_type) test_ptr = mattress.initialize(test_data) best, raw_scores, delta = lib.classify_single( diff --git a/tests/test_annotate_integrated.py b/tests/test_annotate_integrated.py index fc6aa7c..987d648 100644 --- a/tests/test_annotate_integrated.py +++ b/tests/test_annotate_integrated.py @@ -19,9 +19,9 @@ def test_annotate_integrated(): single_results, integrated_results = singler.annotate_integrated( test, test_features=test_features, - ref_data_list=[ref1, ref2], - ref_labels_list=[labels1, labels2], - ref_features_list=[features1, features2], + ref_data=[ref1, ref2], + ref_labels=[labels1, labels2], + ref_features=[features1, features2], ) assert len(single_results) == 2 diff --git a/tests/test_classify_integrated.py b/tests/test_classify_integrated.py index 9b03456..c6b2759 100644 --- a/tests/test_classify_integrated.py +++ b/tests/test_classify_integrated.py @@ -51,7 +51,7 @@ def test_classify_integrated(): assert results2.column("best")[i] in labels2_set # Repeating without names. - integrated_un = singler.build_integrated_references( + integrated_un = singler.train_integrated( test_features, ref_prebuilt=[built1, built2], ) @@ -62,6 +62,6 @@ def test_classify_integrated(): integrated_prebuilt=integrated_un, ) - assert results.shape[0] == 50 - assert set(results.column("best_reference")) == set([0, 1]) - assert list(results.column("scores").column_names) == ['0', '1'] + assert results_un.shape[0] == 50 + assert set(results_un.column("best_reference")) == set([0, 1]) + assert list(results_un.column("scores").column_names) == ['0', '1'] From 990a2fa95ef273d06419ed24877d7a049e781e96 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 14:42:49 -0800 Subject: [PATCH 12/29] Bumped the required version of mattress. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index e37e2b7..71e6b73 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,7 @@ python_requires = >=3.8 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - mattress + mattress>=0.3.0 knncolle delayedarray biocframe>=0.5.0 From 3365a3b73683fa87af0bad12a838679ff4ce4f46 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 15:47:49 -0800 Subject: [PATCH 13/29] Updated the README. --- README.md | 60 ++++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index c08e43d..1853b68 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Overview -This package provides Python bindings to the [C++ implementation](https://github.com/LTLA/singlepp) of the [SingleR algorithm](https://github.com/LTLA/SingleR), +This package provides Python bindings to the [C++ implementation](https://github.com/SingleR-inc/singlepp) of the [SingleR algorithm](https://github.com/SingleR-inc/SingleR), originally developed by [Aran et al. (2019)](https://www.nature.com/articles/s41590-018-0276-y). It is designed to annotate cell types by matching cells to known references based on their expression profiles. So kind of like Tinder, but for cells. @@ -62,7 +62,7 @@ results = singler.annotate_single( test_data = mat, test_features = features, ref_data = ref_data, - ref_labels = "label.main", + ref_labels = ref_data.get_column_data().column("label.main"), ) ``` @@ -94,23 +94,19 @@ Advanced users may prefer to build the reference and run the classification sepa This allows us to re-use the same reference for multiple datasets without repeating the build step. ```python -built = singler.build_single_reference( - ref_data=ref_data.assay("logcounts"), - ref_labels=ref_data.col_data.column("label.main"), - ref_features=ref_data.get_row_names(), - restrict_to=features, +built = singler.train_single( + ref_data = ref_data.assay("logcounts"), + ref_labels = ref_data.get_column_data().column("label.main"), + ref_features = ref_data.get_row_names(), + test_features = features, ) ``` And finally, we apply the pre-built reference to the test dataset to obtain our label assignments. -This can be repeated with different datasets that have the same features or a superset of `features`. +This can be repeated with different datasets that have the same features as `test_features=`. ```python -output = singler.classify_single_reference( - mat, - test_features=features, - ref_prebuilt=built, -) +output = singler.classify_single(mat, ref_prebuilt=built) ``` ## output @@ -134,21 +130,25 @@ import singler import celldex blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True) - immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) single_results, integrated = singler.annotate_integrated( mat, features, - ref_data_list = (blueprint_ref, immune_cell_ref), - ref_labels_list = "label.main", + ref_data = [ + blueprint_ref, + immune_cell_ref + ], + ref_labels = [ + blueprint_ref.get_column_data().column("label.main"), + immune_cell_ref.get_column_data().column("label.main") + ], num_threads = 6 ) ``` This annotates the test dataset against each reference individually to obtain the best per-reference label, and then it compares across references to find the best label from all references. -Both the single and integrated annotations are reported for diagnostics. ```python integrated.column("best_label") @@ -174,29 +174,3 @@ integrated.column("best_reference") ## ... ##] ``` - -## Developer notes - -Build the shared object file: - -```shell -python setup.py build_ext --inplace -``` - -For quick testing: - -```shell -pytest -``` - -For more complex testing: - -```shell -python setup.py build_ext --inplace && tox -``` - -To rebuild the **ctypes** bindings with [**cpptypes**](https://github.com/BiocPy/ctypes-wrapper): - -```shell -cpptypes src/singler/lib --py src/singler/_cpphelpers.py --cpp src/singler/lib/bindings.cpp --dll _core -``` From c1bd77bd3c025e533b10738ced21a0d5810ece96 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 16:11:24 -0800 Subject: [PATCH 14/29] Cleaned up docstring for proper API generation. --- docs/conf.py | 2 +- docs/index.md | 18 +---------- src/singler/annotate_integrated.py | 38 +++++++---------------- src/singler/annotate_single.py | 2 +- src/singler/classify_integrated.py | 12 ++++---- src/singler/classify_single.py | 6 ++-- src/singler/get_classic_markers.py | 49 ++++++++++++++++++------------ src/singler/train_integrated.py | 8 ++--- src/singler/train_single.py | 18 +++++------ 9 files changed, 66 insertions(+), 87 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 3495622..5ec32e8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,7 +43,7 @@ try: import sphinx - cmd_line = f"sphinx-apidoc --implicit-namespaces -f -o {output_dir} {module_dir}" + cmd_line = f"sphinx-apidoc -M --implicit-namespaces -f -o {output_dir} {module_dir} {module_dir}/lib_*" args = cmd_line.split(" ") if tuple(sphinx.__version__.split(".")) >= ("1", "7"): diff --git a/docs/index.md b/docs/index.md index 8362299..8d88c79 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,25 +1,9 @@ -# singler - -Add a short description here! - - -## Note - -> This is the main page of your project's [Sphinx] documentation. It is -> formatted in [Markdown]. Add additional pages by creating md-files in -> `docs` or rst-files (formatted in [reStructuredText]) and adding links to -> them in the `Contents` section below. -> -> Please check [Sphinx] and [MyST] for more information -> about how to document your project and how to configure your preferences. - - ## Contents ```{toctree} :maxdepth: 2 -Overview +Usage Contributions & Help License Authors diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index 4de6ce9..e1de142 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -49,37 +49,21 @@ def annotate_integrated( - A ``SummarizedExperiment`` object containing such a matrix in its assays. ref_labels: - Sequence of the same length as ``ref_data``, where the contents - depend on the type of value in the corresponding entry of ``ref_data``: - - - If ``ref_data[i]`` is a matrix-like object, ``ref_labels[i]`` should be - a sequence of length equal to the number of columns of ``ref_data[i]``, - containing the label associated with each column. - - If ``ref_data[i]`` is a ``SummarizedExperiment``, ``ref_labels[i]`` - may be a string speciying the column name in `column_data` that contains the - features. It can also be set to ``None`` to use the row names of the - experiment as features. + Sequence of the same length as ``ref_data``. The ``i``-th entry + should be a sequence of length equal to the number of columns of + ``ref_data[i]``, containing the label associated with each column. test_features: - Sequence of length equal to the number of rows in - ``test_data``, containing the feature identifier for each row. - - Alternatively, if ``test_data`` is a ``SummarizedExperiment``, - ``test_features`` may be a string speciying the column name in - `row_data` that contains the features. It can also be set to - ``None`` to use the row names of the experiment as features. + Sequence of length equal to the number of rows in ``test_data``, + containing the feature identifier for each row. Alternatively + ``None``, to use the row names of the experiment as features. ref_features: - Sequence of the same length as ``ref_data``, where the contents - depend on the type of value in the corresponding entry of ``ref_data``: - - - If ``ref_data[i]`` is a matrix-like object, ``ref_features[i]`` should be - a sequence of length equal to the number of rows of ``ref_data[i]``, - containing the feature identifier associated with each row. - - If ``ref_data[i]`` is a ``SummarizedExperiment``, - ``ref_features[i]`` may be a string speciying the column name in - `row_data` that contains the features. It can also be set to - ``None`` to use the row names of the experiment as features. + Sequence of the same length as ``ref_data``. The ``i``-th entry + should be a sequence of length equal to the number of rows of + ``ref_data[i]``, containing the feature identifier associated with + each row. It can also be set to ``None`` to use the row names of + the experiment as features. This can also be ``None`` to indicate that the row names should be used for all references, assuming ``ref_data`` only contains diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 8f1da09..11914c6 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -60,7 +60,7 @@ def annotate_single( ref_features: Sequence of length equal to the number of rows of ``ref_data``, containing the feature identifier for each row. Alternatively - ``None``, to use the `row_names` of the experiment as features. + ``None``, to use the row names of the experiment as features. test_assay_type: Assay containing the expression matrix, if ``test_data`` is a diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py index c73a1a9..0ee2009 100644 --- a/src/singler/classify_integrated.py +++ b/src/singler/classify_integrated.py @@ -35,16 +35,16 @@ def classify_integrated( results: List of classification results generated by running - :py:meth:`~singler.classify_single.classify_single` on + :py:func:`~singler.classify_single.classify_single` on ``test_data`` with each reference. References should be ordered as in ``integrated_prebuilt.reference_names``. integrated_prebuilt: Integrated reference object, constructed with - :py:meth:`~singler.train_integrated.train_integrated`. + :py:func:`~singler.train_integrated.train_integrated`. assay_type: - Assay containing the expression matrix, if `test_data` is a + Assay containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. quantile: @@ -70,9 +70,9 @@ def classify_integrated( ``best_label`` across all references, defined as the assigned label in the best reference; the identity of the ``best_reference``, either as a name string or an integer index; the ``scores`` for the best label in - each reference, as a nested BiocFrame of BiocFrames; and the ``delta`` - from the best to the second-best reference. Each row corresponds to a - column of ``test``. + each reference, as a nested ``BiocFrame``; and the ``delta`` from the + best to the second-best reference. Each row corresponds to a column of + ``test_data``. """ if isinstance(test_data, summarizedexperiment.SummarizedExperiment): test_data = test_data.assay(assay_type) diff --git a/src/singler/classify_single.py b/src/singler/classify_single.py index fa79ffa..aacbe51 100644 --- a/src/singler/classify_single.py +++ b/src/singler/classify_single.py @@ -33,10 +33,10 @@ def classify_single( ref_prebuilt: A pre-built reference created with - :py:meth:`~singler.build_single_reference.build_single_reference`. + :py:func:`~singler.train_single.train_single`. assay_type: - Assay containing the expression matrix, if `test_data` is a + Assay containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. quantile: @@ -57,7 +57,7 @@ def classify_single( Returns: A :py:class:`~BiocFrame.BiocFrame.BiocFrame` containing the ``best`` - label, the ``scores`` for each label (as a nested BiocFrame), and the + label, the ``scores`` for each label as a nested ``BiocFrame``, and the ``delta`` from the best to the second-best label. Each row corresponds to a column of ``test``. The metadata contains ``markers``, a list of the markers from each pairwise comparison between labels; and ``used``, diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index 3ca7f2a..1e6a54c 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -101,40 +101,51 @@ def get_classic_markers( Args: ref_data: - A matrix-like object containing the log-normalized expression values of a reference dataset. - Each column is a sample and each row is a feature. + A matrix-like object containing the log-normalized expression + values of a reference dataset. Each column is a sample and each + row is a feature. - Alternatively, this can be a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + Alternatively, this can be a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a matrix-like object in one of its assays. - Alternatively, a list of such matrices or ``SummarizedExperiment`` objects, - typically for multiple batches of the same reference; - it is assumed that different batches exhibit at least some overlap in their ``features`` and ``labels``. + Alternatively, a list of such matrices or ``SummarizedExperiment`` + objects, typically for multiple batches of the same reference; it + is assumed that different batches exhibit at least some overlap in + their ``ref_features`` and ``ref_labels``. ref_labels: - A sequence of length equal to the number of columns of ``ref``, + If ``ref_data`` is not a list, ``ref_labels`` should be a sequence + of length equal to the number of columns of ``ref_data``, containing a label (usually a string) for each column. - Alternatively, a list of such sequences of length equal to that of a list ``ref``; - each sequence should have length equal to the number of columns of the corresponding entry of ``ref``. + + If ``ref_data`` is a list, ``ref_labels`` should also be a list of + the same length. Each entry should be a sequence of length equal to + the number of columns of the corresponding entry of ``ref_data``. ref_features: - A sequence of length equal to the number of rows of ``ref``, + If ``ref_data`` is not a list, ``ref_features`` should be a + sequence of length equal to the number of rows of ``ref_data``, containing the feature name (usually a string) for each row. - Alternatively, a list of such sequences of length equal to that of a list ``ref``; - each sequence should have length equal to the number of rows of the corresponding entry of ``ref``. + + If ``ref_data`` is a list, ``ref_features`` should also be a list + of the same length. Each entry should be a sequence of length + equal to the number of rows of the corresponding entry of ``ref``. assay_type: - Name or index of the assay containing the assay of interest, - if ``ref`` is or contains - :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` objects. + Name or index of the assay of interest, if ``ref`` is or contains + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + objects. check_missing: - Whether to check for and remove rows with missing (NaN) values in the reference matrices. - This can be set to False if it is known that no NaN values exist. + Whether to check for and remove rows with missing (NaN) values in + the reference matrices. This can be set to False if it is known + that no NaN values exist. num_de: - Number of differentially expressed genes to use as markers for each pairwise comparison between labels. - If None, an appropriate number of genes is automatically determined. + Number of differentially expressed genes to use as markers for each + pairwise comparison between labels. If ``None``, an appropriate + number of genes is automatically determined. num_threads: Number of threads to use for the calculations. diff --git a/src/singler/train_integrated.py b/src/singler/train_integrated.py index e4bcc46..600e1b0 100644 --- a/src/singler/train_integrated.py +++ b/src/singler/train_integrated.py @@ -22,7 +22,7 @@ def __init__(self, ptr, ref_names, ref_labels): @property def reference_names(self) -> Union[Sequence[str], None]: """Sequence containing the names of the references. Alternatively - None, if no names were supplied.""" + ``None``, if no names were supplied.""" return self._names @property @@ -30,7 +30,7 @@ def reference_labels(self) -> list: """List of lists containing the names of the labels for each reference. Each entry corresponds to a reference in :py:attr:`~reference_names`, - if ``reference_names`` is not None. + if ``reference_names`` is not ``None``. """ return self._labels @@ -50,10 +50,10 @@ def train_integrated( ref_prebuilt: List of prebuilt references, typically created by calling - :py:meth:`~singler.build_single_reference.train_single`. + :py:meth:`~singler.train_single.train_single`. ref_names: - Sequence of names for the references. If None, these are + Sequence of names for the references. If ``None``, these are automatically generated. warn_lost: diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 5bb2834..20c9de7 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -13,8 +13,8 @@ class TrainedSingleReference: """A prebuilt reference object, typically created by - :py:meth:`~singler.build_single_reference.build_single_reference`. This is intended for advanced users only and - should not be serialized. + :py:meth:`~singler.train_single.train_single`. This is intended for + advanced users only and should not be serialized. """ def __init__( @@ -110,7 +110,7 @@ def train_single( ref_data: A matrix-like object where rows are features, columns are reference profiles, and each entry is the expression value. - If `markers` is not provided, expression should be normalized + If ``markers`` is not provided, expression should be normalized and log-transformed in preparation for marker prioritization via differential expression analyses. Otherwise, any expression values are acceptable as only the ranking within each column is used. @@ -130,7 +130,7 @@ def train_single( assay_type: Assay containing the expression matrix, - if `ref_data` is a + if ``ref_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. check_missing: @@ -139,8 +139,8 @@ def train_single( restrict_to: Subset of available features to restrict to. Only features in - ``restrict_to`` will be used in the reference building. If None, - no restriction is performed. + ``restrict_to`` will be used in the reference building. If + ``None``, no restriction is performed. markers: Upregulated markers for each pairwise comparison between labels. @@ -159,9 +159,9 @@ def train_single( Further arguments to pass to the chosen marker detection method. Only used if ``markers`` is not supplied. - approximate: - Whether to use an approximate neighbor search to compute scores - during classification. + nn_parameters: + Algorithm for constructing the neighbor search index, used to + compute scores during classification. num_threads: Number of threads to use for reference building. From 6f5fd77f7c41800e0ad007dd73a1989a525a46a3 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 16:46:58 -0800 Subject: [PATCH 15/29] Remove duplicated row names from the reference before marker detection. These cause trouble as they confuse our name-based approach to matters. --- src/singler/_utils.py | 25 ++++++++++++++----------- src/singler/train_single.py | 22 +++++++++++++++++++--- tests/test_train_single.py | 10 ++++++++++ 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/singler/_utils.py b/src/singler/_utils.py index dff601a..b9b061d 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -101,14 +101,17 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): def _restrict_features(data, features, restrict_to): - if restrict_to is not None: - if not isinstance(restrict_to, set): - restrict_to = set(restrict_to) - keep = [] - new_features = [] - for i, x in enumerate(features): - if x in restrict_to: - keep.append(i) - new_features.append(x) - return delayedarray.DelayedArray(data)[keep, :], new_features - return data, features + if restrict_to is None: + return data, features + + if not isinstance(restrict_to, set): + restrict_to = set(restrict_to) + keep = [] + for i, x in enumerate(features): + if x in restrict_to: + keep.append(i) + + return ( + delayedarray.DelayedArray(data)[keep, :], + biocutils.subset_sequence(features, keep) + ) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 20c9de7..aa84f80 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -102,8 +102,9 @@ def train_single( marker_method: Literal["classic"] = "classic", marker_args: dict = {}, nn_parameters: Optional[knncolle.Parameters] = knncolle.AnnoyParameters(), + check_duplicated: bool = True, num_threads: int = 1, - ) -> TrainedSingleReference: +) -> TrainedSingleReference: """Build a single reference dataset in preparation for classification. Args: @@ -163,6 +164,10 @@ def train_single( Algorithm for constructing the neighbor search index, used to compute scores during classification. + check_duplicated: + Whether to remove rows with duplicate feature names. This can be + set to False if ``ref_features`` does not contain any duplicates. + num_threads: Number of threads to use for reference building. @@ -179,8 +184,19 @@ def train_single( num_threads=num_threads, ) + if check_duplicated: + encountered = set() + keep = [] + for i, rg in enumerate(ref_features): + if rg not in encountered: + encountered.add(rg) + keep.append(i) + if len(keep) != len(ref_features): + ref_features = biocutils.subset_sequence(ref_features, keep) + ref_data = delayedarray.DelayedArray(ref_data)[keep,:] + unique_labels, label_idx = _factorize(ref_labels) - markers = identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) + markers = _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) markers_idx = [None] * len(unique_labels) for outer_i, outer_k in enumerate(unique_labels): inner_instance = [None] * len(unique_labels) @@ -217,7 +233,7 @@ def train_single( ) -def identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads): +def _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads): ref_data, ref_features = _restrict_features(ref_data, ref_features, test_features) ref_data, ref_features = _restrict_features(ref_data, ref_features, restrict_to) diff --git a/tests/test_train_single.py b/tests/test_train_single.py index bbb12b0..6cd6351 100644 --- a/tests/test_train_single.py +++ b/tests/test_train_single.py @@ -38,6 +38,16 @@ def test_train_single_markers(): assert built.markers == mbuilt.markers +def test_train_single_dedup(): + ref = numpy.random.rand(10000, 10) + labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] + features = [str(i) for i in range(ref.shape[0])] + features[0] = "1" + built = singler.train_single(ref, labels, features) + assert built.features == features[1:] # duplicates are ignored + assert built._full_data.shape[0] == len(built.features) + + def test_train_single_restricted(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] From ff7035d60ea4d30b4eb62ea412d87e032c243f42 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 12 Dec 2024 21:27:07 -0800 Subject: [PATCH 16/29] chore: remove Python 3.8 support (#28) * chore: remove Python 3.8 support * update actions * simplify build deps * bump assorthead and mattress * make changes to gh action * sync actions with mattress --- .github/workflows/build-docs.yml | 28 +++--- .github/workflows/pypi-publish.yml | 146 +++++++++++++++++++++++++++++ .github/workflows/pypi-test.yml | 101 +++----------------- .pre-commit-config.yaml | 29 +++--- CHANGELOG.md | 5 + pyproject.toml | 3 - setup.cfg | 2 +- 7 files changed, 189 insertions(+), 125 deletions(-) create mode 100644 .github/workflows/pypi-publish.yml diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index fbfe825..24a9c78 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -2,28 +2,23 @@ name: Build documentation on: push: - branches: - - master + tags: + - "*" jobs: - test: - name: Build docs + build_docs: + name: Build docs runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - with: - submodules: true + - uses: actions/checkout@v4 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 + - name: Set up python + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.12 cache: 'pip' - - name: Set up ccache - uses: hendrikmuhs/ccache-action@v1.2 - - name: Install Python dependencies run: | python -m pip install --upgrade pip setuptools @@ -32,17 +27,16 @@ jobs: DOCDEPENDENCIES=$(python -c 'with open("docs/requirements.txt") as a: available = list(a); print(" ".join(map(lambda x : x.strip(), filter(lambda x : not x.startswith("#"), available))))') pip install ${DOCDEPENDENCIES} - # Note that doc building requires the inplace shared library. - name: Build docs run: | - CC="ccache gcc" python setup.py build_ext --inplace + touch src/singler/lib_singler.py sphinx-build --color -b html -d docs/doctrees docs docs/_build/html touch ./docs/_build/html/.nojekyll - name: GH Pages Deployment if: github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') - uses: JamesIves/github-pages-deploy-action@4.1.3 + uses: JamesIves/github-pages-deploy-action@v4 with: branch: gh-pages # The branch the action should deploy to. folder: ./docs/_build/html - clean: true # Automatically remove deleted files from the deploy branch + clean: true # Automatically remove deleted files from the deploy branch \ No newline at end of file diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..6b2f949 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,146 @@ +name: Publish to PyPI + +on: + push: + tags: + - "*" + +jobs: + build_linux_x86_64: + name: Build wheels for linux x86_64 + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.16.2 + env: + CIBW_ARCHS: x86_64 + CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" + CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/artifactdb/prebuilt-hdf5/manylinux_x86_64:0.0.4 + CIBW_MUSLLINUX_X86_64_IMAGE: ghcr.io/artifactdb/prebuilt-hdf5/musllinux_x86_64:0.0.4 + CIBW_SKIP: pp* + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-ubuntu-x86_64 + path: ./wheelhouse/*.whl + + build_macosx_x86_64: + name: Build wheels for macosx x86_64 + runs-on: macos-13 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Grab prebuilt dependencies + run: | + curl -L https://github.com/ArtifactDB/prebuilt-hdf5/releases/download/0.0.4/macosx_x86_64.tar.gz > bundle.tar.gz + tar -xvf bundle.tar.gz + + - name: Build wheels + uses: pypa/cibuildwheel@v2.16.2 + env: + CIBW_ARCHS: x86_64 + CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" + CIBW_ENVIRONMENT: "MORE_CMAKE_OPTIONS=\"-DCMAKE_INSTALL_PREFIX=$(pwd)/installed -DCMAKE_OSX_ARCHITECTURES=x86_64\"" + CIBW_BUILD_VERBOSITY: 3 + CIBW_SKIP: pp* + MACOSX_DEPLOYMENT_TARGET: 11.7 + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-macos-x86_64 + path: ./wheelhouse/*.whl + + build_macosx_arm64: + name: Build wheels for macosx arm64 + runs-on: macos-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Grab prebuilt dependencies + run: | + curl -L https://github.com/ArtifactDB/prebuilt-hdf5/releases/download/0.0.4-manual/macosx_arm64.tar.gz > bundle.tar.gz + tar -xvf bundle.tar.gz + + - name: Build wheels + uses: pypa/cibuildwheel@v2.16.2 + env: + CIBW_ARCHS: arm64 + CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" + CIBW_ENVIRONMENT: "MORE_CMAKE_OPTIONS=\"-DCMAKE_INSTALL_PREFIX=$(pwd)/installed -DCMAKE_OSX_ARCHITECTURES=arm64\"" + CIBW_BUILD_VERBOSITY: 3 + MACOSX_DEPLOYMENT_TARGET: 13.0 + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-maxos_arm64 + path: ./wheelhouse/*.whl + +# build_windows_x86_64: +# name: Build wheels for windows x86_64 +# runs-on: windows-2019 +# steps: +# - name: Check out repository +# uses: actions/checkout@v4 +# +# - name: Grab prebuilt dependencies +# run: | +# curl -L https://github.com/ArtifactDB/prebuilt-hdf5/releases/download/0.0.4/windows_x86_64.tar.gz > bundle.tar.gz +# tar -xvf bundle.tar.gz +# shell: bash +# +# - name: Store path +# run: | +# $wd = pwd +# echo "INSTALL_DIR=$wd\\installed" >> $env:GITHUB_ENV +# shell: powershell +# +# - name: Build wheels +# uses: pypa/cibuildwheel@v2.16.2 +# env: +# CIBW_ARCHS: AMD64 +# CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" +# CIBW_ENVIRONMENT: "MORE_CMAKE_OPTIONS=\"-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}\" VERBOSE=1" +# CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" +# CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" +# CIBW_TEST_EXTRAS: "testing" +# CIBW_TEST_COMMAND: "pytest {package}/tests" +# CIBW_BUILD_VERBOSITY: 3 + # - uses: actions/upload-artifact@v3 + # with: + # path: ./wheelhouse/*.whl + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Build sdist + run: pipx run build --sdist + + - uses: actions/upload-artifact@v4 + with: + name: cibw-sdist + path: dist/*.tar.gz + + upload_pypi: + needs: [build_linux_x86_64, build_macosx_x86_64, build_macosx_arm64, build_sdist] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v4 + with: + pattern: cibw-* + path: dist + merge-multiple: true + + - uses: pypa/gh-action-pypi-publish@v1.12.2 + with: + user: __token__ + password: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index fa01acf..e583020 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -1,111 +1,32 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - name: Test the library on: push: branches: - master - tags: - - "*" pull_request: jobs: test: - name: Running tests runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + name: Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 - with: - submodules: true + - uses: actions/checkout@v4 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 + - name: Setup Python + uses: actions/setup-python@v5 with: - python-version: 3.9 - cache: 'pip' + python-version: ${{ matrix.python-version }} + cache: "pip" - - name: Install python dependencies - run: | - python -m pip install --upgrade pip setuptools - DEPENDENCIES=$(python -c 'from setuptools.config.setupcfg import read_configuration as c; a = c("setup.cfg"); print(" ".join(a["options"]["install_requires"][1:] + a["options"]["extras_require"]["testing"][1:]))') - pip install ${DEPENDENCIES} + - name: Get latest CMake + uses: lukka/get-cmake@latest - # We do proper tests if we're on the master branch, or if we're creating a new release. - name: Test with tox - if: github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') run: | pip install tox tox - - # Otherwise, in a PR, we don't need the full clean test. - - name: Set up ccache - if: github.ref != 'refs/heads/master' && !startsWith(github.ref, 'refs/tags') - uses: hendrikmuhs/ccache-action@v1.2 - - - name: Quickly build and test - if: github.ref != 'refs/heads/master' && !startsWith(github.ref, 'refs/tags') - run: | - CC="ccache gcc" python setup.py install - pytest - - build_wheels: - name: Build wheels on ${{ matrix.os }} - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-20.04, macos-11] # at some point get this to work on windows-2019 - - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - name: Build wheels - uses: pypa/cibuildwheel@v2.12.1 - env: - CIBW_ARCHS_MACOS: x86_64 arm64 - CIBW_ARCHS_LINUX: x86_64 # remove this later so we build for all linux archs - CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" - CIBW_SKIP: pp* - - uses: actions/upload-artifact@v3 - with: - path: ./wheelhouse/*.whl - - build_sdist: - name: Build source distribution - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - name: Build sdist - run: pipx run build --sdist - - - uses: actions/upload-artifact@v3 - with: - path: dist/*.tar.gz - - upload_pypi: - needs: [test, build_wheels, build_sdist] - runs-on: ubuntu-latest - # upload to PyPI on every tag starting with 'v' - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - # alternatively, to publish when a GitHub Release is created, use the following rule: - # if: github.event_name == 'release' && github.event.action == 'published' - steps: - - uses: actions/download-artifact@v3 - with: - # unpacks default artifact into dist/ - # if `name: artifact` is omitted, the action will create extra parent dir - name: artifact - path: dist - - - uses: pypa/gh-action-pypi-publish@v1.8.3 - with: - user: __token__ - password: ${{ secrets.PYPI_PASSWORD }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d8c075..e60a5f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: '^docs/conf.py' repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: check-added-large-files @@ -17,26 +17,27 @@ repos: - id: mixed-line-ending args: ['--fix=auto'] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows -- repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 - hooks: - - id: docformatter - additional_dependencies: [tomli] - args: [--in-place, --wrap-descriptions=120, --wrap-summaries=120] - # --config, ./pyproject.toml +# - repo: https://github.com/PyCQA/docformatter +# rev: master +# hooks: +# - id: docformatter +# additional_dependencies: [tomli] +# args: [--in-place, --wrap-descriptions=120, --wrap-summaries=120] +# # --config, ./pyproject.toml -- repo: https://github.com/psf/black - rev: 23.9.1 - hooks: - - id: black - language_version: python3 +# - repo: https://github.com/psf/black +# rev: 24.8.0 +# hooks: +# - id: black +# language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.0.287 + rev: v0.6.8 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format ## If like to embrace black styles even in the docs: # - repo: https://github.com/asottile/blacken-docs diff --git a/CHANGELOG.md b/CHANGELOG.md index d9204f8..0b850d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.4.0 + +- chore: Remove Python 3.8 (EOL). +- precommit: Replace docformatter with ruff's formatter. + ## Version 0.3.0 Compatibility with NumPy 2.0 diff --git a/pyproject.toml b/pyproject.toml index b584fea..2c40e41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,3 @@ convention = "google" [tool.ruff.per-file-ignores] "__init__.py" = ["E402", "F401"] - -[tool.black] -force-exclude = "__init__.py|cpphelpers|cpp" diff --git a/setup.cfg b/setup.cfg index 71e6b73..eca616d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -python_requires = >=3.8 +python_requires = >=3.9 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in From 462b05d9f1c42aefc9906e1855aedb74e62a951e Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 21:33:05 -0800 Subject: [PATCH 17/29] Added deprecation warnings for passing row/column data names. At least it'll still work. --- src/singler/_utils.py | 7 +++++++ src/singler/train_single.py | 21 ++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/singler/_utils.py b/src/singler/_utils.py index b9b061d..07fcc92 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -5,6 +5,7 @@ import delayedarray import summarizedexperiment import mattress +import warnings def _factorize(x: Sequence) -> Tuple[list, numpy.ndarray]: @@ -70,6 +71,12 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): if isinstance(x, summarizedexperiment.SummarizedExperiment): if features is None: features = x.get_row_names() + elif isinstance(features, str): + warnings.warn( + "setting 'features' to a column name of the row data is deprecated", + category=DeprecationWarning + ) + features = x.get_row_data().column(features) elif not isinstance(features, list): features = list(features) x = x.assay(assay_type) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index aa84f80..79dc958 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -5,6 +5,7 @@ import knncolle import mattress import delayedarray +import warnings from . import lib_singler as lib from ._utils import _clean_matrix, _factorize, _restrict_features, _stable_intersect @@ -195,8 +196,26 @@ def train_single( ref_features = biocutils.subset_sequence(ref_features, keep) ref_data = delayedarray.DelayedArray(ref_data)[keep,:] + if isinstance(ref_labels, str): + warnings.warn( + "setting 'labels' to a column name of the column data is deprecated", + category=DeprecationWarning + ) + ref_labels = ref_data.get_column_data().column(ref_labels) unique_labels, label_idx = _factorize(ref_labels) - markers = _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) + + markers = _identify_genes( + ref_data=ref_data, + ref_features=ref_features, + ref_labels=ref_labels, + unique_labels=unique_labels, + markers=markers, + marker_method=marker_method, + test_features=test_features, + restrict_to=restrict_to, + marker_args=marker_args, + num_threads=num_threads + ) markers_idx = [None] * len(unique_labels) for outer_i, outer_k in enumerate(unique_labels): inner_instance = [None] * len(unique_labels) From bdbb9a80f829926470cd2514c53350e3db2e3230 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 21:40:34 -0800 Subject: [PATCH 18/29] Got the tests to pass, more or less. --- src/singler/_utils.py | 1 + tests/test_utils.py | 55 ++++++++++++++++++++----------------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 07fcc92..7b397b1 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -10,6 +10,7 @@ def _factorize(x: Sequence) -> Tuple[list, numpy.ndarray]: f = biocutils.Factor.from_sequence(x, sort_levels=False) + print(f) return f.levels, numpy.array(f.codes, dtype=numpy.uint32) diff --git a/tests/test_utils.py b/tests/test_utils.py index db333d6..e112230 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,9 +4,8 @@ _stable_union, _clean_matrix, ) -import numpy as np -from mattress import tatamize -from summarizedexperiment import SummarizedExperiment +import numpy +import summarizedexperiment def test_factorize(): @@ -19,10 +18,6 @@ def test_factorize(): assert list(lev) == ["C", "D", "A", "B"] assert (ind == [0, 1, 2, 3, 0, 2]).all() - # Handles None-ness. - lev, ind = _factorize([1, None, 5, None, 3, None]) - assert list(lev) == ["1", "5", "3"] - assert (ind == [0, -1, 1, -1, 2, -1]).all() def test_intersect(): # Preserves the order in the first argument. @@ -82,42 +77,44 @@ def test_union(): def test_clean_matrix(): - out = np.random.rand(20, 10) + out = numpy.random.rand(20, 10) features = ["FEATURE_" + str(i) for i in range(out.shape[0])] - ptr, feats = _clean_matrix( + out2, feats = _clean_matrix( out, features, assay_type=None, check_missing=True, num_threads=1 ) assert feats == features - assert (ptr.row(1) == out[1, :]).all() - assert (ptr.column(2) == out[:, 2]).all() + assert (out2[1, :] == out[1, :]).all() + assert (out2[:, 2] == out[:, 2]).all() - ptr, feats = _clean_matrix( + out2, feats = _clean_matrix( out, features, assay_type=None, check_missing=False, num_threads=1 ) assert feats == features - assert (ptr.row(3) == out[3, :]).all() - assert (ptr.column(4) == out[:, 4]).all() + assert (out2[3, :] == out[3, :]).all() + assert (out2[:, 4] == out[:, 4]).all() - tmp = np.copy(out) - tmp[0, 5] = np.nan - ptr, feats = _clean_matrix( + tmp = numpy.copy(out) + tmp[0, 5] = numpy.nan + out2, feats = _clean_matrix( tmp, features, assay_type=None, check_missing=True, num_threads=1 ) assert feats == features[1:] - assert (ptr.row(2) == out[3, :]).all() - assert (ptr.column(4) == out[1:, 4]).all() - - ptr = tatamize(out) - ptr2, feats = _clean_matrix( - ptr, features, assay_type=None, check_missing=True, num_threads=1 - ) - assert ptr2.ptr == ptr.ptr + assert (out2[2, :] == out[3, :]).all() + assert (out2[:, 4] == out[1:, 4]).all() - se = SummarizedExperiment({"counts": out}) - ptr, feats = _clean_matrix( + se = summarizedexperiment.SummarizedExperiment({"counts": out}) + out2, feats = _clean_matrix( se, features, assay_type="counts", check_missing=True, num_threads=1 ) assert feats == features - assert (ptr.row(1) == out[1, :]).all() - assert (ptr.column(2) == out[:, 2]).all() + assert (out2[1, :] == out[1, :]).all() + assert (out2[:, 2] == out[:, 2]).all() + + se2 = se.set_row_names(features) + out2, feats = _clean_matrix( + se2, None, assay_type="counts", check_missing=True, num_threads=1 + ) + assert feats.as_list() == features + assert (out2[1, :] == out[1, :]).all() + assert (out2[:, 2] == out[:, 2]).all() From dec49712ba174d1201c93d34d61b20933f693586 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 22:14:00 -0800 Subject: [PATCH 19/29] Got more tests to pass. --- src/singler/_utils.py | 6 ---- src/singler/annotate_integrated.py | 12 +++++++- src/singler/annotate_single.py | 7 +++++ src/singler/train_single.py | 22 +++++++++----- tests/test_integrated_with_celldex.py | 44 +++++++++++++++------------ tests/test_single_with_celldex.py | 13 +------- tests/test_train_integrated.py | 2 +- tests/test_utils.py | 12 -------- 8 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 7b397b1..603edd1 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -8,12 +8,6 @@ import warnings -def _factorize(x: Sequence) -> Tuple[list, numpy.ndarray]: - f = biocutils.Factor.from_sequence(x, sort_levels=False) - print(f) - return f.levels, numpy.array(f.codes, dtype=numpy.uint32) - - def _create_map(x: Sequence) -> dict: mapping = {} for i, val in enumerate(x): diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index e1de142..a574c00 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -1,6 +1,7 @@ from typing import Any, Optional, Sequence, Tuple, Union import biocframe +import warnings from ._utils import _clean_matrix, _restrict_features from .train_single import train_single @@ -132,6 +133,15 @@ def annotate_integrated( all_ref_labels = [] all_ref_features = [] for r in range(nrefs): + curref_labels = ref_labels[r] + if isinstance(curref_labels, str): + warnings.warn( + "setting 'ref_labels' to a column name of the column data is deprecated", + category=DeprecationWarning + ) + curref_labels = ref_data[r].get_column_data().column(curref_labels) + all_ref_labels.append(curref_labels) + curref_data, curref_features = _clean_matrix( ref_data[r], ref_features[r], @@ -161,7 +171,7 @@ def annotate_integrated( for r in range(nrefs): curbuilt = train_single( ref_data=all_ref_data[r], - ref_labels=ref_labels[r], + ref_labels=all_ref_labels[r], ref_features=all_ref_features[r], test_features=test_features, **train_single_args, diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 11914c6..ae25c70 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -91,6 +91,13 @@ def annotate_single( A :py:class:`~biocframe.BiocFrame.BiocFrame` of labelling results, see :py:func:`~singler.classify_single.classify_single` for details. """ + if isinstance(ref_labels, str): + warnings.warn( + "setting 'ref_labels' to a column name of the column data is deprecated", + category=DeprecationWarning + ) + ref_labels = ref_data.get_column_data().column(ref_labels) + test_data, test_features = _clean_matrix( test_data, test_features, diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 79dc958..1eaaaab 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -8,7 +8,7 @@ import warnings from . import lib_singler as lib -from ._utils import _clean_matrix, _factorize, _restrict_features, _stable_intersect +from ._utils import _clean_matrix, _restrict_features, _stable_intersect from .get_classic_markers import get_classic_markers @@ -176,6 +176,12 @@ def train_single( The pre-built reference, ready for use in downstream methods like :py:meth:`~singler.classify_single_reference.classify_single`. """ + if isinstance(ref_labels, str): + warnings.warn( + "setting 'ref_labels' to a column name of the column data is deprecated", + category=DeprecationWarning + ) + ref_labels = ref_data.get_column_data().column(ref_labels) ref_data, ref_features = _clean_matrix( ref_data, @@ -196,13 +202,13 @@ def train_single( ref_features = biocutils.subset_sequence(ref_features, keep) ref_data = delayedarray.DelayedArray(ref_data)[keep,:] - if isinstance(ref_labels, str): - warnings.warn( - "setting 'labels' to a column name of the column data is deprecated", - category=DeprecationWarning - ) - ref_labels = ref_data.get_column_data().column(ref_labels) - unique_labels, label_idx = _factorize(ref_labels) + for f in ref_labels: + if f is None: + raise ValueError("entries of 'ref_labels' cannot be missing") + if not isinstance(ref_labels, biocutils.Factor): # TODO: move over to biocutils so coercion can no-op. + ref_labels = biocutils.Factor.from_sequence(ref_labels, sort_levels=False) # TODO: add a dtype= option. + unique_labels = ref_labels.levels + label_idx = ref_labels.codes.astype(dtype=numpy.uint32, copy=False) markers = _identify_genes( ref_data=ref_data, diff --git a/tests/test_integrated_with_celldex.py b/tests/test_integrated_with_celldex.py index 8181d66..dc53c0c 100644 --- a/tests/test_integrated_with_celldex.py +++ b/tests/test_integrated_with_celldex.py @@ -16,19 +16,17 @@ def test_with_minimal_args(): ) immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) - with pytest.raises(Exception): - singler.annotate_integrated( - test_data=sce.assays["counts"], - ref_data_list=(blueprint_ref, immune_cell_ref), - ref_labels_list="label.main", - num_threads=6, - ) - single, integrated = singler.annotate_integrated( test_data=sce, - ref_data_list=(blueprint_ref, immune_cell_ref), - ref_labels_list="label.main", - num_threads=6, + ref_data=( + blueprint_ref, + immune_cell_ref + ), + ref_labels=[ + blueprint_ref.get_column_data().column("label.main"), + immune_cell_ref.get_column_data().column("label.main") + ], + num_threads=2 ) assert len(single) == 2 assert isinstance(integrated, BiocFrame) @@ -43,14 +41,21 @@ def test_with_all_supplied(): immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) single, integrated = singler.annotate_integrated( - test_data=sce, + test_data=sce.assays["counts"], test_features=sce.get_row_names(), - ref_data_list=(blueprint_ref, immune_cell_ref), - ref_labels_list=[ - x.get_column_data().column("label.main") - for x in (blueprint_ref, immune_cell_ref) + ref_data=( + blueprint_ref, + immune_cell_ref + ), + ref_labels=[ + blueprint_ref.get_column_data().column("label.main"), + immune_cell_ref.get_column_data().column("label.main") + ], + ref_features=[ + blueprint_ref.get_row_names(), + immune_cell_ref.get_row_names() ], - ref_features_list=[x.get_row_names() for x in (blueprint_ref, immune_cell_ref)], + num_threads=2 ) assert len(single) == 2 @@ -67,8 +72,9 @@ def test_with_colname(): single, integrated = singler.annotate_integrated( test_data=sce, - ref_data_list=(blueprint_ref, immune_cell_ref), - ref_labels_list="label.main", + ref_data=(blueprint_ref, immune_cell_ref), + ref_labels=["label.main"] * 2, + num_threads=2 ) assert len(single) == 2 diff --git a/tests/test_single_with_celldex.py b/tests/test_single_with_celldex.py index b5cd6db..eeef181 100644 --- a/tests/test_single_with_celldex.py +++ b/tests/test_single_with_celldex.py @@ -4,21 +4,12 @@ import scrnaseq import pandas as pd import scipy -import pytest from biocframe import BiocFrame def test_with_minimal_args(): sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) - immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) - with pytest.raises(Exception): - matches = singler.annotate_single( - test_data=sce.assays["counts"], - ref_data=immgen_ref, - ref_labels=immgen_ref.get_column_data().column("label.main"), - ) - matches = singler.annotate_single( test_data=sce, ref_data=immgen_ref, @@ -32,11 +23,10 @@ def test_with_minimal_args(): def test_with_all_supplied(): sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) - immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) matches = singler.annotate_single( - test_data=sce, + test_data=sce.assays["counts"], test_features=sce.get_row_names(), ref_data=immgen_ref, ref_labels=immgen_ref.get_column_data().column("label.main"), @@ -50,7 +40,6 @@ def test_with_all_supplied(): def test_with_colname(): sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) - immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) matches = singler.annotate_single( diff --git a/tests/test_train_integrated.py b/tests/test_train_integrated.py index 064d8b4..7e2a581 100644 --- a/tests/test_train_integrated.py +++ b/tests/test_train_integrated.py @@ -30,7 +30,7 @@ def test_train_integrated(): test_features, ref_prebuilt=[built1, built2], ref_names=["FOO", "BAR"], - num_threads=3, + num_threads=2, ) assert pintegrated.reference_names == ["FOO", "BAR"] diff --git a/tests/test_utils.py b/tests/test_utils.py index e112230..4d525f0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,4 @@ from singler._utils import ( - _factorize, _stable_intersect, _stable_union, _clean_matrix, @@ -8,17 +7,6 @@ import summarizedexperiment -def test_factorize(): - lev, ind = _factorize([1, 3, 5, 5, 3, 1]) - assert list(lev) == ["1", "3", "5"] - assert (ind == [0, 1, 2, 2, 1, 0]).all() - - # Preserves the order. - lev, ind = _factorize(["C", "D", "A", "B", "C", "A"]) - assert list(lev) == ["C", "D", "A", "B"] - assert (ind == [0, 1, 2, 3, 0, 2]).all() - - def test_intersect(): # Preserves the order in the first argument. out = _stable_intersect(["B", "C", "A", "D", "E"], ["A", "C", "E"]) From 4880cf4da0469cc23ee4c1bb4124e2efbf2987a1 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 22:35:31 -0800 Subject: [PATCH 20/29] Skip 3.13 builds for now as dolomite doesn't want to build. --- .github/workflows/pypi-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index e583020..ca0b9ab 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12"] name: Python ${{ matrix.python-version }} steps: From 386d4b1be5e96c1c0018a9c4ea64c66ed50caf54 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 12 Dec 2024 22:45:06 -0800 Subject: [PATCH 21/29] Remove unnecessary mattress instruction in the install_requires. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2c40e41..525baed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "mattress>=0.2.0", "pybind11", "assorthead>=0.1.0"] +requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "pybind11", "assorthead>=0.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] From d55db15815aab2a3084717a1204c0e62a9adeb7b Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 11:16:31 -0800 Subject: [PATCH 22/29] Switch to the latest mattress. --- lib/CMakeLists.txt | 2 ++ lib/src/classify_integrated.cpp | 19 +++++++++++-------- lib/src/classify_single.cpp | 19 +++++++++++-------- lib/src/def.h | 10 +++------- lib/src/find_classic_markers.cpp | 7 ++++--- lib/src/train_integrated.cpp | 11 ++++++----- lib/src/train_single.cpp | 20 ++++++++++++-------- setup.cfg | 2 +- setup.py | 6 +++++- src/singler/train_single.py | 2 +- 10 files changed, 56 insertions(+), 42 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index a8eacf3..9aa174e 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -19,6 +19,8 @@ pybind11_add_module(singler ) target_include_directories(singler PRIVATE "${ASSORTHEAD_INCLUDE_DIR}") +target_include_directories(singler PRIVATE "${MATTRESS_INCLUDE_DIR}") +target_include_directories(singler PRIVATE "${KNNCOLLE_INCLUDE_DIR}") set_property(TARGET singler PROPERTY CXX_STANDARD 17) diff --git a/lib/src/classify_integrated.cpp b/lib/src/classify_integrated.cpp index afa1df2..2b1cda3 100644 --- a/lib/src/classify_integrated.cpp +++ b/lib/src/classify_integrated.cpp @@ -1,5 +1,6 @@ #include "def.h" #include "utils.h" +#include "mattress.h" #include "singlepp/singlepp.hpp" #include "tatami/tatami.hpp" @@ -10,7 +11,7 @@ #include pybind11::tuple classify_integrated( - const MatrixPointer& test, + uintptr_t test_ptr, const pybind11::list& results, const TrainedIntegratedPointer& integrated_build, double quantile, @@ -18,6 +19,8 @@ pybind11::tuple classify_integrated( double fine_tune_threshold, int nthreads) { + const auto& test = mattress::cast(test_ptr)->ptr; + // Setting up the previous results. size_t num_refs = results.size(); std::vector previous_results; @@ -29,18 +32,18 @@ pybind11::tuple classify_integrated( // Setting up outputs. size_t ncells = test->ncol(); - pybind11::array_t best(ncells); - pybind11::array_t delta(ncells); + pybind11::array_t best(ncells); + pybind11::array_t delta(ncells); - singlepp::ClassifyIntegratedBuffers buffers; - buffers.best = static_cast(best.request().ptr); - buffers.delta = static_cast(delta.request().ptr); + singlepp::ClassifyIntegratedBuffers buffers; + buffers.best = static_cast(best.request().ptr); + buffers.delta = static_cast(delta.request().ptr); pybind11::list scores(num_refs); buffers.scores.resize(num_refs); for (size_t l = 0; l < num_refs; ++l) { - scores[l] = pybind11::array_t(ncells); - buffers.scores[l] = static_cast(scores[l].cast().request().ptr); + scores[l] = pybind11::array_t(ncells); + buffers.scores[l] = static_cast(scores[l].cast().request().ptr); } // Running the integrated scoring. diff --git a/lib/src/classify_single.cpp b/lib/src/classify_single.cpp index b6784c5..742aa83 100644 --- a/lib/src/classify_single.cpp +++ b/lib/src/classify_single.cpp @@ -1,5 +1,6 @@ #include "def.h" #include "utils.h" +#include "mattress.h" #include "singlepp/singlepp.hpp" #include "tatami/tatami.hpp" @@ -9,22 +10,24 @@ #include #include -pybind11::tuple classify_single(const MatrixPointer& test, const TrainedSingleIntersectPointer& built, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { +pybind11::tuple classify_single(uintptr_t test_ptr, const TrainedSingleIntersectPointer& built, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { + const auto& test = mattress::cast(test_ptr)->ptr; + // Setting up outputs. size_t ncells = test->ncol(); - pybind11::array_t best(ncells); - pybind11::array_t delta(ncells); + pybind11::array_t best(ncells); + pybind11::array_t delta(ncells); - singlepp::ClassifySingleBuffers buffers; - buffers.best = static_cast(best.request().ptr); - buffers.delta = static_cast(delta.request().ptr); + singlepp::ClassifySingleBuffers buffers; + buffers.best = static_cast(best.request().ptr); + buffers.delta = static_cast(delta.request().ptr); size_t nlabels = built->num_labels(); pybind11::list scores(nlabels); buffers.scores.resize(nlabels); for (size_t l = 0; l < nlabels; ++l) { - scores[l] = pybind11::array_t(ncells); - buffers.scores[l] = static_cast(scores[l].cast().request().ptr); + scores[l] = pybind11::array_t(ncells); + buffers.scores[l] = static_cast(scores[l].cast().request().ptr); } // Running the analysis. diff --git a/lib/src/def.h b/lib/src/def.h index df0491b..883d2c8 100644 --- a/lib/src/def.h +++ b/lib/src/def.h @@ -3,18 +3,14 @@ #include #include -#include "tatami/tatami.hpp" #include "singlepp/singlepp.hpp" - -typedef uint32_t MatrixIndex; -typedef double MatrixValue; -typedef std::shared_ptr > MatrixPointer; +#include "mattress.h" typedef std::shared_ptr, double> > BuilderPointer; -typedef singlepp::TrainedSingleIntersect TrainedSingleIntersect; +typedef singlepp::TrainedSingleIntersect TrainedSingleIntersect; typedef std::shared_ptr TrainedSingleIntersectPointer; -typedef singlepp::TrainedIntegrated TrainedIntegrated; +typedef singlepp::TrainedIntegrated TrainedIntegrated; typedef std::shared_ptr TrainedIntegratedPointer; #endif diff --git a/lib/src/find_classic_markers.cpp b/lib/src/find_classic_markers.cpp index 2f2888d..9e7a9ec 100644 --- a/lib/src/find_classic_markers.cpp +++ b/lib/src/find_classic_markers.cpp @@ -1,5 +1,6 @@ #include "def.h" #include "utils.h" +#include "mattress.h" #include "singlepp/singlepp.hpp" #include "tatami/tatami.hpp" @@ -15,13 +16,13 @@ pybind11::list find_classic_markers(uint32_t num_labels, uint32_t num_genes, con throw std::runtime_error("'ref' and 'labels' should have the same length"); } - std::vector*> ref_ptrs; + std::vector*> ref_ptrs; ref_ptrs.reserve(num_ref); std::vector lab_ptrs; lab_ptrs.reserve(num_ref); for (size_t r = 0; r < num_ref; ++r) { - auto ptr = reference[r].cast().get(); + auto ptr = mattress::cast(reference[r].cast())->ptr.get(); ref_ptrs.emplace_back(ptr); if (ptr->nrow() != num_genes) { throw std::runtime_error("each entry of 'ref' should have number of rows equal to 'ngenes'"); @@ -46,7 +47,7 @@ pybind11::list find_classic_markers(uint32_t num_labels, uint32_t num_genes, con const auto& src = store[l]; pybind11::list dest(num_labels); for (uint32_t l2 = 0; l2 < num_labels; ++l2) { - dest[l2] = pybind11::array_t(src[l2].size(), src[l2].data()); + dest[l2] = pybind11::array_t(src[l2].size(), src[l2].data()); } output[l] = dest; } diff --git a/lib/src/train_integrated.cpp b/lib/src/train_integrated.cpp index 12a154f..a62840d 100644 --- a/lib/src/train_integrated.cpp +++ b/lib/src/train_integrated.cpp @@ -1,5 +1,6 @@ #include "def.h" #include "utils.h" +#include "mattress.h" #include "singlepp/singlepp.hpp" #include "tatami/tatami.hpp" @@ -18,12 +19,12 @@ TrainedIntegratedPointer train_integrated( int nthreads) { size_t nrefs = references.size(); - std::vector > inputs; + std::vector > inputs; inputs.reserve(nrefs); - std::vector > intersections(nrefs); + std::vector > intersections(nrefs); for (size_t r = 0; r < nrefs; ++r) { - const auto& curref = references[r].cast(); + const auto& curref = mattress::cast(references[r].cast())->ptr; const auto& curlabels = labels[r].cast(); const auto& curbuilt = prebuilt[r].cast(); @@ -33,8 +34,8 @@ TrainedIntegratedPointer train_integrated( if (ninter != static_cast(ref_ids.size())) { throw std::runtime_error("length of each entry of 'test_features' and 'ref_features' should be the same"); } - const auto* test_ids_ptr = check_numpy_array(test_ids); - const auto* ref_ids_ptr = check_numpy_array(ref_ids); + const auto* test_ids_ptr = check_numpy_array(test_ids); + const auto* ref_ids_ptr = check_numpy_array(ref_ids); auto& curinter = intersections[r]; curinter.reserve(ninter); diff --git a/lib/src/train_single.cpp b/lib/src/train_single.cpp index 912dbd4..e74137c 100644 --- a/lib/src/train_single.cpp +++ b/lib/src/train_single.cpp @@ -1,5 +1,7 @@ #include "def.h" #include "utils.h" +#include "mattress.h" +#include "knncolle_py.h" #include "singlepp/singlepp.hpp" #include "tatami/tatami.hpp" @@ -11,27 +13,29 @@ TrainedSingleIntersectPointer train_single( const pybind11::array& test_features, - const MatrixPointer& ref, + uintptr_t ref_ptr, const pybind11::array& ref_features, const pybind11::array& labels, const pybind11::list& markers, const BuilderPointer& builder, int nthreads) { - singlepp::TrainSingleOptions opts; + const auto& ref = mattress::cast(ref_ptr)->ptr; + + singlepp::TrainSingleOptions opts; opts.num_threads = nthreads; opts.top = -1; // Use all available markers; assume subsetting was applied on the Python side. opts.trainer = BuilderPointer(BuilderPointer{}, builder.get()); // make a no-op shared pointer. auto NR = ref->nrow(); auto NC = ref->ncol(); - if (static_cast(labels.size()) != NC) { + if (static_cast(labels.size()) != NC) { throw std::runtime_error("length of 'labels' is equal to the number of columns of 'ref'"); } // Setting up the markers. size_t ngroups = markers.size(); - singlepp::Markers markers2(ngroups); + singlepp::Markers markers2(ngroups); for (size_t m = 0; m < ngroups; ++m) { auto curmarkers = markers[m].cast(); auto& curmarkers2 = markers2[m]; @@ -40,7 +44,7 @@ TrainedSingleIntersectPointer train_single( for (size_t n = 0; n < inner_ngroups; ++n) { auto seq = curmarkers[n].cast(); - auto sptr = check_numpy_array(seq); + auto sptr = check_numpy_array(seq); auto& seq2 = curmarkers2[n]; seq2.insert(seq2.end(), sptr, sptr + seq.size()); } @@ -53,7 +57,7 @@ TrainedSingleIntersectPointer train_single( } auto tfptr = check_numpy_array(test_features); auto rfptr = check_numpy_array(ref_features); - singlepp::Intersection inter; + singlepp::Intersection inter; inter.reserve(ninter); for (size_t i = 0; i < ninter; ++i) { inter.emplace_back(tfptr[i], rfptr[i]); @@ -71,9 +75,9 @@ TrainedSingleIntersectPointer train_single( return TrainedSingleIntersectPointer(new decltype(built)(std::move(built))); } -pybind11::array_t get_markers_from_single_reference(const TrainedSingleIntersectPointer& ptr) { +pybind11::array_t get_markers_from_single_reference(const TrainedSingleIntersectPointer& ptr) { const auto& rsub = ptr->get_ref_subset(); - return pybind11::array_t(rsub.size(), rsub.data()); + return pybind11::array_t(rsub.size(), rsub.data()); } uint32_t get_num_markers_from_single_reference(const TrainedSingleIntersectPointer& ptr) { diff --git a/setup.cfg b/setup.cfg index eca616d..79034a0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,7 @@ python_requires = >=3.9 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - mattress>=0.3.0 + mattress>=0.3.1 knncolle delayedarray biocframe>=0.5.0 diff --git a/setup.py b/setup.py index 0d70471..6305619 100644 --- a/setup.py +++ b/setup.py @@ -30,13 +30,17 @@ def build_cmake(self, ext): if not os.path.exists(build_temp): import assorthead + import mattress + import knncolle cmd = [ "cmake", "-S", "lib", "-B", build_temp, "-Dpybind11_DIR=" + os.path.join(os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11"), "-DPYTHON_EXECUTABLE=" + sys.executable, - "-DASSORTHEAD_INCLUDE_DIR=" + assorthead.includes() + "-DASSORTHEAD_INCLUDE_DIR=" + assorthead.includes(), + "-DMATTRESS_INCLUDE_DIR=" + mattress.includes(), + "-DKNNCOLLE_INCLUDE_DIR=" + knncolle.includes() ] if os.name != "nt": cmd.append("-DCMAKE_BUILD_TYPE=Release") diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 1eaaaab..449e449 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -247,7 +247,7 @@ def train_single( ref_features_idx, label_idx, markers_idx, - builder, + builder.ptr, num_threads, ), full_data = ref_data, From 60b7fd1e66113ba88d1127754093cb4eee13f1e8 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 15:03:05 -0800 Subject: [PATCH 23/29] Updated to use the latest knncolle. --- lib/src/train_single.cpp | 3 ++- setup.cfg | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/src/train_single.cpp b/lib/src/train_single.cpp index e74137c..540b649 100644 --- a/lib/src/train_single.cpp +++ b/lib/src/train_single.cpp @@ -17,10 +17,11 @@ TrainedSingleIntersectPointer train_single( const pybind11::array& ref_features, const pybind11::array& labels, const pybind11::list& markers, - const BuilderPointer& builder, + uintptr_t builder_ptr, int nthreads) { const auto& ref = mattress::cast(ref_ptr)->ptr; + const auto& builder = knncolle_py::cast_builder(builder_ptr)->ptr; singlepp::TrainSingleOptions opts; opts.num_threads = nthreads; diff --git a/setup.cfg b/setup.cfg index 79034a0..2b8e9cc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ python_requires = >=3.9 install_requires = importlib-metadata; python_version<"3.8" mattress>=0.3.1 - knncolle + knncolle>=0.1.1 delayedarray biocframe>=0.5.0 summarizedexperiment>=0.4.0 From a6017bf4d4b62f813b31b92c535995d89f8db250 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 15:04:49 -0800 Subject: [PATCH 24/29] Turned out that we do, in fact, need mattress for the installatino. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 525baed..cf9b812 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD! -requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "pybind11", "assorthead>=0.1.0"] +requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "pybind11", "assorthead>=0.1.0", "mattress>=0.3.1", "knncolle>=0.1.1"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] From 7db04857f15508a8dcef8dae1719ec5f905e4596 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 15:16:56 -0800 Subject: [PATCH 25/29] Updated CHANGELOG. --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b850d3..90322e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.5.0 + +- Switch to using **pybind11** for the Python/C++ interface. +- Update to the latest **singlepp** C++ library. + ## Version 0.4.0 - chore: Remove Python 3.8 (EOL). From 6d751235ffb2f44770227b7cbdf7c02055371217 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 16:07:14 -0800 Subject: [PATCH 26/29] Cache gypsum assets to avoid re-pinging the API. --- .github/workflows/pypi-test.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index ca0b9ab..8d609c3 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -22,6 +22,15 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: "pip" + + - name: Specify gypsum cache + run: echo "GYPSUM_CACHE_DIR=$(pwd)/.gypsum_cache" >> $GITHUB_ENV + + - name: Cache gypsum assets + uses: actions/cache@v4 + with: + path: ${{ env.GYPSUM_CACHE_DIR }} + key: gypsum-cache - name: Get latest CMake uses: lukka/get-cmake@latest From 1ff1d7e5b7f765e6650d3949b1ea810f6099ac61 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 16:19:16 -0800 Subject: [PATCH 27/29] Update the ini file. --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 69f8159..d5228c1 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ setenv = passenv = HOME SETUPTOOLS_* + GYPSUM_CACHE_DIR extras = testing commands = From 3a9204f9770a6d129d27c17f88048b4f0460e4ec Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 16:23:25 -0800 Subject: [PATCH 28/29] Finally get this thing to work. --- .github/workflows/pypi-test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 8d609c3..53e26cb 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -24,7 +24,10 @@ jobs: cache: "pip" - name: Specify gypsum cache - run: echo "GYPSUM_CACHE_DIR=$(pwd)/.gypsum_cache" >> $GITHUB_ENV + run: | + LOCATION=$(pwd)/.gypsum_cache + mkdir -p ${LOCATION} # TODO: move to gypsum_client + echo "GYPSUM_CACHE_DIR=${LOCATION}" >> $GITHUB_ENV - name: Cache gypsum assets uses: actions/cache@v4 From 64aeafcf2420e502487cdbd5dfb62760470ef3c6 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 13 Dec 2024 16:44:14 -0800 Subject: [PATCH 29/29] Check that the correct test dataset is being used for each trained reference. --- src/singler/classify_integrated.py | 4 ++++ src/singler/classify_single.py | 4 ++++ src/singler/train_integrated.py | 8 +++++--- src/singler/train_single.py | 11 ++++++++--- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/singler/classify_integrated.py b/src/singler/classify_integrated.py index 0ee2009..8bb91d3 100644 --- a/src/singler/classify_integrated.py +++ b/src/singler/classify_integrated.py @@ -76,6 +76,10 @@ def classify_integrated( """ if isinstance(test_data, summarizedexperiment.SummarizedExperiment): test_data = test_data.assay(assay_type) + + if test_data.shape[0] != integrated_prebuilt._test_num_features: # TODO: move to singlepp. + raise ValueError("number of rows in 'test_data' is not consistent with 'test_features=' used to create 'integrated_prebuilt'") + ref_labs = integrated_prebuilt.reference_labels # Applying the sanity checks. diff --git a/src/singler/classify_single.py b/src/singler/classify_single.py index aacbe51..6c96f5f 100644 --- a/src/singler/classify_single.py +++ b/src/singler/classify_single.py @@ -65,6 +65,10 @@ def classify_single( """ if isinstance(test_data, summarizedexperiment.SummarizedExperiment): test_data = test_data.assay(assay_type) + + if test_data.shape[0] != ref_prebuilt._test_num_features: # TODO: move to singlepp + raise ValueError("number of rows in 'test_data' is not consistent with 'test_features=' used to create 'ref_prebuilt'") + test_ptr = mattress.initialize(test_data) best, raw_scores, delta = lib.classify_single( diff --git a/src/singler/train_integrated.py b/src/singler/train_integrated.py index 600e1b0..9511618 100644 --- a/src/singler/train_integrated.py +++ b/src/singler/train_integrated.py @@ -14,13 +14,14 @@ class TrainedIntegratedReferences: """Object containing integrated references, typically constructed by :py:meth:`~singler.train_integrated.train_integrated`.""" - def __init__(self, ptr, ref_names, ref_labels): + def __init__(self, ptr: int, ref_names: Optional[Sequence], ref_labels: list, test_num_features: int): self._ptr = ptr self._names = ref_names self._labels = ref_labels + self._test_num_features = test_num_features # TODO: move to singlepp. @property - def reference_names(self) -> Union[Sequence[str], None]: + def reference_names(self) -> Union[Sequence, None]: """Sequence containing the names of the references. Alternatively ``None``, if no names were supplied.""" return self._names @@ -98,5 +99,6 @@ def train_integrated( return TrainedIntegratedReferences( ptr=ibuilt, ref_names=ref_names, - ref_labels=[x.labels for x in ref_prebuilt] + ref_labels=[x.labels for x in ref_prebuilt], + test_num_features = len(test_features), ) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 449e449..cec52e8 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -20,12 +20,13 @@ class TrainedSingleReference: def __init__( self, - ptr, - full_data, + ptr: int, + full_data: Any, full_label_codes: numpy.ndarray, labels: Sequence, features: Sequence, - markers: dict[Any, dict[Any, Sequence]] + markers: dict[Any, dict[Any, Sequence]], + test_num_features: int, ): self._ptr = ptr self._full_data = full_data @@ -33,6 +34,7 @@ def __init__( self._features = features self._labels = labels self._markers = markers + self._test_num_features = test_num_features # TODO: move to singlepp. def num_markers(self) -> int: """ @@ -233,10 +235,12 @@ def train_single( if test_features is None: test_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) ref_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32) + test_num_features = len(ref_features) else: common_features = _stable_intersect(test_features, ref_features) test_features_idx = biocutils.match(common_features, test_features, dtype=numpy.uint32) ref_features_idx = biocutils.match(common_features, ref_features, dtype=numpy.uint32) + test_num_features = len(test_features) ref_ptr = mattress.initialize(ref_data) builder, _ = knncolle.define_builder(nn_parameters) @@ -255,6 +259,7 @@ def train_single( labels = unique_labels, features = ref_features, markers = markers, + test_num_features = test_num_features, )