diff --git a/.github/workflows/check-install.yaml b/.github/workflows/check-install.yaml new file mode 100644 index 0000000..114513d --- /dev/null +++ b/.github/workflows/check-install.yaml @@ -0,0 +1,36 @@ +on: + push: + branches: + - master + pull_request: + +name: Check CMake install + +jobs: + install: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Get latest CMake + uses: lukka/get-cmake@latest + + - name: Configure the build + run: cmake -S . -B build -DSCRAN_PCA_TESTS=OFF + + - name: Install the library + run: sudo cmake --install build + + - name: Test downstream usage + run: | + mkdir _downstream + touch _downstream/source.cpp + cat << EOF > _downstream/CMakeLists.txt + cmake_minimum_required(VERSION 3.24) + project(test_install) + add_executable(whee source.cpp) + find_package(libscran_scran_pca) + target_link_libraries(whee libscran::scran_pca) + EOF + cd _downstream && cmake -S . -B build diff --git a/CMakeLists.txt b/CMakeLists.txt index 736ee34..d950acc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,20 +1,77 @@ cmake_minimum_required(VERSION 3.14) -project(scran_principal_component_analysis +project(scran_pca VERSION 1.0.0 DESCRIPTION "Principal components analysis for single-cell data" LANGUAGES CXX) -add_library(scran_principal_component_analysis INTERFACE) -target_compile_features(scran_principal_component_analysis INTERFACE cxx_std_17) -target_include_directories(scran_principal_component_analysis INTERFACE include/scran) +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) -add_subdirectory(extern) -target_link_libraries(scran_principal_component_analysis INTERFACE scran_core_utils tatami::tatami tatami::tatami_stats ltla::irlba Eigen3::Eigen) +# Library +add_library(scran_pca INTERFACE) +add_library(libscran::scran_pca ALIAS scran_pca) +target_include_directories(scran_pca INTERFACE + $ + $) +target_compile_features(scran_pca INTERFACE cxx_std_17) + +# Dependencies +option(SCRAN_PCA_FETCH_EXTERN "Automatically fetch scran_pca's external dependencies." ON) +if(SCRAN_PCA_FETCH_EXTERN) + add_subdirectory(extern) +else() + find_package(tatami_tatami 3.0.0 CONFIG REQUIRED) + find_package(tatami_tatami_stats 1.0.0 CONFIG REQUIRED) + find_package(libscran_scran_blocks 1.0.0 CONFIG REQUIRED) + find_package(ltla_irlba 2.0.0 CONFIG REQUIRED) + find_package(Eigen3 3.4.0 REQUIRED NO_MODULE) +endif() + +target_link_libraries( + scran_pca INTERFACE + tatami::tatami + tatami::tatami_stats + libscran::scran_blocks + ltla::irlba + Eigen3::Eigen +) + +# Tests if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + option(SCRAN_PCA_TESTS "Build scran_pca's test suite." ON) +else() + option(SCRAN_PCA_TESTS "Build scran_pca's test suite." OFF) +endif() + +if(SCRAN_PCA_TESTS) include(CTest) if(BUILD_TESTING) add_subdirectory(tests) - endif() + endif() endif() + +# Install +install(DIRECTORY include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/scran_pca) + +install(TARGETS scran_pca + EXPORT scran_pcaTargets) + +install(EXPORT scran_pcaTargets + FILE libscran_scran_pcaTargets.cmake + NAMESPACE libscran:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libscran_scran_pca) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/libscran_scran_pcaConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libscran_scran_pca) + +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/libscran_scran_pcaConfigVersion.cmake" + COMPATIBILITY SameMajorVersion) + +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libscran_scran_pcaConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/libscran_scran_pcaConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libscran_scran_pca) diff --git a/README.md b/README.md index 36c8ae0..deb1dfa 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Principal components analysis, duh -![Unit tests](https://github.com/libscran/principal_component_analysis/actions/workflows/run-tests.yaml/badge.svg) -![Documentation](https://github.com/libscran/principal_component_analysis/actions/workflows/doxygenate.yaml/badge.svg) -[![Codecov](https://codecov.io/gh/libscran/principal_component_analysis/graph/badge.svg?token=qklLZtJSE9)](https://codecov.io/gh/libscran/principal_component_analysis) +![Unit tests](https://github.com/libscran/scran_pca/actions/workflows/run-tests.yaml/badge.svg) +![Documentation](https://github.com/libscran/scran_pca/actions/workflows/doxygenate.yaml/badge.svg) +[![Codecov](https://codecov.io/gh/libscran/scran_pca/graph/badge.svg?token=qklLZtJSE9)](https://codecov.io/gh/libscran/scran_pca) ## Overview @@ -16,13 +16,13 @@ factored out into a separate C++ library for easier re-use. Given a [`tatami::Matrix`](https://github.com/tatami-inc/tatami), the `simple_pca::compute()` function will compute the PCA to obtain a low-dimensional representation of the cells: ```cpp -#include "scran/simple_pca.hpp" +#include "scran_pca/scran_pca.hpp" -tatami::Matrix* ptr = some_data_source(); +const tatami::Matrix& mat = some_data_source(); // Take the top 20 PCs: -scran::simple_pca::Options opt; -auto res = scran::simple_pca::compute(ptr, 20, opt); +scran_pca::SimplePcaOptions opt; +auto res = scran_pca::simple_pca(mat, 20, opt); res.components; // rows are PCs, columns are cells. res.rotation; // rows are genes, columns correspond to PCs. @@ -45,8 +45,8 @@ This ensures that the inter-block differences do not contribute to the first few ```cpp std::vector blocks = some_blocks(); -scran::blocked_pca::Options bopt; -auto bres = scran::blocked_pca::compute(ptr, blocks.data(), 20, bopt); +scran_pca::BlockedPcaOptions bopt; +auto bres = scran_pca::blocked_pca(mat, blocks.data(), 20, bopt); bres.components; // rows are PCs, columns are cells. bres.center; // rows are blocks, columns are genes. @@ -59,31 +59,59 @@ we can use `blocked_pca::compute()` to obtain an appropriate matrix that focuses ```cpp bopt.components_from_residuals = false; -auto bres2 = scran::blocked_pca::compute(ptr, blocks.data(), 20, bopt); +auto bres2 = scran_pca::blocked_pca(mat, blocks.data(), 20, bopt); ``` -Check out the [reference documentation](https://libscran.github.io/principal_component_analysis) for more details. +Check out the [reference documentation](https://libscran.github.io/scran_pca) for more details. ## Building projects -This repository is part of the broader [**libscran**](https://github.com/libscran/libscran) library, -so users are recommended to use the latter in their projects. -**libscran** developers should just use CMake with `FetchContent`: +### CMake with `FetchContent` + +If you're using CMake, you just need to add something like this to your `CMakeLists.txt`: ```cmake include(FetchContent) FetchContent_Declare( - scran_principal_component_analysis - GIT_REPOSITORY https://github.com/libscran/principal_component_analysis + scran_pca + GIT_REPOSITORY https://github.com/libscran/scran_pca GIT_TAG master # or any version of interest ) -FetchContent_MakeAvailable(scran_principal_component_analysis) +FetchContent_MakeAvailable(scran_pca) +``` + +Then you can link to **scran_pca** to make the headers available during compilation: +```cmake # For executables: -target_link_libraries(myexe scran_principal_component_analysis) +target_link_libraries(myexe libscran::scran_pca) # For libaries -target_link_libraries(mylib INTERFACE scran_principal_component_analysis) +target_link_libraries(mylib INTERFACE libscran::scran_pca) +``` + +### CMake with `find_package()` + +```cmake +find_package(libscran_scran_pca CONFIG REQUIRED) +target_link_libraries(mylib INTERFACE libscran::scran_pca) +``` + +To install the library, use: + +```sh +mkdir build && cd build +cmake .. -DSCRAN_PCA_TESTS=OFF +cmake --build . --target install ``` + +By default, this will use `FetchContent` to fetch all external dependencies. +If you want to install them manually, use `-DSCRAN_PCA_FETCH_EXTERN=OFF`. +See the tags in [`extern/CMakeLists.txt`](extern/CMakeLists.txt) to find compatible versions of each dependency. + +### Manual + +If you're not using CMake, the simple approach is to just copy the files in `include/` - either directly or with Git submodules - and include their path during compilation with, e.g., GCC's `-I`. +This requires the external dependencies listed in [`extern/CMakeLists.txt`](extern/CMakeLists.txt), which also need to be made available during compilation. diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in new file mode 100644 index 0000000..5e2e334 --- /dev/null +++ b/cmake/Config.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(tatami_tatami 3.0.0 CONFIG REQUIRED) +find_dependency(tatami_tatami_stats 1.0.0 CONFIG REQUIRED) +find_dependency(libscran_scran_blocks 1.0.0 CONFIG REQUIRED) +find_dependency(ltla_irlba 2.0.0 CONFIG REQUIRED) +find_dependency(Eigen3 3.4.0 REQUIRED NO_MODULE) + +include("${CMAKE_CURRENT_LIST_DIR}/libscran_scran_pcaTargets.cmake") diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 97c8f7f..33aee6d 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -25,13 +25,13 @@ FetchContent_Declare( ) FetchContent_Declare( - scran_core_utils - GIT_REPOSITORY https://github.com/libscran/core_utils - GIT_TAG master + scran_blocks + GIT_REPOSITORY https://github.com/libscran/scran_blocks + GIT_TAG master # ^1.0.0 ) FetchContent_MakeAvailable(tatami) FetchContent_MakeAvailable(tatami_stats) FetchContent_MakeAvailable(irlba) FetchContent_MakeAvailable(eigen) -FetchContent_MakeAvailable(scran_core_utils) +FetchContent_MakeAvailable(scran_blocks) diff --git a/include/scran/scran.hpp b/include/scran/scran.hpp deleted file mode 100644 index e71314e..0000000 --- a/include/scran/scran.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef SCRAN_SCRAN_HPP -#define SCRAN_SCRAN_HPP - -/** - * @file scran.hpp - * @brief Methods for single-cell analysis. - */ - -/** - * @namespace scran - * @brief Methods for single-cell analysis. - */ -namespace scran {} - -#endif diff --git a/include/scran/blocked_pca.hpp b/include/scran_pca/blocked_pca.hpp similarity index 83% rename from include/scran/blocked_pca.hpp rename to include/scran_pca/blocked_pca.hpp index c9a5baa..5528bf3 100644 --- a/include/scran/blocked_pca.hpp +++ b/include/scran_pca/blocked_pca.hpp @@ -1,5 +1,5 @@ -#ifndef SCRAN_BLOCKED_PCA_HPP -#define SCRAN_BLOCKED_PCA_HPP +#ifndef SCRAN_PCA_BLOCKED_PCA_HPP +#define SCRAN_PCA_BLOCKED_PCA_HPP #include "tatami/tatami.hpp" #include "irlba/irlba.hpp" @@ -9,8 +9,8 @@ #include #include -#include "block_weights.hpp" -#include "pca_utils.hpp" +#include "scran_blocks/scran_blocks.hpp" +#include "utils.hpp" /** * @file blocked_pca.hpp @@ -18,47 +18,16 @@ * @brief Perform PCA on residuals after regressing out a blocking factor. */ -namespace scran { +namespace scran_pca { /** - * @namespace scran::blocked_pca - * @brief Perform PCA on residuals after regressing out a blocking factor. - * - * In the presence of a blocking factor (e.g., batches, samples), we want to ensure that the PCA is not driven by uninteresting differences between blocks. - * To achieve this, `blocked_pca::compute()` centers the expression of each gene in each blocking level and uses the residuals for PCA. - * The gene-gene covariance matrix will thus focus on variation within each batch, - * ensuring that the top rotation vectors/principal components capture biological heterogeneity instead of inter-block differences. - * Internally, `blocked_pca::compute()` defers the residual calculation until the matrix multiplication steps within [IRLBA](https://github.com/LTLA/CppIrlba). - * This yields the same results as the naive calculation of residuals but is much faster as it can take advantage of efficient sparse operations. - * - * By default, the principal components are computed from the (conceptual) matrix of residuals. - * This yields a low-dimensional space where all inter-block differences have been removed, - * assuming that all blocks have the same composition and the inter-block differences are consistent for all cell subpopulations. - * Under these assumptions, we could use these components for downstream analysis without any concern for block-wise effects. - * In practice, these assumptions do not hold and more sophisticated batch correction methods like [MNN correction](https://github.com/LTLA/CppMnnCorrect) are required. - * Some of these methods accept a low-dimensional embedding of cells as input, which can be created by `blocked_pca::compute()` with `Options::components_from_residuals = false`. - * In this mode, only the rotation vectors are computed from the residuals. - * The original expression values for each cell are then projected onto the associated subspace to obtain PC coordinates that can be used for further batch correction. - * This approach aims to preserve the benefits of blocking to focus on intra-block biology instead of inter-block differences, - * without making strong assumptions about the nature of those differences. - * - * If one batch has many more cells than the others, it will dominate the PCA by driving the axes of maximum variance. - * This may mask interesting aspects of variation in the smaller batches. - * To mitigate this, we scale each batch in inverse proportion to its size (see `Options::block_weight_policy`). - * This ensures that each batch contributes equally to the (conceptual) gene-gene covariance matrix and thus the rotation vectors. - * The vector of residuals for each cell (or the original expression values, if `Options::components_from_residuals = false`) - * is then projected to the subspace defined by these rotation vectors to obtain that cell's PC coordinates. - */ -namespace blocked_pca { - -/** - * @brief Options for `compute()`. + * @brief BlockedPcaOptions for `blocked_pca()`. */ -struct Options { +struct BlockedPcaOptions { /** * @cond */ - Options() { + BlockedPcaOptions() { irlba_options.cap_number = true; } /** @@ -80,13 +49,13 @@ struct Options { /** * Policy to use for weighting batches of different size. */ - block_weights::Policy block_weight_policy = block_weights::Policy::VARIABLE; + scran_blocks::WeightPolicy block_weight_policy = scran_blocks::WeightPolicy::VARIABLE; /** * Parameters for the variable block weights. - * Only used when `Options::block_weight_policy = block_weights::Policy::VARIABLE`. + * Only used when `BlockedPcaOptions::block_weight_policy = scran_blocks::WeightPolicy::VARIABLE`. */ - block_weights::VariableParameters variable_block_weight_parameters; + scran_blocks::VariableWeightParameters variable_block_weight_parameters; /** * Compute the principal components from the residuals. @@ -138,12 +107,12 @@ template BlockingDetails compute_blocking_details( Index_ ncells, const Block_* block, - block_weights::Policy block_weight_policy, - const block_weights::VariableParameters& variable_block_weight_parameters) + scran_blocks::WeightPolicy block_weight_policy, + const scran_blocks::VariableWeightParameters& variable_block_weight_parameters) { BlockingDetails output; output.block_size = tatami_stats::tabulate_groups(block, ncells); - if (block_weight_policy == block_weights::Policy::NONE) { + if (block_weight_policy == scran_blocks::WeightPolicy::NONE) { return output; } @@ -162,8 +131,8 @@ BlockingDetails compute_blocking_details( // 'compute_blockwise_mean_and_variance*()' functions. if (bsize) { typename EigenVector_::Scalar block_weight = 1; - if (block_weight_policy == block_weights::Policy::VARIABLE) { - block_weight = block_weights::compute_variable(bsize, variable_block_weight_parameters); + if (block_weight_policy == scran_blocks::WeightPolicy::VARIABLE) { + block_weight = scran_blocks::compute_variable_weight(bsize, variable_block_weight_parameters); } element_weight[i] = block_weight / bsize; @@ -198,7 +167,7 @@ BlockingDetails compute_blocking_details( *****************************************************************/ template -void compute_sparse_mean_and_variance( +void compute_sparse_mean_and_variance_blocked( Num_ num_nonzero, const Value_* values, const Index_* indices, @@ -257,12 +226,12 @@ void compute_sparse_mean_and_variance( // COMMENT ON DENOMINATOR: // If we're not dealing with weights, we compute the actual sample // variance for easy interpretation (and to match up with the - // per-PC calculations in pca_utils::clean_up). + // per-PC calculations in internal::clean_up). // // If we're dealing with weights, the concept of the sample variance // becomes somewhat weird, but we just use the same denominator for // consistency in clean_up_projected. Magnitude doesn't matter when - // scaling for pca_utils::process_scale_vector anyway. + // scaling for internal::process_scale_vector anyway. variance /= num_all - 1; } @@ -288,7 +257,7 @@ void compute_blockwise_mean_and_variance_realized_sparse( for (size_t c = start, end = start + length; c < end; ++c, mptr += nblocks) { auto offset = ptrs[c]; - compute_sparse_mean_and_variance( + compute_sparse_mean_and_variance_blocked( ptrs[c + 1] - offset, values.data() + offset, indices.data() + offset, @@ -304,7 +273,7 @@ void compute_blockwise_mean_and_variance_realized_sparse( } template -void compute_dense_mean_and_variance( +void compute_dense_mean_and_variance_blocked( Num_ number, const Value_* values, const Block_* block, @@ -363,14 +332,14 @@ void compute_blockwise_mean_and_variance_realized_dense( auto mptr = centers.data() + static_cast(start) * nblocks; // cast to avoid overflow. for (size_t c = start, end = start + length; c < end; ++c, ptr += ncells, mptr += nblocks) { - compute_dense_mean_and_variance(ncells, ptr, block, block_details, mptr, variances[c]); + compute_dense_mean_and_variance_blocked(ncells, ptr, block, block_details, mptr, variances[c]); } }, emat.cols(), nthreads); } template void compute_blockwise_mean_and_variance_tatami( - const tatami::Matrix* mat, // this should have genes in the rows! + const tatami::Matrix& mat, // this should have genes in the rows! const Block_* block, const BlockingDetails& block_details, EigenMatrix_& centers, @@ -379,10 +348,10 @@ void compute_blockwise_mean_and_variance_tatami( { const auto& block_size = block_details.block_size; size_t nblocks = block_size.size(); - Index_ ngenes = mat->nrow(); - Index_ ncells = mat->ncol(); + Index_ ngenes = mat.nrow(); + Index_ ncells = mat.ncol(); - if (mat->prefer_rows()) { + if (mat.prefer_rows()) { tatami::parallelize([&](size_t, Index_ start, Index_ length) -> void { static_assert(!EigenMatrix_::IsRowMajor); auto mptr = centers.data() + static_cast(start) * nblocks; // cast to avoid overflow. @@ -390,18 +359,18 @@ void compute_blockwise_mean_and_variance_tatami( std::vector vbuffer(ncells); - if (mat->is_sparse()) { + if (mat.is_sparse()) { std::vector ibuffer(ncells); - auto ext = tatami::consecutive_extractor(mat, true, start, length); + auto ext = tatami::consecutive_extractor(&mat, true, start, length); for (Index_ r = start, end = start + length; r < end; ++r, mptr += nblocks) { auto range = ext->fetch(vbuffer.data(), ibuffer.data()); - compute_sparse_mean_and_variance(range.number, range.value, range.index, block, block_details, mptr, variances[r], block_copy, ncells); + compute_sparse_mean_and_variance_blocked(range.number, range.value, range.index, block, block_details, mptr, variances[r], block_copy, ncells); } } else { - auto ext = tatami::consecutive_extractor(mat, true, start, length); + auto ext = tatami::consecutive_extractor(&mat, true, start, length); for (Index_ r = start, end = start + length; r < end; ++r, mptr += nblocks) { auto ptr = ext->fetch(vbuffer.data()); - compute_dense_mean_and_variance(ncells, ptr, block, block_details, mptr, variances[r]); + compute_dense_mean_and_variance_blocked(ncells, ptr, block, block_details, mptr, variances[r]); } } }, ngenes, nthreads); @@ -433,7 +402,7 @@ void compute_blockwise_mean_and_variance_tatami( std::vector vbuffer(length); - if (mat->is_sparse()) { + if (mat.is_sparse()) { std::vector > running; running.reserve(nblocks); for (size_t b = 0; b < nblocks; ++b) { @@ -441,7 +410,7 @@ void compute_blockwise_mean_and_variance_tatami( } std::vector ibuffer(length); - auto ext = tatami::consecutive_extractor(mat, false, static_cast(0), ncells, start, length); + auto ext = tatami::consecutive_extractor(&mat, false, static_cast(0), ncells, start, length); for (Index_ c = 0; c < ncells; ++c) { auto range = ext->fetch(vbuffer.data(), ibuffer.data()); running[block[c]].add(range.value, range.index, range.number); @@ -458,7 +427,7 @@ void compute_blockwise_mean_and_variance_tatami( running.emplace_back(length, re_centers[b].data(), re_variances[b].data(), /* skip_nan = */ false); } - auto ext = tatami::consecutive_extractor(mat, false, static_cast(0), ncells, start, length); + auto ext = tatami::consecutive_extractor(&mat, false, static_cast(0), ncells, start, length); for (Index_ c = 0; c < ncells; ++c) { auto ptr = ext->fetch(vbuffer.data()); running[block[c]].add(ptr); @@ -563,18 +532,18 @@ inline void project_matrix_realized_sparse( template inline void project_matrix_transposed_tatami( - const tatami::Matrix* mat, // genes in rows, cells in columns + const tatami::Matrix& mat, // genes in rows, cells in columns EigenMatrix_& components, const EigenMatrix_& scaled_rotation, // genes in rows, dims in columns int nthreads) { size_t rank = scaled_rotation.cols(); - auto ngenes = mat->nrow(), ncells = mat->ncol(); + auto ngenes = mat.nrow(), ncells = mat.ncol(); typedef typename EigenMatrix_::Scalar Scalar; components.resize(rank, ncells); // used a transposed version for more cache efficiency. - if (mat->prefer_rows()) { + if (mat.prefer_rows()) { tatami::parallelize([&](size_t, Index_ start, Index_ length) -> void { static_assert(!EigenMatrix_::IsRowMajor); auto vptr = scaled_rotation.data(); @@ -586,9 +555,9 @@ inline void project_matrix_transposed_tatami( local_buffers.emplace_back(length); } - if (mat->is_sparse()) { + if (mat.is_sparse()) { std::vector ibuffer(length); - auto ext = tatami::consecutive_extractor(mat, true, static_cast(0), ngenes, start, length); + auto ext = tatami::consecutive_extractor(&mat, true, static_cast(0), ngenes, start, length); for (Index_ g = 0; g < ngenes; ++g) { auto range = ext->fetch(vbuffer.data(), ibuffer.data()); for (size_t r = 0; r < rank; ++r) { @@ -601,7 +570,7 @@ inline void project_matrix_transposed_tatami( } } else { - auto ext = tatami::consecutive_extractor(mat, true, static_cast(0), ngenes, start, length); + auto ext = tatami::consecutive_extractor(&mat, true, static_cast(0), ngenes, start, length); for (Index_ g = 0; g < ngenes; ++g) { auto ptr = ext->fetch(vbuffer.data()); for (size_t r = 0; r < rank; ++r) { @@ -628,9 +597,9 @@ inline void project_matrix_transposed_tatami( static_assert(!EigenMatrix_::IsRowMajor); auto optr = components.data() + static_cast(start) * rank; // cast to avoid overflow. - if (mat->is_sparse()) { + if (mat.is_sparse()) { std::vector ibuffer(ngenes); - auto ext = tatami::consecutive_extractor(mat, false, start, length); + auto ext = tatami::consecutive_extractor(&mat, false, start, length); for (Index_ c = start, end = start + length; c < end; ++c, optr += rank) { auto range = ext->fetch(vbuffer.data(), ibuffer.data()); @@ -646,7 +615,7 @@ inline void project_matrix_transposed_tatami( } } else { - auto ext = tatami::consecutive_extractor(mat, false, start, length); + auto ext = tatami::consecutive_extractor(&mat, false, start, length); for (Index_ c = start, end = start + length; c < end; ++c, optr += rank) { auto ptr = ext->fetch(vbuffer.data()); static_assert(!EigenMatrix_::IsRowMajor); @@ -776,12 +745,12 @@ class ResidualWrapper { **************************/ template -void run_general( - const tatami::Matrix* mat, +void run_blocked( + const tatami::Matrix& mat, const Block_* block, const BlockingDetails& block_details, int rank, - const Options& options, + const BlockedPcaOptions& options, EigenMatrix_& components, EigenMatrix_& rotation, EigenVector_& variance_explained, @@ -789,22 +758,22 @@ void run_general( EigenVector_& scale_v, typename EigenVector_::Scalar& total_var) { - Index_ ngenes = mat->nrow(), ncells = mat->ncol(); + Index_ ngenes = mat.nrow(), ncells = mat.ncol(); auto emat = [&]{ if constexpr(!realize_matrix_) { - return pca_utils::TransposedTatamiWrapper(mat, options.num_threads); + return internal::TransposedTatamiWrapper(mat, options.num_threads); } else if constexpr(sparse_) { // 'extracted' contains row-major contents... but we implicitly transpose it to CSC with genes in columns. - auto extracted = tatami::retrieve_compressed_sparse_contents(mat, /* row = */ true, /* two_pass = */ false, /* threads = */ options.num_threads); + auto extracted = tatami::retrieve_compressed_sparse_contents(&mat, /* row = */ true, /* two_pass = */ false, /* threads = */ options.num_threads); return irlba::ParallelSparseMatrix(ncells, ngenes, std::move(extracted.value), std::move(extracted.index), std::move(extracted.pointers), true, options.num_threads); } else { // Perform an implicit transposition by performing a row-major extraction into a column-major transposed matrix. EigenMatrix_ emat(ncells, ngenes); static_assert(!EigenMatrix_::IsRowMajor); - tatami::convert_to_dense(mat, /* row_major = */ true, emat.data(), options.num_threads); + tatami::convert_to_dense(&mat, /* row_major = */ true, emat.data(), options.num_threads); return emat; } }(); @@ -819,7 +788,7 @@ void run_general( } else { compute_blockwise_mean_and_variance_realized_dense(emat, block, block_details, center_m, scale_v, options.num_threads); } - total_var = pca_utils::process_scale_vector(options.scale, scale_v); + total_var = internal::process_scale_vector(options.scale, scale_v); ResidualWrapper centered(emat, block, center_m); @@ -867,7 +836,7 @@ void run_general( } if (options.components_from_residuals) { - pca_utils::clean_up(mat->ncol(), components, variance_explained); + internal::clean_up(mat.ncol(), components, variance_explained); if (options.transpose) { components.adjointInPlace(); } @@ -892,54 +861,24 @@ void run_general( } } -template -void dispatch( - const tatami::Matrix* mat, - const Block_* block, - int rank, - const Options& options, - EigenMatrix_& components, - EigenMatrix_& rotation, - EigenVector_& variance_explained, - EigenMatrix_& center_m, - EigenVector_& scale_v, - double& total_var) -{ - irlba::EigenThreadScope t(options.num_threads); - auto bdetails = compute_blocking_details(mat->ncol(), block, options.block_weight_policy, options.variable_block_weight_parameters); - - if (mat->sparse()) { - if (options.realize_matrix) { - run_general(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); - } else { - run_general(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); - } - } else { - if (options.realize_matrix) { - run_general(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); - } else { - run_general(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); - } - } -} - } /** * @endcond */ /** - * @brief Results of the PCA on the residuals. + * @brief Results of `blocked_pca()`. * - * Instances should generally be constructed by the `compute()` function. + * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. + * @tparam EigenVector_ A floating-point `Eigen::Vector` class. */ template -struct Results { +struct BlockedPcaResults { /** * Matrix of principal components. * By default, each row corresponds to a PC while each column corresponds to a cell in the input matrix. - * If `Options::transpose = false`, rows are cells instead. - * The number of PCs is determined by the `rank` used in `compute()`. + * If `BlockedPcaOptions::transpose = false`, rows are cells instead. + * The number of PCs is determined by the `rank` used in `blocked_pca()`. */ EigenMatrix_ components; @@ -950,7 +889,7 @@ struct Results { EigenVector_ variance_explained; /** - * Total variance of the dataset (possibly after scaling, if `Options::scale = true`). + * Total variance of the dataset (possibly after scaling, if `BlockedPcaOptions::scale = true`). * This can be used to divide `variance_explained` to obtain the percentage of variance explained. */ typename EigenVector_::Scalar total_variance = 0; @@ -958,7 +897,7 @@ struct Results { /** * Rotation matrix. * Each row corresponds to a gene while each column corresponds to a PC. - * The number of PCs is determined by the `rank` used in `compute()`. + * The number of PCs is determined by the `rank` used in `blocked_pca()`. */ EigenMatrix_ rotation; @@ -970,14 +909,89 @@ struct Results { EigenMatrix_ center; /** - * Scaling vector, only returned if `Options::scale = true`. - * Each entry corresponds to a row in the input matrix and contains the scaling factor used to divide that gene's values if `Options::scale = true`. + * Scaling vector, only returned if `BlockedPcaOptions::scale = true`. + * Each entry corresponds to a row in the input matrix and contains the scaling factor used to divide that gene's values if `BlockedPcaOptions::scale = true`. */ EigenVector_ scale; }; /** - * Run PCA on the residuals after regressing out a blocking factor from the rows of a gene-by-cell matrix. + * In the presence of a blocking factor (e.g., batches, samples), we want to ensure that the PCA is not driven by uninteresting differences between blocks. + * To achieve this, `blocked_pca()` centers the expression of each gene in each blocking level and uses the residuals for PCA. + * The gene-gene covariance matrix will thus focus on variation within each batch, + * ensuring that the top rotation vectors/principal components capture biological heterogeneity instead of inter-block differences. + * Internally, `blocked_pca()` defers the residual calculation until the matrix multiplication steps within [IRLBA](https://github.com/LTLA/CppIrlba). + * This yields the same results as the naive calculation of residuals but is much faster as it can take advantage of efficient sparse operations. + * + * By default, the principal components are computed from the (conceptual) matrix of residuals. + * This yields a low-dimensional space where all inter-block differences have been removed, + * assuming that all blocks have the same composition and the inter-block differences are consistent for all cell subpopulations. + * Under these assumptions, we could use these components for downstream analysis without any concern for block-wise effects. + * In practice, these assumptions do not hold and more sophisticated batch correction methods like [MNN correction](https://github.com/LTLA/CppMnnCorrect) are required. + * Some of these methods accept a low-dimensional embedding of cells as input, which can be created by `blocked_pca()` with `BlockedPcaOptions::components_from_residuals = false`. + * In this mode, only the rotation vectors are computed from the residuals. + * The original expression values for each cell are then projected onto the associated subspace to obtain PC coordinates that can be used for further batch correction. + * This approach aims to preserve the benefits of blocking to focus on intra-block biology instead of inter-block differences, + * without making strong assumptions about the nature of those differences. + * + * If one batch has many more cells than the others, it will dominate the PCA by driving the axes of maximum variance. + * This may mask interesting aspects of variation in the smaller batches. + * To mitigate this, we scale each batch in inverse proportion to its size (see `BlockedPcaOptions::block_weight_policy`). + * This ensures that each batch contributes equally to the (conceptual) gene-gene covariance matrix and thus the rotation vectors. + * The vector of residuals for each cell (or the original expression values, if `BlockedPcaOptions::components_from_residuals = false`) + * is then projected to the subspace defined by these rotation vectors to obtain that cell's PC coordinates. + * + * @tparam Value_ Type of the matrix data. + * @tparam Index_ Integer type for the indices. + * @tparam Block_ Integer type for the blocking factor. + * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. + * @tparam EigenVector_ A floating-point `Eigen::Vector` class. + * + * @param[in] mat Input expression matrix. + * Columns should contain cells while rows should contain genes. + * @param[in] block Pointer to an array of length equal to the number of cells, + * containing the block assignment for each cell. + * Each assignment should be an integer in \f$[0, N)\f$ where \f$N\f$ is the number of blocks. + * @param rank Number of PCs to compute. + * This should be no greater than the maximum number of PCs, i.e., the smaller dimension of the input matrix; + * otherwise, only the maximum number of PCs will be reported in the results. + * @param options Further options. + * @param[out] output On output, the results of the PCA on the residuals. + * This can be re-used across multiple calls to `blocked_pca()`. + */ +template +void blocked_pca(const tatami::Matrix& mat, const Block_* block, int rank, const BlockedPcaOptions& options, BlockedPcaResults& output) { + irlba::EigenThreadScope t(options.num_threads); + auto bdetails = internal::compute_blocking_details(mat.ncol(), block, options.block_weight_policy, options.variable_block_weight_parameters); + + EigenMatrix_& components = output.components; + EigenMatrix_& rotation = output.rotation; + EigenVector_& variance_explained = output.variance_explained; + EigenMatrix_& center_m = output.center; + EigenVector_& scale_v = output.scale; + auto& total_var = output.total_variance; + + if (mat.sparse()) { + if (options.realize_matrix) { + internal::run_blocked(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); + } else { + internal::run_blocked(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); + } + } else { + if (options.realize_matrix) { + internal::run_blocked(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); + } else { + internal::run_blocked(mat, block, bdetails, rank, options, components, rotation, variance_explained, center_m, scale_v, total_var); + } + } + + if (!options.scale) { + output.scale = EigenVector_(); + } +} + +/** + * Overload of `blocked_pca()` that allocates memory for the output. * * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. * @tparam EigenVector_ A floating-point `Eigen::Vector` class. @@ -985,7 +999,7 @@ struct Results { * @tparam Index_ Integer type for the indices. * @tparam Block_ Integer type for the blocking factor. * - * @param[in] mat Pointer to the input matrix. + * @param[in] mat Input expression matrix. * Columns should contain cells while rows should contain genes. * @param[in] block Pointer to an array of length equal to the number of cells, * containing the block assignment for each cell. @@ -995,17 +1009,15 @@ struct Results { * otherwise, only the maximum number of PCs will be reported in the results. * @param options Further options. * - * @return The results of the PCA on the residuals. + * @return Results of the PCA on the residuals. */ -template -Results compute(const tatami::Matrix* mat, const Block_* block, int rank, const Options& options) { - Results output; - internal::dispatch(mat, block, rank, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance); +template +BlockedPcaResults blocked_pca(const tatami::Matrix& mat, const Block_* block, int rank, const BlockedPcaOptions& options) { + BlockedPcaResults output; + blocked_pca(mat, block, rank, options, output); return output; } } -} - #endif diff --git a/include/scran_pca/scran_pca.hpp b/include/scran_pca/scran_pca.hpp new file mode 100644 index 0000000..aa22b3c --- /dev/null +++ b/include/scran_pca/scran_pca.hpp @@ -0,0 +1,18 @@ +#ifndef SCRAN_PCA_HPP +#define SCRAN_PCA_HPP + +#include "simple_pca.hpp" +#include "blocked_pca.hpp" + +/** + * @file scran_pca.hpp + * @brief Principal component analysis on single-cell data + */ + +/** + * @namespace scran_pca + * @brief Principal component analysis on single-cell data + */ +namespace scran_pca {} + +#endif diff --git a/include/scran/simple_pca.hpp b/include/scran_pca/simple_pca.hpp similarity index 73% rename from include/scran/simple_pca.hpp rename to include/scran_pca/simple_pca.hpp index d535294..8b19136 100644 --- a/include/scran/simple_pca.hpp +++ b/include/scran_pca/simple_pca.hpp @@ -1,5 +1,5 @@ -#ifndef SCRAN_SIMPLE_PCA_HPP -#define SCRAN_SIMPLE_PCA_HPP +#ifndef SCRAN_PCA_SIMPLE_PCA_HPP +#define SCRAN_PCA_SIMPLE_PCA_HPP #include "tatami/tatami.hpp" #include "tatami_stats/tatami_stats.hpp" @@ -10,34 +10,23 @@ #include #include -#include "pca_utils.hpp" +#include "utils.hpp" /** * @file simple_pca.hpp * @brief Perform a simple PCA on a gene-by-cell matrix. */ -namespace scran { +namespace scran_pca { /** - * @namespace scran::simple_pca - * @brief Perform a simple PCA on a gene-cell matrix. - * - * Principal components analysis (PCA) is a helpful technique for data compression and denoising. - * The idea is that the earlier PCs capture most of the systematic biological variation while the later PCs capture random technical noise. - * Thus, we can reduce the size of the data and eliminate noise by only using the earlier PCs for further analyses. - * Most practitioners will keep the first 10-50 PCs, though the exact choice is fairly arbitrary. - */ -namespace simple_pca { - -/** - * @brief Options for `compute()`. + * @brief Options for `simple_pca()`. */ -struct Options { +struct SimplePcaOptions { /** * @cond */ - Options() { + SimplePcaOptions() { irlba_options.cap_number = true; } /** @@ -79,13 +68,13 @@ struct Options { namespace internal { template -void compute_row_means_and_variances(const tatami::Matrix* mat, int num_threads, EigenVector_& center_v, EigenVector_& scale_v) { - if (mat->prefer_rows()) { +void compute_row_means_and_variances(const tatami::Matrix& mat, int num_threads, EigenVector_& center_v, EigenVector_& scale_v) { + if (mat.prefer_rows()) { tatami::parallelize([&](size_t, Index_ start, Index_ length) -> void { tatami::Options opt; opt.sparse_extract_index = false; - auto ext = tatami::consecutive_extractor(mat, true, start, length, opt); - auto ncells = mat->ncol(); + auto ext = tatami::consecutive_extractor(&mat, true, start, length, opt); + auto ncells = mat.ncol(); std::vector vbuffer(ncells); for (Index_ r = start, end = start + length; r < end; ++r) { @@ -101,13 +90,13 @@ void compute_row_means_and_variances(const tatami::Matrix* mat, center_v.coeffRef(r) = results.first; scale_v.coeffRef(r) = results.second; } - }, mat->nrow(), num_threads); + }, mat.nrow(), num_threads); } else { tatami::parallelize([&](size_t t, Index_ start, Index_ length) -> void { tatami::Options opt; - auto ncells = mat->ncol(); - auto ext = tatami::consecutive_extractor(mat, false, 0, ncells, start, length, opt); + auto ncells = mat.ncol(); + auto ext = tatami::consecutive_extractor(&mat, false, 0, ncells, start, length, opt); typedef typename EigenVector_::Scalar Scalar; tatami_stats::LocalOutputBuffer cbuffer(t, start, length, center_v.data()); @@ -136,7 +125,7 @@ void compute_row_means_and_variances(const tatami::Matrix* mat, running.finish(); cbuffer.transfer(); sbuffer.transfer(); - }, mat->nrow(), num_threads); + }, mat.nrow(), num_threads); } } @@ -144,7 +133,7 @@ template void run_irlba_deferred( const IrlbaMatrix_& mat, int rank, - const Options& options, + const SimplePcaOptions& options, EigenMatrix_& components, EigenMatrix_& rotation, EigenVector_& variance_explained, @@ -162,9 +151,9 @@ void run_irlba_deferred( template void run_sparse( - const tatami::Matrix* mat, + const tatami::Matrix& mat, int rank, - const Options& options, + const SimplePcaOptions& options, EigenMatrix_& components, EigenMatrix_& rotation, EigenVector_& variance_explained, @@ -172,21 +161,21 @@ void run_sparse( EigenVector_& scale_v, typename EigenVector_::Scalar& total_var) { - Index_ ngenes = mat->nrow(); + Index_ ngenes = mat.nrow(); center_v.resize(ngenes); scale_v.resize(ngenes); if (options.realize_matrix) { // 'extracted' contains row-major contents... auto extracted = tatami::retrieve_compressed_sparse_contents( - mat, + &mat, /* row = */ true, /* two_pass = */ false, /* threads = */ options.num_threads ); // But we effectively transpose it to CSC with genes in columns. - Index_ ncells = mat->ncol(); + Index_ ncells = mat.ncol(); irlba::ParallelSparseMatrix emat( ncells, ngenes, @@ -209,14 +198,14 @@ void run_sparse( } }, ngenes, options.num_threads); - total_var = pca_utils::process_scale_vector(options.scale, scale_v); + total_var = internal::process_scale_vector(options.scale, scale_v); run_irlba_deferred(emat, rank, options, components, rotation, variance_explained, center_v, scale_v); } else { compute_row_means_and_variances(mat, options.num_threads, center_v, scale_v); - total_var = pca_utils::process_scale_vector(options.scale, scale_v); + total_var = internal::process_scale_vector(options.scale, scale_v); run_irlba_deferred( - pca_utils::TransposedTatamiWrapper(mat, options.num_threads), + internal::TransposedTatamiWrapper(mat, options.num_threads), rank, options, components, @@ -230,9 +219,9 @@ void run_sparse( template void run_dense( - const tatami::Matrix* mat, + const tatami::Matrix& mat, int rank, - const Options& options, + const SimplePcaOptions& options, EigenMatrix_& components, EigenMatrix_& rotation, EigenVector_& variance_explained, @@ -240,18 +229,18 @@ void run_dense( EigenVector_& scale_v, typename EigenVector_::Scalar& total_var) { - Index_ ngenes = mat->nrow(); + Index_ ngenes = mat.nrow(); center_v.resize(ngenes); scale_v.resize(ngenes); if (options.realize_matrix) { // Create a matrix with genes in columns. - Index_ ncells = mat->ncol(); + Index_ ncells = mat.ncol(); EigenMatrix_ emat(ncells, ngenes); // If emat is row-major, we want to fill it with columns of 'mat', so row_major = false. // If emat is column-major, we want to fill it with rows of 'mat', so row_major = true. - tatami::convert_to_dense(mat, /* row_major = */ !emat.IsRowMajor, emat.data(), options.num_threads); + tatami::convert_to_dense(&mat, /* row_major = */ !emat.IsRowMajor, emat.data(), options.num_threads); center_v.array() = emat.array().colwise().sum(); if (ncells) { @@ -268,7 +257,7 @@ void run_dense( std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits::quiet_NaN()); } - total_var = pca_utils::process_scale_vector(options.scale, scale_v); + total_var = internal::process_scale_vector(options.scale, scale_v); if (options.scale) { emat.array().rowwise() /= scale_v.adjoint().array(); } @@ -277,9 +266,9 @@ void run_dense( } else { compute_row_means_and_variances(mat, options.num_threads, center_v, scale_v); - total_var = pca_utils::process_scale_vector(options.scale, scale_v); + total_var = internal::process_scale_vector(options.scale, scale_v); run_irlba_deferred( - pca_utils::TransposedTatamiWrapper(mat, options.num_threads), + internal::TransposedTatamiWrapper(mat, options.num_threads), rank, options, components, @@ -291,51 +280,23 @@ void run_dense( } } -template -void dispatch( - const tatami::Matrix* mat, - int rank, - const Options& options, - EigenMatrix_& components, - EigenMatrix_& rotation, - EigenVector_& variance_explained, - EigenVector_& center_v, - EigenVector_& scale_v, - typename EigenVector_::Scalar& total_var) -{ - irlba::EigenThreadScope t(options.num_threads); - - if (mat->sparse()) { - run_sparse(mat, rank, options, components, rotation, variance_explained, center_v, scale_v, total_var); - } else { - run_dense(mat, rank, options, components, rotation, variance_explained, center_v, scale_v, total_var); - } - - pca_utils::clean_up(mat->ncol(), components, variance_explained); - if (options.transpose) { - components.adjointInPlace(); - } -} - } /** * @endcond */ /** - * @brief Container for the PCA results. + * @brief Results of `simple_pca()`. * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. * @tparam EigenVector_ A floating-point `Eigen::Vector` class. - * - * Instances should be constructed by the `compute()` function. */ template -struct Results { +struct SimplePcaResults { /** * Matrix of principal components. * By default, each row corresponds to a PC while each column corresponds to a cell in the input matrix. - * If `Options::transpose = false`, rows are cells instead. - * The number of PCs is determined by the `rank` used in `compute()`. + * If `SimplePcaOptions::transpose = false`, rows are cells instead. + * The number of PCs is determined by the `rank` used in `simple_pca()`. */ EigenMatrix_ components; @@ -346,7 +307,7 @@ struct Results { EigenVector_ variance_explained; /** - * Total variance of the dataset (possibly after scaling, if `Options::scale = true`). + * Total variance of the dataset (possibly after scaling, if `SimplePcaOptions::scale = true`). * This can be used to divide `variance_explained` to obtain the percentage of variance explained. */ typename EigenVector_::Scalar total_variance = 0; @@ -354,7 +315,7 @@ struct Results { /** * Rotation matrix. * Each row corresponds to a feature while each column corresponds to a PC. - * The number of PCs is determined by the `rank` used in `compute()`. + * The number of PCs is determined by the `rank` used in `simple_pca()`. */ EigenMatrix_ rotation; @@ -365,41 +326,74 @@ struct Results { EigenVector_ center; /** - * Scaling vector, only returned if `Options::scale = true`. - * Each entry corresponds to a row in the matrix and contains the scaling factor used to divide the feature values if `Options::scale = true`. + * Scaling vector, only returned if `SimplePcaOptions::scale = true`. + * Each entry corresponds to a row in the matrix and contains the scaling factor used to divide the feature values if `SimplePcaOptions::scale = true`. */ EigenVector_ scale; }; /** - * Run PCA on an input gene-by-cell matrix. + * Principal components analysis (PCA) is a helpful technique for data compression and denoising. + * The idea is that the earlier PCs capture most of the systematic biological variation while the later PCs capture random technical noise. + * Thus, we can reduce the size of the data and eliminate noise by only using the earlier PCs for further analyses. + * Most practitioners will keep the first 10-50 PCs, though the exact choice is fairly arbitrary. * * - * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. - * @tparam EigenVector_ A floating-point `Eigen::Vector` class. * @tparam Value_ Type of the matrix data. * @tparam Index_ Integer type for the indices. + * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. + * @tparam EigenVector_ A floating-point `Eigen::Vector` class. * - * @param[in] mat Pointer to the input matrix. + * @param[in] mat The input expression matrix. * Columns should contain cells while rows should contain genes. * @param rank Number of PCs to compute. * This should be no greater than the maximum number of PCs, i.e., the smaller dimension of the input matrix; * otherwise, only the maximum number of PCs will be reported in the results. * @param options Further options. - * - * @return The results of the PCA on `mat`. + * @param[out] output On output, the results of the PCA on `mat`. + * This can be re-used across multiple calls to `simple_pca()`. */ -template -Results compute(const tatami::Matrix* mat, int rank, const Options& options) { - Results output; - internal::dispatch(mat, rank, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance); +template +void simple_pca(const tatami::Matrix& mat, int rank, const SimplePcaOptions& options, SimplePcaResults& output) { + irlba::EigenThreadScope t(options.num_threads); + + if (mat.sparse()) { + internal::run_sparse(mat, rank, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance); + } else { + internal::run_dense(mat, rank, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance); + } + + internal::clean_up(mat.ncol(), output.components, output.variance_explained); + if (options.transpose) { + output.components.adjointInPlace(); + } if (!options.scale) { output.scale = EigenVector_(); } - - return output; } +/** + * Overload of `simple_pca()` that allocates memory for the output. + * + * @tparam EigenMatrix_ A floating-point `Eigen::Matrix` class. + * @tparam EigenVector_ A floating-point `Eigen::Vector` class. + * @tparam Value_ Type of the matrix data. + * @tparam Index_ Integer type for the indices. + * + * @param[in] mat The input expression matrix. + * Columns should contain cells while rows should contain genes. + * @param rank Number of PCs to compute. + * This should be no greater than the maximum number of PCs, i.e., the smaller dimension of the input matrix; + * otherwise, only the maximum number of PCs will be reported in the results. + * @param options Further options. + * + * @return Results of the PCA. + */ +template +SimplePcaResults simple_pca(const tatami::Matrix& mat, int rank, const SimplePcaOptions& options) { + SimplePcaResults output; + simple_pca(mat, rank, options, output); + return output; } } diff --git a/include/scran/pca_utils.hpp b/include/scran_pca/utils.hpp similarity index 96% rename from include/scran/pca_utils.hpp rename to include/scran_pca/utils.hpp index 5f486a4..dc75fc5 100644 --- a/include/scran/pca_utils.hpp +++ b/include/scran_pca/utils.hpp @@ -9,9 +9,9 @@ #include "tatami/tatami.hpp" #include "tatami_stats/tatami_stats.hpp" -namespace scran { +namespace scran_pca { -namespace pca_utils { +namespace internal { template auto process_scale_vector(bool scale, EigenVector_& scale_v) { @@ -43,12 +43,12 @@ void clean_up(size_t NC, EigenMatrix_& U, EigenVector_& D) { template class TransposedTatamiWrapper { public: - TransposedTatamiWrapper(const tatami::Matrix* mat, int num_threads) : - my_mat(mat), - my_nrow(mat->nrow()), - my_ncol(mat->ncol()), - my_is_sparse(mat->is_sparse()), - my_prefer_rows(mat->prefer_rows()), + TransposedTatamiWrapper(const tatami::Matrix& mat, int num_threads) : + my_mat(&mat), + my_nrow(mat.nrow()), + my_ncol(mat.ncol()), + my_is_sparse(mat.is_sparse()), + my_prefer_rows(mat.prefer_rows()), my_num_threads(num_threads) {} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 766d06d..57b42fc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,20 +1,10 @@ include(FetchContent) FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/1d17ea141d2c11b8917d2c7d029f1c4e2b9769b2.zip -) - -FetchContent_Declare( - scran_test_utils - GIT_REPOSITORY https://github.com/libscran/test_utils + scran_tests + GIT_REPOSITORY https://github.com/libscran/scran_tests GIT_TAG master ) - -# For Windows: Prevent overriding the parent project's compiler/linker settings -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(googletest) - -FetchContent_MakeAvailable(scran_test_utils) +FetchContent_MakeAvailable(scran_tests) enable_testing() @@ -35,9 +25,8 @@ macro(create_test name) target_link_libraries( ${name} - gtest_main - scran_principal_component_analysis - scran_test_utils + scran_pca + scran_tests ) target_compile_options(${name} PRIVATE -Wall -Wpedantic -Wextra) diff --git a/tests/src/blocked_pca.cpp b/tests/src/blocked_pca.cpp index 4522458..6032486 100644 --- a/tests/src/blocked_pca.cpp +++ b/tests/src/blocked_pca.cpp @@ -2,12 +2,12 @@ #include "compare_pcs.h" #include "utils.h" -#include "simulate_vector.h" +#include "scran_tests/scran_tests.hpp" #include "tatami/tatami.hpp" -#include "blocked_pca.hpp" -#include "simple_pca.hpp" +#include "scran_pca/blocked_pca.hpp" +#include "scran_pca/simple_pca.hpp" TEST(ResidualWrapperTest, EigenDense) { size_t NR = 30, NC = 10, NB = 3; @@ -29,7 +29,7 @@ TEST(ResidualWrapperTest, EigenDense) { } } - scran::blocked_pca::internal::ResidualWrapper blocked(thing, block.data(), centers); + scran_pca::internal::ResidualWrapper blocked(thing, block.data(), centers); auto realized = blocked.template realize(); // Trying in the normal orientation. @@ -101,12 +101,12 @@ TEST(ResidualWrapperTest, CustomSparse) { } irlba::ParallelSparseMatrix thing(NR, NC, std::move(values), std::move(indices), std::move(ptrs), /* column_major = */ true, 1); - scran::blocked_pca::internal::ResidualWrapper blocked(thing, block.data(), centers); + scran_pca::internal::ResidualWrapper blocked(thing, block.data(), centers); auto realized = blocked.template realize(); // Checking that the dense reference matches up. { - scran::blocked_pca::internal::ResidualWrapper blockedref(thing, block.data(), centers); + scran_pca::internal::ResidualWrapper blockedref(thing, block.data(), centers); auto realizedref = blockedref.template realize(); for (Eigen::Index i = 0; i < realizedref.cols(); ++i) { @@ -155,7 +155,14 @@ class BlockedPcaTestCore { static void assemble() { size_t nr = 121, nc = 155; - auto vec = simulate_sparse_vector(nr * nc, 0.1, /* lower = */ -10, /* upper = */ 10, /* seed = */ 123456); + auto vec = scran_tests::simulate_vector(nr * nc, [&]{ + scran_tests::SimulationParameters sparams; + sparams.density = 0.1; + sparams.lower = -10; + sparams.upper = 10; + sparams.seed = 123456; + return sparams; + }()); dense_row.reset(new tatami::DenseRowMatrix(nr, nc, std::move(vec))); return; } @@ -185,11 +192,11 @@ TEST_P(BlockedPcaBasicTest, BasicConsistency) { auto block = generate_blocks(dense_row->ncol(), nblocks); - scran::blocked_pca::Options opts; + scran_pca::BlockedPcaOptions opts; opts.scale = scale; opts.components_from_residuals = use_resids; - opts.block_weight_policy = scran::block_weights::Policy::NONE; - auto ref = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + opts.block_weight_policy = scran_blocks::WeightPolicy::NONE; + auto ref = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); if (nthreads == 1) { EXPECT_EQ(ref.components.rows(), rank); @@ -218,7 +225,7 @@ TEST_P(BlockedPcaBasicTest, BasicConsistency) { } else { opts.num_threads = nthreads; - auto res1 = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + auto res1 = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); // Results should be EXACTLY the same with parallelization. EXPECT_EQ(ref.components, res1.components); @@ -227,39 +234,39 @@ TEST_P(BlockedPcaBasicTest, BasicConsistency) { } // Checking that we get more-or-less the same results. - auto res2 = scran::blocked_pca::compute(dense_column.get(), block.data(), rank, opts); + auto res2 = scran_pca::blocked_pca(*dense_column, block.data(), rank, opts); expect_equal_pcs(ref.components, res2.components); expect_equal_vectors(ref.variance_explained, res2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res2.total_variance); - auto res3 = scran::blocked_pca::compute(sparse_row.get(), block.data(), rank, opts); + auto res3 = scran_pca::blocked_pca(*sparse_row, block.data(), rank, opts); expect_equal_pcs(ref.components, res3.components); expect_equal_vectors(ref.variance_explained, res3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res3.total_variance); - auto res4 = scran::blocked_pca::compute(sparse_column.get(), block.data(), rank, opts); + auto res4 = scran_pca::blocked_pca(*sparse_column, block.data(), rank, opts); expect_equal_pcs(ref.components, res4.components); expect_equal_vectors(ref.variance_explained, res4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res4.total_variance); // Checking that we get more-or-less the same results. opts.realize_matrix = false; - auto tres1 = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + auto tres1 = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); expect_equal_pcs(ref.components, tres1.components); expect_equal_vectors(ref.variance_explained, tres1.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres1.total_variance); - auto tres2 = scran::blocked_pca::compute(dense_column.get(), block.data(), rank, opts); + auto tres2 = scran_pca::blocked_pca(*dense_column, block.data(), rank, opts); expect_equal_pcs(ref.components, tres2.components); expect_equal_vectors(ref.variance_explained, tres2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres2.total_variance); - auto tres3 = scran::blocked_pca::compute(sparse_row.get(), block.data(), rank, opts); + auto tres3 = scran_pca::blocked_pca(*sparse_row, block.data(), rank, opts); expect_equal_pcs(ref.components, tres3.components); expect_equal_vectors(ref.variance_explained, tres3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres3.total_variance); - auto tres4 = scran::blocked_pca::compute(sparse_column.get(), block.data(), rank, opts); + auto tres4 = scran_pca::blocked_pca(*sparse_column, block.data(), rank, opts); expect_equal_pcs(ref.components, tres4.components); expect_equal_vectors(ref.variance_explained, tres4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres4.total_variance); @@ -275,11 +282,11 @@ TEST_P(BlockedPcaBasicTest, WeightedConsistency) { auto block = generate_blocks(dense_row->ncol(), nblocks); - scran::blocked_pca::Options opts; + scran_pca::BlockedPcaOptions opts; opts.scale = scale; opts.components_from_residuals = use_resids; - opts.block_weight_policy = scran::block_weights::Policy::EQUAL; - auto ref = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + opts.block_weight_policy = scran_blocks::WeightPolicy::EQUAL; + auto ref = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); if (nthreads == 1) { are_pcs_centered(ref.components); @@ -304,7 +311,7 @@ TEST_P(BlockedPcaBasicTest, WeightedConsistency) { } else { opts.num_threads = nthreads; - auto res1 = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + auto res1 = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); // Results should be EXACTLY the same with parallelization. EXPECT_EQ(ref.components, res1.components); @@ -313,39 +320,39 @@ TEST_P(BlockedPcaBasicTest, WeightedConsistency) { } // Checking that we get more-or-less the same results. - auto res2 = scran::blocked_pca::compute(dense_column.get(), block.data(), rank, opts); + auto res2 = scran_pca::blocked_pca(*dense_column, block.data(), rank, opts); expect_equal_pcs(ref.components, res2.components); expect_equal_vectors(ref.variance_explained, res2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res2.total_variance); - auto res3 = scran::blocked_pca::compute(sparse_row.get(), block.data(), rank, opts); + auto res3 = scran_pca::blocked_pca(*sparse_row, block.data(), rank, opts); expect_equal_pcs(ref.components, res3.components); expect_equal_vectors(ref.variance_explained, res3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res3.total_variance); - auto res4 = scran::blocked_pca::compute(sparse_column.get(), block.data(), rank, opts); + auto res4 = scran_pca::blocked_pca(*sparse_column, block.data(), rank, opts); expect_equal_pcs(ref.components, res4.components); expect_equal_vectors(ref.variance_explained, res4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res4.total_variance); // Checking that we get more-or-less the same results. opts.realize_matrix = false; - auto tres1 = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opts); + auto tres1 = scran_pca::blocked_pca(*dense_row, block.data(), rank, opts); expect_equal_pcs(ref.components, tres1.components); expect_equal_vectors(ref.variance_explained, tres1.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres1.total_variance); - auto tres2 = scran::blocked_pca::compute(dense_column.get(), block.data(), rank, opts); + auto tres2 = scran_pca::blocked_pca(*dense_column, block.data(), rank, opts); expect_equal_pcs(ref.components, tres2.components); expect_equal_vectors(ref.variance_explained, tres2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres2.total_variance); - auto tres3 = scran::blocked_pca::compute(sparse_row.get(), block.data(), rank, opts); + auto tres3 = scran_pca::blocked_pca(*sparse_row, block.data(), rank, opts); expect_equal_pcs(ref.components, tres3.components); expect_equal_vectors(ref.variance_explained, tres3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres3.total_variance); - auto tres4 = scran::blocked_pca::compute(sparse_column.get(), block.data(), rank, opts); + auto tres4 = scran_pca::blocked_pca(*sparse_column, block.data(), rank, opts); expect_equal_pcs(ref.components, tres4.components); expect_equal_vectors(ref.variance_explained, tres4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres4.total_variance); @@ -380,19 +387,19 @@ TEST_P(BlockedPcaMoreTest, VersusSimple) { int nblocks = std::get<3>(param); auto block = generate_blocks(dense_row->ncol(), nblocks); - scran::blocked_pca::Options opt; + scran_pca::BlockedPcaOptions opt; opt.scale = scale; opt.components_from_residuals = use_resids; - opt.block_weight_policy = scran::block_weights::Policy::NONE; - auto res1 = scran::blocked_pca::compute(dense_row.get(), block.data(), rank, opt); + opt.block_weight_policy = scran_blocks::WeightPolicy::NONE; + auto res1 = scran_pca::blocked_pca(*dense_row, block.data(), rank, opt); - scran::simple_pca::Options refopt; + scran_pca::SimplePcaOptions refopt; refopt.scale = scale; if (nblocks == 1) { // Checking that we get more-or-less the same results // from the vanilla PCA algorithm in the absence of blocks. - auto res2 = scran::simple_pca::compute(dense_row.get(), rank, refopt); + auto res2 = scran_pca::simple_pca(*dense_row, rank, refopt); expect_equal_pcs(res1.components, res2.components); expect_equal_vectors(res1.variance_explained, res2.variance_explained); @@ -432,7 +439,7 @@ TEST_P(BlockedPcaMoreTest, VersusSimple) { } tatami::DenseColumnMatrix refmat(nr, nc, std::move(regressed)); - auto res2 = scran::simple_pca::compute(&refmat, rank, refopt); + auto res2 = scran_pca::simple_pca(refmat, rank, refopt); expect_equal_rotation(res1.rotation, res2.rotation); expect_equal_vectors(res1.variance_explained, res2.variance_explained); @@ -486,21 +493,27 @@ TEST_P(BlockedPcaWeightedTest, VersusReference) { std::vector blocking; size_t nr = 80, nc = 50; for (int b = 0; b < nblocks; ++b) { - auto vec = simulate_vector(nr * nc, /* lower = */ -10.0, /* upper = */ 10.0, /* seed = */ b + 100); + auto vec = scran_tests::simulate_vector(nr * nc, [&]{ + scran_tests::SimulationParameters sparams; + sparams.lower = -10; + sparams.upper = 10; + sparams.seed = b + 100; + return sparams; + }()); components.emplace_back(new tatami::DenseRowMatrix(nr, nc, std::move(vec))); blocking.insert(blocking.end(), nc, b); } - scran::blocked_pca::Options base_opt; + scran_pca::BlockedPcaOptions base_opt; base_opt.scale = scale; base_opt.components_from_residuals = use_resids; base_opt.num_threads = nthreads; auto combined = tatami::make_DelayedBind(components, false); - auto ref = scran::blocked_pca::compute(combined.get(), blocking.data(), rank, [&]{ + auto ref = scran_pca::blocked_pca(*combined, blocking.data(), rank, [&]{ auto opt = base_opt; opt.num_threads = 1; // using a single thread for a consistent reference. - opt.block_weight_policy = scran::block_weights::Policy::NONE; + opt.block_weight_policy = scran_blocks::WeightPolicy::NONE; return opt; }()); @@ -512,9 +525,9 @@ TEST_P(BlockedPcaWeightedTest, VersusReference) { // Checking that we get more-or-less the same results with equiweighting // when all blocks are of the same size. auto opt = base_opt; - opt.block_weight_policy = scran::block_weights::Policy::EQUAL; + opt.block_weight_policy = scran_blocks::WeightPolicy::EQUAL; - auto res1 = scran::blocked_pca::compute(combined.get(), blocking.data(), rank, opt); + auto res1 = scran_pca::blocked_pca(*combined, blocking.data(), rank, opt); res1.components.array() /= res1.components.norm(); expect_equal_pcs(ref.components, res1.components); @@ -536,12 +549,12 @@ TEST_P(BlockedPcaWeightedTest, VersusReference) { // which is equivalent to the total absence of re-weighting. { auto opt = base_opt; - opt.block_weight_policy = scran::block_weights::Policy::VARIABLE; + opt.block_weight_policy = scran_blocks::WeightPolicy::VARIABLE; opt.variable_block_weight_parameters.upper_bound = 1000000; - auto res2 = scran::blocked_pca::compute(expanded.get(), expanded_block.data(), rank, opt); + auto res2 = scran_pca::blocked_pca(*expanded, expanded_block.data(), rank, opt); - opt.block_weight_policy = scran::block_weights::Policy::NONE; - auto ref2 = scran::blocked_pca::compute(expanded.get(), expanded_block.data(), rank, opt); + opt.block_weight_policy = scran_blocks::WeightPolicy::NONE; + auto ref2 = scran_pca::blocked_pca(*expanded, expanded_block.data(), rank, opt); ref2.components.array() /= ref2.components.norm(); res2.components.array() /= res2.components.norm(); @@ -557,9 +570,9 @@ TEST_P(BlockedPcaWeightedTest, VersusReference) { // when the PCA was performed with batches of equal size. { auto opt = base_opt; - opt.block_weight_policy = scran::block_weights::Policy::VARIABLE; + opt.block_weight_policy = scran_blocks::WeightPolicy::VARIABLE; opt.variable_block_weight_parameters.upper_bound = 0; - auto res2 = scran::blocked_pca::compute(expanded.get(), expanded_block.data(), rank, opt); + auto res2 = scran_pca::blocked_pca(*expanded, expanded_block.data(), rank, opt); // Mocking up the expected results. Eigen::MatrixXd expanded_pcs(rank, expanded->ncol()); diff --git a/tests/src/compare_pcs.h b/tests/src/compare_pcs.h index 0aff5b4..c2df66c 100644 --- a/tests/src/compare_pcs.h +++ b/tests/src/compare_pcs.h @@ -5,7 +5,7 @@ #include -#include "compare_almost_equal.h" +#include "scran_tests/scran_tests.hpp" #include "Eigen/Dense" #include "tatami/tatami.hpp" @@ -35,7 +35,7 @@ inline void expect_equal_pcs(const Eigen::MatrixXd& left, const Eigen::MatrixXd& auto aleft = std::abs(left(i, j)); auto aright = std::abs(right(i, j)); if (relative) { - compare_almost_equal(aleft, aright, tol); + scran_tests::compare_almost_equal(aleft, aright, tol); } else if (std::abs(aleft - aright) > tol) { EXPECT_TRUE(false) << "mismatch in almost-equal floats (expected " << aleft << ", got " << aright << ")"; } @@ -45,7 +45,6 @@ inline void expect_equal_pcs(const Eigen::MatrixXd& left, const Eigen::MatrixXd& EXPECT_TRUE(std::abs(left.row(i).sum()) < tol); EXPECT_TRUE(std::abs(right.row(i).sum()) < tol); } - return; } inline void expect_equal_rotation(const Eigen::MatrixXd& left, const Eigen::MatrixXd& right, double tol=1e-8) { @@ -57,17 +56,16 @@ inline void expect_equal_rotation(const Eigen::MatrixXd& left, const Eigen::Matr for (int j = 0; j < ngenes; ++j) { auto aleft = std::abs(left(i, j)); auto aright = std::abs(right(i, j)); - compare_almost_equal(aleft, aright, tol); + scran_tests::compare_almost_equal(aleft, aright, tol); } } - return; } inline void expect_equal_vectors(const Eigen::VectorXd& left, const Eigen::VectorXd& right, double tol=1e-8) { int n = left.size(); ASSERT_EQ(n, right.size()); for (int i = 0; i < n; ++i) { - compare_almost_equal(left[i], right[i], tol); + scran_tests::compare_almost_equal(left[i], right[i], tol); } } @@ -78,7 +76,7 @@ inline void compare_almost_equal(const Eigen::MatrixXd& left, const Eigen::Matri for (int c = 0; c < ngenes; ++c) { for (int r = 0; r < ndims; ++r) { - compare_almost_equal(left(r, c), right(r, c), tol); + scran_tests::compare_almost_equal(left(r, c), right(r, c), tol); } } } diff --git a/tests/src/simple_pca.cpp b/tests/src/simple_pca.cpp index 706aeef..954e1c9 100644 --- a/tests/src/simple_pca.cpp +++ b/tests/src/simple_pca.cpp @@ -1,11 +1,11 @@ #include -#include "simulate_vector.h" #include "compare_pcs.h" +#include "scran_tests/scran_tests.hpp" #include "tatami/tatami.hpp" -#include "simple_pca.hpp" +#include "scran_pca/simple_pca.hpp" class SimplePcaTestCore { protected: @@ -17,7 +17,15 @@ class SimplePcaTestCore { } size_t nr = 199, nc = 165; - auto vec = simulate_sparse_vector(nr * nc, 0.1, /* lower = */ -10, /* upper = */ 10, /* seed = */ 69); + auto vec = scran_tests::simulate_vector(nr * nc, [&]() { + scran_tests::SimulationParameters sparams; + sparams.density = 0.1; + sparams.lower = -10; + sparams.upper = 10; + sparams.seed = 69; + return sparams; + }()); + dense_row.reset(new tatami::DenseRowMatrix(nr, nc, std::move(vec))); dense_column = tatami::convert_to_dense(dense_row.get(), false); sparse_row = tatami::convert_to_compressed_sparse(dense_row.get(), true); @@ -40,9 +48,9 @@ TEST_P(SimplePcaBasicTest, Test) { int rank = std::get<1>(param); int threads = std::get<2>(param); - scran::simple_pca::Options opt; + scran_pca::SimplePcaOptions opt; opt.scale = scale; - auto ref = scran::simple_pca::compute(dense_row.get(), rank, opt); + auto ref = scran_pca::simple_pca(*dense_row, rank, opt); if (threads == 1) { EXPECT_EQ(ref.variance_explained.size(), rank); @@ -81,46 +89,46 @@ TEST_P(SimplePcaBasicTest, Test) { } else { // Results should be EXACTLY the same with parallelization. opt.num_threads = threads; - auto res1 = scran::simple_pca::compute(dense_row.get(), rank, opt); + auto res1 = scran_pca::simple_pca(*dense_row, rank, opt); EXPECT_EQ(ref.components, res1.components); EXPECT_EQ(ref.variance_explained, res1.variance_explained); EXPECT_EQ(ref.total_variance, res1.total_variance); } // Checking that we get more-or-less the same results. - auto res2 = scran::simple_pca::compute(dense_column.get(), rank, opt); + auto res2 = scran_pca::simple_pca(*dense_column, rank, opt); expect_equal_pcs(ref.components, res2.components); expect_equal_vectors(ref.variance_explained, res2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res2.total_variance); - auto res3 = scran::simple_pca::compute(sparse_row.get(), rank, opt); + auto res3 = scran_pca::simple_pca(*sparse_row, rank, opt); expect_equal_pcs(ref.components, res3.components); expect_equal_vectors(ref.variance_explained, res3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res3.total_variance); - auto res4 = scran::simple_pca::compute(sparse_column.get(), rank, opt); + auto res4 = scran_pca::simple_pca(*sparse_column, rank, opt); expect_equal_pcs(ref.components, res4.components); expect_equal_vectors(ref.variance_explained, res4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, res4.total_variance); // Checking that we get more-or-less the same results. opt.realize_matrix = false; - auto tres1 = scran::simple_pca::compute(dense_row.get(), rank, opt); + auto tres1 = scran_pca::simple_pca(*dense_row, rank, opt); expect_equal_pcs(ref.components, tres1.components); expect_equal_vectors(ref.variance_explained, tres1.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres1.total_variance); - auto tres2 = scran::simple_pca::compute(dense_column.get(), rank, opt); + auto tres2 = scran_pca::simple_pca(*dense_column, rank, opt); expect_equal_pcs(ref.components, tres2.components); expect_equal_vectors(ref.variance_explained, tres2.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres2.total_variance); - auto tres3 = scran::simple_pca::compute(sparse_row.get(), rank, opt); + auto tres3 = scran_pca::simple_pca(*sparse_row, rank, opt); expect_equal_pcs(ref.components, tres3.components); expect_equal_vectors(ref.variance_explained, tres3.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres3.total_variance); - auto tres4 = scran::simple_pca::compute(sparse_column.get(), rank, opt); + auto tres4 = scran_pca::simple_pca(*sparse_column, rank, opt); expect_equal_pcs(ref.components, tres4.components); expect_equal_vectors(ref.variance_explained, tres4.variance_explained); EXPECT_FLOAT_EQ(ref.total_variance, tres4.total_variance); @@ -146,32 +154,46 @@ TEST_P(SimplePcaMoreTest, ZeroVariance) { int rank = std::get<1>(param); size_t nr = 109, nc = 153; - auto vec = simulate_sparse_vector(nr * nc, 0.1, /* lower = */ -10, /* upper = */ 10, /* seed = */ scale + rank + 100); + auto vec = scran_tests::simulate_vector(nr * nc, [&]() { + scran_tests::SimulationParameters sparams; + sparams.density = 0.1; + sparams.lower = -10; + sparams.upper = 10; + sparams.seed = scale * 100 + rank; + return sparams; + }()); auto copy = vec; size_t last_row = (nr - 1) * nc; - std::fill(copy.begin() + last_row, copy.begin() + last_row + nc, 0); + std::fill_n(copy.begin() + last_row, nc, 0); tatami::DenseRowMatrix has_zero(nr, nc, std::move(copy)); std::vector removed(vec.begin(), vec.begin() + last_row); tatami::DenseRowMatrix leftovers(nr - 1, nc, std::move(removed)); - scran::simple_pca::Options opt; + scran_pca::SimplePcaOptions opt; opt.scale = scale; // The initial vector is slightly different when we lose a feature, so we manually force our own random initialization. - auto raw_init = simulate_vector(nr, /* lower = */ -2, /* upper = */ 2, /* seed = */ scale + rank + 10); + auto raw_init = scran_tests::simulate_vector(nr, [&]() { + scran_tests::SimulationParameters sparams; + sparams.lower = -2; + sparams.upper = 2; + sparams.seed = scale * 10 + rank; + return sparams; + }()); + raw_init.back() = 0; Eigen::VectorXd init(nr - 1); std::copy_n(raw_init.begin(), nr - 1, init.data()); opt.irlba_options.initial = &init; - auto ref = scran::simple_pca::compute(&leftovers, rank, opt); + auto ref = scran_pca::simple_pca(leftovers, rank, opt); Eigen::VectorXd init2(nr); std::copy_n(raw_init.begin(), nr, init2.data()); opt.irlba_options.initial = &init2; - auto out = scran::simple_pca::compute(&has_zero, rank, opt); + auto out = scran_pca::simple_pca(has_zero, rank, opt); expect_equal_pcs(ref.components, out.components); expect_equal_vectors(ref.variance_explained, out.variance_explained); @@ -179,7 +201,7 @@ TEST_P(SimplePcaMoreTest, ZeroVariance) { // Same behavior with sparse representation. auto sparse_zero = tatami::convert_to_compressed_sparse(&has_zero, true); - auto spout = scran::simple_pca::compute(sparse_zero.get(), rank, opt); + auto spout = scran_pca::simple_pca(*sparse_zero, rank, opt); expect_equal_pcs(spout.components, out.components); expect_equal_vectors(spout.variance_explained, out.variance_explained);