diff --git a/dynamo/external/pearson_residual_recipe.py b/dynamo/external/pearson_residual_recipe.py index e92b215b3..35d5f7122 100644 --- a/dynamo/external/pearson_residual_recipe.py +++ b/dynamo/external/pearson_residual_recipe.py @@ -28,7 +28,7 @@ is_nonnegative_integer_arr, seurat_get_mean_var, ) -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca main_logger = LoggerManager.main_logger diff --git a/dynamo/preprocessing/Preprocessor.py b/dynamo/preprocessing/Preprocessor.py index 1cf017e70..a17ea7fa8 100644 --- a/dynamo/preprocessing/Preprocessor.py +++ b/dynamo/preprocessing/Preprocessor.py @@ -22,7 +22,7 @@ select_genes_by_pearson_residuals, ) from ..tools.connectivity import neighbors as default_neighbors -from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca_monocle +from .preprocess import normalize_cell_expr_by_size_factors_legacy, pca from .preprocessor_utils import _infer_labeling_experiment_type from .preprocessor_utils import ( filter_cells_by_outliers as monocle_filter_cells_by_outliers, @@ -62,7 +62,7 @@ def __init__( normalize_selected_genes_kwargs: dict = {}, use_log1p: bool = True, log1p_kwargs: dict = {}, - pca_function: bool = pca_monocle, + pca_function: Callable = pca, pca_kwargs: dict = {}, gene_append_list: List[str] = [], gene_exclude_list: List[str] = [], @@ -72,36 +72,50 @@ def __init__( """Preprocessor constructor. The default preprocess functions are those of monocle recipe by default. - You can pass your own Callable objects (functions) to this constructor directly, which wil be used in the - preprocess steps later. These functions parameters are saved into Preprocessor instances. You can set these - attributes directly to your own implementation. + You can pass your own Callable objects (functions) to this constructor + directly, which wil be used in the preprocess steps later. These + functions parameters are saved into Preprocessor instances. You can set + these attributes directly to your own implementation. Args: - collapse_speicies_adata_function: function for collapsing the species data. Defaults to - collapse_species_adata. - convert_gene_name_function: transform gene names, by default convert2symbol, which transforms unofficial - gene names to official gene names. Defaults to convert2symbol. - filter_cells_by_outliers_function: filter cells by thresholds. Defaults to monocle_filter_cells_by_outliers. - filter_cells_by_outliers_kwargs: arguments that will be passed to filter_cells_by_outliers. Defaults to {}. - filter_genes_by_outliers_function: filter genes by thresholds. Defaults to monocle_filter_genes_by_outliers. - filter_genes_by_outliers_kwargs: arguments that will be passed to filter_genes_by_outliers. Defaults to {}. - normalize_by_cells_function: function for performing cell-wise normalization. Defaults to - normalize_cell_expr_by_size_factors. - normalize_by_cells_function_kwargs: arguments that will be passed to normalize_by_cells_function. Defaults - to {}. - select_genes_function: function for selecting gene features. Defaults to select_genes_by_dispersion_general. - select_genes_kwargs: arguments that will be passed to select_genes. Defaults to {}. - normalize_selected_genes_function: function for normalize selected genes. Defaults to None. - normalize_selected_genes_kwargs: arguments that will be passed to normalize_selected_genes. Defaults to {}. - use_log1p: whether to use log1p to normalize layers in adata. Defaults to True. + collapse_speicies_adata_function: function for collapsing the + species data. Defaults to collapse_species_adata. + convert_gene_name_function: transform gene names, by default + convert2symbol, which transforms unofficial gene names to + official gene names. Defaults to convert2symbol. + filter_cells_by_outliers_function: filter cells by thresholds. + Defaults to monocle_filter_cells_by_outliers. + filter_cells_by_outliers_kwargs: arguments that will be passed to + filter_cells_by_outliers. Defaults to {}. + filter_genes_by_outliers_function: filter genes by thresholds. + Defaults to monocle_filter_genes_by_outliers. + filter_genes_by_outliers_kwargs: arguments that will be passed to + filter_genes_by_outliers. Defaults to {}. + normalize_by_cells_function: function for performing cell-wise + normalization. Defaults to normalize_cell_expr_by_size_factors. + normalize_by_cells_function_kwargs: arguments that will be passed to + normalize_by_cells_function. Defaults to {}. + select_genes_function: function for selecting gene features. + Defaults to select_genes_by_dispersion_general. + select_genes_kwargs: arguments that will be passed to select_genes. + Defaults to {}. + normalize_selected_genes_function: function for normalize selected + genes. Defaults to None. + normalize_selected_genes_kwargs: arguments that will be passed to + normalize_selected_genes. Defaults to {}. + use_log1p: whether to use log1p to normalize layers in adata. + Defaults to True. log1p_kwargs: arguments passed to use_log1p. Defaults to {}. - pca_function: function to perform pca. Defaults to pca_monocle. + pca_function: function to perform pca. Defaults to pca in utils.py. pca_kwargs: arguments that will be passed pca. Defaults to {}. - gene_append_list: ensure that a list of genes show up in selected genes in monocle recipe pipeline. Defaults - to []. - gene_exclude_list: exclude a list of genes in monocle recipe pipeline. Defaults to []. - force_gene_list: use this gene list as selected genes in monocle recipe pipeline. Defaults to None. - sctransform_kwargs: arguments passed into sctransform function. Defaults to {}. + gene_append_list: ensure that a list of genes show up in selected + genes in monocle recipe pipeline. Defaults to []. + gene_exclude_list: exclude a list of genes in monocle recipe + pipeline. Defaults to []. + force_gene_list: use this gene list as selected genes in monocle + recipe pipeline. Defaults to None. + sctransform_kwargs: arguments passed into sctransform function. + Defaults to {}. """ self.convert_layers2csr = convert_layers2csr @@ -138,13 +152,15 @@ def __init__( def add_experiment_info( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None ) -> None: - """Infer the experiment type and experiment layers stored in the AnnData object and record the info in unstructured metadata (.uns). + """Infer the experiment type and experiment layers stored in the AnnData + object and record the info in unstructured metadata (.uns). Args: adata: an AnnData object. - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type. If set to None, the experiment type would be inferred from the data. - Defaults to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type. If set to None, the experiment + type would be inferred from the data. Defaults to None. Raises: ValueError: the tkey is invalid. @@ -213,13 +229,15 @@ def standardize_adata(self, adata: AnnData, tkey: str, experiment_type: str) -> """Process the AnnData object to make it meet the standards of dynamo. The index of the observations would be ensured to be unique. - The layers with sparse matrix would be converted to compressed csr_matrix. - DKM.allowed_layer_raw_names() will be used to define only_splicing, only_labeling and splicing_labeling keys. - The genes would be renamed to their official name. + The layers with sparse matrix would be converted to compressed + csr_matrix. DKM.allowed_layer_raw_names() will be used to define + only_splicing, only_labeling and splicing_labeling keys. The genes + would be renamed to their official name. Args: adata: an AnnData object. - tkey: the key for time information (labeling time period for the cells) in .obs. + tkey: the key for time information (labeling time period for the + cells) in .obs. experiment_type: the experiment type. """ @@ -229,7 +247,6 @@ def standardize_adata(self, adata: AnnData, tkey: str, experiment_type: str) -> main_info_insert_adata("tkey=%s" % tkey, "uns['pp']", indent_level=2) main_info_insert_adata("experiment_type=%s" % experiment_type, "uns['pp']", indent_level=2) main_info("making adata observation index unique...") - self.unique_var_obs_adata(adata) self.convert_layers2csr(adata) if self.collapse_species_adata: @@ -239,11 +256,13 @@ def standardize_adata(self, adata: AnnData, tkey: str, experiment_type: str) -> if self.convert_gene_name: main_info("applying convert_gene_name function...") self.convert_gene_name(adata) - main_info("making adata observation index unique after gene name conversion...") - self.unique_var_obs_adata(adata) + + main_info("making adata observation index unique after gene name conversion...") + self.unique_var_obs_adata(adata) def _filter_cells_by_outliers(self, adata: AnnData) -> None: - """Select valid cells based on the method specified as the preprocessor's `filter_cells_by_outliers`. + """Select valid cells based on the method specified as the + preprocessor's `filter_cells_by_outliers`. Args: adata: an AnnData object. @@ -255,7 +274,8 @@ def _filter_cells_by_outliers(self, adata: AnnData) -> None: self.filter_cells_by_outliers(adata, **self.filter_cells_by_outliers_kwargs) def _filter_genes_by_outliers(self, adata: AnnData) -> None: - """Select valid genes based on the method specified as the preprocessor's `filter_genes_by_outliers`. + """Select valid genes based on the method specified as the + preprocessor's `filter_genes_by_outliers`. Args: adata: an AnnData object. @@ -267,7 +287,8 @@ def _filter_genes_by_outliers(self, adata: AnnData) -> None: self.filter_genes_by_outliers(adata, **self.filter_genes_by_outliers_kwargs) def _select_genes(self, adata: AnnData) -> None: - """selecting gene by features, based on method specified as the preprocessor's `select_genes`. + """selecting gene by features, based on method specified as the + preprocessor's `select_genes`. Args: adata: an AnnData object. @@ -279,7 +300,8 @@ def _select_genes(self, adata: AnnData) -> None: self.select_genes(adata, **self.select_genes_kwargs) def _append_gene_list(self, adata: AnnData) -> None: - """Add genes to the feature gene list detected by the preprocessing steps. + """Add genes to the feature gene list detected by the preprocessing + steps. Args: adata: an AnnData object. @@ -291,7 +313,8 @@ def _append_gene_list(self, adata: AnnData) -> None: main_info("appended %d extra genes as required..." % len(append_genes)) def _exclude_gene_list(self, adata: AnnData) -> None: - """Remove genes from the feature gene list detected by the preprocessing steps. + """Remove genes from the feature gene list detected by the preprocessing + steps. Args: adata: an AnnData object. @@ -303,7 +326,8 @@ def _exclude_gene_list(self, adata: AnnData) -> None: main_info("excluded %d genes as required..." % len(exclude_genes)) def _force_gene_list(self, adata: AnnData) -> None: - """Use the provided gene list as the feature gene list, overwrite the gene list detected by the preprocessing steps. + """Use the provided gene list as the feature gene list, overwrite the + gene list detected by the preprocessing steps. Args: adata: an AnnData object. @@ -321,7 +345,8 @@ def _force_gene_list(self, adata: AnnData) -> None: main_info("self.force_gene_list is None, skipping filtering by gene list...") def _normalize_selected_genes(self, adata: AnnData) -> None: - """Normalize selected genes with method specified in the preprocessor's `normalize_selected_genes` + """Normalize selected genes with method specified in the preprocessor's + `normalize_selected_genes` Args: adata: an AnnData object. @@ -337,7 +362,8 @@ def _normalize_selected_genes(self, adata: AnnData) -> None: self.normalize_selected_genes(adata, **self.normalize_selected_genes_kwargs) def _normalize_by_cells(self, adata: AnnData) -> None: - """Performing cell-wise normalization based on method specified as the preprocessor's `normalize_by_cells`. + """Performing cell-wise normalization based on method specified as the + preprocessor's `normalize_by_cells`. Args: adata: an AnnData object. @@ -351,7 +377,8 @@ def _normalize_by_cells(self, adata: AnnData) -> None: self.normalize_by_cells(adata, **self.normalize_by_cells_function_kwargs) def _log1p(self, adata: AnnData) -> None: - """Perform log1p on the data with args specified in the preprocessor's `log1p_kwargs`. + """Perform log1p on the data with args specified in the preprocessor's + `log1p_kwargs`. Args: adata: an AnnData object. @@ -369,7 +396,8 @@ def _log1p(self, adata: AnnData) -> None: self.log1p(adata, **self.log1p_kwargs) def _pca(self, adata: AnnData) -> None: - """Perform pca reduction with args specified in the preprocessor's `pca_kwargs`. + """Perform pca reduction with args specified in the preprocessor's + `pca_kwargs`. Args: adata: an AnnData object. @@ -379,6 +407,31 @@ def _pca(self, adata: AnnData) -> None: main_info("reducing dimension by PCA...") self.pca(adata, **self.pca_kwargs) + def preprocess_adata_seurat_wo_pca( + self, + adata: AnnData, + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + """Preprocess the anndata object according to standard preprocessing in + Seurat recipe without PCA. This can be used to test different + dimentsion reduction methods. + """ + main_info("Running preprocessing pipeline...") + temp_logger = LoggerManager.gen_logger("preprocessor-seurat_wo_pca") + temp_logger.log_time() + + self.standardize_adata(adata, tkey, experiment_type) + self._filter_cells_by_outliers(adata) + self._filter_genes_by_outliers(adata) + self._normalize_by_cells(adata) + self._select_genes(adata) + self._log1p(adata) + + temp_logger.finish_progress( + progress_name="preprocess by seurat wo pca recipe" + ) + def config_monocle_recipe( self, adata: AnnData, n_top_genes: int = 2000, gene_selection_method: str = "SVR" ) -> None: @@ -386,8 +439,10 @@ def config_monocle_recipe( Args: adata: an AnnData object. - n_top_genes: Number of top feature genes to select in the preprocessing step. Defaults to 2000. - gene_selection_method: Which sorting method to be used to select genes. Defaults to "SVR". + n_top_genes: Number of top feature genes to select in the + preprocessing step. Defaults to 2000. + gene_selection_method: Which sorting method to be used to select + genes. Defaults to "SVR". """ n_obs, n_genes = adata.n_obs, adata.n_vars @@ -445,7 +500,7 @@ def config_monocle_recipe( # recipe monocle log1p all raw data in normalize_by_cells (dynamo version), so we do not need extra log1p transform. self.use_log1p = False - self.pca = pca_monocle + self.pca = pca self.pca_kwargs = {"pca_key": "X_pca"} def preprocess_adata_monocle( @@ -455,9 +510,10 @@ def preprocess_adata_monocle( Args: adata: an AnnData object. - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type of the data. If not provided, would be inferred from the data. Defaults - to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, + would be inferred from the data. Defaults to None. """ main_info("Running preprocessing pipeline...") @@ -485,7 +541,8 @@ def preprocess_adata_monocle( temp_logger.finish_progress(progress_name="preprocess") def config_seurat_recipe(self, adata: AnnData) -> None: - """Automatically configure the preprocessor for using the seurat style recipe. + """Automatically configure the preprocessor for using the seurat style + recipe. Args: adata: an AnnData object. @@ -503,16 +560,19 @@ def config_seurat_recipe(self, adata: AnnData) -> None: def preprocess_adata_seurat( self, adata: AnnData, tkey: Optional[str] = None, experiment_type: Optional[str] = None ) -> None: - """The preprocess pipeline in Seurat based on dispersion, implemented by dynamo authors. + """The preprocess pipeline in Seurat based on dispersion, implemented by + dynamo authors. - Stuart and Butler et al. Comprehensive Integration of Single-Cell Data. Cell (2019) - Butler et al. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat Biotechnol + Stuart and Butler et al. Comprehensive Integration of Single-Cell Data. + Cell (2019) Butler et al. Integrating single-cell transcriptomic data + across different conditions, technologies, and species. Nat Biotechnol Args: adata: an AnnData object - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type of the data. If not provided, would be inferred from the data. Defaults - to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, + would be inferred from the data. Defaults to None. """ temp_logger = LoggerManager.gen_logger("preprocessor-seurat") @@ -528,7 +588,8 @@ def preprocess_adata_seurat( temp_logger.finish_progress(progress_name="preprocess by seurat recipe") def config_sctransform_recipe(self, adata: AnnData) -> None: - """Automatically configure the preprocessor for using the sctransform style recipe. + """Automatically configure the preprocessor for using the sctransform + style recipe. Args: adata: an AnnData object. @@ -553,13 +614,15 @@ def preprocess_adata_sctransform( ) -> None: """Python implementation of https://github.com/satijalab/sctransform. - Hao and Hao et al. Integrated analysis of multimodal single-cell data. Cell (2021) + Hao and Hao et al. Integrated analysis of multimodal single-cell data. + Cell (2021) Args: adata: an AnnData object - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type of the data. If not provided, would be inferred from the data. Defaults - to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, + would be inferred from the data. Defaults to None. """ temp_logger = LoggerManager.gen_logger("preprocessor-sctransform") @@ -579,7 +642,8 @@ def preprocess_adata_sctransform( temp_logger.finish_progress(progress_name="preprocess by sctransform recipe") def config_pearson_residuals_recipe(self, adata: AnnData) -> None: - """Automatically configure the preprocessor for using the Pearson residuals style recipe. + """Automatically configure the preprocessor for using the Pearson + residuals style recipe. Args: adata: an AnnData object. @@ -602,13 +666,16 @@ def preprocess_adata_pearson_residuals( ) -> None: """A pipeline proposed in Pearson residuals (Lause, Berens & Kobak, 2021). - Lause, J., Berens, P. & Kobak, D. Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data. Genome Biol 22, 258 (2021). https://doi.org/10.1186/s13059-021-02451-7 + Lause, J., Berens, P. & Kobak, D. Analytic Pearson residuals for + normalization of single-cell RNA-seq UMI data. Genome Biol 22, 258 + (2021). https://doi.org/10.1186/s13059-021-02451-7 Args: adata: an AnnData object - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type of the data. If not provided, would be inferred from the data. Defaults - to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, + would be inferred from the data. Defaults to None. """ temp_logger = LoggerManager.gen_logger("preprocessor-sctransform") @@ -622,10 +689,13 @@ def preprocess_adata_pearson_residuals( temp_logger.finish_progress(progress_name="preprocess by pearson residual recipe") def config_monocle_pearson_residuals_recipe(self, adata: AnnData) -> None: - """Automatically configure the preprocessor for using the Monocle-Pearson-residuals style recipe. + """Automatically configure the preprocessor for using the + Monocle-Pearson-residuals style recipe. - Useful when you want to use Pearson residual to obtain feature genes and perform PCA but also using the standard - size-factor normalization and log1p analyses to normalize data for RNA velocity and vector field analyses. + Useful when you want to use Pearson residual to obtain feature genes and + perform PCA but also using the standard size-factor normalization and + log1p analyses to normalize data for RNA velocity and vector field + analyses. Args: adata: an AnnData object. @@ -649,16 +719,19 @@ def preprocess_adata_monocle_pearson_residuals( ) -> None: """A combined pipeline of monocle and pearson_residuals. - Results after running pearson_residuals can contain negative values, an undesired feature for later RNA velocity - analysis. This function combine pearson_residual and monocle recipes so that it uses Pearson residual to obtain - feature genes and perform PCA but also uses monocle recipe to generate X_spliced, X_unspliced, X_new, X_total or - other data values for RNA velocity and downstream vector field analyses. + Results after running pearson_residuals can contain negative values, an + undesired feature for later RNA velocity analysis. This function combine + pearson_residual and monocle recipes so that it uses Pearson residual to + obtain feature genes and perform PCA but also uses monocle recipe to + generate X_spliced, X_unspliced, X_new, X_total or other data values for + RNA velocity and downstream vector field analyses. Args: adata: an AnnData object - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. - experiment_type: the experiment type of the data. If not provided, would be inferred from the data. Defaults - to None. + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, + would be inferred from the data. Defaults to None. """ temp_logger = LoggerManager.gen_logger("preprocessor-monocle-pearson-residual") @@ -692,8 +765,10 @@ def preprocess_adata( Args: adata: An AnnData object. - recipe: The recipe used to preprocess the data. Defaults to "monocle". - tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. + recipe: The recipe used to preprocess the data. Defaults to + "monocle". + tkey: the key for time information (labeling time period for the + cells) in .obs. Defaults to None. Raises: NotImplementedError: the recipe is invalid. diff --git a/dynamo/preprocessing/__init__.py b/dynamo/preprocessing/__init__.py index 4ce7baefc..69cc46c09 100755 --- a/dynamo/preprocessing/__init__.py +++ b/dynamo/preprocessing/__init__.py @@ -26,7 +26,7 @@ cook_dist, decode, filter_genes_by_pattern, - pca_monocle, + pca, relative2abs, scale, top_pca_genes, @@ -65,7 +65,7 @@ "cell_cycle_scores", "basic_stats", "cook_dist", - "pca_monocle", + "pca", "top_pca_genes", "relative2abs", "scale", diff --git a/dynamo/preprocessing/preprocess.py b/dynamo/preprocessing/preprocess.py index a6299d7a6..8a02f49df 100755 --- a/dynamo/preprocessing/preprocess.py +++ b/dynamo/preprocessing/preprocess.py @@ -53,7 +53,7 @@ get_sz_exprs, merge_adata_attrs, normalize_mat_monocle, - pca_monocle, + pca, sz_util, unique_var_obs_adata, ) @@ -1285,7 +1285,7 @@ def recipe_monocle( logger.info("applying %s ..." % (method.upper())) if method == "pca": - adata = pca_monocle(adata, pca_input, num_dim, "X_" + method.lower()) + adata = pca(adata, pca_input, num_dim, "X_" + method.lower()) # TODO remove adata.obsm["X"] in future, use adata.obsm.X_pca instead adata.obsm["X"] = adata.obsm["X_" + method.lower()] @@ -1438,7 +1438,7 @@ def recipe_velocyto( CM = CM[:, valid_ind] if method == "pca": - adata, fit, _ = pca_monocle(adata, CM, num_dim, "X_" + method.lower(), return_all=True) + adata, fit, _ = pca(adata, CM, num_dim, "X_" + method.lower(), return_all=True) # adata.obsm['X_' + method.lower()] = reduce_dim elif method == "ica": diff --git a/dynamo/preprocessing/preprocessor_utils.py b/dynamo/preprocessing/preprocessor_utils.py index 5a15bc9a4..46a2602a3 100644 --- a/dynamo/preprocessing/preprocessor_utils.py +++ b/dynamo/preprocessing/preprocessor_utils.py @@ -49,7 +49,7 @@ get_sz_exprs, merge_adata_attrs, normalize_mat_monocle, - pca_monocle, + pca, sz_util, unique_var_obs_adata, ) @@ -65,7 +65,7 @@ def is_log1p_transformed_adata(adata: anndata.AnnData) -> bool: A flag shows whether the adata object is log transformed. """ - chosen_gene_indices = np.random.choice(adata.n_obs, 10) + chosen_gene_indices = np.random.choice(adata.n_vars, 10) _has_log1p_transformed = not np.allclose( np.array(adata.X[:, chosen_gene_indices].sum(1)), np.array(adata.layers["spliced"][:, chosen_gene_indices].sum(1)), @@ -1528,7 +1528,7 @@ def is_nonnegative_integer_arr(mat: Union[np.ndarray, spmatrix, list]) -> bool: def pca_selected_genes_wrapper( adata: AnnData, pca_input: Union[np.ndarray, None] = None, n_pca_components: int = 30, key: str = "X_pca" ): - """A wrapper for pca_monocle function to reduce dimensions of the Adata with PCA. + """A wrapper for pca function to reduce dimensions of the Adata with PCA. Args: adata: an AnnData object. @@ -1537,4 +1537,4 @@ def pca_selected_genes_wrapper( key: the key to store the calculation result. Defaults to "X_pca". """ - adata = pca_monocle(adata, pca_input, n_pca_components=n_pca_components, pca_key=key) + adata = pca(adata, pca_input, n_pca_components=n_pca_components, pca_key=key) diff --git a/dynamo/preprocessing/utils.py b/dynamo/preprocessing/utils.py index bf8286e53..5e04a5b1b 100755 --- a/dynamo/preprocessing/utils.py +++ b/dynamo/preprocessing/utils.py @@ -1,5 +1,5 @@ import warnings -from typing import Callable, Iterable, List, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union try: from typing import Literal @@ -14,8 +14,12 @@ import scipy.sparse import statsmodels.api as sm from anndata import AnnData -from scipy.sparse import csr_matrix, issparse +from scipy.sparse.linalg import LinearOperator, svds +from scipy.sparse import csc_matrix, csr_matrix, issparse from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.utils import check_random_state +from sklearn.utils.extmath import svd_flip +from sklearn.utils.sparsefuncs import mean_variance_axis from ..configuration import DKM, DynamoAdataKeyManager from ..dynamo_logger import ( @@ -774,8 +778,131 @@ def decode(adata: anndata.AnnData) -> None: # --------------------------------------------------------------------------------------------------- # pca +def _truncatedSVD_with_center( + X: Union[csc_matrix, csr_matrix], + n_components: int = 30, + random_state: int = 0, +) -> Dict: + """Center a sparse matrix and perform truncated SVD on it. -def pca_monocle( + It uses `scipy.sparse.linalg.LinearOperator` to express the centered sparse + input by given matrix-vector and matrix-matrix products. Then truncated + singular value decomposition (SVD) can be solved without calculating the + individual entries of the centered matrix. The right singular vectors after + decomposition represent the principal components. This function is inspired + by the implementation of scanpy (https://github.com/scverse/scanpy). + + Args: + X: The input sparse matrix to perform truncated SVD on. + n_components: The number of components to keep. Default is 30. + random_state: Seed for the random number generator. Default is 0. + + Returns: + The transformed input matrix and a sklearn PCA object containing the + right singular vectors and amount of variance explained by each + principal component. + """ + random_state = check_random_state(random_state) + np.random.set_state(random_state.get_state()) + v0 = random_state.uniform(-1, 1, np.min(X.shape)) + n_components = min(n_components, X.shape[1] - 1) + + mean = X.mean(0) + X_H = X.T.conj() + mean_H = mean.T.conj() + ones = np.ones(X.shape[0])[None, :].dot + + # Following callables implements different type of matrix calculation. + def matvec(x): + """Matrix-vector multiplication. Performs the operation X_centered*x + where x is a column vector or an 1-D array.""" + return X.dot(x) - mean.dot(x) + + def matmat(x): + """Matrix-matrix multiplication. Performs the operation X_centered*x + where x is a matrix or ndarray.""" + return X.dot(x) - mean.dot(x) + + def rmatvec(x): + """Adjoint matrix-vector multiplication. Performs the operation + X_centered^H * x where x is a column vector or an 1-d array.""" + return X_H.dot(x) - mean_H.dot(ones(x)) + + def rmatmat(x): + """Adjoint matrix-matrix multiplication. Performs the operation + X_centered^H * x where x is a matrix or ndarray.""" + return X_H.dot(x) - mean_H.dot(ones(x)) + + # Construct the LinearOperator with callables above. + X_centered = LinearOperator( + shape=X.shape, + matvec=matvec, + matmat=matmat, + rmatvec=rmatvec, + rmatmat=rmatmat, + dtype=X.dtype, + ) + + # Solve SVD without calculating individuals entries in LinearOperator. + U, Sigma, VT = svds(X_centered, solver='arpack', k=n_components, v0=v0) + Sigma = Sigma[::-1] + U, VT = svd_flip(U[:, ::-1], VT[::-1]) + X_transformed = U * Sigma + components_ = VT + exp_var = np.var(X_transformed, axis=0) + _, full_var = mean_variance_axis(X, axis=0) + full_var = full_var.sum() + + result_dict = { + "X_pca": X_transformed, + "components_": components_, + "explained_variance_ratio_": exp_var / full_var, + } + + fit = PCA( + n_components=n_components, + random_state=random_state, + ) + X_pca = result_dict["X_pca"] + fit.components_ = result_dict["components_"] + fit.explained_variance_ratio_ = result_dict[ + "explained_variance_ratio_"] + + return fit, X_pca + +def _pca_fit( + X: np.ndarray, + pca_func: Callable, + n_components: int = 30, + **kwargs, +) -> Tuple: + """Apply PCA to the input data array X using the specified PCA function. + + Args: + X: the input data array of shape (n_samples, n_features). + pca_func: the PCA function to use, which should have a 'fit' and + 'transform' method, such as the PCA class or the IncrementalPCA + class from sklearn.decomposition. + n_components: the number of principal components to compute. If + n_components is greater than or equal to the number of features in + X, it will be set to n_features - 1 to avoid overfitting. + **kwargs: any additional keyword arguments that will be passed to the + PCA function. + + Returns: + A tuple containing two elements: + - The fitted PCA object, which has a 'fit' and 'transform' method. + - The transformed array X_pca of shape (n_samples, n_components). + """ + fit = pca_func( + n_components=min(n_components, X.shape[1] - 1), + **kwargs, + ).fit(X) + X_pca = fit.transform(X) + return fit, X_pca + + +def pca( adata: AnnData, X_data: np.ndarray = None, n_pca_components: int = 30, @@ -783,29 +910,54 @@ def pca_monocle( pcs_key: str = "PCs", genes_to_append: Union[List[str], None] = None, layer: Union[List[str], str, None] = None, + svd_solver: Literal["randomized", "arpack"] = "randomized", + random_state: int = 0, + use_truncated_SVD_threshold: int = 500000, + use_incremental_PCA: bool = False, + incremental_batch_size: Optional[int] = None, return_all: bool = False, ) -> Union[AnnData, Tuple[AnnData, Union[PCA, TruncatedSVD], np.ndarray]]: """Perform PCA reduction for monocle recipe. + When large dataset is used (e.g. 1 million cells are used), Incremental PCA + is recommended to avoid the memory issue. When cell number is less than half + a million, by default PCA or _truncatedSVD_with_center (use sparse matrix + that doesn't explicitly perform centering) will be used. TruncatedSVD is the + fastest method. Unlike other methods which will center the data first, it + performs SVD decomposition on raw input. Only use this when dataset is too + large for other methods. + Args: adata: an AnnData object. X_data: the data to perform dimension reduction on. Defaults to None. n_pca_components: number of PCA components reduced to. Defaults to 30. pca_key: the key to store the reduced data. Defaults to "X". - pcs_key: the key to store the principle axes in feature space. Defaults to "PCs". + pcs_key: the key to store the principle axes in feature space. Defaults + to "PCs". genes_to_append: a list of genes should be inspected. Defaults to None. - layer: the layer(s) to perform dimension reduction on. Would be overrided by X_data. Defaults to None. - return_all: whether to return the PCA fit model and the reduced array together with the updated AnnData object. - Defaults to False. + layer: the layer(s) to perform dimension reduction on. Would be + overrided by X_data. Defaults to None. + svd_solver: the svd_solver to solve svd decomposition in PCA. + random_state: the seed used to initialize the random state for PCA. + use_truncated_SVD_threshold: the threshold of observations to use + truncated SVD instead of standard PCA for efficiency. + use_incremental_PCA: whether to use Incremental PCA. Recommend enabling + incremental PCA when dataset is too large to fit in memory. + incremental_batch_size: The number of samples to use for each batch when + performing incremental PCA. If batch_size is None, then batch_size + is inferred from the data and set to 5 * n_features. + return_all: whether to return the PCA fit model and the reduced array + together with the updated AnnData object. Defaults to False. Raises: ValueError: layer provided is not invalid. ValueError: list of genes to append is invalid. Returns: - The the updated AnnData object with reduced data if `return_all` is False. Otherwise, a tuple (adata, fit, - X_pca), where adata is the updated AnnData object, fit is the fit model for dimension reduction, and X_pca is - the reduced array, will be returned. + The updated AnnData object with reduced data if `return_all` is False. + Otherwise, a tuple (adata, fit, X_pca), where adata is the updated + AnnData object, fit is the fit model for dimension reduction, and X_pca + is the reduced array, will be returned. """ # only use genes pass filter (based on use_for_pca) to perform dimension reduction. @@ -847,33 +999,53 @@ def pca_monocle( adata.var.iloc[bad_genes, adata.var.columns.tolist().index("use_for_pca")] = False X_data = X_data[:, valid_ind] - USE_TRUNCATED_SVD_THRESHOLD = 100000 - if adata.n_obs < USE_TRUNCATED_SVD_THRESHOLD: - pca = PCA( - n_components=min(n_pca_components, X_data.shape[1] - 1), - svd_solver="arpack", - random_state=0, + if use_incremental_PCA: + from sklearn.decomposition import IncrementalPCA + fit, X_pca = _pca_fit( + X_data, + pca_func=IncrementalPCA, + n_components=n_pca_components, + batch_size=incremental_batch_size, ) - fit = pca.fit(X_data.toarray()) if issparse(X_data) else pca.fit(X_data) - X_pca = fit.transform(X_data.toarray()) if issparse(X_data) else fit.transform(X_data) - adata.obsm[pca_key] = X_pca - adata.uns[pcs_key] = fit.components_.T + else: + if adata.n_obs < use_truncated_SVD_threshold: + if not issparse(X_data): + fit, X_pca = _pca_fit( + X_data, + pca_func=PCA, + n_components=n_pca_components, + svd_solver=svd_solver, + random_state=random_state, + ) + else: + fit, X_pca = _truncatedSVD_with_center( + X_data, + n_components=n_pca_components, + random_state=random_state, + ) + else: + # TruncatedSVD is the fastest method we have. It doesn't center the + # data. It only performs SVD decomposition, which is the second part + # in our _truncatedSVD_with_center function. + fit, X_pca = _pca_fit( + X_data, + pca_func=TruncatedSVD, + n_components=n_pca_components + 1, + random_state=random_state + ) + # first columns is related to the total UMI (or library size) + X_pca = X_pca[:, 1:] - adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_ + adata.obsm[pca_key] = X_pca + if use_incremental_PCA or adata.n_obs < use_truncated_SVD_threshold: + adata.uns[pcs_key] = fit.components_.T + adata.uns[ + "explained_variance_ratio_"] = fit.explained_variance_ratio_ else: - # unscaled PCA - fit = TruncatedSVD( - n_components=min(n_pca_components + 1, X_data.shape[1] - 1), - random_state=0, - ) # first columns is related to the total UMI (or library size) - X_pca = fit.fit_transform(X_data)[:, 1:] - adata.obsm[pca_key] = X_pca adata.uns[pcs_key] = fit.components_.T[:, 1:] - - adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] - - adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] + adata.uns[ + "explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] adata.uns["pca_mean"] = fit.mean_ if hasattr(fit, "mean_") else None if return_all: diff --git a/dynamo/tools/cell_velocities.py b/dynamo/tools/cell_velocities.py index d5a114710..32713aa42 100755 --- a/dynamo/tools/cell_velocities.py +++ b/dynamo/tools/cell_velocities.py @@ -541,9 +541,9 @@ def cell_velocities( if "pca_fit" not in adata.uns_keys() or type(adata.uns["pca_fit"]) == str: CM = adata.X[:, adata.var.use_for_dynamics.values] - from ..preprocessing.utils import pca_monocle + from ..preprocessing.utils import pca - adata, pca_fit, X_pca = pca_monocle(adata, CM, n_pca_components, "X", return_all=True) + adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X", return_all=True) adata.uns["pca_fit"] = pca_fit X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"] diff --git a/dynamo/tools/clustering.py b/dynamo/tools/clustering.py index 590de4be1..2725b0fb8 100644 --- a/dynamo/tools/clustering.py +++ b/dynamo/tools/clustering.py @@ -11,7 +11,7 @@ from ..preprocessing.preprocessor_utils import filter_genes_by_outliers as filter_genes from ..preprocessing.preprocessor_utils import log1p_adata as log1p from ..preprocessing.preprocessor_utils import normalize_cell_expr_by_size_factors -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca from ..utils import LoggerManager, copy_adata from .connectivity import _gen_neighbor_keys, neighbors from .utils import update_dict @@ -642,7 +642,7 @@ def scc( adata.uns["pp"] = {} normalize_cell_expr_by_size_factors(adata, layers="X") log1p(adata) - pca_monocle(adata, n_pca_components=30, pca_key="X_pca") + pca(adata, n_pca_components=30, pca_key="X_pca") neighbors(adata, n_neighbors=e_neigh) if "X_" + spatial_key not in adata.obsm.keys(): diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py index fe03afdc9..2bd91d15a 100755 --- a/dynamo/tools/connectivity.py +++ b/dynamo/tools/connectivity.py @@ -564,14 +564,14 @@ def neighbors( logger.info("X_data is None, fetching or recomputing...", indent_level=2) if basis == "pca" and "X_pca" not in adata.obsm_keys(): logger.info("PCA as basis not X_pca not found, doing PCAs", indent_level=2) - from ..preprocessing.utils import pca_monocle + from ..preprocessing.utils import pca CM = adata.X if genes is None else adata[:, genes].X cm_genesums = CM.sum(axis=0) valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() CM = CM[:, valid_ind] - adata, _, _ = pca_monocle(adata, CM, pca_key="X_pca", n_pca_components=n_pca_components, return_all=True) + adata, _, _ = pca(adata, CM, pca_key="X_pca", n_pca_components=n_pca_components, return_all=True) X_data = adata.obsm["X_pca"] else: diff --git a/dynamo/tools/moments.py b/dynamo/tools/moments.py index 35400c613..7c2dea5c9 100755 --- a/dynamo/tools/moments.py +++ b/dynamo/tools/moments.py @@ -8,7 +8,7 @@ from ..configuration import DKM, DynamoAdataKeyManager from ..dynamo_logger import LoggerManager -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca from ..utils import copy_adata from .connectivity import mnn, normalize_knn_graph, umap_conn_indices_dist_embedding from .utils import elem_prod, get_mapper, inverse_norm @@ -121,7 +121,7 @@ def moments( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() CM = CM[:, valid_ind] - adata, fit, _ = pca_monocle( + adata, fit, _ = pca( adata, CM, n_pca_components=n_pca_components, diff --git a/dynamo/tools/recipes.py b/dynamo/tools/recipes.py index 8332752f6..4bbea67b2 100644 --- a/dynamo/tools/recipes.py +++ b/dynamo/tools/recipes.py @@ -1,7 +1,7 @@ import numpy as np from ..configuration import DynamoAdataConfig -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca from .cell_velocities import cell_velocities from .connectivity import neighbors, normalize_knn_graph from .dimension_reduction import reduceDimension @@ -86,7 +86,7 @@ def recipe_kin_data( An updated adata object that went through a proper and typical time-resolved RNA velocity analysis. """ from ..preprocessing import recipe_monocle - from ..preprocessing.utils import detect_experiment_datatype, pca_monocle + from ..preprocessing.utils import detect_experiment_datatype, pca keep_filtered_cells = DynamoAdataConfig.use_default_var_if_none( keep_filtered_cells, DynamoAdataConfig.RECIPE_KEEP_FILTERED_CELLS_KEY @@ -144,7 +144,7 @@ def recipe_kin_data( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() - pca_monocle(adata, CM[:, valid_ind], pca_key="X_spliced_pca") + pca(adata, CM[:, valid_ind], pca_key="X_spliced_pca") # then get neighbors graph based on X_spliced_pca neighbors(adata, X_data=adata.obsm["X_spliced_pca"], layer="X_spliced") # then normalize neighbors graph so that each row sums up to be 1 @@ -268,7 +268,7 @@ def recipe_deg_data( """ from ..preprocessing import recipe_monocle - from ..preprocessing.utils import detect_experiment_datatype, pca_monocle + from ..preprocessing.utils import detect_experiment_datatype, pca keep_filtered_cells = DynamoAdataConfig.use_default_var_if_none( keep_filtered_cells, DynamoAdataConfig.RECIPE_KEEP_FILTERED_CELLS_KEY @@ -322,7 +322,7 @@ def recipe_deg_data( cm_genesums = CM.sum(axis=0) valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() - pca_monocle(adata, CM[:, valid_ind], pca_key="X_total_pca") + pca(adata, CM[:, valid_ind], pca_key="X_total_pca") # then get neighbors graph based on X_spliced_pca neighbors(adata, X_data=adata.obsm["X_total_pca"], layer="X_total") # then normalize neighbors graph so that each row sums up to be 1 @@ -454,7 +454,7 @@ def recipe_mix_kin_deg_data( An updated adata object that went through a proper and typical time-resolved RNA velocity analysis. """ from ..preprocessing import recipe_monocle - from ..preprocessing.utils import detect_experiment_datatype, pca_monocle + from ..preprocessing.utils import detect_experiment_datatype, pca keep_filtered_cells = DynamoAdataConfig.use_default_var_if_none( keep_filtered_cells, DynamoAdataConfig.RECIPE_KEEP_FILTERED_CELLS_KEY @@ -512,7 +512,7 @@ def recipe_mix_kin_deg_data( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() - pca_monocle(adata, CM[:, valid_ind], pca_key="X_spliced_pca") + pca(adata, CM[:, valid_ind], pca_key="X_spliced_pca") # then get neighbors graph based on X_spliced_pca neighbors(adata, X_data=adata.obsm["X_spliced_pca"], layer="X_spliced") # then normalize neighbors graph so that each row sums up to be 1 @@ -636,7 +636,7 @@ def recipe_one_shot_data( An updated adata object that went through a proper and typical time-resolved RNA velocity analysis. """ from ..preprocessing import recipe_monocle - from ..preprocessing.utils import detect_experiment_datatype, pca_monocle + from ..preprocessing.utils import detect_experiment_datatype, pca keep_filtered_cells = DynamoAdataConfig.use_default_var_if_none( keep_filtered_cells, DynamoAdataConfig.RECIPE_KEEP_FILTERED_CELLS_KEY @@ -694,7 +694,7 @@ def recipe_one_shot_data( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() - pca_monocle(adata, CM[:, valid_ind], pca_key="X_spliced_pca") + pca(adata, CM[:, valid_ind], pca_key="X_spliced_pca") # then get neighbors graph based on X_spliced_pca neighbors(adata, X_data=adata.obsm["X_spliced_pca"], layer="X_spliced") # then normalize neighbors graph so that each row sums up to be 1 @@ -850,7 +850,7 @@ def velocity_N( # now let us first run pca with new RNA if recalculate_pca: - pca_monocle(adata, np.log1p(adata[:, adata.var.use_for_pca].layers["X_new"]), pca_key="X_pca") + pca(adata, np.log1p(adata[:, adata.var.use_for_pca].layers["X_new"]), pca_key="X_pca") # if there are unspliced / spliced data, delete them for now: for i in ["spliced", "unspliced", "X_spliced", "X_unspliced"]: diff --git a/dynamo/tools/utils_reduceDimension.py b/dynamo/tools/utils_reduceDimension.py index 144870dfb..75d68ab2e 100644 --- a/dynamo/tools/utils_reduceDimension.py +++ b/dynamo/tools/utils_reduceDimension.py @@ -3,7 +3,7 @@ import numpy as np from ..configuration import DKM -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca from .connectivity import ( _gen_neighbor_keys, knn_to_adj, @@ -93,7 +93,7 @@ def prepare_dim_reduction( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() CM = CM[:, valid_ind] - adata, fit, _ = pca_monocle( + adata, fit, _ = pca( adata, CM, n_pca_components=n_pca_components, @@ -123,7 +123,7 @@ def prepare_dim_reduction( valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) valid_ind = np.array(valid_ind).flatten() CM = CM[:, valid_ind] - adata, fit, _ = pca_monocle(adata, CM, n_pca_components=n_pca_components, pca_key=pca_key, return_all=True) + adata, fit, _ = pca(adata, CM, n_pca_components=n_pca_components, pca_key=pca_key, return_all=True) adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:] # valid genes used for dimension reduction calculation diff --git a/dynamo/vectorfield/clustering.py b/dynamo/vectorfield/clustering.py index 4c93387ba..d4ef2c038 100644 --- a/dynamo/vectorfield/clustering.py +++ b/dynamo/vectorfield/clustering.py @@ -8,7 +8,7 @@ from sklearn.neighbors import NearestNeighbors from ..dynamo_logger import main_info -from ..preprocessing.utils import pca_monocle +from ..preprocessing.utils import pca from ..tools.clustering import hdbscan, infomap, leiden, louvain from ..tools.Markov import ( grid_velocity_filter, @@ -390,7 +390,7 @@ def streamline_clusters( # clustering feature_adata = AnnData(feature_df) - pca_monocle(feature_adata, X_data=feature_df, pca_key="X_pca") + pca(feature_adata, X_data=feature_df, pca_key="X_pca") if clustering_method == "louvain": louvain(feature_adata, obsm_key="X_pca") elif clustering_method == "leiden": diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 0f6842aef..d7a0d5895 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -5,6 +5,7 @@ import scipy.sparse from scipy import sparse from scipy.sparse.csr import csr_matrix +from sklearn.decomposition import PCA # from utils import * import dynamo as dyn @@ -164,6 +165,21 @@ def test_compute_gene_exp_fraction(): print("frac:", list(frac)) assert np.all(np.isclose(frac.flatten(), [2 / 5, 3 / 5])) +def test_pca(): + adata = dyn.sample_data.zebrafish() + preprocessor = Preprocessor() + preprocessor.preprocess_adata_seurat_wo_pca(adata) + adata = dyn.pp.pca(adata, n_pca_components=30) + assert adata.obsm["X"].shape[1] == 30 + assert adata.uns['PCs'].shape[1] == 30 + assert adata.uns['explained_variance_ratio_'].shape[0] == 30 + + X_filterd = adata.X[:, adata.var.use_for_pca.values].copy() + pca = PCA(n_components=30, random_state=0) + X_pca_sklearn = pca.fit_transform(X_filterd.toarray()) + assert np.linalg.norm(X_pca_sklearn[:, :10] - adata.obsm['X'][:, :10]) < 1e-1 + assert np.linalg.norm(pca.components_.T[:, :10] - adata.uns['PCs'][:, :10]) < 1e-1 + assert np.linalg.norm(pca.explained_variance_ratio_[:10] - adata.uns['explained_variance_ratio_'][:10]) < 1e-1 def test_preprocessor_seurat(adata): adata = dyn.sample_data.zebrafish() @@ -221,6 +237,7 @@ def test_is_nonnegative(): # # generate data if needed # adata = utils.gen_or_read_zebrafish_data() # test_is_log_transformed() + # test_pca() # test_Preprocessor_simple_run(dyn.sample_data.zebrafish()) # test_calc_dispersion_sparse()